I'm trying to use scrapy-spider on oneblockdown.it to get all the products from the latest products and to store them into a DB.
Some sites into my monitor are working, but someone such as OBD is not working and not uploading nothing to the db. This is my function:
class OneBlockDownSpider(Spider):
name = "OneBlockDownSpider"
allowded_domains = ["oneblockdown.it"]
start_urls = [OneBlockDownURL]
def __init__(self):
logging.critical("OneBlockDown STARTED.")
def parse(self, response):
products = Selector(response).xpath("//div[#id='product-list']")
for product in products:
item = OneBlockDownItem()
item['name'] = product.xpath('.//div[#class="catalogue-product-title"]//h3').extract.first
item['link'] = product.xpath('.//div[#class="catalogue-product-title"]//h3/a/#href').extract.first
# # item['image'] = "http:" + product.xpath("/div[#class='catalogue-product-cover']/a[#class='catalogue-product-cover-image']/img/#src").extract()[0]
# item['size'] = '**NOT SUPPORTED YET**'
yield item
yield Request(OneBlockDownURL, callback=self.parse, dont_filter=True, priority=15)
I guess I'm using the wrong xpath, but I can't solve it
First of all the site is Cloudflare protected (prevent scraping).
Also you have several issues with your code:
Your products is a single node
You're using extract.first instead of extract_first()
products = response.xpath("//div[#id='product-list']/div")
for product in products:
item = OneBlockDownItem()
item['name'] = product.xpath('.//div[#class="catalogue-product-title"]//h3').extract_first()
item['link'] = product.xpath('.//div[#class="catalogue-product-title"]//h3/a/#href').extract_first()
yield item
You should start all your xpaths with '.' when using a relative selector like product:
item['image'] = "http:" + product.xpath("./div[#class='catalogue-product-cover']/a[#class='catalogue-product-cover-image']/img/#src").extract()[0]
Otherwise, It will try to get the element with this xpath: /body/div[#class='catalogue-product-cover']
Related
So I've managed to write a spider that extracts the download links of "Videos" and "English Transcripts" from this site . Looking at the cmd window i can see that all the correct information has been scraped.
The issue I am having is that the output csv file only contains the "Video" links and not the "English Transcripts" links (even though you can see that it's been scraped in the cmd window).
I've tried a few suggestions from other posts but none of them seem to work.
The following picture is how I'd like the output to look like:
CSV Output Picture
this is my current spider code:
import scrapy
class SuhbaSpider(scrapy.Spider):
name = "suhba2"
start_urls = ["http://saltanat.org/videos.php?topic=SheikhBahauddin&gopage={numb}".format(numb=numb)
for numb in range(1,3)]
def parse(self, response):
yield{
"video" : response.xpath("//span[#class='download make-cursor']/a/#href").extract(),
}
fullvideoid = response.xpath("//span[#class='media-info make-cursor']/#onclick").extract()
for videoid in fullvideoid:
url = ("http://saltanat.org/ajax_transcription.php?vid=" + videoid[21:-2])
yield scrapy.Request(url, callback=self.parse_transcript)
def parse_transcript(self, response):
yield{
"transcript" : response.xpath("//a[contains(#href,'english')]/#href").extract(),
}
You are yielding two different kinds of items - one containing just video attribute and one containing just transcript attribute. You have to yield one kind of item composed of both attributes. For that, you have to create item in parse and pass it to second level request using meta. Then, in the parse_transcript, you take it from meta, fill additional data and finally yield the item. The general pattern is described in Scrapy documentation.
The second thing is that you extract all videos at once using extract() method. This yields a list where it's hard afterwards to link each individual element with corresponding transcript. Better approach is to loop over each individual video element in the HTML and yield item for each video.
Applied to your example:
import scrapy
class SuhbaSpider(scrapy.Spider):
name = "suhba2"
start_urls = ["http://saltanat.org/videos.php?topic=SheikhBahauddin&gopage={numb}".format(numb=numb) for numb in range(1,3)]
def parse(self, response):
for video in response.xpath("//tr[#class='video-doclet-row']"):
item = dict()
item["video"] = video.xpath(".//span[#class='download make-cursor']/a/#href").extract_first()
videoid = video.xpath(".//span[#class='media-info make-cursor']/#onclick").extract_first()
url = "http://saltanat.org/ajax_transcription.php?vid=" + videoid[21:-2]
request = scrapy.Request(url, callback=self.parse_transcript)
request.meta['item'] = item
yield request
def parse_transcript(self, response):
item = response.meta['item']
item["transcript"] = response.xpath("//a[contains(#href,'english')]/#href").extract_first()
yield item
I am trying to build this crawler to get housing data from craigslist,
but the crawler stops after fetching the first page and does not go to the next page .
Here is the code , it works for the first page ,but for the love of god I dont understand why it does not get to the next page .Any insight is really appreciated .I followed this part from scrapy tutorial
import scrapy
import re
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(scrapy.Spider):
name = "craigslistmm"
start_urls = [
"https://vancouver.craigslist.ca/search/hhh"
]
def parse_second(self,response):
#need all the info in a dict
meta_dict = response.meta
for q in response.css("section.page-container"):
meta_dict["post_details"]= {
"location":
{"longitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-longitude)" ).extract(),
"latitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-latitude)" ).extract()},
"detailed_info": ' '.join(q.css('section#postingbody::text').extract()).strip()
}
return meta_dict
def parse(self, response):
pattern = re.compile("\/([a-z]+)\/([a-z]+)\/.+")
for q in response.css("li.result-row"):
post_urls = q.css("p.result-info a::attr(href)").extract_first()
mm = re.match(pattern, post_urls)
neighborhood= q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
next_url = "https://vancouver.craigslist.ca/"+ post_urls
request = scrapy.Request(next_url,callback=self.parse_second)
#next_page = response.xpath('.//a[#class="button next"]/#href').extract_first()
#follow_url = "https://vancouver.craigslist.ca/" + next_page
#request1 = scrapy.Request(follow_url,callback=self.parse)
#yield response.follow(next_page,callback = self.parse)
request.meta['id'] = q.css("li.result-row::attr(data-pid)").extract_first()
request.meta['pricevaluation'] = q.css("p.result-info span.result-meta span.result-price::text").extract_first()
request.meta["information"] = q.css("p.result-info span.result-meta span.housing::text" ).extract_first()
request.meta["neighborhood"] =q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
request.meta["area"] = mm.group(1)
request.meta["adtype"] = mm.group(2)
yield request
#yield scrapy.Request(follow_url, callback=self.parse)
next_page = LinkExtractor(allow="s=\d+").extract_links(response)[0]
# = "https://vancouver.craigslist.ca/" + next_page
yield response.follow(next_page.url,callback=self.parse)
The problem seems to be with the next_page extraction using LinkExtractor. If you look in the look, you'll see duplicate requests being filtered. There are more links on the page that satisfy your extraction rule and maybe they are not extracted in any particular order (or not in the order you wish).
I think better approach is to extract exactly the information you want, try it with this:
next_page = response.xpath('//span[#class="buttons"]//a[contains(., "next")]/#href').extract_first()
I created a python scrapy project to extract the prices of some google flights.
I configured the middleware to use PhantomJS instead of a normal browser.
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
try:
driver.get(request.url)
time.sleep(1.5)
except e:
raise ValueError("request url failed - \n url: {},\n error:
{}").format(request.url, e)
body = driver.page_source
#encoding='utf-8' - add to html response if necessary
return HtmlResponse(driver.current_url, body=body,encoding='utf-8',
request=request)
In the settings.py i added:
DOWNLOADER_MIDDLEWARES = {
# key path intermediate class, order value of middleware
'scraper_module.middlewares.middleware.JSMiddleware' : 543 ,
# prohibit the built-in middleware
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None , } `
I also created the following spider class:
import scrapy
from scrapy import Selector
class Gspider(scrapy.Spider):
name = "google_spider"
def __init__(self):
self.start_urls = ["https://www.google.pt/flights/#search;f=LIS;t=POR;d=2017-06-18;r=2017-06-22"]
self.prices = []
self.links = []
def clean_price(self, part):
#part received as a list
#the encoding is utf-8
part = part[0]
part = part.encode('utf-8')
part = filter(str.isdigit, part)
return part
def clean_link(self, part):
part = part[0]
part = part.encode('utf-8')
return part
def get_part(self, var_holder, response, marker, inner_marker, amount = 1):
selector = Selector(response)
divs = selector.css(marker)
for n, div in enumerate(divs):
if n < amount:
part = div.css(inner_marker).extract()
if inner_marker == '::text':
part = self.clean_price(part)
else:
part = self.clean_link(part)
var_holder.append(part)
else:
break
return var_holder
def parse(self, response):
prices, links = [], []
prices = self.get_part(prices, response, 'div.OMOBOQD-d-Ab', '::text')
print prices
links = self.get_part(links, response, 'a.OMOBOQD-d-X', 'a::attr(href)')
print links
The problem is, I run the code in the shell, and around half of the times I successfully get the prices and links requested, but another half of the time, the final vectors which should contain the extracted data, are empty.
I do not get any errors during execution.
Does anyone have any idea about why this is happening?
here are the logs from the command line:
Google has a very strict policy in terms of crawling. (Pretty hypocritical when you know that they constently crawl all the web...)
You should either find an API, as said previously in the comments or maybe use proxies. An easy way is to use Crawlera. It manages thousands of proxies so you don't have to bother. I personnaly use it to crawl google and it works perfectly. The downside is that it is not free.
i have built a spider which gets data from one category , the method it follows is when the category page is specified in start url and defining start_requests for pagination which iterates over the link provided by xhr request. Since i wanted to get all the categories at once i have written code like this. my logic was to first get all category links and append those links with xhr links which follows same string for every category which is (?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu) and parse these appended url to start_request and iterate them for pagination and item parsing . but i am not able to run spider because it throws the missing scheme error since in start request i havenot provided the http:// i am stuck onto how should i solve this issue please help..
class JabcatSpider(scrapy.Spider):
name = "jabcat"
allowed_domains = ["trendin.com"]
start_urls = [
'http://www.trendin.com',
]
max_pages = 400
def parse(self,response):
urls = response.xpath('//div[#class = "men"]//#href').extract()
for url in urls:
urljoin=(url + "/" "?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu")
#yield scrapy.Request(urljoin, callback=self.start_requests)
print urljoin
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('?from=%d&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu' % i, callback=self.parse)
def parse(self, response):
for href in response.xpath('//*[#id="product_rows"]/div/div/div/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_detail_page)
def parse_detail_page(self, response):
for sel in response.xpath('//*[#id="catalog-product"]/section[2]'):
item = Jabongo()
item['title'] = response.xpath('//*[#id="product-details-wrapper"]/div[1]/div[2]/div/div[1]/h1/span[2]/text()').extract()
# item['price'] = response.xpath('//*[#id="pdp-price-info"]/span[2]/text()').extract()
# item['image'] = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract()
# # item['color'] = sel.xpath('//ul/li/label[.="Color"]/following-sibling::Span/text()').extract()
# return item
#pattern = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract
As a beginner I'm having a hard time, so I'm here to ask for help.
I'm trying to extract prices from the html page, which are nested deeply:
second price location:
from scrapy.spider import Spider
from scrapy.selector import Selector
from mymarket.items import MymarketItem
class MySpider(Spider):
name = "mymarket"
allowed_domains = ["url"]
start_urls = [
"http://url"
]
def parse(self, response):
sel = Selector(response)
titles = sel.xpath('//table[#class="tab_product_list"]//tr')
items = []
for t in titles:
item = MymarketItem()
item["price"] = t.xpath('//tr//span[2]/text()').extract()
items.append(item)
return items
I'm trying to export scraped prices to csv. they do export but are being populated like this:
And I want them to be sorted like this in .csv:
etc.
Can anybody point out where is the faulty part of the xpath or how I can make prices be sorted "properly" ?
It's difficult to say what's wrong with the path. Install firepath extension for Firefox to test your xpath queries. One note for now:
titles = sel.xpath('//table[#class="tab_product_list"]//tr')
In your screenshot you have nested tables, so //tr will give trs from nested tables too.
def parse(self, response):
sel = Selector(response)
titles = sel.xpath('//table[#class="tab_product_list"]/tr') # or with tbody
items = []
for t in titles:
item = MymarketItem()
item["price"] = t.xpath('.//span[#style="color:red;"]/text()').extract()[0]
items.append(item)
return items
.extract() returns a list, even if just one argument found, take the first element of the list .extract()[0]