Scrapy crawler not recursively crawling next page - python-2.7

I am trying to build this crawler to get housing data from craigslist,
but the crawler stops after fetching the first page and does not go to the next page .
Here is the code , it works for the first page ,but for the love of god I dont understand why it does not get to the next page .Any insight is really appreciated .I followed this part from scrapy tutorial
import scrapy
import re
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(scrapy.Spider):
name = "craigslistmm"
start_urls = [
"https://vancouver.craigslist.ca/search/hhh"
]
def parse_second(self,response):
#need all the info in a dict
meta_dict = response.meta
for q in response.css("section.page-container"):
meta_dict["post_details"]= {
"location":
{"longitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-longitude)" ).extract(),
"latitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-latitude)" ).extract()},
"detailed_info": ' '.join(q.css('section#postingbody::text').extract()).strip()
}
return meta_dict
def parse(self, response):
pattern = re.compile("\/([a-z]+)\/([a-z]+)\/.+")
for q in response.css("li.result-row"):
post_urls = q.css("p.result-info a::attr(href)").extract_first()
mm = re.match(pattern, post_urls)
neighborhood= q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
next_url = "https://vancouver.craigslist.ca/"+ post_urls
request = scrapy.Request(next_url,callback=self.parse_second)
#next_page = response.xpath('.//a[#class="button next"]/#href').extract_first()
#follow_url = "https://vancouver.craigslist.ca/" + next_page
#request1 = scrapy.Request(follow_url,callback=self.parse)
#yield response.follow(next_page,callback = self.parse)
request.meta['id'] = q.css("li.result-row::attr(data-pid)").extract_first()
request.meta['pricevaluation'] = q.css("p.result-info span.result-meta span.result-price::text").extract_first()
request.meta["information"] = q.css("p.result-info span.result-meta span.housing::text" ).extract_first()
request.meta["neighborhood"] =q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
request.meta["area"] = mm.group(1)
request.meta["adtype"] = mm.group(2)
yield request
#yield scrapy.Request(follow_url, callback=self.parse)
next_page = LinkExtractor(allow="s=\d+").extract_links(response)[0]
# = "https://vancouver.craigslist.ca/" + next_page
yield response.follow(next_page.url,callback=self.parse)

The problem seems to be with the next_page extraction using LinkExtractor. If you look in the look, you'll see duplicate requests being filtered. There are more links on the page that satisfy your extraction rule and maybe they are not extracted in any particular order (or not in the order you wish).
I think better approach is to extract exactly the information you want, try it with this:
next_page = response.xpath('//span[#class="buttons"]//a[contains(., "next")]/#href').extract_first()

Related

How should I be formatting my yield requests?

My scrapy spider is very confused, or I am, but one of us is not working as intended. My spider pulls start url's from a file and is supposed to: Start on an Amazon search page, crawl the page and grab the url's of each search result, follow the link to the items page, crawl the items page for information on the item, once all items have been crawled on the first page follow pagination up to page X, rinse and repeat.
I am using ScraperAPI and Scrapy-user-agent to randomize my middlewares. I have formatted my start_requests with a priority based on their index in the file, so they should be crawled in order. I have checked and ensured that I AM receiving a successful 200 html response with the actual html from the Amazon page. Here is the code for the spider:
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon_spider'
page_number = 2
current_keyword = 0
keyword_list = []
payload = {'api_key': 'mykey', 'url':'https://httpbin.org/ip'}
r = requests.get('http://api.scraperapi.com', params=payload)
print(r.text)
#/////////////////////////////////////////////////////////////////////
def start_requests(self):
with open("keywords.txt") as f:
for index, line in enumerate(f):
try:
keyword = line.strip()
AmazonSpiderSpider.keyword_list.append(keyword)
formatted_keyword = keyword.replace(' ', '+')
url = "http://api.scraperapi.com/?api_key=mykey&url=https://www.amazon.com/s?k=" + formatted_keyword + "&ref=nb_sb_noss_2"
yield scrapy.Request(url, meta={'priority': index})
except:
continue
#/////////////////////////////////////////////////////////////////////
def parse(self, response):
print("========== starting parse ===========")
for next_page in response.css("h2.a-size-mini a").xpath("#href").extract():
if next_page is not None:
if "https://www.amazon.com" not in next_page:
next_page = "https://www.amazon.com" + next_page
yield scrapy.Request('http://api.scraperapi.com/?api_key=mykey&url=' + next_page, callback=self.parse_dir_contents)
second_page = response.css('li.a-last a').xpath("#href").extract_first()
if second_page is not None and AmazonSpiderSpider.page_number < 3:
AmazonSpiderSpider.page_number += 1
yield scrapy.Request('http://api.scraperapi.com/?api_key=mykey&url=' + second_page, callback=self.parse_pagination)
else:
AmazonSpiderSpider.current_keyword = AmazonSpiderSpider.current_keyword + 1
#/////////////////////////////////////////////////////////////////////
def parse_pagination(self, response):
print("========== starting pagination ===========")
for next_page in response.css("h2.a-size-mini a").xpath("#href").extract():
if next_page is not None:
if "https://www.amazon.com" not in next_page:
next_page = "https://www.amazon.com" + next_page
yield scrapy.Request(
'http://api.scraperapi.com/?api_key=mykey&url=' + next_page,
callback=self.parse_dir_contents)
second_page = response.css('li.a-last a').xpath("#href").extract_first()
if second_page is not None and AmazonSpiderSpider.page_number < 3:
AmazonSpiderSpider.page_number += 1
yield scrapy.Request(
'http://api.scraperapi.com/?api_key=mykey&url=' + second_page,
callback=self.parse_pagination)
else:
AmazonSpiderSpider.current_keyword = AmazonSpiderSpider.current_keyword + 1
#/////////////////////////////////////////////////////////////////////
def parse_dir_contents(self, response):
items = ScrapeAmazonItem()
print("============= parsing page ==============")
temp = response.css('#productTitle::text').extract()
product_name = ''.join(temp)
product_name = product_name.replace('\n', '')
product_name = product_name.strip()
temp = response.css('#priceblock_ourprice::text').extract()
product_price = ''.join(temp)
product_price = product_price.replace('\n', '')
product_price = product_price.strip()
temp = response.css('#SalesRank::text').extract()
product_score = ''.join(temp)
product_score = product_score.strip()
product_score = re.sub(r'\D', '', product_score)
product_ASIN = response.css('li:nth-child(2) .a-text-bold+ span').css('::text').extract()
keyword = AmazonSpiderSpider.keyword_list[AmazonSpiderSpider.current_keyword]
items['product_keyword'] = keyword
items['product_ASIN'] = product_ASIN
items['product_name'] = product_name
items['product_price'] = product_price
items['product_score'] = product_score
yield items
For the FIRST start url, it will crawl three or four items and then it will jump to the SECOND start url. It will skip processing the remaining items and pagination pages, going directly to the second start url. For the second url, it will crawl three or four items, then it again will skip to the THIRD start url. It continues in this way, grabbing three or four items, then skipping to the next URL until it reaches the final start url. It will completely gather all information on this URL. Sometimes the spider COMPLETELY SKIPS the first or second starting url. This happens infrequently, but I have no idea as to what could cause this.
My code for following result item URL's works fine, but I never get the print statement for "starting pagination" so it is not correctly following pages. Also, there is something odd with middlewares. It begins parsing before it has assigned a middleware

How run an spider sequentially to sites that use session in scrapy

I wanna scrape a web page that first send an AjaxFormPost that open a session and next send an _SearchResultGridPopulate to populate a control that I need to scrape, the response is a json.
this is a fragment of my code:
def parse_AjaxFormPost(self, response):
self.logger.info("parse_AjaxFormPost")
page = response.meta['page']
header = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-US,en;q=0.8',
'Connection':'keep-alive',
'Content-Length':'14',
'Content-Type':'application/x-www-form-urlencoded',
'Cookie':'ASP.NET_SessionId=gq4dgcsl500y32xb1n2ciexq',
.
.
.
}
url = '<url>/Search/AjaxFormPost'
cities = ['city1','city2',...]
for city in cities:
formData = {
'City':city
}
re = scrapy.FormRequest(
url,
formdata=formData,
headers=header,
dont_filter=True,
callback=self.parse_GridPopulate
)
yield re
def parse_GridPopulate(self,response):
self.logger.info("parse_LookupPermitTypeDetails")
url = '<url>/Search//_SearchResultGridPopulate?Grid-page=2&Grid-size=10&Grid-CERT_KEYSIZE=128&Grid-CERT_SECRETKEYSIZE=2048&Grid-HTTPS_KEYSIZE=128&Grid-HTTPS_SECRETKEYSIZE=2048'
header = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-US,en;q=0.8',
'Connection':'keep-alive',
'Content-Length':'23',
'Content-Type':'application/x-www-form-urlencoded',
'Cookie':'ASP.NET_SessionId=gq4dgcsl500y32xb1n2ciexq',
.
.
.
}
formData = {
'page':'1',
'size':'10'
}
re = scrapy.FormRequest(
url,
formdata=formData,
headers=header,
dont_filter=True,
callback=self.parse
)
yield re
def parse(self, response):
self.logger.info("parse_permit")
data_json = json.loads(response.body)
for row in data_json["data"]:
self.logger.info(row)
item = RedmondPermitItem()
item["item1"] = row["item1"]
item["item2"] = row["item2"]
yield item
The problem is that scrapy do request concurrent and when and the request in parse_AjaxFormPost open a session so when pass to the parse_LookupPermitTypeDetails I got the session of the last request do it in parse_AjaxFormPost. So if I have 10 cities at the end I got 10 times the information of the last city.
In settings I changed the configuration:
CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_REQUESTS_PER_IP = 1
And it doesn't work. On other hand I thought in run the spider only for one city every time something like
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
class MySpider(scrapy.Spider):
# Your first spider definition
...
...
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
cities = ['city1','city2',...]
for city in cities:
yield runner.crawl(MySpider1,city=city)
reactor.stop()
crawl()
reactor.run() # the script will block here until the last crawl call is finished
Maybe this can be the only one solution, but I'm not sure. I would like to create a procedure for every site with this characteristic.
Any suggestion about how solve that, is possible to achive this only configuring settings.
thanks in advance.
Update1
I change the title because is important that is for sites that use session
This is a problem of understanding how concurrency works, as this isn't parallelism you can still work sequentially, but between callbacks. I would suggest something like this:
def parse_AjaxFormPost(self, response):
...
cities = ['city1','city2',...]
formData = {
'City':cities[0]
}
re = scrapy.FormRequest(
url,
formdata=formData,
headers=header,
dont_filter=True,
callback=self.parse_remaining_cities,
meta={'remaining_cities': cities[1:]}, # check the meta argument
)
yield re
def parse_remaining_cities(self, response):
remaining_cities = response.meta['remaining_cities']
current_city = remaining_cities[0]
...
yield Request(
...,
meta={'remaining_cities': remaining_cities[1:]},
callback=self.parse_remaining_cities)
This way you are doing one request at a time and in a row from city to city.

Python scrapy working (only half of the time)

I created a python scrapy project to extract the prices of some google flights.
I configured the middleware to use PhantomJS instead of a normal browser.
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
try:
driver.get(request.url)
time.sleep(1.5)
except e:
raise ValueError("request url failed - \n url: {},\n error:
{}").format(request.url, e)
body = driver.page_source
#encoding='utf-8' - add to html response if necessary
return HtmlResponse(driver.current_url, body=body,encoding='utf-8',
request=request)
In the settings.py i added:
DOWNLOADER_MIDDLEWARES = {
# key path intermediate class, order value of middleware
'scraper_module.middlewares.middleware.JSMiddleware' : 543 ,
# prohibit the built-in middleware
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None , } `
I also created the following spider class:
import scrapy
from scrapy import Selector
class Gspider(scrapy.Spider):
name = "google_spider"
def __init__(self):
self.start_urls = ["https://www.google.pt/flights/#search;f=LIS;t=POR;d=2017-06-18;r=2017-06-22"]
self.prices = []
self.links = []
def clean_price(self, part):
#part received as a list
#the encoding is utf-8
part = part[0]
part = part.encode('utf-8')
part = filter(str.isdigit, part)
return part
def clean_link(self, part):
part = part[0]
part = part.encode('utf-8')
return part
def get_part(self, var_holder, response, marker, inner_marker, amount = 1):
selector = Selector(response)
divs = selector.css(marker)
for n, div in enumerate(divs):
if n < amount:
part = div.css(inner_marker).extract()
if inner_marker == '::text':
part = self.clean_price(part)
else:
part = self.clean_link(part)
var_holder.append(part)
else:
break
return var_holder
def parse(self, response):
prices, links = [], []
prices = self.get_part(prices, response, 'div.OMOBOQD-d-Ab', '::text')
print prices
links = self.get_part(links, response, 'a.OMOBOQD-d-X', 'a::attr(href)')
print links
The problem is, I run the code in the shell, and around half of the times I successfully get the prices and links requested, but another half of the time, the final vectors which should contain the extracted data, are empty.
I do not get any errors during execution.
Does anyone have any idea about why this is happening?
here are the logs from the command line:
Google has a very strict policy in terms of crawling. (Pretty hypocritical when you know that they constently crawl all the web...)
You should either find an API, as said previously in the comments or maybe use proxies. An easy way is to use Crawlera. It manages thousands of proxies so you don't have to bother. I personnaly use it to crawl google and it works perfectly. The downside is that it is not free.

Adding xhr links to scraped categories hrefs missing scheme error

i have built a spider which gets data from one category , the method it follows is when the category page is specified in start url and defining start_requests for pagination which iterates over the link provided by xhr request. Since i wanted to get all the categories at once i have written code like this. my logic was to first get all category links and append those links with xhr links which follows same string for every category which is (?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu) and parse these appended url to start_request and iterate them for pagination and item parsing . but i am not able to run spider because it throws the missing scheme error since in start request i havenot provided the http:// i am stuck onto how should i solve this issue please help..
class JabcatSpider(scrapy.Spider):
name = "jabcat"
allowed_domains = ["trendin.com"]
start_urls = [
'http://www.trendin.com',
]
max_pages = 400
def parse(self,response):
urls = response.xpath('//div[#class = "men"]//#href').extract()
for url in urls:
urljoin=(url + "/" "?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu")
#yield scrapy.Request(urljoin, callback=self.start_requests)
print urljoin
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('?from=%d&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu' % i, callback=self.parse)
def parse(self, response):
for href in response.xpath('//*[#id="product_rows"]/div/div/div/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_detail_page)
def parse_detail_page(self, response):
for sel in response.xpath('//*[#id="catalog-product"]/section[2]'):
item = Jabongo()
item['title'] = response.xpath('//*[#id="product-details-wrapper"]/div[1]/div[2]/div/div[1]/h1/span[2]/text()').extract()
# item['price'] = response.xpath('//*[#id="pdp-price-info"]/span[2]/text()').extract()
# item['image'] = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract()
# # item['color'] = sel.xpath('//ul/li/label[.="Color"]/following-sibling::Span/text()').extract()
# return item
#pattern = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract

crawlSpider seems not to follow rule

here's my code. Actually I followed the example in "Recursively Scraping Web Pages With Scrapy" and it seems I have included a mistake somewhere.
Can someone help me find it, please? It's driving me crazy, I only want all the results from all the result pages. Instead it gives me the results from page 1.
Here's my code:
import scrapy
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http.request import Request
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from githubScrape.items import GithubscrapeItem
class GithubSpider(CrawlSpider):
name = "github2"
allowed_domains = ["github.com"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//*[contains(#class, "next_page")]')), callback='parse_items', follow=True),
)
def start_requests(self):
baseURL = 'https://github.com/search?utf8=%E2%9C%93&q=eagle+SYSTEM+extension%3Asch+size%3A'
for i in range(10000, 20000, +5000):
url = baseURL+str(i+1)+".."+str(i+5000)+'&type=Code&ref=searchresults'
print "URL:",url
yield Request(url, callback=self.parse_items)
def parse_items(self, response):
hxs = Selector(response)
resultParagraphs = hxs.xpath('//div[contains(#id,"code_search_results")]//p[contains(#class, "title")]')
items = []
for p in resultParagraphs:
hrefs = p.xpath('a/#href').extract()
projectURL = hrefs[0]
schemeURL = hrefs[1]
lastIndexedOn = p.xpath('.//span/time/#datetime').extract()
i = GithubscrapeItem()
i['counter'] = self.count
i['projectURL'] = projectURL
i['schemeURL'] = schemeURL
i['lastIndexedOn'] = lastIndexedOn
items.append(i)
return(items)
I didn't find your code on the link you passed, but I think the problem is that you are never using the rules.
Scrapy starts crawling by calling the start_requests method, but the rules are compiled and used on the parse method, which you are not using because your requests go directly from start_requests to parse_items.
You could remove the callback on the start_requests method if you want the rules to be applied on that level.