Scrapy: Crawls pages but scrapes 0 items - python-2.7

I am trying to scrape baseball-reference.com. In the Scrapy bot I created, I start from 1st page and navigate to different links and from there to a third link. Please find the code below:
class VisitorBattingSpider(InitSpider):
name = 'VisitorBatting'
year=str(datetime.datetime.today().year)
allowed_domains = ["baseball-reference.com"]
start= 'http://www.baseball-reference.com/boxes/'+year+'.shtml'
start_urls=[start]
#rules = [Rule(LinkExtractor(allow=['/play-index/st.cgi?date=\d+-\d+-\d+']), callback='parse_item',)]
def __init__(self):
BaseSpider.__init__(self)
# use any browser you wish
self.browser = webdriver.Firefox()
def __del__(self):
self.browser.close()
def parse(self, response):
self.browser.get(response.url)
# let JavaScript Load
time.sleep(15)
page=Selector(text=self.browser.page_source)
#page=Selector(response)
sites=page.xpath('//*[#id="2016"]/tbody/tr/td/table/tbody/tr/td/a/#href')
for site in sites:
tree = site.extract()
yield Request(url='http://www.baseball-reference.com'+tree,callback=self.parse_new,dont_filter=True)
self.browser.close()
def parse_new(self, response):
hxs=Selector(response)
loads = hxs.xpath('/html/body/pre/a/#href')
for load in loads:
branch=load.extract()
if 'boxes' in branch:
yield Request(url='http://www.baseball-reference.com'+branch,callback=self.parse_final,dont_filter=True)
def parse_final(self, response):
self.browser.get(response.url)
fxs=Selector(text=self.browser.page_source)
vi= fxs.xpath('html/body/div/div[3]/div[1]/div[1]/h3/text()').extract()
vis=''.join(vi)
if "." in vis:
visitor=vis.replace(".","")
else:
visitor=vis
visitor_id=visitor.replace(" ","")
print visitor_id
UR=response.url
URL=''.join(UR)
dtt=URL[-15:]
dt=dtt[:8]
day=datetime.datetime(int(dt[:4]),int(dt[5:6]),int(dt[-2:]),01,01,01).weekday()
path = '//*[#id="'+visitor_id+'batting"]/tfoot/tr'
webs=fxs.xpath(path)
items=[]
for web in webs:
item=VisitorbattingItem()
item['ID']=response.url
item['AWAY_TEAM']=visitor_id
item['GAME_DT']=dt
item['GAME_DY']=day
item['AWAY_GAME']=1
item['AWAY_SCORE_CT']=web.xpath("td[3]/text()").extract()
item['MINUTES_GAME_CT']=fxs.xpath('//*[#id="gametime"]/text()').extract()
item['AWAY_AB']=web.xpath("td[2]/span/text()").extract()
item['AWAY_HITS']=web.xpath("td[4]/text()").extract()
item['AWAY_DO']=fxs.xpath('//*[#id="2Bvisitor"]/text()').extract()
item['AWAY_TR']=fxs.xpath('//*[#id="3Bvisitor"]/text()').extract()
item['AWAY_RBI']=web.xpath("td[5]/text()").extract()
item['AWAY_HBP']=fxs.xpath('//*[#id="HBPvisitor"]/text()').extract()
item['AWAY_SB']=fxs.xpath('//*[#id="SBvisitor"]/text()').extract()
item['AWAY_LOB']=fxs.xpath('//*[#id="teamlobvisitor"]/text()').extract()
item['AWAY_PO']=web.xpath("td[5]/text()").extract()
item['AWAY_ASS']=web.xpath("td[5]/text()").extract()
item['AWAY_ERR']=fxs.xpath('//*[#id="linescore"]/strong[3]/text()').extract
item['AWAY_PB']=fxs.xpath('//*[#id="PBvisitor"]/text()').extract()
item['AWAY_DP']=fxs.xpath('//*[#id="DPvisitor"]/text()').extract()
item['AWAY_TP']=fxs.xpath('//*[#id="TPvisitor"]/text()').extract()
item['AWAY_First_Innings']=fxs.xpath('//*[#id="linescore"]/text()[3]').extract
item['AWAY_IBB']=fxs.xpath('//*[#id="IBBvisitor"]/text()').extract()
item['AWAY_BB']=web.xpath("td[6]/text()").extract()
item['AWAY_SO']=web.xpath("td[7]/text()").extract()
items.append(item)
self.browser.close()
return items
The problem is that, when I execute the script the message I get on my CMD prompt, says crawled 'pages , scraped 0 items'. I don't understand why the items are not being scraped. Any help would be appreciated.

Related

Scrapy post request form data

I want to get the search result using scrapy post request after giving the input to CP Number as 16308 https://www.icsi.in/Student/Default.aspx?TabID=100 .
Here is my scrapy spider code :--
def parse(self, response):
head=response.xpath('//span[#id="dnn_ctlHeader_dnnBreadcrumb_lblBreadCrumb"]/span[#class="SkinObject"]/text()').extract_first()
view_gen = response.xpath('//input[#id="__VIEWSTATEGENERATOR"]/#value').extract_first()
dnn= response.xpath('//input[#id="__dnnVariable"]/#value').extract_first()
view_state = response.xpath('//input[#id="__VIEWSTATE"]/#value').extract_first()
view_val = response.xpath('//input[#id="__EVENTVALIDATION"]/#value').extract_first()
data={
'__VIEWSTATEGENERATOR':view_gen,
'__dnnVariable':dnn,
'__VIEWSTATE':view_state,
'__EVENTVALIDATION':view_val,
'dnn$ctr410$MemberSearch$txtCpNumber':'16803',
'dnn$ctr410$MemberSearch$ddlMemberType':'0'
}
yield scrapy.FormRequest(response.url,formdata=data,callback=self.fun)
Response
DEBUG: Crawled (200) https://www.icsi.in/Student/Default.aspx?tabid=100&error=An%20unexpected%20error%20has%20occurred&content=0> (referer: https://www.icsi.in/Student/Default.aspx?TabID=100)
[]
Response DEBUG: Crawled (200)
https://www.icsi.in/Student/Default.aspx?tabid=100&error=An%20unexpected%20error%20has%20occurred&content=0>
(referer: https://www.icsi.in/Student/Default.aspx?TabID=100) []
Your question is how to avoid getting this error right? Try to be more specific in the future.
When you want to scrape a webpage you have to inspect it all on your browser, see all parameters that are being sent with the request and make sure you are doing the same on your spider. You got lots of parameters on your code, but not all.
See my code bellow which actually solves your problem:
import scrapy
class MySpider(scrapy.Spider):
name = 'icsi'
start_urls = ['https://www.icsi.in/Student/Default.aspx?TabID=100']
search_action_url = 'https://www.icsi.in/Student/Default.aspx?TabID=100'
def parse(self, response):
formdata = dict()
for input in response.css('form#Form input'):
name = input.xpath('./#name').get()
value = input.xpath('./#value').get()
formdata[name] = str(value) if value else ''
formdata['dnn$ctr410$MemberSearch$txtCpNumber'] = '16308'
formdata['__EVENTTARGET'] = 'dnn$ctr410$MemberSearch$btnSearch'
return scrapy.FormRequest(self.search_action_url, formdata=formdata, callback=self.parse_search)
def parse_search(self, response):
scrapy.shell.inspect_response(response, self)
return
You were missing the parameter __EVENTTARGET, which informs the site you hit the button "Search".

Scrapy crawler not recursively crawling next page

I am trying to build this crawler to get housing data from craigslist,
but the crawler stops after fetching the first page and does not go to the next page .
Here is the code , it works for the first page ,but for the love of god I dont understand why it does not get to the next page .Any insight is really appreciated .I followed this part from scrapy tutorial
import scrapy
import re
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(scrapy.Spider):
name = "craigslistmm"
start_urls = [
"https://vancouver.craigslist.ca/search/hhh"
]
def parse_second(self,response):
#need all the info in a dict
meta_dict = response.meta
for q in response.css("section.page-container"):
meta_dict["post_details"]= {
"location":
{"longitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-longitude)" ).extract(),
"latitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-latitude)" ).extract()},
"detailed_info": ' '.join(q.css('section#postingbody::text').extract()).strip()
}
return meta_dict
def parse(self, response):
pattern = re.compile("\/([a-z]+)\/([a-z]+)\/.+")
for q in response.css("li.result-row"):
post_urls = q.css("p.result-info a::attr(href)").extract_first()
mm = re.match(pattern, post_urls)
neighborhood= q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
next_url = "https://vancouver.craigslist.ca/"+ post_urls
request = scrapy.Request(next_url,callback=self.parse_second)
#next_page = response.xpath('.//a[#class="button next"]/#href').extract_first()
#follow_url = "https://vancouver.craigslist.ca/" + next_page
#request1 = scrapy.Request(follow_url,callback=self.parse)
#yield response.follow(next_page,callback = self.parse)
request.meta['id'] = q.css("li.result-row::attr(data-pid)").extract_first()
request.meta['pricevaluation'] = q.css("p.result-info span.result-meta span.result-price::text").extract_first()
request.meta["information"] = q.css("p.result-info span.result-meta span.housing::text" ).extract_first()
request.meta["neighborhood"] =q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
request.meta["area"] = mm.group(1)
request.meta["adtype"] = mm.group(2)
yield request
#yield scrapy.Request(follow_url, callback=self.parse)
next_page = LinkExtractor(allow="s=\d+").extract_links(response)[0]
# = "https://vancouver.craigslist.ca/" + next_page
yield response.follow(next_page.url,callback=self.parse)
The problem seems to be with the next_page extraction using LinkExtractor. If you look in the look, you'll see duplicate requests being filtered. There are more links on the page that satisfy your extraction rule and maybe they are not extracted in any particular order (or not in the order you wish).
I think better approach is to extract exactly the information you want, try it with this:
next_page = response.xpath('//span[#class="buttons"]//a[contains(., "next")]/#href').extract_first()

Python scrapy working (only half of the time)

I created a python scrapy project to extract the prices of some google flights.
I configured the middleware to use PhantomJS instead of a normal browser.
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
try:
driver.get(request.url)
time.sleep(1.5)
except e:
raise ValueError("request url failed - \n url: {},\n error:
{}").format(request.url, e)
body = driver.page_source
#encoding='utf-8' - add to html response if necessary
return HtmlResponse(driver.current_url, body=body,encoding='utf-8',
request=request)
In the settings.py i added:
DOWNLOADER_MIDDLEWARES = {
# key path intermediate class, order value of middleware
'scraper_module.middlewares.middleware.JSMiddleware' : 543 ,
# prohibit the built-in middleware
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None , } `
I also created the following spider class:
import scrapy
from scrapy import Selector
class Gspider(scrapy.Spider):
name = "google_spider"
def __init__(self):
self.start_urls = ["https://www.google.pt/flights/#search;f=LIS;t=POR;d=2017-06-18;r=2017-06-22"]
self.prices = []
self.links = []
def clean_price(self, part):
#part received as a list
#the encoding is utf-8
part = part[0]
part = part.encode('utf-8')
part = filter(str.isdigit, part)
return part
def clean_link(self, part):
part = part[0]
part = part.encode('utf-8')
return part
def get_part(self, var_holder, response, marker, inner_marker, amount = 1):
selector = Selector(response)
divs = selector.css(marker)
for n, div in enumerate(divs):
if n < amount:
part = div.css(inner_marker).extract()
if inner_marker == '::text':
part = self.clean_price(part)
else:
part = self.clean_link(part)
var_holder.append(part)
else:
break
return var_holder
def parse(self, response):
prices, links = [], []
prices = self.get_part(prices, response, 'div.OMOBOQD-d-Ab', '::text')
print prices
links = self.get_part(links, response, 'a.OMOBOQD-d-X', 'a::attr(href)')
print links
The problem is, I run the code in the shell, and around half of the times I successfully get the prices and links requested, but another half of the time, the final vectors which should contain the extracted data, are empty.
I do not get any errors during execution.
Does anyone have any idea about why this is happening?
here are the logs from the command line:
Google has a very strict policy in terms of crawling. (Pretty hypocritical when you know that they constently crawl all the web...)
You should either find an API, as said previously in the comments or maybe use proxies. An easy way is to use Crawlera. It manages thousands of proxies so you don't have to bother. I personnaly use it to crawl google and it works perfectly. The downside is that it is not free.

How to handle two continuous post call (302 redirect) in scrapy?

I am getting the first post call url after redirect first post call url
I am not able to get second post url after click the first select button second post call url
pls help with this for mini project
My code:
import scrapy
import time
class Govreq(scrapy.Spider):
name = 'Gov-req'
start_urls = ['http://www.assessment.cot.tn.gov/re_assessment/SelectCounty.aspx?map=true&SelectCounty=003']
download_delay = 1.5
def parse(self, response):
yield scrapy.FormRequest(
'http://www.assessment.cot.tn.gov/re_assessment/SelectCounty.aspx?map=true&SelectCounty=003',
formdata={
'_EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__VIEWSTATE':response.css('input#__VIEWSTATE::attr(value)').extract_first(),
'__VIEWSTATEGENERATOR':'C7482FC3',
'__EVENTVALIDATION':response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00%24MainContent%24countylist': '003',
'ctl00%24MainContent%24txtOwnerName': 'aa',
'ctl00%24MainContent%24txtPropertyAddress': '',
'ctl00%24MainContent%24txtControlMap':'',
'ctl00%24MainContent%24txtGroup':'',
'ctl00%24MainContent%24txtParcel':'',
'ctl00%24MainContent%24txtSubdivisionName':'',
'ctl00%24MainContent%24ddlClass':'99',
'ctl00%24MainContent%24txtBegSaleDate':'',
'ctl00%24MainContent%24txtEndingSaleDate':'',
'ctl00%24MainContent%24Sort':'Owner',
'ctl00%24MainContent%24btnSearch':'SEARCH'
},callback=self.parse_tags
)
def parse_tags(self, response):
print 'parcel'
yield scrapy.FormRequest.from_response(response, callback=self.pracel_list_next)
def pracel_list_next(self, response):
print 'prarcel_list_next'
time.sleep(5)
yield scrapy.FormRequest.from_response(response,
formdata={
'_EVENTTARGET': 'ctl00%24MainContent%24GridView1',
'__EVENTARGUMENT': 'select%240',
'__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)').extract_first(),
'__VIEWSTATEGENERATOR': 'F71013A5',
'__VIEWSTATEENCRYPTED':'',
'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)').extract_first()
}, callback=self.parse_results)
def parse_results(self, response):
filename = response.url.split("/")[-2] + '.html'
print filename
with open(filename, 'wb') as f:
f.write(response.body)
yield {
'quote': response.xpath('//title//text()').extract()
}

Adding xhr links to scraped categories hrefs missing scheme error

i have built a spider which gets data from one category , the method it follows is when the category page is specified in start url and defining start_requests for pagination which iterates over the link provided by xhr request. Since i wanted to get all the categories at once i have written code like this. my logic was to first get all category links and append those links with xhr links which follows same string for every category which is (?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu) and parse these appended url to start_request and iterate them for pagination and item parsing . but i am not able to run spider because it throws the missing scheme error since in start request i havenot provided the http:// i am stuck onto how should i solve this issue please help..
class JabcatSpider(scrapy.Spider):
name = "jabcat"
allowed_domains = ["trendin.com"]
start_urls = [
'http://www.trendin.com',
]
max_pages = 400
def parse(self,response):
urls = response.xpath('//div[#class = "men"]//#href').extract()
for url in urls:
urljoin=(url + "/" "?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu")
#yield scrapy.Request(urljoin, callback=self.start_requests)
print urljoin
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('?from=%d&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu' % i, callback=self.parse)
def parse(self, response):
for href in response.xpath('//*[#id="product_rows"]/div/div/div/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_detail_page)
def parse_detail_page(self, response):
for sel in response.xpath('//*[#id="catalog-product"]/section[2]'):
item = Jabongo()
item['title'] = response.xpath('//*[#id="product-details-wrapper"]/div[1]/div[2]/div/div[1]/h1/span[2]/text()').extract()
# item['price'] = response.xpath('//*[#id="pdp-price-info"]/span[2]/text()').extract()
# item['image'] = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract()
# # item['color'] = sel.xpath('//ul/li/label[.="Color"]/following-sibling::Span/text()').extract()
# return item
#pattern = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract