Scrapy post request form data - python-2.7

I want to get the search result using scrapy post request after giving the input to CP Number as 16308 https://www.icsi.in/Student/Default.aspx?TabID=100 .
Here is my scrapy spider code :--
def parse(self, response):
head=response.xpath('//span[#id="dnn_ctlHeader_dnnBreadcrumb_lblBreadCrumb"]/span[#class="SkinObject"]/text()').extract_first()
view_gen = response.xpath('//input[#id="__VIEWSTATEGENERATOR"]/#value').extract_first()
dnn= response.xpath('//input[#id="__dnnVariable"]/#value').extract_first()
view_state = response.xpath('//input[#id="__VIEWSTATE"]/#value').extract_first()
view_val = response.xpath('//input[#id="__EVENTVALIDATION"]/#value').extract_first()
data={
'__VIEWSTATEGENERATOR':view_gen,
'__dnnVariable':dnn,
'__VIEWSTATE':view_state,
'__EVENTVALIDATION':view_val,
'dnn$ctr410$MemberSearch$txtCpNumber':'16803',
'dnn$ctr410$MemberSearch$ddlMemberType':'0'
}
yield scrapy.FormRequest(response.url,formdata=data,callback=self.fun)
Response
DEBUG: Crawled (200) https://www.icsi.in/Student/Default.aspx?tabid=100&error=An%20unexpected%20error%20has%20occurred&content=0> (referer: https://www.icsi.in/Student/Default.aspx?TabID=100)
[]

Response DEBUG: Crawled (200)
https://www.icsi.in/Student/Default.aspx?tabid=100&error=An%20unexpected%20error%20has%20occurred&content=0>
(referer: https://www.icsi.in/Student/Default.aspx?TabID=100) []
Your question is how to avoid getting this error right? Try to be more specific in the future.
When you want to scrape a webpage you have to inspect it all on your browser, see all parameters that are being sent with the request and make sure you are doing the same on your spider. You got lots of parameters on your code, but not all.
See my code bellow which actually solves your problem:
import scrapy
class MySpider(scrapy.Spider):
name = 'icsi'
start_urls = ['https://www.icsi.in/Student/Default.aspx?TabID=100']
search_action_url = 'https://www.icsi.in/Student/Default.aspx?TabID=100'
def parse(self, response):
formdata = dict()
for input in response.css('form#Form input'):
name = input.xpath('./#name').get()
value = input.xpath('./#value').get()
formdata[name] = str(value) if value else ''
formdata['dnn$ctr410$MemberSearch$txtCpNumber'] = '16308'
formdata['__EVENTTARGET'] = 'dnn$ctr410$MemberSearch$btnSearch'
return scrapy.FormRequest(self.search_action_url, formdata=formdata, callback=self.parse_search)
def parse_search(self, response):
scrapy.shell.inspect_response(response, self)
return
You were missing the parameter __EVENTTARGET, which informs the site you hit the button "Search".

Related

How run an spider sequentially to sites that use session in scrapy

I wanna scrape a web page that first send an AjaxFormPost that open a session and next send an _SearchResultGridPopulate to populate a control that I need to scrape, the response is a json.
this is a fragment of my code:
def parse_AjaxFormPost(self, response):
self.logger.info("parse_AjaxFormPost")
page = response.meta['page']
header = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-US,en;q=0.8',
'Connection':'keep-alive',
'Content-Length':'14',
'Content-Type':'application/x-www-form-urlencoded',
'Cookie':'ASP.NET_SessionId=gq4dgcsl500y32xb1n2ciexq',
.
.
.
}
url = '<url>/Search/AjaxFormPost'
cities = ['city1','city2',...]
for city in cities:
formData = {
'City':city
}
re = scrapy.FormRequest(
url,
formdata=formData,
headers=header,
dont_filter=True,
callback=self.parse_GridPopulate
)
yield re
def parse_GridPopulate(self,response):
self.logger.info("parse_LookupPermitTypeDetails")
url = '<url>/Search//_SearchResultGridPopulate?Grid-page=2&Grid-size=10&Grid-CERT_KEYSIZE=128&Grid-CERT_SECRETKEYSIZE=2048&Grid-HTTPS_KEYSIZE=128&Grid-HTTPS_SECRETKEYSIZE=2048'
header = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-US,en;q=0.8',
'Connection':'keep-alive',
'Content-Length':'23',
'Content-Type':'application/x-www-form-urlencoded',
'Cookie':'ASP.NET_SessionId=gq4dgcsl500y32xb1n2ciexq',
.
.
.
}
formData = {
'page':'1',
'size':'10'
}
re = scrapy.FormRequest(
url,
formdata=formData,
headers=header,
dont_filter=True,
callback=self.parse
)
yield re
def parse(self, response):
self.logger.info("parse_permit")
data_json = json.loads(response.body)
for row in data_json["data"]:
self.logger.info(row)
item = RedmondPermitItem()
item["item1"] = row["item1"]
item["item2"] = row["item2"]
yield item
The problem is that scrapy do request concurrent and when and the request in parse_AjaxFormPost open a session so when pass to the parse_LookupPermitTypeDetails I got the session of the last request do it in parse_AjaxFormPost. So if I have 10 cities at the end I got 10 times the information of the last city.
In settings I changed the configuration:
CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_REQUESTS_PER_IP = 1
And it doesn't work. On other hand I thought in run the spider only for one city every time something like
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
class MySpider(scrapy.Spider):
# Your first spider definition
...
...
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
cities = ['city1','city2',...]
for city in cities:
yield runner.crawl(MySpider1,city=city)
reactor.stop()
crawl()
reactor.run() # the script will block here until the last crawl call is finished
Maybe this can be the only one solution, but I'm not sure. I would like to create a procedure for every site with this characteristic.
Any suggestion about how solve that, is possible to achive this only configuring settings.
thanks in advance.
Update1
I change the title because is important that is for sites that use session
This is a problem of understanding how concurrency works, as this isn't parallelism you can still work sequentially, but between callbacks. I would suggest something like this:
def parse_AjaxFormPost(self, response):
...
cities = ['city1','city2',...]
formData = {
'City':cities[0]
}
re = scrapy.FormRequest(
url,
formdata=formData,
headers=header,
dont_filter=True,
callback=self.parse_remaining_cities,
meta={'remaining_cities': cities[1:]}, # check the meta argument
)
yield re
def parse_remaining_cities(self, response):
remaining_cities = response.meta['remaining_cities']
current_city = remaining_cities[0]
...
yield Request(
...,
meta={'remaining_cities': remaining_cities[1:]},
callback=self.parse_remaining_cities)
This way you are doing one request at a time and in a row from city to city.

Python scrapy working (only half of the time)

I created a python scrapy project to extract the prices of some google flights.
I configured the middleware to use PhantomJS instead of a normal browser.
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
try:
driver.get(request.url)
time.sleep(1.5)
except e:
raise ValueError("request url failed - \n url: {},\n error:
{}").format(request.url, e)
body = driver.page_source
#encoding='utf-8' - add to html response if necessary
return HtmlResponse(driver.current_url, body=body,encoding='utf-8',
request=request)
In the settings.py i added:
DOWNLOADER_MIDDLEWARES = {
# key path intermediate class, order value of middleware
'scraper_module.middlewares.middleware.JSMiddleware' : 543 ,
# prohibit the built-in middleware
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None , } `
I also created the following spider class:
import scrapy
from scrapy import Selector
class Gspider(scrapy.Spider):
name = "google_spider"
def __init__(self):
self.start_urls = ["https://www.google.pt/flights/#search;f=LIS;t=POR;d=2017-06-18;r=2017-06-22"]
self.prices = []
self.links = []
def clean_price(self, part):
#part received as a list
#the encoding is utf-8
part = part[0]
part = part.encode('utf-8')
part = filter(str.isdigit, part)
return part
def clean_link(self, part):
part = part[0]
part = part.encode('utf-8')
return part
def get_part(self, var_holder, response, marker, inner_marker, amount = 1):
selector = Selector(response)
divs = selector.css(marker)
for n, div in enumerate(divs):
if n < amount:
part = div.css(inner_marker).extract()
if inner_marker == '::text':
part = self.clean_price(part)
else:
part = self.clean_link(part)
var_holder.append(part)
else:
break
return var_holder
def parse(self, response):
prices, links = [], []
prices = self.get_part(prices, response, 'div.OMOBOQD-d-Ab', '::text')
print prices
links = self.get_part(links, response, 'a.OMOBOQD-d-X', 'a::attr(href)')
print links
The problem is, I run the code in the shell, and around half of the times I successfully get the prices and links requested, but another half of the time, the final vectors which should contain the extracted data, are empty.
I do not get any errors during execution.
Does anyone have any idea about why this is happening?
here are the logs from the command line:
Google has a very strict policy in terms of crawling. (Pretty hypocritical when you know that they constently crawl all the web...)
You should either find an API, as said previously in the comments or maybe use proxies. An easy way is to use Crawlera. It manages thousands of proxies so you don't have to bother. I personnaly use it to crawl google and it works perfectly. The downside is that it is not free.

Scrapy: Crawls pages but scrapes 0 items

I am trying to scrape baseball-reference.com. In the Scrapy bot I created, I start from 1st page and navigate to different links and from there to a third link. Please find the code below:
class VisitorBattingSpider(InitSpider):
name = 'VisitorBatting'
year=str(datetime.datetime.today().year)
allowed_domains = ["baseball-reference.com"]
start= 'http://www.baseball-reference.com/boxes/'+year+'.shtml'
start_urls=[start]
#rules = [Rule(LinkExtractor(allow=['/play-index/st.cgi?date=\d+-\d+-\d+']), callback='parse_item',)]
def __init__(self):
BaseSpider.__init__(self)
# use any browser you wish
self.browser = webdriver.Firefox()
def __del__(self):
self.browser.close()
def parse(self, response):
self.browser.get(response.url)
# let JavaScript Load
time.sleep(15)
page=Selector(text=self.browser.page_source)
#page=Selector(response)
sites=page.xpath('//*[#id="2016"]/tbody/tr/td/table/tbody/tr/td/a/#href')
for site in sites:
tree = site.extract()
yield Request(url='http://www.baseball-reference.com'+tree,callback=self.parse_new,dont_filter=True)
self.browser.close()
def parse_new(self, response):
hxs=Selector(response)
loads = hxs.xpath('/html/body/pre/a/#href')
for load in loads:
branch=load.extract()
if 'boxes' in branch:
yield Request(url='http://www.baseball-reference.com'+branch,callback=self.parse_final,dont_filter=True)
def parse_final(self, response):
self.browser.get(response.url)
fxs=Selector(text=self.browser.page_source)
vi= fxs.xpath('html/body/div/div[3]/div[1]/div[1]/h3/text()').extract()
vis=''.join(vi)
if "." in vis:
visitor=vis.replace(".","")
else:
visitor=vis
visitor_id=visitor.replace(" ","")
print visitor_id
UR=response.url
URL=''.join(UR)
dtt=URL[-15:]
dt=dtt[:8]
day=datetime.datetime(int(dt[:4]),int(dt[5:6]),int(dt[-2:]),01,01,01).weekday()
path = '//*[#id="'+visitor_id+'batting"]/tfoot/tr'
webs=fxs.xpath(path)
items=[]
for web in webs:
item=VisitorbattingItem()
item['ID']=response.url
item['AWAY_TEAM']=visitor_id
item['GAME_DT']=dt
item['GAME_DY']=day
item['AWAY_GAME']=1
item['AWAY_SCORE_CT']=web.xpath("td[3]/text()").extract()
item['MINUTES_GAME_CT']=fxs.xpath('//*[#id="gametime"]/text()').extract()
item['AWAY_AB']=web.xpath("td[2]/span/text()").extract()
item['AWAY_HITS']=web.xpath("td[4]/text()").extract()
item['AWAY_DO']=fxs.xpath('//*[#id="2Bvisitor"]/text()').extract()
item['AWAY_TR']=fxs.xpath('//*[#id="3Bvisitor"]/text()').extract()
item['AWAY_RBI']=web.xpath("td[5]/text()").extract()
item['AWAY_HBP']=fxs.xpath('//*[#id="HBPvisitor"]/text()').extract()
item['AWAY_SB']=fxs.xpath('//*[#id="SBvisitor"]/text()').extract()
item['AWAY_LOB']=fxs.xpath('//*[#id="teamlobvisitor"]/text()').extract()
item['AWAY_PO']=web.xpath("td[5]/text()").extract()
item['AWAY_ASS']=web.xpath("td[5]/text()").extract()
item['AWAY_ERR']=fxs.xpath('//*[#id="linescore"]/strong[3]/text()').extract
item['AWAY_PB']=fxs.xpath('//*[#id="PBvisitor"]/text()').extract()
item['AWAY_DP']=fxs.xpath('//*[#id="DPvisitor"]/text()').extract()
item['AWAY_TP']=fxs.xpath('//*[#id="TPvisitor"]/text()').extract()
item['AWAY_First_Innings']=fxs.xpath('//*[#id="linescore"]/text()[3]').extract
item['AWAY_IBB']=fxs.xpath('//*[#id="IBBvisitor"]/text()').extract()
item['AWAY_BB']=web.xpath("td[6]/text()").extract()
item['AWAY_SO']=web.xpath("td[7]/text()").extract()
items.append(item)
self.browser.close()
return items
The problem is that, when I execute the script the message I get on my CMD prompt, says crawled 'pages , scraped 0 items'. I don't understand why the items are not being scraped. Any help would be appreciated.

Adding xhr links to scraped categories hrefs missing scheme error

i have built a spider which gets data from one category , the method it follows is when the category page is specified in start url and defining start_requests for pagination which iterates over the link provided by xhr request. Since i wanted to get all the categories at once i have written code like this. my logic was to first get all category links and append those links with xhr links which follows same string for every category which is (?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu) and parse these appended url to start_request and iterate them for pagination and item parsing . but i am not able to run spider because it throws the missing scheme error since in start request i havenot provided the http:// i am stuck onto how should i solve this issue please help..
class JabcatSpider(scrapy.Spider):
name = "jabcat"
allowed_domains = ["trendin.com"]
start_urls = [
'http://www.trendin.com',
]
max_pages = 400
def parse(self,response):
urls = response.xpath('//div[#class = "men"]//#href').extract()
for url in urls:
urljoin=(url + "/" "?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu")
#yield scrapy.Request(urljoin, callback=self.start_requests)
print urljoin
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('?from=%d&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu' % i, callback=self.parse)
def parse(self, response):
for href in response.xpath('//*[#id="product_rows"]/div/div/div/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_detail_page)
def parse_detail_page(self, response):
for sel in response.xpath('//*[#id="catalog-product"]/section[2]'):
item = Jabongo()
item['title'] = response.xpath('//*[#id="product-details-wrapper"]/div[1]/div[2]/div/div[1]/h1/span[2]/text()').extract()
# item['price'] = response.xpath('//*[#id="pdp-price-info"]/span[2]/text()').extract()
# item['image'] = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract()
# # item['color'] = sel.xpath('//ul/li/label[.="Color"]/following-sibling::Span/text()').extract()
# return item
#pattern = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract

Scrapy get request url in parse

How can I get the request url in Scrapy's parse() function? I have a lot of urls in start_urls and some of them redirect my spider to homepage and as result I have an empty item. So I need something like item['start_url'] = request.url to store these urls. I'm using the BaseSpider.
The 'response' variable that's passed to parse() has the info you want. You shouldn't need to override anything.
eg. (EDITED)
def parse(self, response):
print "URL: " + response.request.url
The request object is accessible from the response object, therefore you can do the following:
def parse(self, response):
item['start_url'] = response.request.url
Instead of storing requested URL's somewhere and also scrapy processed URL's are not in same sequence as provided in start_urls.
By using below,
response.request.meta['redirect_urls']
will give you the list of redirect happened like ['http://requested_url','https://redirected_url','https://final_redirected_url']
To access first URL from above list, you can use
response.request.meta['redirect_urls'][0]
For more, see doc.scrapy.org mentioned as :
RedirectMiddleware
This middleware handles redirection of requests based on response status.
The urls which the request goes through (while being redirected) can be found in the redirect_urls Request.meta key.
Hope this helps you
You need to override BaseSpider's make_requests_from_url(url) function to assign the start_url to the item and then use the Request.meta special keys to pass that item to the parse function
from scrapy.http import Request
# override method
def make_requests_from_url(self, url):
item = MyItem()
# assign url
item['start_url'] = url
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
return request
def parse(self, response):
# access and do something with the item in parse
item = response.meta['item']
item['other_url'] = response.url
return item
Hope that helps.
Python 3.5
Scrapy 1.5.0
from scrapy.http import Request
# override method
def start_requests(self):
for url in self.start_urls:
item = {'start_url': url}
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
yield request
# use meta variable
def parse(self, response):
url = response.meta['item']['start_url']