How to handle two continuous post call (302 redirect) in scrapy? - python-2.7

I am getting the first post call url after redirect first post call url
I am not able to get second post url after click the first select button second post call url
pls help with this for mini project
My code:
import scrapy
import time
class Govreq(scrapy.Spider):
name = 'Gov-req'
start_urls = ['http://www.assessment.cot.tn.gov/re_assessment/SelectCounty.aspx?map=true&SelectCounty=003']
download_delay = 1.5
def parse(self, response):
yield scrapy.FormRequest(
'http://www.assessment.cot.tn.gov/re_assessment/SelectCounty.aspx?map=true&SelectCounty=003',
formdata={
'_EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__VIEWSTATE':response.css('input#__VIEWSTATE::attr(value)').extract_first(),
'__VIEWSTATEGENERATOR':'C7482FC3',
'__EVENTVALIDATION':response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'ctl00%24MainContent%24countylist': '003',
'ctl00%24MainContent%24txtOwnerName': 'aa',
'ctl00%24MainContent%24txtPropertyAddress': '',
'ctl00%24MainContent%24txtControlMap':'',
'ctl00%24MainContent%24txtGroup':'',
'ctl00%24MainContent%24txtParcel':'',
'ctl00%24MainContent%24txtSubdivisionName':'',
'ctl00%24MainContent%24ddlClass':'99',
'ctl00%24MainContent%24txtBegSaleDate':'',
'ctl00%24MainContent%24txtEndingSaleDate':'',
'ctl00%24MainContent%24Sort':'Owner',
'ctl00%24MainContent%24btnSearch':'SEARCH'
},callback=self.parse_tags
)
def parse_tags(self, response):
print 'parcel'
yield scrapy.FormRequest.from_response(response, callback=self.pracel_list_next)
def pracel_list_next(self, response):
print 'prarcel_list_next'
time.sleep(5)
yield scrapy.FormRequest.from_response(response,
formdata={
'_EVENTTARGET': 'ctl00%24MainContent%24GridView1',
'__EVENTARGUMENT': 'select%240',
'__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)').extract_first(),
'__VIEWSTATEGENERATOR': 'F71013A5',
'__VIEWSTATEENCRYPTED':'',
'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)').extract_first()
}, callback=self.parse_results)
def parse_results(self, response):
filename = response.url.split("/")[-2] + '.html'
print filename
with open(filename, 'wb') as f:
f.write(response.body)
yield {
'quote': response.xpath('//title//text()').extract()
}

Related

Scrapy post request form data

I want to get the search result using scrapy post request after giving the input to CP Number as 16308 https://www.icsi.in/Student/Default.aspx?TabID=100 .
Here is my scrapy spider code :--
def parse(self, response):
head=response.xpath('//span[#id="dnn_ctlHeader_dnnBreadcrumb_lblBreadCrumb"]/span[#class="SkinObject"]/text()').extract_first()
view_gen = response.xpath('//input[#id="__VIEWSTATEGENERATOR"]/#value').extract_first()
dnn= response.xpath('//input[#id="__dnnVariable"]/#value').extract_first()
view_state = response.xpath('//input[#id="__VIEWSTATE"]/#value').extract_first()
view_val = response.xpath('//input[#id="__EVENTVALIDATION"]/#value').extract_first()
data={
'__VIEWSTATEGENERATOR':view_gen,
'__dnnVariable':dnn,
'__VIEWSTATE':view_state,
'__EVENTVALIDATION':view_val,
'dnn$ctr410$MemberSearch$txtCpNumber':'16803',
'dnn$ctr410$MemberSearch$ddlMemberType':'0'
}
yield scrapy.FormRequest(response.url,formdata=data,callback=self.fun)
Response
DEBUG: Crawled (200) https://www.icsi.in/Student/Default.aspx?tabid=100&error=An%20unexpected%20error%20has%20occurred&content=0> (referer: https://www.icsi.in/Student/Default.aspx?TabID=100)
[]
Response DEBUG: Crawled (200)
https://www.icsi.in/Student/Default.aspx?tabid=100&error=An%20unexpected%20error%20has%20occurred&content=0>
(referer: https://www.icsi.in/Student/Default.aspx?TabID=100) []
Your question is how to avoid getting this error right? Try to be more specific in the future.
When you want to scrape a webpage you have to inspect it all on your browser, see all parameters that are being sent with the request and make sure you are doing the same on your spider. You got lots of parameters on your code, but not all.
See my code bellow which actually solves your problem:
import scrapy
class MySpider(scrapy.Spider):
name = 'icsi'
start_urls = ['https://www.icsi.in/Student/Default.aspx?TabID=100']
search_action_url = 'https://www.icsi.in/Student/Default.aspx?TabID=100'
def parse(self, response):
formdata = dict()
for input in response.css('form#Form input'):
name = input.xpath('./#name').get()
value = input.xpath('./#value').get()
formdata[name] = str(value) if value else ''
formdata['dnn$ctr410$MemberSearch$txtCpNumber'] = '16308'
formdata['__EVENTTARGET'] = 'dnn$ctr410$MemberSearch$btnSearch'
return scrapy.FormRequest(self.search_action_url, formdata=formdata, callback=self.parse_search)
def parse_search(self, response):
scrapy.shell.inspect_response(response, self)
return
You were missing the parameter __EVENTTARGET, which informs the site you hit the button "Search".

Django sometimes gives 404 and sometimes gives 200

With the same configuration in Django, visiting the same URL
http://<URL>/basic/insert/adafd
sometimes gives me a 404 while it will give me a 200 later. Any idea how would this happen and how to solve it?
(Looks like it occurs after I tried to use channels)
from django.conf.urls import url
from . import views
urlpatterns = [
url(r'^insert/(.*)$', views.insert),
url(r'^list$', views.list),
]
Other code cut down as follows:
def listfrom(request, last):
stack = infoStack.objects.all()
listing = []
for info in stack:
if info.id()>last:
listing.append(info.as_dict())
request.session['last'] = info.id()
return render(request, "basic/list.html", {"listings": json.dumps(listing)})
def list(request):
last = 0
if request.session.keys():
sid = request.session.session_key
s = Session.objects.get(pk=sid)
last = s.get_decoded()['last']
return listfrom(request, last)
def insert(request, info):
ip = get_client_ip(request)
q = infoStack()
q.infoIP = ip
q.infoText = info
q.save()
response = HttpResponse()
response.status_code = 200
return response
Channel related config/code as follows:
#channel_session
def ws_message(message):
print "message"
#channel_session
def ws_connect(message):
print "connect"
Group("wsg").add(message.reply_channel)
message.reply_channel.send({"accept": True})
#channel_session
def ws_disconnect(message):
print "disconnect"
Group("wsg").discard(message.reply_channel)
# routing.py
from basic.views import ws_message, ws_connect, ws_disconnect
channel_routing = {
'websocket.connect': ws_connect,
'websocket.receive': ws_message,
'websocket.disconnect': ws_disconnect,
}
Eventually found a post about this, https://github.com/django/channels/issues/489
Basically, the issue gone after I have uninstalled the redis-server and install it again.

scrapy: request url must be str or unicode, got Selector

I am writing a spider using Scrapy, to scrape user details of Pinterest. I am trying to get the details of user and his followers ( and so on until the last node).
Below is the spider code:
from scrapy.spider import BaseSpider
import scrapy
from pinners.items import PinterestItem
from scrapy.http import FormRequest
from urlparse import urlparse
class Sample(BaseSpider):
name = 'sample'
allowed_domains = ['pinterest.com']
start_urls = ['https://www.pinterest.com/banka/followers', ]
def parse(self, response):
for base_url in response.xpath('//div[#class="Module User gridItem"]/a/#href'):
list_a = response.urljoin(base_url.extract())
for new_urls in response.xpath('//div[#class="Module User gridItem"]/a/#href'):
yield scrapy.Request(new_urls, callback=self.Next)
yield scrapy.Request(list_a, callback=self.Next)
def Next(self, response):
href_base = response.xpath('//div[#class = "tabs"]/ul/li/a')
href_board = href_base.xpath('//div[#class="BoardCount Module"]')
href_pin = href_base.xpath('.//div[#class="Module PinCount"]')
href_like = href_base.xpath('.//div[#class="LikeCount Module"]')
href_followers = href_base.xpath('.//div[#class="FollowerCount Module"]')
href_following = href_base.xpath('.//div[#class="FollowingCount Module"]')
item = PinterestItem()
item["Board_Count"] = href_board.xpath('.//span[#class="value"]/text()').extract()[0]
item["Pin_Count"] = href_pin.xpath('.//span[#class="value"]/text()').extract()
item["Like_Count"] = href_like.xpath('.//span[#class="value"]/text()').extract()
item["Followers_Count"] = href_followers.xpath('.//span[#class="value"]/text()').extract()
item["Following_Count"] = href_following.xpath('.//span[#class="value"]/text()').extract()
item["User_ID"] = response.xpath('//link[#rel="canonical"]/#href').extract()[0]
yield item
I get the following error:
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
TypeError: Request url must be str or unicode, got Selector:
I did check the type of the list_a ( urls extracted). It gives me unicode.
the error is generated by the inner for loop in the parse method:
for new_urls in response.xpath('//div[#class="Module User gridItem"]/a/#href'):
yield scrapy.Request(new_urls, callback=self.Next)
the new_urls variable is actually a selector, please try something like this:
for base_url in response.xpath('//div[#class="Module User gridItem"]/a/#href'):
list_a = response.urljoin(base_url.extract())
yield scrapy.Request(list_a, callback=self.Next)

Scrapy: Crawls pages but scrapes 0 items

I am trying to scrape baseball-reference.com. In the Scrapy bot I created, I start from 1st page and navigate to different links and from there to a third link. Please find the code below:
class VisitorBattingSpider(InitSpider):
name = 'VisitorBatting'
year=str(datetime.datetime.today().year)
allowed_domains = ["baseball-reference.com"]
start= 'http://www.baseball-reference.com/boxes/'+year+'.shtml'
start_urls=[start]
#rules = [Rule(LinkExtractor(allow=['/play-index/st.cgi?date=\d+-\d+-\d+']), callback='parse_item',)]
def __init__(self):
BaseSpider.__init__(self)
# use any browser you wish
self.browser = webdriver.Firefox()
def __del__(self):
self.browser.close()
def parse(self, response):
self.browser.get(response.url)
# let JavaScript Load
time.sleep(15)
page=Selector(text=self.browser.page_source)
#page=Selector(response)
sites=page.xpath('//*[#id="2016"]/tbody/tr/td/table/tbody/tr/td/a/#href')
for site in sites:
tree = site.extract()
yield Request(url='http://www.baseball-reference.com'+tree,callback=self.parse_new,dont_filter=True)
self.browser.close()
def parse_new(self, response):
hxs=Selector(response)
loads = hxs.xpath('/html/body/pre/a/#href')
for load in loads:
branch=load.extract()
if 'boxes' in branch:
yield Request(url='http://www.baseball-reference.com'+branch,callback=self.parse_final,dont_filter=True)
def parse_final(self, response):
self.browser.get(response.url)
fxs=Selector(text=self.browser.page_source)
vi= fxs.xpath('html/body/div/div[3]/div[1]/div[1]/h3/text()').extract()
vis=''.join(vi)
if "." in vis:
visitor=vis.replace(".","")
else:
visitor=vis
visitor_id=visitor.replace(" ","")
print visitor_id
UR=response.url
URL=''.join(UR)
dtt=URL[-15:]
dt=dtt[:8]
day=datetime.datetime(int(dt[:4]),int(dt[5:6]),int(dt[-2:]),01,01,01).weekday()
path = '//*[#id="'+visitor_id+'batting"]/tfoot/tr'
webs=fxs.xpath(path)
items=[]
for web in webs:
item=VisitorbattingItem()
item['ID']=response.url
item['AWAY_TEAM']=visitor_id
item['GAME_DT']=dt
item['GAME_DY']=day
item['AWAY_GAME']=1
item['AWAY_SCORE_CT']=web.xpath("td[3]/text()").extract()
item['MINUTES_GAME_CT']=fxs.xpath('//*[#id="gametime"]/text()').extract()
item['AWAY_AB']=web.xpath("td[2]/span/text()").extract()
item['AWAY_HITS']=web.xpath("td[4]/text()").extract()
item['AWAY_DO']=fxs.xpath('//*[#id="2Bvisitor"]/text()').extract()
item['AWAY_TR']=fxs.xpath('//*[#id="3Bvisitor"]/text()').extract()
item['AWAY_RBI']=web.xpath("td[5]/text()").extract()
item['AWAY_HBP']=fxs.xpath('//*[#id="HBPvisitor"]/text()').extract()
item['AWAY_SB']=fxs.xpath('//*[#id="SBvisitor"]/text()').extract()
item['AWAY_LOB']=fxs.xpath('//*[#id="teamlobvisitor"]/text()').extract()
item['AWAY_PO']=web.xpath("td[5]/text()").extract()
item['AWAY_ASS']=web.xpath("td[5]/text()").extract()
item['AWAY_ERR']=fxs.xpath('//*[#id="linescore"]/strong[3]/text()').extract
item['AWAY_PB']=fxs.xpath('//*[#id="PBvisitor"]/text()').extract()
item['AWAY_DP']=fxs.xpath('//*[#id="DPvisitor"]/text()').extract()
item['AWAY_TP']=fxs.xpath('//*[#id="TPvisitor"]/text()').extract()
item['AWAY_First_Innings']=fxs.xpath('//*[#id="linescore"]/text()[3]').extract
item['AWAY_IBB']=fxs.xpath('//*[#id="IBBvisitor"]/text()').extract()
item['AWAY_BB']=web.xpath("td[6]/text()").extract()
item['AWAY_SO']=web.xpath("td[7]/text()").extract()
items.append(item)
self.browser.close()
return items
The problem is that, when I execute the script the message I get on my CMD prompt, says crawled 'pages , scraped 0 items'. I don't understand why the items are not being scraped. Any help would be appreciated.

Adding xhr links to scraped categories hrefs missing scheme error

i have built a spider which gets data from one category , the method it follows is when the category page is specified in start url and defining start_requests for pagination which iterates over the link provided by xhr request. Since i wanted to get all the categories at once i have written code like this. my logic was to first get all category links and append those links with xhr links which follows same string for every category which is (?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu) and parse these appended url to start_request and iterate them for pagination and item parsing . but i am not able to run spider because it throws the missing scheme error since in start request i havenot provided the http:// i am stuck onto how should i solve this issue please help..
class JabcatSpider(scrapy.Spider):
name = "jabcat"
allowed_domains = ["trendin.com"]
start_urls = [
'http://www.trendin.com',
]
max_pages = 400
def parse(self,response):
urls = response.xpath('//div[#class = "men"]//#href').extract()
for url in urls:
urljoin=(url + "/" "?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu")
#yield scrapy.Request(urljoin, callback=self.start_requests)
print urljoin
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('?from=%d&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu' % i, callback=self.parse)
def parse(self, response):
for href in response.xpath('//*[#id="product_rows"]/div/div/div/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_detail_page)
def parse_detail_page(self, response):
for sel in response.xpath('//*[#id="catalog-product"]/section[2]'):
item = Jabongo()
item['title'] = response.xpath('//*[#id="product-details-wrapper"]/div[1]/div[2]/div/div[1]/h1/span[2]/text()').extract()
# item['price'] = response.xpath('//*[#id="pdp-price-info"]/span[2]/text()').extract()
# item['image'] = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract()
# # item['color'] = sel.xpath('//ul/li/label[.="Color"]/following-sibling::Span/text()').extract()
# return item
#pattern = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract