How can I get the request url in Scrapy's parse() function? I have a lot of urls in start_urls and some of them redirect my spider to homepage and as result I have an empty item. So I need something like item['start_url'] = request.url to store these urls. I'm using the BaseSpider.
The 'response' variable that's passed to parse() has the info you want. You shouldn't need to override anything.
eg. (EDITED)
def parse(self, response):
print "URL: " + response.request.url
The request object is accessible from the response object, therefore you can do the following:
def parse(self, response):
item['start_url'] = response.request.url
Instead of storing requested URL's somewhere and also scrapy processed URL's are not in same sequence as provided in start_urls.
By using below,
response.request.meta['redirect_urls']
will give you the list of redirect happened like ['http://requested_url','https://redirected_url','https://final_redirected_url']
To access first URL from above list, you can use
response.request.meta['redirect_urls'][0]
For more, see doc.scrapy.org mentioned as :
RedirectMiddleware
This middleware handles redirection of requests based on response status.
The urls which the request goes through (while being redirected) can be found in the redirect_urls Request.meta key.
Hope this helps you
You need to override BaseSpider's make_requests_from_url(url) function to assign the start_url to the item and then use the Request.meta special keys to pass that item to the parse function
from scrapy.http import Request
# override method
def make_requests_from_url(self, url):
item = MyItem()
# assign url
item['start_url'] = url
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
return request
def parse(self, response):
# access and do something with the item in parse
item = response.meta['item']
item['other_url'] = response.url
return item
Hope that helps.
Python 3.5
Scrapy 1.5.0
from scrapy.http import Request
# override method
def start_requests(self):
for url in self.start_urls:
item = {'start_url': url}
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
yield request
# use meta variable
def parse(self, response):
url = response.meta['item']['start_url']
Related
I am implementing magic tokens and would like clean URLs. As a consequence, I would like to remove the token from the URL upon a successful user authentication. This is my attempt:
def authenticate_via_token(get_response):
def middleware(request):
if request.session.get('authenticated', None):
pass
else:
token = request.GET.get('token', None)
if token:
mt = MagicToken.fetch_by_token(token)
if mt:
request.session['authenticated'] = mt.email
if not request.GET._mutable:
request.GET._mutable = True
request.GET['token'] = None
request.GET._mutable = False
else:
print("invalid token")
response = get_response(request)
return response
return middleware
IE, I would like to send /products/product-detail/3?token=piyMlVMrmYblRwHcgwPEee --> /products/product-detail/3
It's possible that there may be additional GET parameters and I would like to keep them. Any input would be appreciated!
This is the solution I ended up going for:
from django.urls import resolve, reverse
import urllib
def drop_get_param(request, param):
'helpful for redirecting while dropping a specific parameter'
resolution = resolve(request.path_info) #simulate resolving the request
new_params = request.GET.copy() # copy the parameters
del new_params[param] # drop the specified parameter
reversed = reverse(resolution.url_name, kwargs=resolution.kwargs) # create a base url
if new_params: #append the remaining parameters
reversed += '?' + urllib.parse.urlencode(new_params)
return reversed
I am parsing two lists of URLs with the latter being populated from the first. I am able to get all the needed URLs, but having trouble pulling data from the last list of URLs. I am only able to get one URL's data
I have tried using scrapy.Request, response.follow, While statement, but only getting a response from one URL. I am new to scrapy/python not to sure how to fix this issue.
import scrapy
class DBSpider(scrapy.Spider):
name = "Scrape"
allowed_domains = ["192.168.3.137"]
start_urls = [
'http://192.168.3.137/api/?controller_id=' + str(x)
for x in range(0,8)
]
def parse(self, response):
for sites in response.xpath('//*
[#id="siteslist"]//li/a/#href').extract():
yield response.follow(sites, self.parse_sites)
def parse_sites(self, response):
for device_pages in response.xpath('//*
[#id="list_devices"]/a/#href').extract():
yield scrapy.Request(response.urljoin(device_pages),
self.parse_macs)
def parse_macs(self, response):
print response.url #only gets one response url
parse_mac only print one response url. should be equivalent to the amount of urls in devices_pages loop in parse_sites def
I try to find a way to scrape and parse more pages in the signed in area.
These example links accesible from signed in I want to parse.
#http://example.com/seller/demand/?id=305554
#http://example.com/seller/demand/?id=305553
#http://example.com/seller/demand/?id=305552
#....
I want to create spider that can open each one of these links and then parse them.
I have created another spider which can open and parse only one of them.
When I tried to create "for" or "while" to call more requests with other links it allowed me not because I cannot put more returns into generator, it returns error. I also tried link extractors, but it didn't work for me.
Here is my code:
#!c:/server/www/scrapy
# -*- coding: utf-8 -*-
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import FormRequest
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider, Rule
from array import *
from stack.items import StackItem
from scrapy.linkextractors import LinkExtractor
class Spider3(Spider):
name = "Spider3"
allowed_domains = ["example.com"]
start_urls = ["http://example.com/login"] #this link lead to login page
When I am signed in it returns page with url, that contains "stat", that is why I put here first "if" condition.
When I am signed in, I request one link and call function parse_items.
def parse(self, response):
#when "stat" is in url it means that I just signed in
if "stat" in response.url:
return Request("http://example.com/seller/demand/?id=305554", callback = self.parse_items)
else:
#this succesful login turns me to page, it's url contains "stat"
return [FormRequest.from_response(response,
formdata={'ctl00$ContentPlaceHolder1$lMain$tbLogin': 'my_login', 'ctl00$ContentPlaceHolder1$lMain$tbPass': 'my_password'},callback=self.parse)]
Function parse_items simply parse desired content from one desired page:
def parse_items(self,response):
questions = Selector(response).xpath('//*[#id="ctl00_ContentPlaceHolder1_cRequest_divAll"]/table/tr')
for question in questions:
item = StackItem()
item['name'] = question.xpath('th/text()').extract()[0]
item['value'] = question.xpath('td/text()').extract()[0]
yield item
Can you help me please to update this code to open and parse more than one page in each sessions?
I don't want to sign in over and over for each request.
The session most likely depends on the cookies and scrapy manages that by itself. I.e:
def parse_items(self,response):
questions = Selector(response).xpath('//*[#id="ctl00_ContentPlaceHolder1_cRequest_divAll"]/table/tr')
for question in questions:
item = StackItem()
item['name'] = question.xpath('th/text()').extract()[0]
item['value'] = question.xpath('td/text()').extract()[0]
yield item
next_url = '' # find url to next page in the current page
if next_url:
yield Request(next_url, self.parse_items)
# scrapy will retain the session for the next page if it's managed by cookies
I am currently working on the same problem. I use InitSpider so I can overwrite __init__ and init_request. The first is just for initialisation of custom stuff and the actual magic happens in my init_request:
def init_request(self):
"""This function is called before crawling starts."""
# Do not start a request on error,
# simply return nothing and quit scrapy
if self.abort:
return
# Do a login
if self.login_required:
# Start with login first
return Request(url=self.login_page, callback=self.login)
else:
# Start with pase function
return Request(url=self.base_url, callback=self.parse)
My login looks like this
def login(self, response):
"""Generate a login request."""
self.log('Login called')
return FormRequest.from_response(
response,
formdata=self.login_data,
method=self.login_method,
callback=self.check_login_response
)
self.login_data is a dict with post values.
I am still a beginner with python and scrapy, so I might be doing it the wrong way. Anyway, so far I have produced a working version that can be viewed on github.
HTH:
https://github.com/cytopia/crawlpy
i have built a spider which gets data from one category , the method it follows is when the category page is specified in start url and defining start_requests for pagination which iterates over the link provided by xhr request. Since i wanted to get all the categories at once i have written code like this. my logic was to first get all category links and append those links with xhr links which follows same string for every category which is (?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu) and parse these appended url to start_request and iterate them for pagination and item parsing . but i am not able to run spider because it throws the missing scheme error since in start request i havenot provided the http:// i am stuck onto how should i solve this issue please help..
class JabcatSpider(scrapy.Spider):
name = "jabcat"
allowed_domains = ["trendin.com"]
start_urls = [
'http://www.trendin.com',
]
max_pages = 400
def parse(self,response):
urls = response.xpath('//div[#class = "men"]//#href').extract()
for url in urls:
urljoin=(url + "/" "?from=24&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu")
#yield scrapy.Request(urljoin, callback=self.start_requests)
print urljoin
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('?from=%d&ajax=true&search_query=&orderby=popular&orderway=asc&latestfilter=&source=menu' % i, callback=self.parse)
def parse(self, response):
for href in response.xpath('//*[#id="product_rows"]/div/div/div/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_detail_page)
def parse_detail_page(self, response):
for sel in response.xpath('//*[#id="catalog-product"]/section[2]'):
item = Jabongo()
item['title'] = response.xpath('//*[#id="product-details-wrapper"]/div[1]/div[2]/div/div[1]/h1/span[2]/text()').extract()
# item['price'] = response.xpath('//*[#id="pdp-price-info"]/span[2]/text()').extract()
# item['image'] = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract()
# # item['color'] = sel.xpath('//ul/li/label[.="Color"]/following-sibling::Span/text()').extract()
# return item
#pattern = response.xpath('//*[#class="content"]/h1/span[2]/text()').extract
I have the url and corresponding view as follows.
url(r'^(?P<token>.*?)/ack$', views.api_ACK, name='device-api_ack')
def api_ACK(request, token):
"""
Process the ACK request comming from the device
"""
logger.info('-> api_ACK', extra={'request': request, 'token' : token, 'url': request.get_full_path()})
logger.debug(request)
if request.method == 'GET':
# verify the request
action, err_msg = api_verify_request(token=token, action_code=Action.AC_ACKNOWLEDGE)
return api_send_answer(action, err_msg)
I want to call api_ACK function with request method as GET from another view api_send_answer
I am creating one url in /device/LEAB86JFOZ6R7W4F69CBIMVBYB9SFZVC/ack in api_send_answer view as follows..
def api_send_answer(action, err_msg, provisional_answer=None):
last_action = create_action(session,action=Action.AC_ACKNOWLEDGE,token=last_action.next_token,timer=500)
url = ''.join (['/device/',last_action.next_token ,'/',Action.AC_ACKNOWLEDGE])
logger.debug('Request Url')
logger.debug(url)
response = api_ACK(request=url,token=last_action.next_token) # This is wrong
Now from api_send_answer it is redirecting to api_ACK view, but how to call api_ACK with request method as GET?
Please help..Any suggestions would be helpful to me
This line
response = api_ACK(request=url,token=last_action.next_token) is wrong because view expects HttpRequest object and you give him url instead.
if you need to return view response to user, you can use redirect:
def api_send_answer(action, err_msg, provisional_answer=None):
last_action = create_action(session,action=Action.AC_ACKNOWLEDGE,token=last_action.next_token,timer=500)
url = ''.join (['/device/',last_action.next_token ,'/',Action.AC_ACKNOWLEDGE])
logger.debug('Request Url')
logger.debug(url)
return HttpResponseRedirect(url)
if you need to do something else with view response you have to use HttpRequest object not url as parameter.