How to scrape pages after login - python-2.7

I try to find a way to scrape and parse more pages in the signed in area.
These example links accesible from signed in I want to parse.
#http://example.com/seller/demand/?id=305554
#http://example.com/seller/demand/?id=305553
#http://example.com/seller/demand/?id=305552
#....
I want to create spider that can open each one of these links and then parse them.
I have created another spider which can open and parse only one of them.
When I tried to create "for" or "while" to call more requests with other links it allowed me not because I cannot put more returns into generator, it returns error. I also tried link extractors, but it didn't work for me.
Here is my code:
#!c:/server/www/scrapy
# -*- coding: utf-8 -*-
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import FormRequest
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider, Rule
from array import *
from stack.items import StackItem
from scrapy.linkextractors import LinkExtractor
class Spider3(Spider):
name = "Spider3"
allowed_domains = ["example.com"]
start_urls = ["http://example.com/login"] #this link lead to login page
When I am signed in it returns page with url, that contains "stat", that is why I put here first "if" condition.
When I am signed in, I request one link and call function parse_items.
def parse(self, response):
#when "stat" is in url it means that I just signed in
if "stat" in response.url:
return Request("http://example.com/seller/demand/?id=305554", callback = self.parse_items)
else:
#this succesful login turns me to page, it's url contains "stat"
return [FormRequest.from_response(response,
formdata={'ctl00$ContentPlaceHolder1$lMain$tbLogin': 'my_login', 'ctl00$ContentPlaceHolder1$lMain$tbPass': 'my_password'},callback=self.parse)]
Function parse_items simply parse desired content from one desired page:
def parse_items(self,response):
questions = Selector(response).xpath('//*[#id="ctl00_ContentPlaceHolder1_cRequest_divAll"]/table/tr')
for question in questions:
item = StackItem()
item['name'] = question.xpath('th/text()').extract()[0]
item['value'] = question.xpath('td/text()').extract()[0]
yield item
Can you help me please to update this code to open and parse more than one page in each sessions?
I don't want to sign in over and over for each request.

The session most likely depends on the cookies and scrapy manages that by itself. I.e:
def parse_items(self,response):
questions = Selector(response).xpath('//*[#id="ctl00_ContentPlaceHolder1_cRequest_divAll"]/table/tr')
for question in questions:
item = StackItem()
item['name'] = question.xpath('th/text()').extract()[0]
item['value'] = question.xpath('td/text()').extract()[0]
yield item
next_url = '' # find url to next page in the current page
if next_url:
yield Request(next_url, self.parse_items)
# scrapy will retain the session for the next page if it's managed by cookies

I am currently working on the same problem. I use InitSpider so I can overwrite __init__ and init_request. The first is just for initialisation of custom stuff and the actual magic happens in my init_request:
def init_request(self):
"""This function is called before crawling starts."""
# Do not start a request on error,
# simply return nothing and quit scrapy
if self.abort:
return
# Do a login
if self.login_required:
# Start with login first
return Request(url=self.login_page, callback=self.login)
else:
# Start with pase function
return Request(url=self.base_url, callback=self.parse)
My login looks like this
def login(self, response):
"""Generate a login request."""
self.log('Login called')
return FormRequest.from_response(
response,
formdata=self.login_data,
method=self.login_method,
callback=self.check_login_response
)
self.login_data is a dict with post values.
I am still a beginner with python and scrapy, so I might be doing it the wrong way. Anyway, so far I have produced a working version that can be viewed on github.
HTH:
https://github.com/cytopia/crawlpy

Related

Beaker session in bottle

while using beaker session, i came across to use same session object along the whole application.
I came through this url: Bottle.py session with Beaker
But, still i am getting 'KeyError' when i am trying to access the save session value in one function by another function.
my rest.py file looks like:
import bottle
from bottle import route,default_app
from beaker.middleware import SessionMiddleware
app = bottle.default_app()
#bottle.hook('before_request')
def setup_request():
request.session = request.environ['beaker.session']
#app.route('/login')
def login():
request.session['uname'] = 'user'
#app.route('/logout')
def logout():
print request.session['uname']
# expecting to print user
session_opts = {
'session.type': 'file',
'session.data_dir': '/tmp/',
'session.cookie_expires': True,
}
app = SessionMiddleware(bottle.default_app(),session_opts)
I have mentioned the SessionMiddleware at the end as im getting errors with the help of this link https://groups.google.com/forum/#!topic/bottlepy/m0akSbWRpZg
But when i am accessing request.session in the logout function i am getting
'KeyError': Uname not found
can any one give clear example of how to adjust the code inorder to maintain same session in whole application.

How to follow next pages in Scrapy Crawler to scrape content

I am able to scrape all the stories from the first page,my problem is how to move to the next page and continue scraping stories and name,kindly check my code below
# -*- coding: utf-8 -*-
import scrapy
from cancerstories.items import CancerstoriesItem
class MyItem(scrapy.Item):
name = scrapy.Field()
story = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'cancerstories'
allowed_domains = ['thebreastcancersite.greatergood.com']
start_urls = ['http://thebreastcancersite.greatergood.com/clickToGive/bcs/stories/']
def parse(self, response):
rows = response.xpath('//a[contains(#href,"story")]')
#loop over all links to stories
for row in rows:
myItem = MyItem() # Create a new item
myItem['name'] = row.xpath('./text()').extract() # assign name from link
story_url = response.urljoin(row.xpath('./#href').extract()[0]) # extract url from link
request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
request.meta['myItem'] = myItem # pass the item with the request
yield request
def parse_detail(self, response):
myItem = response.meta['myItem'] # extract the item (with the name) from the response
#myItem['name']=response.xpath('//h1[#class="headline"]/text()').extract()
text_raw = response.xpath('//div[#class="photoStoryBox"]/div/p/text()').extract() # extract the story (text)
myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
yield myItem # return the item
You could change your scrapy.Spider for a CrawlSpider, and use Rule and LinkExtractor to follow the link to the next page.
For this approach you have to include the code below:
...
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
...
rules = (
Rule(LinkExtractor(allow='\.\./stories;jsessionid=[0-9A-Z]+?page=[0-9]+')),
)
...
class MySpider(CrawlSpider):
...
This way, for each page you visit the spider will create a request for the next page (if present), follow it when finishes the execution for the parse method, and repeat the process again.
EDIT:
The rule I wrote is just to follow the next page link not to extract the stories, if your first approach works it's not necessary to change it.
Also, regarding the rule in your comment, SgmlLinkExtractor is deprecated so I recommend you to use the default link extractor, and the rule itself is not well defined.
When the parameter attrs in the extractor is not defined, it searchs links looking for the href tags in the body, which in this case looks like ../story/mother-of-4435 and not /clickToGive/bcs/story/mother-of-4435. That's the reason it doesn't find any link to follow.
you can follow next pages manually if you would use scrapy.spider class,example:
next_page = response.css('a.pageLink ::attr(href)').extract_first()
if next_page:
absolute_next_page_url = response.urljoin(next_page)
yield scrapy.Request(url=absolute_next_page_url, callback=self.parse)
Do not forget to rename your parse method to parse_start_url if you want to use CralwSpider class

Using scrapy recursivelly for scrape a phpBB forum

I'm trying to use scrapy for crawl a phpbb-based forum. My knowledge level of scrapy is quite basic (but improving).
Extract the contents of a forum thread's first page was more or less easy. My successful scraper was this:
import scrapy
from ptmya1.items import Ptmya1Item
class bastospider3(scrapy.Spider):
name = "basto3"
allowed_domains = ["portierramaryaire.com"]
start_urls = [
"http://portierramaryaire.com/foro/viewtopic.php?f=3&t=3821&st=0&sk=t&sd=a"
]
def parse(self, response):
for sel in response.xpath('//div[2]/div'):
item = Ptmya1Item()
item['author'] = sel.xpath('div/div[1]/p/strong/a/text()').extract()
item['date'] = sel.xpath('div/div[1]/p/text()').extract()
item['body'] = sel.xpath('div/div[1]/div/text()').extract()
yield item
However, when I tried to crawl using "next page" link I have failed after a lot of frustrating hours. I would like to show you my attempts, in order to ask for an advice. Note: I would prefer to obtain a solution for the SgmlLinkExtractor variants, since they are more flexible and powerful, but I priorize success after so many attempts
First one, SgmlLinkExtractor with restricted path. 'Next page xpath' is
/html/body/div[1]/div[2]/form[1]/fieldset/a
Indeed, I tested with the shell that
response.xpath('//div[2]/form[1]/fieldset/a/#href')[1].extract()
returns a correct value for the "next page" link. However, I want to note that the cited xpath offers TWO links
>>> response.xpath('//div[2]/form[1]/fieldset/a/#href').extract()
[u'./search.php?sid=5aa2b92bec28a93c85956e83f2f62c08', u'./viewtopic.php?f=3&t=3821&st=0&sk=t&sd=a&sid=5aa2b92bec28a93c85956e83f2f62c08&start=15']
thus, my failed scraper was
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from ptmya1.items import Ptmya1Item
class bastospider3(scrapy.Spider):
name = "basto7"
allowed_domains = ["portierramaryaire.com"]
start_urls = [
"http://portierramaryaire.com/foro/viewtopic.php?f=3&t=3821&st=0&sk=t&sd=a"
]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[2]/form[1]/fieldset/a/#href')[1],), callback="parse_items", follow= True)
)
def parse_item(self, response):
for sel in response.xpath('//div[2]/div'):
item = Ptmya1Item()
item['author'] = sel.xpath('div/div[1]/p/strong/a/text()').extract()
item['date'] = sel.xpath('div/div[1]/p/text()').extract()
item['body'] = sel.xpath('div/div[1]/div/text()').extract()
yield item
Second one, SgmlLinkExtractor with allow. More primitive and unsuccessful too
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from ptmya1.items import Ptmya1Item
class bastospider3(scrapy.Spider):
name = "basto7"
allowed_domains = ["portierramaryaire.com"]
start_urls = [
"http://portierramaryaire.com/foro/viewtopic.php?f=3&t=3821&st=0&sk=t&sd=a"
]
rules = (
Rule(SgmlLinkExtractor(allow=(r'viewtopic.php?f=3&t=3821&st=0&sk=t&sd=a&start.',),), callback="parse_items", follow= True)
)
def parse_item(self, response):
for sel in response.xpath('//div[2]/div'):
item = Ptmya1Item()
item['author'] = sel.xpath('div/div[1]/p/strong/a/text()').extract()
item['date'] = sel.xpath('div/div[1]/p/text()').extract()
item['body'] = sel.xpath('div/div[1]/div/text()').extract()
yield item
Finally, I returned to the damn paleolithic age, or to its first tutorial equivalent. I try to use the loop included at the end of the beginner's tutorial. Another failure
import scrapy
import urlparse
from ptmya1.items import Ptmya1Item
class bastospider5(scrapy.Spider):
name = "basto5"
allowed_domains = ["portierramaryaire.com"]
start_urls = [
"http://portierramaryaire.com/foro/viewtopic.php?f=3&t=3821&st=0&sk=t&sd=a"
]
def parse_articles_follow_next_page(self, response):
item = Ptmya1Item()
item['cacho'] = response.xpath('//div[2]/form[1]/fieldset/a/#href').extract()[1][1:] + "http://portierramaryaire.com/foro"
for sel in response.xpath('//div[2]/div'):
item['author'] = sel.xpath('div/div[1]/p/strong/a/text()').extract()
item['date'] = sel.xpath('div/div[1]/p/text()').extract()
item['body'] = sel.xpath('div/div[1]/div/text()').extract()
yield item
next_page = response.xpath('//fieldset/a[#class="right-box right"]')
if next_page:
cadenanext = response.xpath('//div[2]/form[1]/fieldset/a/#href').extract()[1][1:]
url = urlparse.urljoin("http://portierramaryaire.com/foro",cadenanext)
yield scrapy.Request(url, self.parse_articles_follow_next_page)
In all the cases, what I have obtained is a cryptic error message from which I cannot obtain a hint for the solution of my problem.
2015-10-08 21:24:46 [scrapy] DEBUG: Crawled (200) <GET http://portierramaryaire.com/foro/viewtopic.php?f=3&t=3821&st=0&sk=t&sd=a> (referer: None)
2015-10-08 21:24:46 [scrapy] ERROR: Spider error processing <GET http://portierramaryaire.com/foro/viewtopic.php?f=3&t=3821&st=0&sk=t&sd=a> (referer: None)
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/spiders/__init__.py", line 76, in parse
raise NotImplementedError
NotImplementedError
2015-10-08 21:24:46 [scrapy] INFO: Closing spider (finished)
I really would appreciate any advice (or better, a working solution) for the problem. I'm utterly stuck on this and no matter how much I read, I am not able to find a solution :(
The cryptic error message occurs because you do not use the parse method. That's the default entry-point of scrapy when it wants to parse a response.
However you only defined a parse_articles_follow_next_page or parse_item function -- which are definitely no parse functions.
And this is not because of the next site but the first site: Scrapy cannot parse the start_url so your tries are not reached in any case. Try to change your parse_items to parse and execute your approaches again for the palaeolithic solution.
If you are using a Rule then you need to use a different spider. For those use CrawlSpider which you can see in the tutorials. In this case do not override the parse method but use the parse_items as you do. That's because CrawlSpider uses parse to forward the responses to the callback method.
Thanks to GHajba, the problem is solved. The solution is developed on the commentaries.
However, the spider doesn't return the results in order. It starts on http://portierramaryaire.com/foro/viewtopic.php?f=3&t=3821&st=0&sk=t&sd=a
and it should walk through "next page" urls, which are like this: http://portierramaryaire.com/foro/viewtopic.php?f=3&t=3821&st=0&sk=t&sd=a&start=15
incrementing the 'start' variable with 15 post each time.
Indeed, the spider returns first the page produced 'start=15', then 'start=30', then 'start=0', then again 'start=15', then 'start=45'...
I am not sure if I have to create a new question or if it would be better for future readers to develop the question here. What do you think?
since this is 5 year old - many many new approaches are out there.
btw: see https://github.com/Dascienz/phpBB-forum-scraper
Python-based web scraper for phpBB forums. Project can be used as a
template for building your own custom Scrapy spiders or for one-off
crawls on designated forums. Please keep in mind that aggressive
crawls can produce significant strain on web servers, so please
throttle your request rates.
The phpBB.py spider scrapes the following information from forum
posts: Username User Post Count Post Date & Time Post Text Quoted Text
If you need additional data scraped, you will have to create
additional spiders or edit the existing spider.
Edit phpBB.py and Specify: allowed_domains start_urls username &
password forum_login=False or forum_login=True
see also
import requests
forum = "the forum name"
headers = {'User-Agent': 'Mozilla/5.0'}
payload = {'username': 'username', 'password': 'password', 'redirect':'index.php', 'sid':'', 'login':'Login'}
session = requests.Session()
r = session.post(forum + "ucp.php?mode=login", headers=headers, data=payload)
print(r.text)
but wait: we can - instead of manipulating the website using requests,
also make use a browser automation such as mechanize offers this.
This way we don't have to manage the own session and only have a few lines of code to craft each request.
a interesting example is on GitHub https://github.com/winny-/sirsi/blob/317928f23847f4fe85e2428598fbe44c4dae2352/sirsi/sirsi.py#L74-L211

Scrapy Cookie Manipulation How to?

I have to crawl a Web Site, so I use Scrapy to do it, but I need to pass a cookie to bypass the first page (which is a kind of login page, you choose you location)
I heard on the web that you need to do this with a base Spider (not a Crawl Spider), but I need to use a Crawl Spider to do my crawling, so what do I need to do?
At first a Base Spider? then launch my Crawl spider? But I don't know if cookie will be passed between them or how do I do it? How to launch a spider from another spider?
How to handle cookie? I tried with this
def start_requests(self):
yield Request(url='http://www.auchandrive.fr/drive/St-Quentin-985/', cookies={'auchanCook': '"985|"'})
But not working
My answer should be here, but the guy is really evasive and I don't know what to do.
First, you need to add open cookies in settings.py file
COOKIES_ENABLED = True
Here is my testing spider code for your reference. I tested it and passed
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy import log
class Stackoverflow23370004Spider(CrawlSpider):
name = 'auchandrive.fr'
allowed_domains = ["auchandrive.fr"]
target_url = "http://www.auchandrive.fr/drive/St-Quentin-985/"
def start_requests(self):
yield Request(self.target_url,cookies={'auchanCook': "985|"}, callback=self.parse_page)
def parse_page(self, response):
if 'St-Quentin-985' in response.url:
self.log("Passed : %r" % response.url,log.DEBUG)
else:
self.log("Failed : %r" % response.url,log.DEBUG)
You can run command to test and watch the console output:
scrapy crawl auchandrive.fr
I noticed that in your code snippet, you were using cookies={'auchanCook': '"985|"'}, instead of cookies={'auchanCook': "985|"}.
This should get you started:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
class AuchanDriveSpider(CrawlSpider):
name = 'auchandrive'
allowed_domains = ["auchandrive.fr"]
# pseudo-start_url
begin_url = "http://www.auchandrive.fr/"
# start URL used as shop selection
select_shop_url = "http://www.auchandrive.fr/drive/St-Quentin-985/"
rules = (
Rule(SgmlLinkExtractor(restrict_xpaths=('//ul[#class="header-menu"]',))),
Rule(SgmlLinkExtractor(restrict_xpaths=('//div[contains(#class, "vignette-content")]',)),
callback='parse_product'),
)
def start_requests(self):
yield Request(self.begin_url, callback=self.select_shop)
def select_shop(self, response):
return Request(url=self.select_shop_url, cookies={'auchanCook': "985|"})
def parse_product(self, response):
self.log("parse_product: %r" % response.url)
Pagination might be tricky.

Scrapy get request url in parse

How can I get the request url in Scrapy's parse() function? I have a lot of urls in start_urls and some of them redirect my spider to homepage and as result I have an empty item. So I need something like item['start_url'] = request.url to store these urls. I'm using the BaseSpider.
The 'response' variable that's passed to parse() has the info you want. You shouldn't need to override anything.
eg. (EDITED)
def parse(self, response):
print "URL: " + response.request.url
The request object is accessible from the response object, therefore you can do the following:
def parse(self, response):
item['start_url'] = response.request.url
Instead of storing requested URL's somewhere and also scrapy processed URL's are not in same sequence as provided in start_urls.
By using below,
response.request.meta['redirect_urls']
will give you the list of redirect happened like ['http://requested_url','https://redirected_url','https://final_redirected_url']
To access first URL from above list, you can use
response.request.meta['redirect_urls'][0]
For more, see doc.scrapy.org mentioned as :
RedirectMiddleware
This middleware handles redirection of requests based on response status.
The urls which the request goes through (while being redirected) can be found in the redirect_urls Request.meta key.
Hope this helps you
You need to override BaseSpider's make_requests_from_url(url) function to assign the start_url to the item and then use the Request.meta special keys to pass that item to the parse function
from scrapy.http import Request
# override method
def make_requests_from_url(self, url):
item = MyItem()
# assign url
item['start_url'] = url
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
return request
def parse(self, response):
# access and do something with the item in parse
item = response.meta['item']
item['other_url'] = response.url
return item
Hope that helps.
Python 3.5
Scrapy 1.5.0
from scrapy.http import Request
# override method
def start_requests(self):
for url in self.start_urls:
item = {'start_url': url}
request = Request(url, dont_filter=True)
# set the meta['item'] to use the item in the next call back
request.meta['item'] = item
yield request
# use meta variable
def parse(self, response):
url = response.meta['item']['start_url']