How to follow next pages in Scrapy Crawler to scrape content - python-2.7

I am able to scrape all the stories from the first page,my problem is how to move to the next page and continue scraping stories and name,kindly check my code below
# -*- coding: utf-8 -*-
import scrapy
from cancerstories.items import CancerstoriesItem
class MyItem(scrapy.Item):
name = scrapy.Field()
story = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'cancerstories'
allowed_domains = ['thebreastcancersite.greatergood.com']
start_urls = ['http://thebreastcancersite.greatergood.com/clickToGive/bcs/stories/']
def parse(self, response):
rows = response.xpath('//a[contains(#href,"story")]')
#loop over all links to stories
for row in rows:
myItem = MyItem() # Create a new item
myItem['name'] = row.xpath('./text()').extract() # assign name from link
story_url = response.urljoin(row.xpath('./#href').extract()[0]) # extract url from link
request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
request.meta['myItem'] = myItem # pass the item with the request
yield request
def parse_detail(self, response):
myItem = response.meta['myItem'] # extract the item (with the name) from the response
#myItem['name']=response.xpath('//h1[#class="headline"]/text()').extract()
text_raw = response.xpath('//div[#class="photoStoryBox"]/div/p/text()').extract() # extract the story (text)
myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
yield myItem # return the item

You could change your scrapy.Spider for a CrawlSpider, and use Rule and LinkExtractor to follow the link to the next page.
For this approach you have to include the code below:
...
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
...
rules = (
Rule(LinkExtractor(allow='\.\./stories;jsessionid=[0-9A-Z]+?page=[0-9]+')),
)
...
class MySpider(CrawlSpider):
...
This way, for each page you visit the spider will create a request for the next page (if present), follow it when finishes the execution for the parse method, and repeat the process again.
EDIT:
The rule I wrote is just to follow the next page link not to extract the stories, if your first approach works it's not necessary to change it.
Also, regarding the rule in your comment, SgmlLinkExtractor is deprecated so I recommend you to use the default link extractor, and the rule itself is not well defined.
When the parameter attrs in the extractor is not defined, it searchs links looking for the href tags in the body, which in this case looks like ../story/mother-of-4435 and not /clickToGive/bcs/story/mother-of-4435. That's the reason it doesn't find any link to follow.

you can follow next pages manually if you would use scrapy.spider class,example:
next_page = response.css('a.pageLink ::attr(href)').extract_first()
if next_page:
absolute_next_page_url = response.urljoin(next_page)
yield scrapy.Request(url=absolute_next_page_url, callback=self.parse)
Do not forget to rename your parse method to parse_start_url if you want to use CralwSpider class

Related

unable to scrape a text file through a given link and store it in an output file

I'm trying to collect the weather data for year 2000 from this site.
My code for the spider is:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class weather(CrawlSpider):
name = 'data'
start_urls = [
'https://www1.ncdc.noaa.gov/pub/data/uscrn/products/daily01/'
]
custom_settings = {
'DEPTH_LIMIT': '2',
}
rules = (
Rule(LinkExtractor(restrict_xpaths=
('//table/tr[2]/td[1]/a',)),callback='parse_item',follow=True),
)
def parse_item(self, response):
for b in response.xpath('//table')
yield scrapy.request('/tr[4]/td[2]/a/#href').extract()
yield scrapy.request('/tr[5]/td[2]/a/#href').extract()
The paths with 'yield' are the links to two text file and i want to scrape the data from these text files and store it separately in two different files but I don't know how to continue.
I don't usually use CrawlSpider so I'm unfamiliar with it, but it seems like you should be able to create another xpath (preferably something more specific than "/tr[4]/td[2]/a/#href") and supply a callback function.
However, in a "typical" scrapy project using Spider instead of CrawlSpider, you would simply yield a Request with another callback function to handle extracting and storing to the database. For example:
def parse_item(self, response):
for b in response.xpath('//table')
url = b.xpath('/tr[4]/td[2]/a/#href').extract_first()
yield scrapy.Request(url=url, callback=extract_and_store)
url = b.xpath('/tr[5]/td[2]/a/#href').extract_first()
yield scrapy.Request(url=url, callback=extract_and_store)
def extract_and_store(self, response):
"""
Scrape data and store it separately
"""

How to scrape pages after login

I try to find a way to scrape and parse more pages in the signed in area.
These example links accesible from signed in I want to parse.
#http://example.com/seller/demand/?id=305554
#http://example.com/seller/demand/?id=305553
#http://example.com/seller/demand/?id=305552
#....
I want to create spider that can open each one of these links and then parse them.
I have created another spider which can open and parse only one of them.
When I tried to create "for" or "while" to call more requests with other links it allowed me not because I cannot put more returns into generator, it returns error. I also tried link extractors, but it didn't work for me.
Here is my code:
#!c:/server/www/scrapy
# -*- coding: utf-8 -*-
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import FormRequest
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider, Rule
from array import *
from stack.items import StackItem
from scrapy.linkextractors import LinkExtractor
class Spider3(Spider):
name = "Spider3"
allowed_domains = ["example.com"]
start_urls = ["http://example.com/login"] #this link lead to login page
When I am signed in it returns page with url, that contains "stat", that is why I put here first "if" condition.
When I am signed in, I request one link and call function parse_items.
def parse(self, response):
#when "stat" is in url it means that I just signed in
if "stat" in response.url:
return Request("http://example.com/seller/demand/?id=305554", callback = self.parse_items)
else:
#this succesful login turns me to page, it's url contains "stat"
return [FormRequest.from_response(response,
formdata={'ctl00$ContentPlaceHolder1$lMain$tbLogin': 'my_login', 'ctl00$ContentPlaceHolder1$lMain$tbPass': 'my_password'},callback=self.parse)]
Function parse_items simply parse desired content from one desired page:
def parse_items(self,response):
questions = Selector(response).xpath('//*[#id="ctl00_ContentPlaceHolder1_cRequest_divAll"]/table/tr')
for question in questions:
item = StackItem()
item['name'] = question.xpath('th/text()').extract()[0]
item['value'] = question.xpath('td/text()').extract()[0]
yield item
Can you help me please to update this code to open and parse more than one page in each sessions?
I don't want to sign in over and over for each request.
The session most likely depends on the cookies and scrapy manages that by itself. I.e:
def parse_items(self,response):
questions = Selector(response).xpath('//*[#id="ctl00_ContentPlaceHolder1_cRequest_divAll"]/table/tr')
for question in questions:
item = StackItem()
item['name'] = question.xpath('th/text()').extract()[0]
item['value'] = question.xpath('td/text()').extract()[0]
yield item
next_url = '' # find url to next page in the current page
if next_url:
yield Request(next_url, self.parse_items)
# scrapy will retain the session for the next page if it's managed by cookies
I am currently working on the same problem. I use InitSpider so I can overwrite __init__ and init_request. The first is just for initialisation of custom stuff and the actual magic happens in my init_request:
def init_request(self):
"""This function is called before crawling starts."""
# Do not start a request on error,
# simply return nothing and quit scrapy
if self.abort:
return
# Do a login
if self.login_required:
# Start with login first
return Request(url=self.login_page, callback=self.login)
else:
# Start with pase function
return Request(url=self.base_url, callback=self.parse)
My login looks like this
def login(self, response):
"""Generate a login request."""
self.log('Login called')
return FormRequest.from_response(
response,
formdata=self.login_data,
method=self.login_method,
callback=self.check_login_response
)
self.login_data is a dict with post values.
I am still a beginner with python and scrapy, so I might be doing it the wrong way. Anyway, so far I have produced a working version that can be viewed on github.
HTH:
https://github.com/cytopia/crawlpy

crawlSpider seems not to follow rule

here's my code. Actually I followed the example in "Recursively Scraping Web Pages With Scrapy" and it seems I have included a mistake somewhere.
Can someone help me find it, please? It's driving me crazy, I only want all the results from all the result pages. Instead it gives me the results from page 1.
Here's my code:
import scrapy
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http.request import Request
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from githubScrape.items import GithubscrapeItem
class GithubSpider(CrawlSpider):
name = "github2"
allowed_domains = ["github.com"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//*[contains(#class, "next_page")]')), callback='parse_items', follow=True),
)
def start_requests(self):
baseURL = 'https://github.com/search?utf8=%E2%9C%93&q=eagle+SYSTEM+extension%3Asch+size%3A'
for i in range(10000, 20000, +5000):
url = baseURL+str(i+1)+".."+str(i+5000)+'&type=Code&ref=searchresults'
print "URL:",url
yield Request(url, callback=self.parse_items)
def parse_items(self, response):
hxs = Selector(response)
resultParagraphs = hxs.xpath('//div[contains(#id,"code_search_results")]//p[contains(#class, "title")]')
items = []
for p in resultParagraphs:
hrefs = p.xpath('a/#href').extract()
projectURL = hrefs[0]
schemeURL = hrefs[1]
lastIndexedOn = p.xpath('.//span/time/#datetime').extract()
i = GithubscrapeItem()
i['counter'] = self.count
i['projectURL'] = projectURL
i['schemeURL'] = schemeURL
i['lastIndexedOn'] = lastIndexedOn
items.append(i)
return(items)
I didn't find your code on the link you passed, but I think the problem is that you are never using the rules.
Scrapy starts crawling by calling the start_requests method, but the rules are compiled and used on the parse method, which you are not using because your requests go directly from start_requests to parse_items.
You could remove the callback on the start_requests method if you want the rules to be applied on that level.

Issues on following links in scrapy

I want to crawl a blog which has several categories of websites . Starting navigating the page from the first category, my goal is to collect every webpage by following the categories . I have collected the websites from the 1st category but the spider stops there , can't reach the 2nd category .
An example draft :
my code :
import scrapy
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.contrib.linkextractors import LinkExtractor
from final.items import DmozItem
class my_spider(CrawlSpider):
name = 'heart'
allowed_domains = ['greek-sites.gr']
start_urls = ['http://www.greek-sites.gr/categories/istoselides-athlitismos']
rules = (Rule(LinkExtractor(allow=(r'.*categories/.*', )), callback='parse', follow=True),)
def parse(self, response):
self.logger.info('Hi, this is an item page! %s', response.url)
categories = response.xpath('//a[contains(#href, "categories")]/text()').extract()
for category in categories:
item = DmozItem()
item['title'] = response.xpath('//a[contains(text(),"gr")]/text()').extract()
item['category'] = response.xpath('//div/strong/text()').extract()
return item
The problem is simple: the callback has to be different than parse, so I suggest you name your method parse_site for example and then you are ready to continue your scraping.
If you make the change below it will work:
rules = (Rule(LinkExtractor(allow=(r'.*categories/.*', )), callback='parse_site', follow=True),)
def parse_site(self, response):
The reason for this is described in the docs:
When writing crawl spider rules, avoid using parse as callback, since the CrawlSpider uses the parse method itself to implement its logic. So if you override the parse method, the crawl spider will no longer work.

xpath not getting selected

I have just started using Scrapy:
Here is an example of a website that I want to crawl :
http://www.thefreedictionary.com/shame
The code for my Spider :
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from dic_crawler.items import DicCrawlerItem
from urlBuilder import *
class Dic_crawler(BaseSpider):
name = "dic"
allowed_domains = ["www.thefreedictionary.com"]
start_urls = listmaker()[:]
print start_urls
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*[#id="MainTxt"]/table/tbody')
print 'SITES:\n',sites
item = DicCrawlerItem()
item["meanings"] = sites.select('//*[#id="MainTxt"]/table/tbody/tr/td/div[1]/div[1]/div[1]/text()').extract()
print item
return item
The listmaker() returns a list of urls to scrap.
My problem is that the sites variable comes up empty if I select till 'tbody' in the xpath and returns an empty sites variable, Whereas if I select only table I get the part of the site I want.
I am not able to retrieve the meaning for a word as a result of this into item["meanings"] since the part after tbody is does not select beyond tbody.
Also while at it, the site gives multiple meanings which I would like to extract but I only know how to extract a single method.
Thanks
Here's a spider skeleton to get you started:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
class Dic_crawler(BaseSpider):
name = "thefreedictionary"
allowed_domains = ["www.thefreedictionary.com"]
start_urls = ['http://www.thefreedictionary.com/shame']
def parse(self, response):
hxs = HtmlXPathSelector(response)
# loop on each "noun" or "verb" or something... section
for category in hxs.select('id("MainTxt")//div[#class="pseg"]'):
# this is simply to get what's in the <i> tag
category_name = u''.join(category.select('./i/text()').extract())
self.log("category: %s" % category_name)
# for each category, a term can have multiple definition
# category from .select() is a selector
# so you can call .select() on it also,
# here with a relative XPath expression selecting all definitions
for definition in category.select('div[#class="ds-list"]'):
definition_text = u'\n'.join(
definition.select('.//text()').extract())
self.log(" - definition: %s" % definition_text)