scrapy: request url must be str or unicode, got Selector - python-2.7

I am writing a spider using Scrapy, to scrape user details of Pinterest. I am trying to get the details of user and his followers ( and so on until the last node).
Below is the spider code:
from scrapy.spider import BaseSpider
import scrapy
from pinners.items import PinterestItem
from scrapy.http import FormRequest
from urlparse import urlparse
class Sample(BaseSpider):
name = 'sample'
allowed_domains = ['pinterest.com']
start_urls = ['https://www.pinterest.com/banka/followers', ]
def parse(self, response):
for base_url in response.xpath('//div[#class="Module User gridItem"]/a/#href'):
list_a = response.urljoin(base_url.extract())
for new_urls in response.xpath('//div[#class="Module User gridItem"]/a/#href'):
yield scrapy.Request(new_urls, callback=self.Next)
yield scrapy.Request(list_a, callback=self.Next)
def Next(self, response):
href_base = response.xpath('//div[#class = "tabs"]/ul/li/a')
href_board = href_base.xpath('//div[#class="BoardCount Module"]')
href_pin = href_base.xpath('.//div[#class="Module PinCount"]')
href_like = href_base.xpath('.//div[#class="LikeCount Module"]')
href_followers = href_base.xpath('.//div[#class="FollowerCount Module"]')
href_following = href_base.xpath('.//div[#class="FollowingCount Module"]')
item = PinterestItem()
item["Board_Count"] = href_board.xpath('.//span[#class="value"]/text()').extract()[0]
item["Pin_Count"] = href_pin.xpath('.//span[#class="value"]/text()').extract()
item["Like_Count"] = href_like.xpath('.//span[#class="value"]/text()').extract()
item["Followers_Count"] = href_followers.xpath('.//span[#class="value"]/text()').extract()
item["Following_Count"] = href_following.xpath('.//span[#class="value"]/text()').extract()
item["User_ID"] = response.xpath('//link[#rel="canonical"]/#href').extract()[0]
yield item
I get the following error:
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
TypeError: Request url must be str or unicode, got Selector:
I did check the type of the list_a ( urls extracted). It gives me unicode.

the error is generated by the inner for loop in the parse method:
for new_urls in response.xpath('//div[#class="Module User gridItem"]/a/#href'):
yield scrapy.Request(new_urls, callback=self.Next)
the new_urls variable is actually a selector, please try something like this:
for base_url in response.xpath('//div[#class="Module User gridItem"]/a/#href'):
list_a = response.urljoin(base_url.extract())
yield scrapy.Request(list_a, callback=self.Next)

Related

Unable to convert unicode into json in scrapy

import scrapy
import json
class GettingtonDSpider(scrapy.Spider):
name = "gettington_d"
allowed_domains = ["gettington.com"]
start_urls = ['https://api.gettington.com/v1/products?showMPP=false&rows=24&q=Keyword:south%20shore%20furniture&productfilter=null&callback=searchCallback']
def parse(self, response):
jsonresp = json.dumps(response.body)
jsonresp= json.loads(jsonresp)
I have tried many methods but I failed:
response.text
encode('utf-8')
response_body_as_unicode
None of above worked. How can the error be solved?
you have to remove the unnecessary information first from the response.body, which isn't JSON serializable:
import re
...
json_string = re.search(r'searchCallback\((.*)\)', response.body).group(1);
jsonresp = json.loads(json_string)
Now you have a dict in jsonresp

can't crawl all pages in a website

I was trying to crawl all the datas in all the pages . when i try to join the url i can't . I want to know what is the mistake i am doing
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
import urlparse
from data.items import TextPostItem
from scrapy import optional_features
optional_features.remove('boto')
class RedditCrawler(CrawlSpider):
name = 'reddit_crawler'
allowed_domains = ['yellowpages.com']
start_urls = ['http://www.yellowpages.com/search?search_terms=restaurant&geo_location_terms=California%2C%20KY']
custom_settings = {
'BOT_NAME': 'reddit-scraper',
'DEPTH_LIMIT': 7,
'DOWNLOAD_DELAY': 3
}
def parse(self, response):
s = Selector(response)
next_link = s.xpath('//a[#class="next ajax-page"]/#href').extract()[0]
full_link = urlparse.urljoin('http://www.yellowpages.com',next_link)
yield self.make_requests_from_url(full_link)
posts = Selector(response).xpath('//div[#class="search-results organic"]')
for post in posts:
item = TextPostItem()
item['address']= post.xpath("//p[#class='adr']//text()").extract()
item['business_name']= post.xpath("//a[#class='business-name']//text()").extract()
item['phonenumber']= post.xpath("//div[#class='phones phone primary']//text()").extract()
item['categories']=post.xpath("//div[#class='categories']//text()").extract()
item['next_link']=post.xpath("//div[#class='pagination']//a[#class='next ajax-page']//#href").extract()
yield item
I think your xpath '//div[#class="next ajax-page"]//ul//li[6]//a/#href' is incorrent. It doesn't work for me.
Try something simpler '//a[#class="next ajax-page"]/#href'

Why scrapy not iterating over all the links on the page even the xpaths are correct?

This code works perfectly fine when I pass extract()[0] or extract() - it gives me output for the first link it parsed.I am not able to understand why its doing so,bcs when I was crawling Other websites with this code it was perfectly fine.
With this website its scraping only the first link.If I change extract()[1] then it will give me second link and so on .Why its not working automatically in for loop?
import scrapy
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
name = scrapy.Field()
date = scrapy.Field()
class criticspider(BaseSpider):
name = "mmt_mouth"
allowed_domains = ["mouthshut.com"]
start_urls = ["http://www.mouthshut.com/websites/makemytripcom-reviews-925031929"]
# rules = (
# Rule(
# SgmlLinkExtractor(allow=("search=make-my-trip&page=1/+",)),
# callback="parse",
# follow=True),
# )
def parse(self, response):
sites = response.xpath('//div[#id="allreviews"]')
items = []
for site in sites:
item = CompItem()
item['name'] = site.xpath('.//li[#class="profile"]/div/a/span/text()').extract()[0]
item['title'] = site.xpath('.//div[#class="reviewtitle fl"]/strong/a/text()').extract()[0]
item['date'] = site.xpath('.//div[#class="reviewrate"]//span[#class="datetime"]/span/span/span/text()').extract()[0]
item['link'] = site.xpath('.//div[#class="reviewtitle fl"]/strong/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//div[#itemprop="description"]/p/text()').extract()
yield old_item
Because your for loop has nothing to loop on the given website. Change your statement
sites = response.xpath('//div[#id="allreviews"]')
to
sites = response.xpath('//div[#id="allreviews"]/ul/li')
Then your for loop can loop over the list elements.

Need help understanding the output of the program

I was working with ma project XYZ
and I got stuck in extracting text in from the source
gifts
I want to extrack the href as content
I tried this
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from XYZ.items import XYZ
class MySpider(BaseSpider):
name = "main"
allowed_domains = ["XYZ"]
start_urls = ["XYZ"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//a[#data-tracking-id='mdd-heading']")
items = []
for titles in titles:
item = XYZ()
item ["title"] = titles.select("text()").extract()
item ["link"] = titles.select("#href").extract()
items.append(item)
print "www.xyz.com"+str(item["link"])
return items
and the output was
www.xyz.com[u'/gifts']
I was expecting output as
www.xyz.com/gifts
What i did wrong.... ?
According to the documentation for Selector's extract():
extract()
Serialize and return the matched nodes as a list of unicode
strings. Percent encoded content is unquoted.
So, extract() returns a list and you need the first item from it. Use item['link'][0].
Also, there are other problems in the code:
for titles in titles loop doesn't make sense, you need a separate loop variable
HtmlXPathSelector is deprecated, use Selector
use urljoin() to join the parts of a url
Here's the complete code with fixes and other improvements:
from urlparse import urljoin
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from XYZ.items import XYZ
class MySpider(BaseSpider):
name = "main"
allowed_domains = ["XYZ"]
start_urls = ["XYZ"]
def parse(self, response):
titles = response.xpath("//a[#data-tracking-id='mdd-heading']")
for title in titles:
item = XYZ()
item ["title"] = title.xpath("text()").extract()[0]
item ["link"] = title.xpath("#href").extract()[0]
print urljoin("www.xyz.com", item["link"])
yield item

xpath not getting selected

I have just started using Scrapy:
Here is an example of a website that I want to crawl :
http://www.thefreedictionary.com/shame
The code for my Spider :
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from dic_crawler.items import DicCrawlerItem
from urlBuilder import *
class Dic_crawler(BaseSpider):
name = "dic"
allowed_domains = ["www.thefreedictionary.com"]
start_urls = listmaker()[:]
print start_urls
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*[#id="MainTxt"]/table/tbody')
print 'SITES:\n',sites
item = DicCrawlerItem()
item["meanings"] = sites.select('//*[#id="MainTxt"]/table/tbody/tr/td/div[1]/div[1]/div[1]/text()').extract()
print item
return item
The listmaker() returns a list of urls to scrap.
My problem is that the sites variable comes up empty if I select till 'tbody' in the xpath and returns an empty sites variable, Whereas if I select only table I get the part of the site I want.
I am not able to retrieve the meaning for a word as a result of this into item["meanings"] since the part after tbody is does not select beyond tbody.
Also while at it, the site gives multiple meanings which I would like to extract but I only know how to extract a single method.
Thanks
Here's a spider skeleton to get you started:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
class Dic_crawler(BaseSpider):
name = "thefreedictionary"
allowed_domains = ["www.thefreedictionary.com"]
start_urls = ['http://www.thefreedictionary.com/shame']
def parse(self, response):
hxs = HtmlXPathSelector(response)
# loop on each "noun" or "verb" or something... section
for category in hxs.select('id("MainTxt")//div[#class="pseg"]'):
# this is simply to get what's in the <i> tag
category_name = u''.join(category.select('./i/text()').extract())
self.log("category: %s" % category_name)
# for each category, a term can have multiple definition
# category from .select() is a selector
# so you can call .select() on it also,
# here with a relative XPath expression selecting all definitions
for definition in category.select('div[#class="ds-list"]'):
definition_text = u'\n'.join(
definition.select('.//text()').extract())
self.log(" - definition: %s" % definition_text)