Can't get additional items from url - regex

I'm scraping few items from this site, but it grabs items only from the first product and doesn't loop further. I know I'm doing simple stupid mistake, but if you can just point out where I got this wrong, I'll appreciate it.
Here is the spider:
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
import re
from zoomer.items import ZoomerItem
class ZoomSpider(BaseSpider):
name = "zoomSp"
allowed_domains = ["zoomer.ge"]
start_urls = [
"http://zoomer.ge/index.php?cid=35&act=search&category=1&search_type=mobile"
]
def parse(self, response):
sel = Selector(response)
titles = sel.xpath('//div[#class="productContainer"]/div[5]')
items = []
for t in titles:
item = ZoomerItem()
item["brand"] = t.xpath('//div[#class="productListContainer"]/div[3]/text()').re('^([\w, ]+)')
item["price"] = t.xpath('//div[#class="productListContainer"]/div[4]/text()').extract()[0].strip()
item["model"] = t.xpath('//div[#class="productListContainer"]/div[3]/text()').re('\s+(.*)$')[0].strip()
items.append(item)
return(items)
P.S. Also can't get regex for "brand" string to get only the first word "Blackberry" from the string:
"BlackBerry P9981 Porsche Design".

The <div/> element with the class productContainer is just a container and only appears one time, thus it is not repeating. The repeating element which you want to iterate over is the one with the class productListContainer.
def parse(self, response):
sel = Selector(response)
titles = sel.xpath('//div[#class="productContainer"]/div[5]/div[#class="productListContainer"]')
items = []
for t in titles:
item = ZoomerItem()
item["brand"] = t.xpath('div[3]/text()').re('^([\w\-]+)')
item["price"] = t.xpath('div[#class="productListPrice"]/div/text()').extract()
item["model"] = t.xpath('div[3]/text()').re('\s+(.*)$')[0].strip()
items.append(item)
items.append(item)
return(items)
This function is untested as I am not a python guy, so you might have to fiddle around a bit.

Related

Scrapy crawler not recursively crawling next page

I am trying to build this crawler to get housing data from craigslist,
but the crawler stops after fetching the first page and does not go to the next page .
Here is the code , it works for the first page ,but for the love of god I dont understand why it does not get to the next page .Any insight is really appreciated .I followed this part from scrapy tutorial
import scrapy
import re
from scrapy.linkextractors import LinkExtractor
class QuotesSpider(scrapy.Spider):
name = "craigslistmm"
start_urls = [
"https://vancouver.craigslist.ca/search/hhh"
]
def parse_second(self,response):
#need all the info in a dict
meta_dict = response.meta
for q in response.css("section.page-container"):
meta_dict["post_details"]= {
"location":
{"longitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-longitude)" ).extract(),
"latitude":q.css("div.mapAndAttrs div.mapbox div.viewposting::attr(data-latitude)" ).extract()},
"detailed_info": ' '.join(q.css('section#postingbody::text').extract()).strip()
}
return meta_dict
def parse(self, response):
pattern = re.compile("\/([a-z]+)\/([a-z]+)\/.+")
for q in response.css("li.result-row"):
post_urls = q.css("p.result-info a::attr(href)").extract_first()
mm = re.match(pattern, post_urls)
neighborhood= q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
next_url = "https://vancouver.craigslist.ca/"+ post_urls
request = scrapy.Request(next_url,callback=self.parse_second)
#next_page = response.xpath('.//a[#class="button next"]/#href').extract_first()
#follow_url = "https://vancouver.craigslist.ca/" + next_page
#request1 = scrapy.Request(follow_url,callback=self.parse)
#yield response.follow(next_page,callback = self.parse)
request.meta['id'] = q.css("li.result-row::attr(data-pid)").extract_first()
request.meta['pricevaluation'] = q.css("p.result-info span.result-meta span.result-price::text").extract_first()
request.meta["information"] = q.css("p.result-info span.result-meta span.housing::text" ).extract_first()
request.meta["neighborhood"] =q.css("p.result-info span.result-meta span.result-hood::text").extract_first()
request.meta["area"] = mm.group(1)
request.meta["adtype"] = mm.group(2)
yield request
#yield scrapy.Request(follow_url, callback=self.parse)
next_page = LinkExtractor(allow="s=\d+").extract_links(response)[0]
# = "https://vancouver.craigslist.ca/" + next_page
yield response.follow(next_page.url,callback=self.parse)
The problem seems to be with the next_page extraction using LinkExtractor. If you look in the look, you'll see duplicate requests being filtered. There are more links on the page that satisfy your extraction rule and maybe they are not extracted in any particular order (or not in the order you wish).
I think better approach is to extract exactly the information you want, try it with this:
next_page = response.xpath('//span[#class="buttons"]//a[contains(., "next")]/#href').extract_first()

Why scrapy not iterating over all the links on the page even the xpaths are correct?

This code works perfectly fine when I pass extract()[0] or extract() - it gives me output for the first link it parsed.I am not able to understand why its doing so,bcs when I was crawling Other websites with this code it was perfectly fine.
With this website its scraping only the first link.If I change extract()[1] then it will give me second link and so on .Why its not working automatically in for loop?
import scrapy
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
name = scrapy.Field()
date = scrapy.Field()
class criticspider(BaseSpider):
name = "mmt_mouth"
allowed_domains = ["mouthshut.com"]
start_urls = ["http://www.mouthshut.com/websites/makemytripcom-reviews-925031929"]
# rules = (
# Rule(
# SgmlLinkExtractor(allow=("search=make-my-trip&page=1/+",)),
# callback="parse",
# follow=True),
# )
def parse(self, response):
sites = response.xpath('//div[#id="allreviews"]')
items = []
for site in sites:
item = CompItem()
item['name'] = site.xpath('.//li[#class="profile"]/div/a/span/text()').extract()[0]
item['title'] = site.xpath('.//div[#class="reviewtitle fl"]/strong/a/text()').extract()[0]
item['date'] = site.xpath('.//div[#class="reviewrate"]//span[#class="datetime"]/span/span/span/text()').extract()[0]
item['link'] = site.xpath('.//div[#class="reviewtitle fl"]/strong/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//div[#itemprop="description"]/p/text()').extract()
yield old_item
Because your for loop has nothing to loop on the given website. Change your statement
sites = response.xpath('//div[#id="allreviews"]')
to
sites = response.xpath('//div[#id="allreviews"]/ul/li')
Then your for loop can loop over the list elements.

Need help understanding the output of the program

I was working with ma project XYZ
and I got stuck in extracting text in from the source
gifts
I want to extrack the href as content
I tried this
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from XYZ.items import XYZ
class MySpider(BaseSpider):
name = "main"
allowed_domains = ["XYZ"]
start_urls = ["XYZ"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//a[#data-tracking-id='mdd-heading']")
items = []
for titles in titles:
item = XYZ()
item ["title"] = titles.select("text()").extract()
item ["link"] = titles.select("#href").extract()
items.append(item)
print "www.xyz.com"+str(item["link"])
return items
and the output was
www.xyz.com[u'/gifts']
I was expecting output as
www.xyz.com/gifts
What i did wrong.... ?
According to the documentation for Selector's extract():
extract()
Serialize and return the matched nodes as a list of unicode
strings. Percent encoded content is unquoted.
So, extract() returns a list and you need the first item from it. Use item['link'][0].
Also, there are other problems in the code:
for titles in titles loop doesn't make sense, you need a separate loop variable
HtmlXPathSelector is deprecated, use Selector
use urljoin() to join the parts of a url
Here's the complete code with fixes and other improvements:
from urlparse import urljoin
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from XYZ.items import XYZ
class MySpider(BaseSpider):
name = "main"
allowed_domains = ["XYZ"]
start_urls = ["XYZ"]
def parse(self, response):
titles = response.xpath("//a[#data-tracking-id='mdd-heading']")
for title in titles:
item = XYZ()
item ["title"] = title.xpath("text()").extract()[0]
item ["link"] = title.xpath("#href").extract()[0]
print urljoin("www.xyz.com", item["link"])
yield item

Trying to extract from the deep node with scrapy, results are bad

As a beginner I'm having a hard time, so I'm here to ask for help.
I'm trying to extract prices from the html page, which are nested deeply:
second price location:
from scrapy.spider import Spider
from scrapy.selector import Selector
from mymarket.items import MymarketItem
class MySpider(Spider):
name = "mymarket"
allowed_domains = ["url"]
start_urls = [
"http://url"
]
def parse(self, response):
sel = Selector(response)
titles = sel.xpath('//table[#class="tab_product_list"]//tr')
items = []
for t in titles:
item = MymarketItem()
item["price"] = t.xpath('//tr//span[2]/text()').extract()
items.append(item)
return items
I'm trying to export scraped prices to csv. they do export but are being populated like this:
And I want them to be sorted like this in .csv:
etc.
Can anybody point out where is the faulty part of the xpath or how I can make prices be sorted "properly" ?
It's difficult to say what's wrong with the path. Install firepath extension for Firefox to test your xpath queries. One note for now:
titles = sel.xpath('//table[#class="tab_product_list"]//tr')
In your screenshot you have nested tables, so //tr will give trs from nested tables too.
def parse(self, response):
sel = Selector(response)
titles = sel.xpath('//table[#class="tab_product_list"]/tr') # or with tbody
items = []
for t in titles:
item = MymarketItem()
item["price"] = t.xpath('.//span[#style="color:red;"]/text()').extract()[0]
items.append(item)
return items
.extract() returns a list, even if just one argument found, take the first element of the list .extract()[0]

xpath not getting selected

I have just started using Scrapy:
Here is an example of a website that I want to crawl :
http://www.thefreedictionary.com/shame
The code for my Spider :
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from dic_crawler.items import DicCrawlerItem
from urlBuilder import *
class Dic_crawler(BaseSpider):
name = "dic"
allowed_domains = ["www.thefreedictionary.com"]
start_urls = listmaker()[:]
print start_urls
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*[#id="MainTxt"]/table/tbody')
print 'SITES:\n',sites
item = DicCrawlerItem()
item["meanings"] = sites.select('//*[#id="MainTxt"]/table/tbody/tr/td/div[1]/div[1]/div[1]/text()').extract()
print item
return item
The listmaker() returns a list of urls to scrap.
My problem is that the sites variable comes up empty if I select till 'tbody' in the xpath and returns an empty sites variable, Whereas if I select only table I get the part of the site I want.
I am not able to retrieve the meaning for a word as a result of this into item["meanings"] since the part after tbody is does not select beyond tbody.
Also while at it, the site gives multiple meanings which I would like to extract but I only know how to extract a single method.
Thanks
Here's a spider skeleton to get you started:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
class Dic_crawler(BaseSpider):
name = "thefreedictionary"
allowed_domains = ["www.thefreedictionary.com"]
start_urls = ['http://www.thefreedictionary.com/shame']
def parse(self, response):
hxs = HtmlXPathSelector(response)
# loop on each "noun" or "verb" or something... section
for category in hxs.select('id("MainTxt")//div[#class="pseg"]'):
# this is simply to get what's in the <i> tag
category_name = u''.join(category.select('./i/text()').extract())
self.log("category: %s" % category_name)
# for each category, a term can have multiple definition
# category from .select() is a selector
# so you can call .select() on it also,
# here with a relative XPath expression selecting all definitions
for definition in category.select('div[#class="ds-list"]'):
definition_text = u'\n'.join(
definition.select('.//text()').extract())
self.log(" - definition: %s" % definition_text)