I want to append string url in the item :
object(item['urls'] = sel.xpath('a/#href').extract())
Example:
item['urls'] = "http://lakmeindia.com" + sel.xpath('a/#href').extract()
# Item class
import scrapy
class LakmeSampleItem(scrapy.Item):
urls = scrapy.Field()
catagory = scrapy.Field()
sub_category = scrapy.Field()
# lakme Spider
import scrapy
from LakmeProject.items import LakmeSampleItem
class LakmeSpider(scrapy.Spider):
name = "lakme"
allowed_domains = ["lakmeindia.com"]
start_urls = [
"http://www.lakmeindia.com/sitemap"
]
def parse(self, response):
for sel in response.xpath("//div[#class='make-up']/ul[1]/li"):
item = LakmeSampleItem()
item['sub_category'] = sel.xpath('span/text()').extract()
# here i want to append url(because url is coming like [/sitemap])
item['urls'] = sel.xpath('a/#href').extract()
item['catagory'] = "Lakme Absolute"
yield item
You are on the right track. You just need to note that extract() returns a list type. So what you actually need to be doing is:
item['urls'] = "http://lakmeindia.com" + sel.xpath('a/#href').extract()[0]
i.e., get the first item in the list by using [0] on the result returned by extract()
Related
I want to scrape a site with scrapy that lists its products in catagories i'm new to scrapy and just getting my head round it today but though i was getting the gist of it on simple scrapes so attempted to scrape urls and return them to scrape further but appears i'm missing something.
someone answered fixing my code here is the latest version as thought i'd have another go at learning scrapy today but its still not recursively scanning it just seems to loop through all the pages but never gets into parse the items
never seems to enter the else statement
yield scrapy.Request(url = response.url,callback = self.parse_item)
i can debug it to check the items are parsed correctly if i force it to output items without looping
if i change the following
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_product_pages)
else:
yield scrapy.Request(url = response.url,callback = self.parse_item)
to
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_item)
else:
yield scrapy.Request(url = response.url,callback = self.parse_product_pages)
here is my code i'm working in python 2.7
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from ybscrape.items import Product
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors.sgml import SgmlLinkExtractor
class ybracingSpider(CrawlSpider):
name = 'ybscrape2'
download_delay = 0.75
def __init__(self, *args, **kwargs):
super(ybracingSpider, self).__init__(*args, **kwargs)
self.allowed_domains = ['http://www.ybracing.com/', 'www.ybracing.com', 'www.esellepro.com']
self.base_url = 'http://www.ybracing.com'
self.start_urls = ['http://www.ybracing.com/karting/']
def parse_start_url(self, response):
category = response.xpath("//h2/a/#href").extract()
#loop over catagory pages take the product link and add all pages url
for product in category:
all_pages = '?itemsperpage=99999'
category_url = str(self.base_url + product + all_pages)
self.log("Queued up: %s" % category_url)
yield scrapy.Request(url = category_url,callback = self.parse_product_pages)
def parse_product_pages(self, response):
product_pages = response.xpath("//li/div/div/h3/a/#href").extract()
#print("debug pause")
#print(product_pages)
#wait = input("PRESS ENTER TO CONTINUE.")
#print("continue")
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_product_pages)
else:
yield scrapy.Request(url = response.url,callback = self.parse_item)
def parse_item(self, response):
item = Product()
item['description'] = response.xpath("//div[#id='Tabbed-Container-Details']/div[2]/div/text()").extract()
item['product_title'] = response.xpath("//h3[#class='Product-Heading']/text()").extract()
item['price'] = response.xpath("//div[#id='Product-Price']/text()").extract()
table_rows = response.xpath("//table[#id='SpecificationTab']/tr[*]/td[1]//text()").extract()
yield item
my items.py
from scrapy.item import Item, Field
class Product(Item):
product_title = Field()
description = Field()
price = Field()
What i'm expecting my code to do in steps
grab all the links within the the first export (categories) (this works)
look at all 9999 products inside each category and export the list (this works)
take the product url from the export append it to the base url to get to the product page for each. (this works)
4.then read data from in the product page to add to items ( never gets here) unlese i skip the if statement but thats not recursive it wont handle sub catagories like that
Here, I have made some changes in your code and now it's working
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from demo.items import DemoItem
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors.sgml import SgmlLinkExtractor
class DemoSpider(CrawlSpider):
name = 'ybracing2'
def __init__(self, *args, **kwargs):
super(DemoSpider, self).__init__(*args, **kwargs)
self.allowed_domains = ['http://www.ybracing.com/', 'www.ybracing.com', 'www.esellepro.com']
self.base_url = 'http://www.ybracing.com'
self.start_urls = ['http://www.ybracing.com/racewear/']
def parse_start_url(self, response):
category = response.xpath("//h2/a/#href").extract()
#loop over catagory pages take the product link and add all pages url
for product in category:
all_pages = '?itemsperpage=99999'
category_url = str(self.base_url + product + all_pages)
self.log("Queued up: %s" % category_url)
yield scrapy.Request(url = category_url,callback = self.parse_product_pages)
def parse_product_pages(self, response):
product_pages = response.xpath("//div[#class='Product']/a/#href").extract()
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_item)
else:
yield scrapy.Request(url = response.url,callback = self.parse_product_pages)
def parse_item(self, response):
item = DemoItem()
dirty_data ={}
item['product_title'] = response.xpath("//h3[#class='Product-Heading']/text()").extract()
item['price'] = response.xpath("//div[#id='Product-Price']/text()").extract()
item['description'] = response.xpath("//div[#id='Tabbed-Container-Details']/div[2]/div/text()").extract()
#image['product_image'] =
# for variable in dirty_data.keys():
# if dirty_data[variable]:
# if variable == 'price':
# item[variable] = float(''.join(dirty_data[variable]).strip().replace('$', '').replace(',', ''))
# else:
# item[variable] = ''.join(dirty_data[variable]).strip()
yield item
I've tried everything but I can't seem to figure it out how I can call the next page in the parse_category.
I've tried LinkExtractor as I do when I go directly to a catergory page but this didn't work.
import scrapy.selector
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from msh_final.items import CrawlerMshFinalItem
def complete_url(string):
return "http://www.mediamarkt.be" + string
def get_base_url(url):
if url != "":
u = urlparse.urlparse(url)
return "%s://%s" % (u.scheme, u.netloc)
else:
return ""
def encode(str):
return str.encode('utf8', 'ignore')
class msh_finalSpider(CrawlSpider):
name = 'msh_final'
start_urls = ['http://www.mediamarkt.be/mcs/productlist/_Telefoon-Navigatie,98952,509451.html?langId=-17']
def parse(self, response):
items = response.xpath('//ul[#class="infield cf"]//div[#class="infield-wrapper"]/h2/a/#href')
for item in items:
link = item.extract()
yield Request(complete_url(link), callback=self.parse_category)
def parse_category(self, response):
items = response.xpath("//ul[#class='products-list']/li/div")
for item in items:
msh = CrawlerMshFinalItem()
msh['item_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0])
msh['item_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0])
yield msh
You should inherite your spider from Spider instead of CrawlSpider and use following code:
class msh_finalSpider(Spider):
name = 'msh_final'
start_urls = ['http://www.mediamarkt.be/mcs/productlist/_Telefoon-Navigatie,98952,509451.html?langId=-17']
def parse(self, response):
items = response.xpath('//ul[#class="infield cf"]//div[#class="infield-wrapper"]/h2/a/#href')
for item in items:
link = item.extract()
yield Request(complete_url(link), callback=self.parse_category)
def parse_category(self, response):
items = response.xpath("//ul[#class='products-list']/li/div")
for item in items:
msh = CrawlerMshFinalItem()
msh['item_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0])
msh['item_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0])
yield msh
new_link = response.xpath('//li[#class="pagination-next"]/a/#href').extract()[0]
yield Request(
complete_url(new_link),
callback=self.parse_category
)
so I have this model set up with django and mongoengine.
class Product(Document):
product_id = IntField()
title = StringField(max_length=255)
sources = ListField(ReferenceField(Source, dbref = True))
class Source(Document):
source_id = IntField()
source_type = StringField(choices=settings.PARENT_TYPE_CHOICES, max_length=50)
name = StringField(max_length=255)
url = URLField(max_length=2000)
meta = {"allow_inheritance": True}
And in my scrapy pipeline I save the following data:
class SaveItemPipeline(object):
def process_item(self, item, spider):
product = item["product"]
product["sources"] = self.create_sources(product)
saved_product,created = Product.objects.get_or_create(**product)
return item
def create_sources(self,product):
temp_sources = []
for source in product["sources"]:
print source
if source["source_type"] == "user":
temp_source,created = UserSource.objects.get_or_create(**source)
elif source["source_type"] == "store":
temp_source,created = StoreSource.objects.get_or_create(**source)
elif source["source_type"] == "collection":
temp_source,created = CollectionSource.objects.get_or_create(**source)
temp_sources.append(temp_source.id)
return temp_sources
Howerver, when I run the scraper, on save it gives me this error:
raise ValidationError(message, errors=errors, field_name=field_name)
mongoengine.errors.ValidationError:
[ObjectId('55787a07516ddcf4d93cd4c6'),
ObjectId('55787b07516ddcf5aff06fa9'),
ObjectId('55787b07516ddcf5aff06faa')] is not a valid ObjectId
By the way the UserSource and StoreSource...all inherit from Source so they are just subclasses.However, am I doing anything wrong here, I don't understand why it is giving me that error when product gets created.
Thanks!
You can use this
class Source(Document):
source_id = IntField()
class Product(Document):
sources = ListField(ReferenceField(Source, dbref = True))
src, created = Source.objects.create(source_id=1)
pd, _ = Product.objects.create(sources=[src])
It works for me. I am using mongoengine 0.8.7, pymongo 2.8
How run download and parse the same url, while needed data not received, or call of recursion is > 5 times?
For example i need get h1 from example.com, but server return different pages: with h1, and without it. I want that the code run download and parse url while h1 not get or recursion call > 5 times.
For example
import re
from kupito.items import KupitoItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
class idpkzSpider(CrawlSpider):
name = 'idp.kz'
allowed_domains = ['idp.kz']
start_urls = [
'http://idp.kz/index.php/katalog-tovarov/monitors'
]
rules = (
Rule(LinkExtractor(allow=('.*start=\d+$')), callback='parse_start_url', follow=True),
)
def parse_start_url(self, response):
items = []
hxs = Selector(response)
dirEl = hxs.xpath('//h1/text()').extract()
goods = hxs.xpath('//div[#class=\'jshop list_product\']/div[#class=\'block_product\']')
if dirEl and len(goods) > 0:
dirName = dirEl[0].encode('utf-8').strip()
for good in goods:
name = good.xpath('div//div[#class=\'name\']/a/text()').extract()
price = good.xpath('div//div[#class=\'jshop_price\']/span/text()').extract()
url = good.xpath('div//div[#class=\'name\']/a/#href').extract()
if name and price and url:
item = KupitoItem()
item['name'] = name[0].encode('utf-8').strip()
item['price'] = price[0].encode('utf-8').strip()
item['url'] = url[0].encode('utf-8').strip()
item['dirName'] = dirName
items.append(item)
return items
I am using Scrapy for a project, in this project I am extracting the information from the xml.
In the xml document the format where I would like to implement the for loop:
<relatedPersonsList>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>
<relatedPersonName>
<firstName>Mark</firstName>
<middleName>E.</middleName>
<lastName>Lucas</lastName>
</relatedPersonName>
<relatedPersonAddress>
<street1>1 IMATION WAY</street1>
<city>OAKDALE</city>
<stateOrCountry>MN</stateOrCountry>
<stateOrCountryDescription>MINNESOTA</stateOrCountryDescription>
<zipCode>55128</zipCode>
</relatedPersonAddress>
<relatedPersonRelationshipList>
<relationship>Executive Officer</relationship>
<relationship>Director</relationship>
</relatedPersonRelationshipList>
<relationshipClarification/>
</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
</relatedPersonsList>
As you can see in the <relatedPersonsList>, you can have multiple <relatedPersonInfo>, and when I try to make a for loop, I still only get the information of the first person.
This is my actual code:
for person in xxs.select('./relatedPersonsList/relatedPersonInfo'):
item = Myform() #even if get rid of it I get the same result
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
here is the code that I used on my spider:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import XmlXPathSelector
from scrapy.http import Request
import urlparse
from formds.items import SecformD
class SecDform(CrawlSpider):
name = "DFORM"
allowed_domain = ["http://www..gov"]
start_urls = [
""
]
rules = (
Rule(
SgmlLinkExtractor(restrict_xpaths=["/html/body/div/table/tr/td[3]/a[2]"]),
callback='parse_formd',
#follow= True no need of follow thing
),
Rule(
SgmlLinkExtractor(restrict_xpaths=('/html/body/div/center[1]/a[contains(., "[NEXT]")]')),
follow=True
),
)
def parse_formd(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*[#id="formDiv"]/div/table/tr[3]/td[3]/a/#href').extract()
for site in sites:
yield Request(url=urlparse.urljoin(response.url, site), callback=self.parse_xml_document)
def parse_xml_document(self, response):
xxs = XmlXPathSelector(response)
item = SecformD()
item["stateOrCountryDescription"] = xxs.select('./primaryIssuer/issuerAddress/stateOrCountryDescription/text()').extract()[0]
item["zipCode"] = xxs.select('./primaryIssuer/issuerAddress/zipCode/text()').extract()[0]
item["issuerPhoneNumber"] = xxs.select('./primaryIssuer/issuerPhoneNumber/text()').extract()[0]
for person in xxs.select('./relatedPersonsList//relatedPersonInfo'):
#item = SecDform()
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
return item
I extract the information to a .json file using this command:
scrapy crawl DFORM -o tes4.json -t json
Try something like this:
def parse_xml_document(self, response):
xxs = XmlXPathSelector(response)
items = []
# common field values
stateOrCountryDescription = xxs.select('./primaryIssuer/issuerAddress/stateOrCountryDescription/text()').extract()[0]
zipCode = xxs.select('./primaryIssuer/issuerAddress/zipCode/text()').extract()[0]
issuerPhoneNumber = xxs.select('./primaryIssuer/issuerPhoneNumber/text()').extract()[0]
for person in xxs.select('./relatedPersonsList//relatedPersonInfo'):
# instantiate one item per loop iteration
item = SecformD()
# save common parameters
item["stateOrCountryDescription"] = stateOrCountryDescription
item["zipCode"] = zipCode
item["issuerPhoneNumber"] = issuerPhoneNumber
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
items.append(item)
return items