I've tried everything but I can't seem to figure it out how I can call the next page in the parse_category.
I've tried LinkExtractor as I do when I go directly to a catergory page but this didn't work.
import scrapy.selector
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
from msh_final.items import CrawlerMshFinalItem
def complete_url(string):
return "http://www.mediamarkt.be" + string
def get_base_url(url):
if url != "":
u = urlparse.urlparse(url)
return "%s://%s" % (u.scheme, u.netloc)
else:
return ""
def encode(str):
return str.encode('utf8', 'ignore')
class msh_finalSpider(CrawlSpider):
name = 'msh_final'
start_urls = ['http://www.mediamarkt.be/mcs/productlist/_Telefoon-Navigatie,98952,509451.html?langId=-17']
def parse(self, response):
items = response.xpath('//ul[#class="infield cf"]//div[#class="infield-wrapper"]/h2/a/#href')
for item in items:
link = item.extract()
yield Request(complete_url(link), callback=self.parse_category)
def parse_category(self, response):
items = response.xpath("//ul[#class='products-list']/li/div")
for item in items:
msh = CrawlerMshFinalItem()
msh['item_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0])
msh['item_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0])
yield msh
You should inherite your spider from Spider instead of CrawlSpider and use following code:
class msh_finalSpider(Spider):
name = 'msh_final'
start_urls = ['http://www.mediamarkt.be/mcs/productlist/_Telefoon-Navigatie,98952,509451.html?langId=-17']
def parse(self, response):
items = response.xpath('//ul[#class="infield cf"]//div[#class="infield-wrapper"]/h2/a/#href')
for item in items:
link = item.extract()
yield Request(complete_url(link), callback=self.parse_category)
def parse_category(self, response):
items = response.xpath("//ul[#class='products-list']/li/div")
for item in items:
msh = CrawlerMshFinalItem()
msh['item_price'] = encode(item.xpath('normalize-space(.//aside/div/div/div/text())').extract()[0])
msh['item_name'] = encode(item.xpath('normalize-space(.//div/h2/a/text())').extract()[0])
yield msh
new_link = response.xpath('//li[#class="pagination-next"]/a/#href').extract()[0]
yield Request(
complete_url(new_link),
callback=self.parse_category
)
Related
I want to scrape a site with scrapy that lists its products in catagories i'm new to scrapy and just getting my head round it today but though i was getting the gist of it on simple scrapes so attempted to scrape urls and return them to scrape further but appears i'm missing something.
someone answered fixing my code here is the latest version as thought i'd have another go at learning scrapy today but its still not recursively scanning it just seems to loop through all the pages but never gets into parse the items
never seems to enter the else statement
yield scrapy.Request(url = response.url,callback = self.parse_item)
i can debug it to check the items are parsed correctly if i force it to output items without looping
if i change the following
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_product_pages)
else:
yield scrapy.Request(url = response.url,callback = self.parse_item)
to
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_item)
else:
yield scrapy.Request(url = response.url,callback = self.parse_product_pages)
here is my code i'm working in python 2.7
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from ybscrape.items import Product
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors.sgml import SgmlLinkExtractor
class ybracingSpider(CrawlSpider):
name = 'ybscrape2'
download_delay = 0.75
def __init__(self, *args, **kwargs):
super(ybracingSpider, self).__init__(*args, **kwargs)
self.allowed_domains = ['http://www.ybracing.com/', 'www.ybracing.com', 'www.esellepro.com']
self.base_url = 'http://www.ybracing.com'
self.start_urls = ['http://www.ybracing.com/karting/']
def parse_start_url(self, response):
category = response.xpath("//h2/a/#href").extract()
#loop over catagory pages take the product link and add all pages url
for product in category:
all_pages = '?itemsperpage=99999'
category_url = str(self.base_url + product + all_pages)
self.log("Queued up: %s" % category_url)
yield scrapy.Request(url = category_url,callback = self.parse_product_pages)
def parse_product_pages(self, response):
product_pages = response.xpath("//li/div/div/h3/a/#href").extract()
#print("debug pause")
#print(product_pages)
#wait = input("PRESS ENTER TO CONTINUE.")
#print("continue")
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_product_pages)
else:
yield scrapy.Request(url = response.url,callback = self.parse_item)
def parse_item(self, response):
item = Product()
item['description'] = response.xpath("//div[#id='Tabbed-Container-Details']/div[2]/div/text()").extract()
item['product_title'] = response.xpath("//h3[#class='Product-Heading']/text()").extract()
item['price'] = response.xpath("//div[#id='Product-Price']/text()").extract()
table_rows = response.xpath("//table[#id='SpecificationTab']/tr[*]/td[1]//text()").extract()
yield item
my items.py
from scrapy.item import Item, Field
class Product(Item):
product_title = Field()
description = Field()
price = Field()
What i'm expecting my code to do in steps
grab all the links within the the first export (categories) (this works)
look at all 9999 products inside each category and export the list (this works)
take the product url from the export append it to the base url to get to the product page for each. (this works)
4.then read data from in the product page to add to items ( never gets here) unlese i skip the if statement but thats not recursive it wont handle sub catagories like that
Here, I have made some changes in your code and now it's working
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from demo.items import DemoItem
from scrapy.linkextractors import LinkExtractor
from scrapy.linkextractors.sgml import SgmlLinkExtractor
class DemoSpider(CrawlSpider):
name = 'ybracing2'
def __init__(self, *args, **kwargs):
super(DemoSpider, self).__init__(*args, **kwargs)
self.allowed_domains = ['http://www.ybracing.com/', 'www.ybracing.com', 'www.esellepro.com']
self.base_url = 'http://www.ybracing.com'
self.start_urls = ['http://www.ybracing.com/racewear/']
def parse_start_url(self, response):
category = response.xpath("//h2/a/#href").extract()
#loop over catagory pages take the product link and add all pages url
for product in category:
all_pages = '?itemsperpage=99999'
category_url = str(self.base_url + product + all_pages)
self.log("Queued up: %s" % category_url)
yield scrapy.Request(url = category_url,callback = self.parse_product_pages)
def parse_product_pages(self, response):
product_pages = response.xpath("//div[#class='Product']/a/#href").extract()
if product_pages:
for product_url in product_pages:
product_url2 = str(self.base_url + product_url)
self.log("Queued up: %s" % product_url2)
yield scrapy.Request(url = product_url2,callback = self.parse_item)
else:
yield scrapy.Request(url = response.url,callback = self.parse_product_pages)
def parse_item(self, response):
item = DemoItem()
dirty_data ={}
item['product_title'] = response.xpath("//h3[#class='Product-Heading']/text()").extract()
item['price'] = response.xpath("//div[#id='Product-Price']/text()").extract()
item['description'] = response.xpath("//div[#id='Tabbed-Container-Details']/div[2]/div/text()").extract()
#image['product_image'] =
# for variable in dirty_data.keys():
# if dirty_data[variable]:
# if variable == 'price':
# item[variable] = float(''.join(dirty_data[variable]).strip().replace('$', '').replace(',', ''))
# else:
# item[variable] = ''.join(dirty_data[variable]).strip()
yield item
I've tried my best to solve this error-
SyntaxError: Invalid syntax in this line
if__name__==__main':
main()
I'm using #Tweepy and #PYTHON27 and attempting to build an #ArcGIS .mdb Feature Class with the collected tweets that contain geotags. Any ideas what is causing the bail? Thank you so much. #Twitter
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import time
import sys
import arcpy
#global variables
consumer_key = 'xxx'
consumer_secret = 'xxxx'
token_key = 'xxx'
token_secret = 'xxx'
class StdOutListener(StreamListener):
def __init__(self, start_time, featureClass, time_limit):
super(StdOutListener, self).__init__()
self.time = start_time
self.limit = time_limit
self.featureClass = featureClass
def on_status(self, status):
while (time.time() - self.time) <self.limit:
if status.geo is not None:
dictCoords = status.geo
listCoords = dictCoords['coordinates']
latitude = listCoords[0]
longitude = listCo0ords[1]
cursor = arcpy.da.InsertCursor(self.featureClass,("SHAPE#XY"))
cursor.insertRow([(longitude,latitude)])
print(str(listCoords[0]) + "," + str(listCoords[1]))
return True
else:
print "No coordinates found"
return True
def on_error(self, status):
print('Error...')
print status
return True
def on_timeout(self):
print('Timeout...')
return True
start_time = time.time()
arcpy.env.workspace = r'c:\ArcGIS_Blueprint_Python\data\Twitter\TweetInformation.gdb'
def main():
try: #new
featureClass = sys.argv[1]
monitorTime = sys.argv[2]
monitorTime = monitorTime * 3600
sr = arcpy.SpatialReference(4326)
arcpy.env.overwriteOutput = True
arcpy.CreateFeatureClass_management(arcpy.env.workspace,
featureClass, "POINT", spatial_reference=sr)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(token_key, token_secret)
stream = Stream(auth, StdOutListener(start_time, featureClass,
time_limit=monitorTime)) #172800
stream.filter(track=['car'])
except Exception as e:
print(e.message)
if__name__ == '__main__':
main()
How run download and parse the same url, while needed data not received, or call of recursion is > 5 times?
For example i need get h1 from example.com, but server return different pages: with h1, and without it. I want that the code run download and parse url while h1 not get or recursion call > 5 times.
For example
import re
from kupito.items import KupitoItem
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
class idpkzSpider(CrawlSpider):
name = 'idp.kz'
allowed_domains = ['idp.kz']
start_urls = [
'http://idp.kz/index.php/katalog-tovarov/monitors'
]
rules = (
Rule(LinkExtractor(allow=('.*start=\d+$')), callback='parse_start_url', follow=True),
)
def parse_start_url(self, response):
items = []
hxs = Selector(response)
dirEl = hxs.xpath('//h1/text()').extract()
goods = hxs.xpath('//div[#class=\'jshop list_product\']/div[#class=\'block_product\']')
if dirEl and len(goods) > 0:
dirName = dirEl[0].encode('utf-8').strip()
for good in goods:
name = good.xpath('div//div[#class=\'name\']/a/text()').extract()
price = good.xpath('div//div[#class=\'jshop_price\']/span/text()').extract()
url = good.xpath('div//div[#class=\'name\']/a/#href').extract()
if name and price and url:
item = KupitoItem()
item['name'] = name[0].encode('utf-8').strip()
item['price'] = price[0].encode('utf-8').strip()
item['url'] = url[0].encode('utf-8').strip()
item['dirName'] = dirName
items.append(item)
return items
I want to append string url in the item :
object(item['urls'] = sel.xpath('a/#href').extract())
Example:
item['urls'] = "http://lakmeindia.com" + sel.xpath('a/#href').extract()
# Item class
import scrapy
class LakmeSampleItem(scrapy.Item):
urls = scrapy.Field()
catagory = scrapy.Field()
sub_category = scrapy.Field()
# lakme Spider
import scrapy
from LakmeProject.items import LakmeSampleItem
class LakmeSpider(scrapy.Spider):
name = "lakme"
allowed_domains = ["lakmeindia.com"]
start_urls = [
"http://www.lakmeindia.com/sitemap"
]
def parse(self, response):
for sel in response.xpath("//div[#class='make-up']/ul[1]/li"):
item = LakmeSampleItem()
item['sub_category'] = sel.xpath('span/text()').extract()
# here i want to append url(because url is coming like [/sitemap])
item['urls'] = sel.xpath('a/#href').extract()
item['catagory'] = "Lakme Absolute"
yield item
You are on the right track. You just need to note that extract() returns a list type. So what you actually need to be doing is:
item['urls'] = "http://lakmeindia.com" + sel.xpath('a/#href').extract()[0]
i.e., get the first item in the list by using [0] on the result returned by extract()
I am using Scrapy for a project, in this project I am extracting the information from the xml.
In the xml document the format where I would like to implement the for loop:
<relatedPersonsList>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>
<relatedPersonName>
<firstName>Mark</firstName>
<middleName>E.</middleName>
<lastName>Lucas</lastName>
</relatedPersonName>
<relatedPersonAddress>
<street1>1 IMATION WAY</street1>
<city>OAKDALE</city>
<stateOrCountry>MN</stateOrCountry>
<stateOrCountryDescription>MINNESOTA</stateOrCountryDescription>
<zipCode>55128</zipCode>
</relatedPersonAddress>
<relatedPersonRelationshipList>
<relationship>Executive Officer</relationship>
<relationship>Director</relationship>
</relatedPersonRelationshipList>
<relationshipClarification/>
</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
</relatedPersonsList>
As you can see in the <relatedPersonsList>, you can have multiple <relatedPersonInfo>, and when I try to make a for loop, I still only get the information of the first person.
This is my actual code:
for person in xxs.select('./relatedPersonsList/relatedPersonInfo'):
item = Myform() #even if get rid of it I get the same result
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
here is the code that I used on my spider:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import XmlXPathSelector
from scrapy.http import Request
import urlparse
from formds.items import SecformD
class SecDform(CrawlSpider):
name = "DFORM"
allowed_domain = ["http://www..gov"]
start_urls = [
""
]
rules = (
Rule(
SgmlLinkExtractor(restrict_xpaths=["/html/body/div/table/tr/td[3]/a[2]"]),
callback='parse_formd',
#follow= True no need of follow thing
),
Rule(
SgmlLinkExtractor(restrict_xpaths=('/html/body/div/center[1]/a[contains(., "[NEXT]")]')),
follow=True
),
)
def parse_formd(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*[#id="formDiv"]/div/table/tr[3]/td[3]/a/#href').extract()
for site in sites:
yield Request(url=urlparse.urljoin(response.url, site), callback=self.parse_xml_document)
def parse_xml_document(self, response):
xxs = XmlXPathSelector(response)
item = SecformD()
item["stateOrCountryDescription"] = xxs.select('./primaryIssuer/issuerAddress/stateOrCountryDescription/text()').extract()[0]
item["zipCode"] = xxs.select('./primaryIssuer/issuerAddress/zipCode/text()').extract()[0]
item["issuerPhoneNumber"] = xxs.select('./primaryIssuer/issuerPhoneNumber/text()').extract()[0]
for person in xxs.select('./relatedPersonsList//relatedPersonInfo'):
#item = SecDform()
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
return item
I extract the information to a .json file using this command:
scrapy crawl DFORM -o tes4.json -t json
Try something like this:
def parse_xml_document(self, response):
xxs = XmlXPathSelector(response)
items = []
# common field values
stateOrCountryDescription = xxs.select('./primaryIssuer/issuerAddress/stateOrCountryDescription/text()').extract()[0]
zipCode = xxs.select('./primaryIssuer/issuerAddress/zipCode/text()').extract()[0]
issuerPhoneNumber = xxs.select('./primaryIssuer/issuerPhoneNumber/text()').extract()[0]
for person in xxs.select('./relatedPersonsList//relatedPersonInfo'):
# instantiate one item per loop iteration
item = SecformD()
# save common parameters
item["stateOrCountryDescription"] = stateOrCountryDescription
item["zipCode"] = zipCode
item["issuerPhoneNumber"] = issuerPhoneNumber
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
items.append(item)
return items