I finally managed to get a working script.
Only 1 small issue. I can crawl all pages and get all the needed info, except from the first page.
Where is my error?
import scrapy.selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(CrawlSpider):
name = "coolblue"
allowed_domains = ["tvstore.be"]
start_urls = ["http://www.tvstore.be/category/192945/televisies.html"]
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="pagination next secondary"]',)), callback = "parse_items",follow = True),)
def parse_items(self, response):
products = response.xpath("//li[#class='product-list-columns--item product-list-item']")
for product in products:
item = PrijsvergelijkingItem()
item["Product_ref"] = product.xpath(".//h2/a/text()").extract_first().strip()
item["Product_price"] = product.xpath(".//strong[1]/text()").extract_first().strip().replace(",",".").replace("-","")
yield item
I didn't look hard enough.
I found the answer. All I had to do was change parse_Items to parse_start_url.
from scrapy.spiders import CrawlSpider, Rule
import scrapy.selector
from scrapy.linkextractors import LinkExtractor
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(CrawlSpider):
name = "msh"
allowed_domains = ["mediamarkt.be"]
start_urls = ["http://www.mediamarkt.be/mcs/productlist/_TV,98952,452540.html?langId=-17&searchParams=&sort=&view=&page=1"]
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//li[#class="pagination-next"]',)), callback = "parse_start_url",follow = True),)
def parse_start_url(self, response):
products = response.xpath("//ul[#class='products-list']/li/div")
for product in products:
item = PrijsvergelijkingItem()
item["Product_price"] = product.xpath('.//aside/div/div/div/text()').extract_first().replace(",", ".").replace("-", "")
item["Product_ref"] = product.xpath('.//div/h2/a/text()').extract_first().strip()
yield item
Related
I use django, celery, scrapy.
My settings for celery:
CELERY_BROKER_URL = 'amqp://****/myvhost'
CELERY_TIMEZONE = TIME_ZONE
CELERYD_CONCURRENCY = 1000
CELERYD_MAX_TASKS_PER_CHILD = 4
CELERY_IGNORE_RESULT = True
# django celery
CELERY_RESULT_BACKEND = 'django-db'
# celery queues setup
CELERY_DEFAULT_QUEUE = 'default'
CELERY_DEFAULT_ROUTING_KEY = 'default'
CELERY_QUEUES = (
Queue('get_context', Exchange('get_context'), routing_key='get_context'),
Queue('get_article', Exchange('get_article'), routing_key='get_article'),
)
CELERY_ROUTES = {
'parse.tasks.get_context': {
'queue': 'get_context',
'routing_key': 'get_context',
},
'parse.tasks.get_article': {
'queue': 'get_article',
'routing_key': 'get_article',
},
}
There are two tasks on celery:
from api_parser import celery_app
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy_parser.scrapy_parser.spiders.map_links import MapLinksSpider
from scrapy_parser.scrapy_parser.spiders.articles import ArticlesSpider
from threading import Thread
#celery_app.task
def get_context(rules_id, rules):
process = CrawlerProcess(get_project_settings())
process.crawl(MapLinksSpider, rules_id=rules_id, rules=rules)
Thread(target=process.start).start()
#celery_app.task
def get_article(rules_id, link_id, rules, link):
process = CrawlerProcess(get_project_settings())
process.crawl(ArticlesSpider, rules_id=rules_id, link_id=link_id, rules=rules, link=link)
Thread(target=process.start).start()
The first task is triggered by a signal and maps the links.
The second task is started when a new link is added to the database.
My signals in django:
from django.db.models.signals import post_save
from django.dispatch import receiver
from parse.models.rules import Scheduler, Rules, ParseLinks
from parse.tasks import get_context, get_article
#receiver(post_save, sender=Scheduler)
def create_task_get_context(sender, instance, created, **kwargs):
if created:
rules = Rules.objects.get(id=int(instance.rules.id))
get_context.delay(int(rules.id), str(rules.rules))
#receiver(post_save, sender=ParseLinks)
def create_task_get_article(sender, instance, created, **kwargs):
if created:
parse_link = ParseLinks.objects.get(id=int(instance.id))
get_article.delay(int(parse_link.rules.id), int(parse_link.id), str(parse_link.rules.rules), str(parse_link.link))
My spiders:
map_links.py
from parse.models.rules import ParseLinks
import scrapy
import json
class MapLinksSpider(scrapy.Spider):
name = "map_links"
start_urls = []
def __init__(self, **kw):
super(MapLinksSpider, self).__init__(**kw)
self.rules_id = kw.get('rules_id')
self.rules = json.loads(kw.get('rules'))
self.start_urls = [self.rules['url']]
self.templates = self.rules['item']['templates']
self.pagination = self.rules['pagination']
def parse(self, response):
for item in self.templates:
context = response.css(str(item['context']))
for row in context:
link = row.css('%s::attr(%s)' % (item['link']['cssSelector'], item['link']['attr'])).extract_first(),
title = row.css('%s::text' % item['options']['title']['cssSelector']).extract_first(),
date = row.css('%s::text' % item['options']['date']['cssSelector']).extract_first()
ParseLinks.objects.get_or_create(rules_id=self.rules_id, link=self.rules['url'] + link[0], title=title, date=date)
next_page = response.css('%s::attr(%s)' % (self.pagination['link']['cssSelector'], self.pagination['link']['attr'])).extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
articles.py
from parse.models.rules import ParseData
import scrapy
import json
class ArticlesSpider(scrapy.Spider):
name = "articles"
start_urls = []
def __init__(self, **kw):
super(ArticlesSpider, self).__init__(**kw)
self.rules_id = kw.get('rules_id')
self.link_id = kw.get('link_id')
self.rules = json.loads(kw.get('rules'))
self.link = kw.get('link')
def parse(self, response):
self.start_urls = [self.link]
title = response.css('%s::text' % self.rules['article']['title']['cssSelector']).extract_first()
text = response.css('%s::text' % self.rules['article']['text']['cssSelector']).extract_first()
ParseData.objects.create(rules_id=self.rules_id, link_id=self.link_id, title=title, text=text)
yield {
"title": title,
'text': text
}
But I get the error: twisted.internet.error.ReactorNotRestartable
I understand that the error is caused by the launch of a new process for the spider. But I'm using threads. And I do not understand why this does not solve my problem.
I think every beginning scraper meets this question :)
Try this:
0) pip install crochet
import from crochet import setup
setup() - at the top of the file
remove 2 lines:
a) d.addBoth(lambda _: reactor.stop())
b) reactor.run()
The only meaningful lines from [Scrapy docs][2] left are 2 last lines in this my code:
#some more imports
from crochet import setup
setup()
def run_spider(spiderName):
module_name="first_scrapy.spiders.{}".format(spiderName)
scrapy_var = import_module(module_name) #do some dynamic import of selected spider
spiderObj=scrapy_var.mySpider() #get mySpider-object from spider module
crawler = CrawlerRunner(get_project_settings()) #from Scrapy docs
crawler.crawl(spiderObj) #from Scrapy docs
This code allows to select what spider to run just with its name passed to run_spider function and after scraping finishes - select another spider and run it again.
Next you simply run run_spider from Celery task.
[1]: ReactorNotRestartable - Twisted and scrapy
[2]: https://doc.scrapy.org/en/latest/topics/practices.html
I have to create an application that imports datas from a csv into a django model that already exists. I used the django-adaptator tool that allows to do that easily.
For exemple, I have a Sates (= "Pays") model that contains 3 fields : name, code, and nationality.
My csv :
nom;abrev;nationalite
AFGHANISTAN;AF;Afghane
AFRIQUE DU SUD;ZA;Sud-africaine
Here is my code
Models.py that already exists :
class Pays(models.Model):
pays = models.CharField(max_length=150)
codeiso3166alpha2 = models.CharField(max_length=2)
nationalite = models.CharField(max_length=50, null=True, blank=True)
maj = models.DateTimeField(auto_now=True)
class Meta:
db_table = u'Pays'
def __unicode__(self):
return self.pays
Models.py from my application :
# coding: utf-8
from django.db import models
from polyc2n.models import Pays #the model that already exists
from adaptor.model import CsvDbModel
class MyCSVDbModel(CsvDbModel):
class Meta:
exclude = ['maj']
dbModel = Pays
delimiter = ";"
has_header = True
update = {'keys': ['codeiso3166alpha2']}
My views.py:
# coding: utf-8
from django.shortcuts import render
from remplirPays.models import MyCSVDbModel
from django.http import HttpResponse
path = "pays.csv"
def DataPays(request):
MyCSVDbModel.import_data(data = open(path))
return HttpResponse("import is ok")
Here is my problem : when I do the import, the first line isn't ignored, even if I set the "has_header" attribute at True. Do you guys know why or do you know how can I resolve my problem ?
Thank you for reading
Why don't you better try using datareader and a load.py file, here i gave you a example code i use :
import csv,sys,os
import django
pathproject = "/home/yourfolder"
base_csv_filepath = "/home/yourfolder/yourcsvfile"
sys.path.append(pathproject)
os.environ['DJANGO_SETTINGS_MODULE'] = 'config.settings.local'
django.setup()
from yourapp.models import yourmodel
def load_your_data():
print ("Entering...")
csv_file = base_csv_filepath + "/yourcsvfile.csv"
dataReader = csv.reader(open(csv_file, encoding='utf-8'),delimiter=',',quotechar='"')
for row in dataReader:
if row[0] != 'ID':
yourmodel.objects.create(
field1=row[0],
field2=row[1]
)
print ("Imported correctly")
if __name__ == "__main__":
load_your_data()
EDIT: 'ID' would be the header column text, and assuming it's a 2 row data, you can of course modifiy it as you wish to cover bigger parameters.
I want to write to a csv with each data observation stored in one line. However, I have all observations reside in one cell. I tried with yield item to replace item.append(item) and return items, but it didn't work either.
import scrapy
from selenium import webdriver
import time
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from gdp.items import gdpItem
import unicodecsv as csv
class gdp_spider2(scrapy.Spider):
name = 'gdp_spider2'
allowed_domains = ['statdb.dgbas.gov.tw/']
start_urls = ['http://statdb.dgbas.gov.tw/pxweb/Dialog/varval.asp?ma=NA8101A1Q&ti=Principal%20Figures%282008SNA%29-Quarterly&path=../PXfileE/NationalIncome/&lang=1&strList=L']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
items = []
driver = self.driver
driver.get(response.url)
driver.find_element_by_partial_link_text('Select all').click()
driver.find_element_by_xpath('//option[contains(text(),"GDP (Million N.T.$,at Current Prices)")]').click()
driver.find_element_by_xpath('//option[contains(text(),"Data")]').click()
driver.find_element_by_xpath('//input[#type="SUBMIT"]').click()
hxs = HtmlXPathSelector(text=self.driver.page_source)
data = hxs.xpath("//table[#class='pxtable']//tbody//tr")
for datum in data:
item = gdpItem()
item ["date"] = datum.xpath('//td[1]/text()').extract()
item ["data"] = datum.xpath('//td[2]/text()').extract()
items.append(item)
return items
CSV image
Try this:
def parse(self, response):
items = []
item = gdpItem()
driver = self.driver
driver.get(response.url)
driver.find_element_by_partial_link_text('Select all').click()
driver.find_element_by_xpath('//option[contains(text(),"GDP (Million N.T.$,at Current Prices)")]').click()
driver.find_element_by_xpath('//option[contains(text(),"Data")]').click()
driver.find_element_by_xpath('//input[#type="SUBMIT"]').click()
hxs = HtmlXPathSelector(text=self.driver.page_source)
data = hxs.xpath("//table[#class='pxtable']//tbody//tr")
for datum in data:
item ["date"] = datum.xpath('td[1]/text()').extract()
item ["data"] = datum.xpath('td[2]/text()').extract()
yield item
I want to download the data from Yahoo Finance
http://finance.yahoo.com/q/hp?s=^TWII&a=00&b=15&c=2004&d=11&e=4&f=2015&g=m
I want the program to type "2004" in the start year space and "2015" in the end year space. How can I do that?
My codes look like this:
import scrapy
from selenium import webdriver
import time
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from taiex.items import taiexItem
import unicodecsv as csv
class taiex_spider(scrapy.Spider):
name = 'taiex_spider'
allowed_domains = ['finance.yahoo.com/']
start_urls = ['http://finance.yahoo.com/q/hp?s=^TWII&a=00&b=15&c=2004&d=11&e=4&f=2015&g=m']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
items = []
item = taiexItem()
driver = self.driver
driver.get(response.url)
driver.find_element_by_css_selector('select[id="selstart"]>option[value="00"]').click()
driver.find_element_by_css_selector('select[id="selend"]>option[value="11"]').click()
driver.find_element_by_xpath('//input[#id="monthly"]').click()
driver.find_element_by_xpath('//input[#class="rapid-nf"]').click()
driver.find_element_by_partial_link_text('Download to Spreadsheet').click()
Locate the desired input elements by id and send the keys to them:
start_year = driver.find_element_by_id("startyear")
start_year.clear()
start_year.send_keys("2004")
end_year = driver.find_element_by_id("endyear")
end_year.clear()
end_year.send_keys("2015")
There is several click-able elements on the page and I'm trying to scrape some pages behind, but I have this error and spider closed after first click:
StaleElementReferenceException: Message: Element not found in the cache - perhaps the page has changed since it was looked up
For now I just trying to get page opened to catch new url. Here is my code
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.xlib.pydispatch import dispatcher
from MySpider.items import MyItem
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import time
class MySpider(Spider):
name = "myspider"
allowed_domains = ["http://example.com"]
base_url = 'http://example.com'
start_urls = ["http://example.com/Page.aspx",]
def __init__(self):
self.driver = webdriver.Firefox()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
self.driver.get(response.url)
item = MyItem()
links = self.driver.find_elements_by_xpath("//input[#class='GetData']")
for button in links:
button.click()
time.sleep(5)
source = self.driver.page_source
sel = Selector(text=source) # create a Selector object
item['url'] = self.driver.current_url
print '\n\nURL\n', item['url'], '\n'
yield item
Because the link elements are in the first page. If you open a new page, the link elements are stale.
You can try these two solutions:
1, Store the link url of link elements and use driver.get(url) to open the link.
def parse(self, response):
self.driver.get(response.url)
item = MyItem()
links = self.driver.find_elements_by_xpath("//input[#class='GetData']")
link_urls = links.get_attribute("href")
for link_url in link_urls:
self.driver.get(link_url)
time.sleep(5)
source = self.driver.page_source
sel = Selector(text=source) # create a Selector object
item['url'] = self.driver.current_url
print '\n\nURL\n', item['url'], '\n'
yield item
2, After click a link and get the url, call driver.back() to back to the first page. Then re-find the link elements.
def parse(self, response):
self.driver.get(response.url)
item = MyItem()
links = self.driver.find_elements_by_xpath("//input[#class='GetData']")
for i in range(len(links)):
links[i].click()
time.sleep(5)
source = self.driver.page_source
sel = Selector(text=source) # create a Selector object
item['url'] = self.driver.current_url
print '\n\nURL\n', item['url'], '\n'
yield item
self.driver.back()
links = self.driver.find_elements_by_xpath("//input[#class='GetData']")