Multi spiders on scrapy

Multi spiders on scrapy - django

I use django, celery, scrapy.
My settings for celery:
CELERY_BROKER_URL = 'amqp://****/myvhost'
CELERY_TIMEZONE = TIME_ZONE
CELERYD_CONCURRENCY = 1000
CELERYD_MAX_TASKS_PER_CHILD = 4
CELERY_IGNORE_RESULT = True
# django celery
CELERY_RESULT_BACKEND = 'django-db'
# celery queues setup
CELERY_DEFAULT_QUEUE = 'default'
CELERY_DEFAULT_ROUTING_KEY = 'default'
CELERY_QUEUES = (
Queue('get_context', Exchange('get_context'), routing_key='get_context'),
Queue('get_article', Exchange('get_article'), routing_key='get_article'),
)
CELERY_ROUTES = {
'parse.tasks.get_context': {
'queue': 'get_context',
'routing_key': 'get_context',
},
'parse.tasks.get_article': {
'queue': 'get_article',
'routing_key': 'get_article',
},
}
There are two tasks on celery:
from api_parser import celery_app
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy_parser.scrapy_parser.spiders.map_links import MapLinksSpider
from scrapy_parser.scrapy_parser.spiders.articles import ArticlesSpider
from threading import Thread
#celery_app.task
def get_context(rules_id, rules):
process = CrawlerProcess(get_project_settings())
process.crawl(MapLinksSpider, rules_id=rules_id, rules=rules)
Thread(target=process.start).start()
#celery_app.task
def get_article(rules_id, link_id, rules, link):
process = CrawlerProcess(get_project_settings())
process.crawl(ArticlesSpider, rules_id=rules_id, link_id=link_id, rules=rules, link=link)
Thread(target=process.start).start()
The first task is triggered by a signal and maps the links.
The second task is started when a new link is added to the database.
My signals in django:
from django.db.models.signals import post_save
from django.dispatch import receiver
from parse.models.rules import Scheduler, Rules, ParseLinks
from parse.tasks import get_context, get_article
#receiver(post_save, sender=Scheduler)
def create_task_get_context(sender, instance, created, **kwargs):
if created:
rules = Rules.objects.get(id=int(instance.rules.id))
get_context.delay(int(rules.id), str(rules.rules))
#receiver(post_save, sender=ParseLinks)
def create_task_get_article(sender, instance, created, **kwargs):
if created:
parse_link = ParseLinks.objects.get(id=int(instance.id))
get_article.delay(int(parse_link.rules.id), int(parse_link.id), str(parse_link.rules.rules), str(parse_link.link))
My spiders:
map_links.py
from parse.models.rules import ParseLinks
import scrapy
import json
class MapLinksSpider(scrapy.Spider):
name = "map_links"
start_urls = []
def __init__(self, **kw):
super(MapLinksSpider, self).__init__(**kw)
self.rules_id = kw.get('rules_id')
self.rules = json.loads(kw.get('rules'))
self.start_urls = [self.rules['url']]
self.templates = self.rules['item']['templates']
self.pagination = self.rules['pagination']
def parse(self, response):
for item in self.templates:
context = response.css(str(item['context']))
for row in context:
link = row.css('%s::attr(%s)' % (item['link']['cssSelector'], item['link']['attr'])).extract_first(),
title = row.css('%s::text' % item['options']['title']['cssSelector']).extract_first(),
date = row.css('%s::text' % item['options']['date']['cssSelector']).extract_first()
ParseLinks.objects.get_or_create(rules_id=self.rules_id, link=self.rules['url'] + link[0], title=title, date=date)
next_page = response.css('%s::attr(%s)' % (self.pagination['link']['cssSelector'], self.pagination['link']['attr'])).extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
articles.py
from parse.models.rules import ParseData
import scrapy
import json
class ArticlesSpider(scrapy.Spider):
name = "articles"
start_urls = []
def __init__(self, **kw):
super(ArticlesSpider, self).__init__(**kw)
self.rules_id = kw.get('rules_id')
self.link_id = kw.get('link_id')
self.rules = json.loads(kw.get('rules'))
self.link = kw.get('link')
def parse(self, response):
self.start_urls = [self.link]
title = response.css('%s::text' % self.rules['article']['title']['cssSelector']).extract_first()
text = response.css('%s::text' % self.rules['article']['text']['cssSelector']).extract_first()
ParseData.objects.create(rules_id=self.rules_id, link_id=self.link_id, title=title, text=text)
yield {
"title": title,
'text': text
}
But I get the error: twisted.internet.error.ReactorNotRestartable
I understand that the error is caused by the launch of a new process for the spider. But I'm using threads. And I do not understand why this does not solve my problem.

I think every beginning scraper meets this question :)
Try this:
0) pip install crochet
import from crochet import setup
setup() - at the top of the file
remove 2 lines:
a) d.addBoth(lambda _: reactor.stop())
b) reactor.run()
The only meaningful lines from [Scrapy docs][2] left are 2 last lines in this my code:
#some more imports
from crochet import setup
setup()
def run_spider(spiderName):
module_name="first_scrapy.spiders.{}".format(spiderName)
scrapy_var = import_module(module_name) #do some dynamic import of selected spider
spiderObj=scrapy_var.mySpider() #get mySpider-object from spider module
crawler = CrawlerRunner(get_project_settings()) #from Scrapy docs
crawler.crawl(spiderObj) #from Scrapy docs
This code allows to select what spider to run just with its name passed to run_spider function and after scraping finishes - select another spider and run it again.
Next you simply run run_spider from Celery task.
[1]: ReactorNotRestartable - Twisted and scrapy
[2]: https://doc.scrapy.org/en/latest/topics/practices.html

Related

How to run scrapy spider with celery

I'm trying to run a scrapy spider which takes some argument and runing it with os.system. But the celery task(scraper) doesn't gets executed untill it finishes.
Spider
class SpecificAuthorQuotesSpider(scrapy.Spider):
"""Extracts the quotes from specific author"""
start_urls = ['https://quotes.toscrape.com/']
name = "some-quotes"
def __init__(self, author=None, **kwargs):
self.author = author
super().__init__(**kwargs)
def parse(self, response, **kwargs):
item = QuotesItem()
all_div_quotes = response.css('div.quote')
for quote in all_div_quotes:
title = quote.css('span.text::text').extract_first().replace('”', '').replace("“", "")
author = quote.css('.author::text').extract_first()
# Check if author's name matches
if author.strip().lower() == self.author.strip().lower():
item['text'] = title
item['author'] = author
yield item
# Crawl Next Page
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
Task
#shared_task
def task_scrape_from_author(author_name):
"""Scrape quotes from author"""
django_path = Path(__file__).resolve().parent.parent
os.chdir(str(django_path)+"/scraper")
os.system(
"scrapy crawl some-quotes -a author='{}'".format(author_name))
View
def scrape_quotes_from_author(request):
if request.user.is_superuser:
author_name = request.POST.get("athr_name")
task_scrape_from_author.delay(author_name)
messages.add_message(
request, messages.INFO, 'Started crawling quotes from {}'.format(author_name))
return HttpResponseRedirect(reverse("admin:index"))
else:
return HttpResponseRedirect("../")
Github Repo
I don't understand why is the task not getting completed and interrupted without any messages. I tried setting max timeout also but that din't worked.

I make a table for spider_name and spider_class.
model.py
class Spiders(models.Model):
spider_class = models.CharField(max_length=50,verbose_name="Spider Class",null=True)
spider_name = models.CharField(max_length=50,verbose_name="Spider Name",null=True)
I collect all spider_name and class' here.
view.py
from .model import Spiders
from spider_dir.start import startallSpiders
def runAllspiders(request):
all_class = []
spiders = Spiders.objects.all()
for spider in spiders:
spider_name = spider.spider_name
name = 'spider_dir.spider_dir.spiders.'+spider_name
i = importlib.import_module(name)
class_ = getattr(i, spider.spider_class)
all_class.append(class_)
try:
startallSpiders(all_class)
messages.success(request,"Spiders works fine")
except:
messages.warning(request,"An error occure")
return redirect(request.META['HTTP_REFERER'])
I make a start py in scrapy dir
I use crochet to start all of the spiders at once.
start.py
from .spider_dir import settings as st
from scrapy.settings import Settings
from crochet import setup
setup()
def startallSpiders(all_Class):
for class_ in all_Class:
crawler_settings = Settings()
setup()
crawler_settings.setmodule(st)
runner= CrawlerRunner(settings=crawler_settings)
runner.crawl(class_)
settings.py You have to append Django settings in scrapy settings.
import os,sys
sys.path.append(os.path.dirname(os.path.abspath('.')))
os.environ['DJANGO_SETTINGS_MODULE'] = 'django_project.settings'
import django
django.setup()
I figure with Crawlerrunner and everything works fine for 6 months.

Python Tkinter- method for printing not working after being called from another script

I have a class called app.py within that class there's a method called print_raw_records_screen here's the part of the class and the method
app.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
from Tkinter import *
from ttk import *
import os
import mftsession
class Example(Frame):
def __init__(self, parent):
Frame.__init__(self, parent)
self.parent = parent
self.filename = ""
self.initUI()
#defining a function that that receives the raw records from the mftsession.py and print it to the screen
#this script will be called by the mftsession.py in
#process_mftfile()
def print_raw_records_screen(self,records):
self.area.delete(1.0, "end")
self.area.insert('1.0',records)
def initUI(self):
self.parent.title("Mtf Analyzer")
#initializing and configuring menubar
menubar = Menu(self.parent)
self.parent.config(menu=menubar)
fileMenu = Menu(menubar)
fileMenu.add_command(label="Open file", command=self.fileOpen)
fileMenu.add_command(label="Exit", command=self.onExit)
menubar.add_cascade(label="File", menu=fileMenu)
#specify grid row and column spaces
self.pack(fill=BOTH, expand=True)
self.columnconfigure(1, weight=1)
self.columnconfigure(3, pad=7)
self.rowconfigure(3, weight=1)
self.rowconfigure(5, pad=7)
lbl = Label(self, text="File Name")
lbl.grid(row=1, column=0, sticky=W, pady=4, padx=5)
self.filename_area = Entry(self)
self.filename_area.grid(row=1, column=1, columnspan=5, padx=5, sticky=E+W+S+N)
analize_button = Button(self, text="Analize", command=self.processFile)
analize_button.grid(row=1, column=6, padx=5)
self.area = Text(self)
self.area.grid(row=2, column=1, columnspan=2, rowspan=4,
padx=5, sticky=E+W+S+N)
#configure the raw output view
def onExit(self):
self.quit()
#this function selects and opens the file to analize
def fileOpen(self):
from tkFileDialog import askopenfilename
Tk().withdraw()
self.filename = askopenfilename()
#populate the filename field
self.set( self.filename)
#do the processing of the file obtained. Populate the file NAME entry or
#send the filename to the analyzeMFT.py
def processFile(self):
arguments = "analyzeMFT.py -f "+self.filename+" -d --bodyfull -l -o "+self.filename+".csv"
os.system(arguments)
mftsession.MftSession.process_mft_file(self)
#get and set methods for the entry field
def get(self):
return self.filename_area.get()
def set(self, value):
self.filename_area.delete(0, END)
self.filename_area.insert(0,value)
def main():
root = Tk()
root.geometry("450x350+500+500")
app = Example(root)
root.mainloop()
if __name__ == '__main__':
main()
This method is called by another external script like this
mftsession.py
import csv
import json
import os
import sys
from optparse import OptionParser
import mft
import app
from Tkinter import *
from ttk import *
class MftSession:
"""Class to describe an entire MFT processing session"""
#staticmethod
def fmt_excel(date_str):
return '="{}"'.format(date_str)
#staticmethod
def fmt_norm(date_str):
return date_str
def __init__(self):
self.mft = {}
self.fullmft = {}
self.folders = {}
self.debug = False
self.mftsize = 0
def process_mft_file(self):
root = Tk()
appi = app.Example(root)
self.sizecheck()
self.build_filepaths()
# reset the file reading
self.num_records = 0
self.file_mft.seek(0)
raw_record = self.file_mft.read(1024)
if self.options.output is not None:
self.file_csv.writerow(mft.mft_to_csv(None, True, self.options))
while raw_record != "":
record = mft.parse_record(raw_record, self.options)
if self.options.debug:
print record
appi.print_raw_records_screen(raw_record ) #THIS FUNCTION WHILE INVOKED THIS WAY IS NOT WORKING
..........
The script analyzeMFT.py called by the app.py through the subrouting
#!/usr/bin/python
import sys
from os import path
def main():
session = mftsession.MftSession()
session.mft_options()
session.open_files()
session.process_mft_file()
if __name__ == '__main__':
if __package__ is None:
sys.path.append( path.dirname( path.dirname( path.abspath(__file__) ) ) )
import mftsession
main()
else:
import mftsession
main()
When called this way it prints nothing to the window, but when i invoke it for testing within its own class it prints the test
def print_raw_records_screen(self,records):
self.area.delete(1.0, "end")
self.area.insert('1.0', records)
and invoke like print_raw_records_screen("any test"): in the app.py
An image can tell alot. and summarize
What am i doing wrong? I sense it's the instantiation am doing wrong. I need diretions please

From what I can see here you have a few issues causing problems.
you are importing app.py on mftsession.py instead importing mftsession.py on app.py.
You are trying to use appi.print_raw_records_screen(raw_record) on a completely different instance of the Example() class with appi = app.Example(root) remove that part all together.
It is bad practice to importing inside a function. Import at the start of each py file.
There is so many things going on in your code I had to create a Minimal, Complete, and Verifiable example example of my own to illustrate the relation between files.
Here is a simple example of how the 2 files can interact and the way I think you are trying to do things.
Here I have created a main file called app.py:
from Tkinter import *
# You might need to import the py file with the package name as well.
# Change the package name to the package your python files are located in.
import PACKAGE_NAME.file_b
class Example(Frame):
def __init__(self, parent):
Frame.__init__(self, parent)
self.parent = parent
self.filename = ""
analize_button = Button(self.parent, text="Analize", command=self.processFile)
analize_button.pack()
self.area = Text(self.parent, height = 2, width = 40)
self.area.pack()
def print_raw_records_screen(self,records):
self.area.delete(1.0, "end")
self.area.insert('1.0',records)
def processFile(self):
PACKAGE_NAME.file_b.process_mft_file(self)
if __name__ == "__main__":
root = Tk()
app = Example(root)
root.mainloop()
Here I have created a file called file_b.py
from Tkinter import *
def process_mft_file(self):
record = "Some values assigned to a variable"
self.print_raw_records_screen(record)
The results look something like this:

Second request doesn't call the callback

My method parse_adf_info never is called and I dont know why. No error occurs. I want to get the links for each ads (parse) and go to ads one by one (parse_ads_urls) and scraping data (parse_ads_info), but this method never is called.
Here is my code:
# -*- coding: utf-8 -*-
from scrapy import Request, Spider
#from zapimoveis.items import ads_info
from scrapy.selector import Selector
#from scrapy.loader import ItemLoader
proxy_list = ["###","###"]
PROXY = "###"
class AdsSpider(Spider):
name = "zapimoveis"
allowed_domains = ["https://www.zapimoveis.com.br/", "https://www.zapimoveis.com.br/oferta/"]
def __init__(self, start_url='', *args, **kwargs):
super(AdsSpider, self).__init__(*args, **kwargs)
self.start_urls = []
self.start_urls.append(start_url)
self.json = '#{"precomaximo":"2147483647","parametrosautosuggest":[{"B\
airro":"JD CAMBURI","Zona":"","Cidade":"VITORIA","Agrupame\
nto":"","Estado":"ES"}],"pagina":"%d","ordem":"DataAtualiz\
acao","paginaOrigem":"ResultadoBusca","semente":"213739135\
0","formato":"Lista"}'
def start_requests(self):
rq = Request(url=self.start_urls[0], callback=self.parse)
rq.meta['proxy'] = PROXY
yield rq
def parse(self, response):
n_pages = response.css('span[class="pull-right num-of"]::text') \
.extract_first()
n_pages = int(n_pages.replace("de ", ""))
for i in range(1, n_pages+1):
rq = Request(url=self.start_urls[0]+(self.json % i),
callback=self.parse_ads_urls, dont_filter=True)
rq.meta['proxy'] = PROXY
yield rq
def parse_ads_urls(self,response):
for article in response.css('article[class=minificha]'):
url_to_ads = article.css('a[class=btn-ver-detalhes]::attr(href)')\
.extract_first()
rq2 = Request(url=url_to_ads, callback=self.parse_ads_info,
dont_filter=True)
rq2.meta['proxy'] = proxy_list[0]
yield rq2
def parse_ads_info(self, response):
print "#--------->"
print response.css('span[class=value-ficha]::text').extract_first()
I removed my personal proxys.
(2017-06-06) EDIT 1:
Output log : https://pastebin.com/4jv2r9um

How to perform the function, after all crawling is done in scrapy?

spider_closed() function is not performing. If i give just print statement it is printing but if i perform any function call and return the value it is not working.
import scrapy
import re
from pydispatch import dispatcher
from scrapy import signals
from SouthShore.items import Product
from SouthShore.internalData import internalApi
from scrapy.http import Request
class bestbuycaspider(scrapy.Spider):
name = "bestbuy_dca"
allowed_domains = ["bestbuy.ca"]
start_urls = ["http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+beds",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+night+stand",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+headboard",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+desk",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+bookcase",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+dresser",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+tv+stand",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+armoire",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+kids",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+changing+table",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+baby"]
def __init__(self,jsondetails="",serverdetails="", *args,**kwargs):
super(bestbuycaspider, self).__init__(*args, **kwargs)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
self.jsondetails = jsondetails
self.serverdetails=serverdetails
self.data = []
def parse(self,response):
#my stuff here
def spider_closed(self,spider):
print "returning values"
self.results['extractedData']=self.data
print self.results=internalApi(self.jsondetails,self.serverdetails)
yield self.results
1) I want to call some function and return the scraped values

You can create an Item Pipeline with close_spider() method:
class MyPipeline(object):
def close_spider(self, spider):
do_something_here()
Just don't forget to activate it in settings.py as described in the docummentation link above.

Link Extractor in scrapy

I finally managed to get a working script.
Only 1 small issue. I can crawl all pages and get all the needed info, except from the first page.
Where is my error?
import scrapy.selector
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(CrawlSpider):
name = "coolblue"
allowed_domains = ["tvstore.be"]
start_urls = ["http://www.tvstore.be/category/192945/televisies.html"]
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="pagination next secondary"]',)), callback = "parse_items",follow = True),)
def parse_items(self, response):
products = response.xpath("//li[#class='product-list-columns--item product-list-item']")
for product in products:
item = PrijsvergelijkingItem()
item["Product_ref"] = product.xpath(".//h2/a/text()").extract_first().strip()
item["Product_price"] = product.xpath(".//strong[1]/text()").extract_first().strip().replace(",",".").replace("-","")
yield item

I didn't look hard enough.
I found the answer. All I had to do was change parse_Items to parse_start_url.
from scrapy.spiders import CrawlSpider, Rule
import scrapy.selector
from scrapy.linkextractors import LinkExtractor
from Prijsvergelijking.items import PrijsvergelijkingItem
class MySpider(CrawlSpider):
name = "msh"
allowed_domains = ["mediamarkt.be"]
start_urls = ["http://www.mediamarkt.be/mcs/productlist/_TV,98952,452540.html?langId=-17&searchParams=&sort=&view=&page=1"]
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//li[#class="pagination-next"]',)), callback = "parse_start_url",follow = True),)
def parse_start_url(self, response):
products = response.xpath("//ul[#class='products-list']/li/div")
for product in products:
item = PrijsvergelijkingItem()
item["Product_price"] = product.xpath('.//aside/div/div/div/text()').extract_first().replace(",", ".").replace("-", "")
item["Product_ref"] = product.xpath('.//div/h2/a/text()').extract_first().strip()
yield item

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Multi spiders on scrapy - django

Related

How to run scrapy spider with celery

Python Tkinter- method for printing not working after being called from another script

Second request doesn't call the callback

How to perform the function, after all crawling is done in scrapy?

Link Extractor in scrapy

Categories

Resources