How to run scrapy spider with celery - django

I'm trying to run a scrapy spider which takes some argument and runing it with os.system. But the celery task(scraper) doesn't gets executed untill it finishes.
Spider
class SpecificAuthorQuotesSpider(scrapy.Spider):
"""Extracts the quotes from specific author"""
start_urls = ['https://quotes.toscrape.com/']
name = "some-quotes"
def __init__(self, author=None, **kwargs):
self.author = author
super().__init__(**kwargs)
def parse(self, response, **kwargs):
item = QuotesItem()
all_div_quotes = response.css('div.quote')
for quote in all_div_quotes:
title = quote.css('span.text::text').extract_first().replace('”', '').replace("“", "")
author = quote.css('.author::text').extract_first()
# Check if author's name matches
if author.strip().lower() == self.author.strip().lower():
item['text'] = title
item['author'] = author
yield item
# Crawl Next Page
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
Task
#shared_task
def task_scrape_from_author(author_name):
"""Scrape quotes from author"""
django_path = Path(__file__).resolve().parent.parent
os.chdir(str(django_path)+"/scraper")
os.system(
"scrapy crawl some-quotes -a author='{}'".format(author_name))
View
def scrape_quotes_from_author(request):
if request.user.is_superuser:
author_name = request.POST.get("athr_name")
task_scrape_from_author.delay(author_name)
messages.add_message(
request, messages.INFO, 'Started crawling quotes from {}'.format(author_name))
return HttpResponseRedirect(reverse("admin:index"))
else:
return HttpResponseRedirect("../")
Github Repo
I don't understand why is the task not getting completed and interrupted without any messages. I tried setting max timeout also but that din't worked.

I make a table for spider_name and spider_class.
model.py
class Spiders(models.Model):
spider_class = models.CharField(max_length=50,verbose_name="Spider Class",null=True)
spider_name = models.CharField(max_length=50,verbose_name="Spider Name",null=True)
I collect all spider_name and class' here.
view.py
from .model import Spiders
from spider_dir.start import startallSpiders
def runAllspiders(request):
all_class = []
spiders = Spiders.objects.all()
for spider in spiders:
spider_name = spider.spider_name
name = 'spider_dir.spider_dir.spiders.'+spider_name
i = importlib.import_module(name)
class_ = getattr(i, spider.spider_class)
all_class.append(class_)
try:
startallSpiders(all_class)
messages.success(request,"Spiders works fine")
except:
messages.warning(request,"An error occure")
return redirect(request.META['HTTP_REFERER'])
I make a start py in scrapy dir
I use crochet to start all of the spiders at once.
start.py
from .spider_dir import settings as st
from scrapy.settings import Settings
from crochet import setup
setup()
def startallSpiders(all_Class):
for class_ in all_Class:
crawler_settings = Settings()
setup()
crawler_settings.setmodule(st)
runner= CrawlerRunner(settings=crawler_settings)
runner.crawl(class_)
settings.py You have to append Django settings in scrapy settings.
import os,sys
sys.path.append(os.path.dirname(os.path.abspath('.')))
os.environ['DJANGO_SETTINGS_MODULE'] = 'django_project.settings'
import django
django.setup()
I figure with Crawlerrunner and everything works fine for 6 months.

Related

Multi spiders on scrapy

I use django, celery, scrapy.
My settings for celery:
CELERY_BROKER_URL = 'amqp://****/myvhost'
CELERY_TIMEZONE = TIME_ZONE
CELERYD_CONCURRENCY = 1000
CELERYD_MAX_TASKS_PER_CHILD = 4
CELERY_IGNORE_RESULT = True
# django celery
CELERY_RESULT_BACKEND = 'django-db'
# celery queues setup
CELERY_DEFAULT_QUEUE = 'default'
CELERY_DEFAULT_ROUTING_KEY = 'default'
CELERY_QUEUES = (
Queue('get_context', Exchange('get_context'), routing_key='get_context'),
Queue('get_article', Exchange('get_article'), routing_key='get_article'),
)
CELERY_ROUTES = {
'parse.tasks.get_context': {
'queue': 'get_context',
'routing_key': 'get_context',
},
'parse.tasks.get_article': {
'queue': 'get_article',
'routing_key': 'get_article',
},
}
There are two tasks on celery:
from api_parser import celery_app
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy_parser.scrapy_parser.spiders.map_links import MapLinksSpider
from scrapy_parser.scrapy_parser.spiders.articles import ArticlesSpider
from threading import Thread
#celery_app.task
def get_context(rules_id, rules):
process = CrawlerProcess(get_project_settings())
process.crawl(MapLinksSpider, rules_id=rules_id, rules=rules)
Thread(target=process.start).start()
#celery_app.task
def get_article(rules_id, link_id, rules, link):
process = CrawlerProcess(get_project_settings())
process.crawl(ArticlesSpider, rules_id=rules_id, link_id=link_id, rules=rules, link=link)
Thread(target=process.start).start()
The first task is triggered by a signal and maps the links.
The second task is started when a new link is added to the database.
My signals in django:
from django.db.models.signals import post_save
from django.dispatch import receiver
from parse.models.rules import Scheduler, Rules, ParseLinks
from parse.tasks import get_context, get_article
#receiver(post_save, sender=Scheduler)
def create_task_get_context(sender, instance, created, **kwargs):
if created:
rules = Rules.objects.get(id=int(instance.rules.id))
get_context.delay(int(rules.id), str(rules.rules))
#receiver(post_save, sender=ParseLinks)
def create_task_get_article(sender, instance, created, **kwargs):
if created:
parse_link = ParseLinks.objects.get(id=int(instance.id))
get_article.delay(int(parse_link.rules.id), int(parse_link.id), str(parse_link.rules.rules), str(parse_link.link))
My spiders:
map_links.py
from parse.models.rules import ParseLinks
import scrapy
import json
class MapLinksSpider(scrapy.Spider):
name = "map_links"
start_urls = []
def __init__(self, **kw):
super(MapLinksSpider, self).__init__(**kw)
self.rules_id = kw.get('rules_id')
self.rules = json.loads(kw.get('rules'))
self.start_urls = [self.rules['url']]
self.templates = self.rules['item']['templates']
self.pagination = self.rules['pagination']
def parse(self, response):
for item in self.templates:
context = response.css(str(item['context']))
for row in context:
link = row.css('%s::attr(%s)' % (item['link']['cssSelector'], item['link']['attr'])).extract_first(),
title = row.css('%s::text' % item['options']['title']['cssSelector']).extract_first(),
date = row.css('%s::text' % item['options']['date']['cssSelector']).extract_first()
ParseLinks.objects.get_or_create(rules_id=self.rules_id, link=self.rules['url'] + link[0], title=title, date=date)
next_page = response.css('%s::attr(%s)' % (self.pagination['link']['cssSelector'], self.pagination['link']['attr'])).extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
articles.py
from parse.models.rules import ParseData
import scrapy
import json
class ArticlesSpider(scrapy.Spider):
name = "articles"
start_urls = []
def __init__(self, **kw):
super(ArticlesSpider, self).__init__(**kw)
self.rules_id = kw.get('rules_id')
self.link_id = kw.get('link_id')
self.rules = json.loads(kw.get('rules'))
self.link = kw.get('link')
def parse(self, response):
self.start_urls = [self.link]
title = response.css('%s::text' % self.rules['article']['title']['cssSelector']).extract_first()
text = response.css('%s::text' % self.rules['article']['text']['cssSelector']).extract_first()
ParseData.objects.create(rules_id=self.rules_id, link_id=self.link_id, title=title, text=text)
yield {
"title": title,
'text': text
}
But I get the error: twisted.internet.error.ReactorNotRestartable
I understand that the error is caused by the launch of a new process for the spider. But I'm using threads. And I do not understand why this does not solve my problem.
I think every beginning scraper meets this question :)
Try this:
0) pip install crochet
import from crochet import setup
setup() - at the top of the file
remove 2 lines:
a) d.addBoth(lambda _: reactor.stop())
b) reactor.run()
The only meaningful lines from [Scrapy docs][2] left are 2 last lines in this my code:
#some more imports
from crochet import setup
setup()
def run_spider(spiderName):
module_name="first_scrapy.spiders.{}".format(spiderName)
scrapy_var = import_module(module_name) #do some dynamic import of selected spider
spiderObj=scrapy_var.mySpider() #get mySpider-object from spider module
crawler = CrawlerRunner(get_project_settings()) #from Scrapy docs
crawler.crawl(spiderObj) #from Scrapy docs
This code allows to select what spider to run just with its name passed to run_spider function and after scraping finishes - select another spider and run it again.
Next you simply run run_spider from Celery task.
[1]: ReactorNotRestartable - Twisted and scrapy
[2]: https://doc.scrapy.org/en/latest/topics/practices.html

Google App Engine: Kind Error

Following is my program
import os
import jinja2
import re
from string import letters
import webapp2
from google.appengine.ext import db
template_dir= os.path.join(os.path.dirname(__file__),'templates')
jinja_env= jinja2.Environment(loader= jinja2.FileSystemLoader(template_dir),autoescape= True)
class Handler(webapp2.RequestHandler):
def write(self,*a,**kw):
self.response.write(*a,**kw)
def render_str(self,template, **params):
t= jinja_env.get_template(template)
return t.render(params)
def render(self,template, **kw):
self.write(self.render_str(template,**kw))
def blog_key(name = "default"):
return db.Key.from_path('blogs',name)
class Post(db.Model):
title= db.StringProperty(required= True)
content= db.TextProperty(required= True)
created= db.DateTimeProperty(auto_now_add= True)
edited= db.DateTimeProperty(auto_now= True)
def render(self):
self._render_text= self.content.replace('\n', '<br>')
return render_str("blogs.html", p= self)
class BlogFront(Handler):
def get(self):
posts= db.GqlQuery("select * from posts order by desc limit 10")
self.render("front.html", posts=posts )
#for a link to the new posts created
class PostPage(Handler):
def get(self, post_id):
key= db.Key.from_path("Post", int(post_id), parent= blog_key())
post=db.get(key)
if not post:
self.error(404)
return
self.render("permalink.html", post=post)
#for new blog entries
class NewPost(Handler):
def get(self):
self.render('newpost.html')
def post(self):
title= self.request.get("title")
content= self.request.get("content")
if title and content:
p= Post(parent= blog_key(), title=title, content=content)
p.put()
self.redirect('/blogs/%s' % str(p.key().id()))
else:
error= "Please write both title and content!!"
self.render("newpost.html",title=title, content=content, error=error)
app = webapp2.WSGIApplication([
('/blog/newpost', NewPost),
('/blogs/?',BlogFront),
('/blogs/([0-9]+)',PostPage), #anything in the bracket will be passed as the parameter
], debug=True)
But when I'm trying to implement this program, I'm getting the following error:
File "C:\Users\tan31102\AppData\Local\Google\Cloud
SDK\google-cloud-sdk\platfo
rm\google_appengine\google\appengine\ext\db__init__.py", line 299, in
class_for
_kind
raise KindError('No implementation for kind \'%s\'' % kind) KindError: No implementation for kind 'posts'
Can someone please help me with this.
Your gql uses post in plural form - "posts"
select * from posts order by desc limit 1
while the db.Model class you have declared uses posts in singular form (Post)
class Post(db.Model):
You need to stick with either form. Also you should consider using ndb.Model instead of db.Model as stated in the docs
You have to import the model definition in the Py file that uses it. This causes the KindError.
from posts import posts
or something similar.
It sounds like you are trying to load a posts entity from a session without importing the posts model first.
To ensure that posts is available when the session middleware runs, you must import the posts model in your script.

Second request doesn't call the callback

My method parse_adf_info never is called and I dont know why. No error occurs. I want to get the links for each ads (parse) and go to ads one by one (parse_ads_urls) and scraping data (parse_ads_info), but this method never is called.
Here is my code:
# -*- coding: utf-8 -*-
from scrapy import Request, Spider
#from zapimoveis.items import ads_info
from scrapy.selector import Selector
#from scrapy.loader import ItemLoader
proxy_list = ["###","###"]
PROXY = "###"
class AdsSpider(Spider):
name = "zapimoveis"
allowed_domains = ["https://www.zapimoveis.com.br/", "https://www.zapimoveis.com.br/oferta/"]
def __init__(self, start_url='', *args, **kwargs):
super(AdsSpider, self).__init__(*args, **kwargs)
self.start_urls = []
self.start_urls.append(start_url)
self.json = '#{"precomaximo":"2147483647","parametrosautosuggest":[{"B\
airro":"JD CAMBURI","Zona":"","Cidade":"VITORIA","Agrupame\
nto":"","Estado":"ES"}],"pagina":"%d","ordem":"DataAtualiz\
acao","paginaOrigem":"ResultadoBusca","semente":"213739135\
0","formato":"Lista"}'
def start_requests(self):
rq = Request(url=self.start_urls[0], callback=self.parse)
rq.meta['proxy'] = PROXY
yield rq
def parse(self, response):
n_pages = response.css('span[class="pull-right num-of"]::text') \
.extract_first()
n_pages = int(n_pages.replace("de ", ""))
for i in range(1, n_pages+1):
rq = Request(url=self.start_urls[0]+(self.json % i),
callback=self.parse_ads_urls, dont_filter=True)
rq.meta['proxy'] = PROXY
yield rq
def parse_ads_urls(self,response):
for article in response.css('article[class=minificha]'):
url_to_ads = article.css('a[class=btn-ver-detalhes]::attr(href)')\
.extract_first()
rq2 = Request(url=url_to_ads, callback=self.parse_ads_info,
dont_filter=True)
rq2.meta['proxy'] = proxy_list[0]
yield rq2
def parse_ads_info(self, response):
print "#--------->"
print response.css('span[class=value-ficha]::text').extract_first()
I removed my personal proxys.
(2017-06-06) EDIT 1:
Output log : https://pastebin.com/4jv2r9um

How to perform the function, after all crawling is done in scrapy?

spider_closed() function is not performing. If i give just print statement it is printing but if i perform any function call and return the value it is not working.
import scrapy
import re
from pydispatch import dispatcher
from scrapy import signals
from SouthShore.items import Product
from SouthShore.internalData import internalApi
from scrapy.http import Request
class bestbuycaspider(scrapy.Spider):
name = "bestbuy_dca"
allowed_domains = ["bestbuy.ca"]
start_urls = ["http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+beds",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+night+stand",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+headboard",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+desk",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+bookcase",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+dresser",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+tv+stand",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+armoire",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+kids",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+changing+table",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+baby"]
def __init__(self,jsondetails="",serverdetails="", *args,**kwargs):
super(bestbuycaspider, self).__init__(*args, **kwargs)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
self.jsondetails = jsondetails
self.serverdetails=serverdetails
self.data = []
def parse(self,response):
#my stuff here
def spider_closed(self,spider):
print "returning values"
self.results['extractedData']=self.data
print self.results=internalApi(self.jsondetails,self.serverdetails)
yield self.results
1) I want to call some function and return the scraped values
You can create an Item Pipeline with close_spider() method:
class MyPipeline(object):
def close_spider(self, spider):
do_something_here()
Just don't forget to activate it in settings.py as described in the docummentation link above.

Django Form use of user id

The following code is working nicely:
class SelectTwoTeams(forms.Form):
campaignnoquery = UserSelection.objects.filter(user=349).order_by('-campaignno')[:1]
currentCampaignNo = campaignnoquery[0].campaignno
cantSelectTeams = UserSelection.objects.filter(campaignno=currentCampaignNo)
currentTeams = StraightredTeam.objects.filter(currentteam = 1).exclude(teamid__in=cantSelectTeams.values_list('teamselectionid', flat=True))
team_one = forms.ModelChoiceField(queryset = currentTeams)
team_two = forms.ModelChoiceField(queryset = currentTeams)
However, you can see that the user id is currently hardcoded into the filter as 349. I would like this to be the id of the user logged in. I know in the view I can use:
currentUser = request.user
currentUserID = currentUser.id
But this code does not work within the forms section. If anyone could point me in the correct direction that would be ideal.
When I follow the suggestion below using the following form I get an error saying: NameError: name 'currentUserID' is not defined
# coding=utf-8
from dwad.threadlocals import get_current_user
from django.db.models import Max
from django import forms
from straightred.models import StraightredTeam
from straightred.models import UserSelection
class SelectTwoTeams(forms.Form):
def save(self):
currentUser = get_current_user()
currentUserID = currentUser.id
campaignnoquery = UserSelection.objects.filter(user=currentUserID).order_by('-campaignno')[:1]
currentCampaignNo = campaignnoquery[0].campaignno
cantSelectTeams = UserSelection.objects.filter(campaignno=currentCampaignNo)
currentTeams = StraightredTeam.objects.filter(currentteam = 1).exclude(teamid__in=cantSelectTeams.values_list('teamselectionid', flat=True))
team_one = forms.ModelChoiceField(queryset = currentTeams)
team_two = forms.ModelChoiceField(queryset = currentTeams)
Many thanks, Alan.
One method is to use local.threading. I have used this solution on a number of Django installations to good use.
I know there are a number of different opinions whether this is a good or bad solution. I tend to fall into the category that it can be extremely good in the right circumstances.
To set it up, create a file called threadlocals.py:
try:
from threading import local
except ImportError:
from django.utils._threading_local import local
_thread_locals = local()
def get_current_user():
return getattr(_thread_locals, 'user', None)
class ThreadLocalsMiddleware(object):
def process_request(self, request):
_thread_locals.user = getattr(request, 'user', None)
Then, add this ThreadLocalsMiddleware class to your project's middleware in settings.py:
MIDDLEWARE_CLASSES = [
...
'myproject.threadlocals.ThreadLocalsMiddleware',
...
]
Now, all you need to do is call the method get_current_user() from anywhere in your project.
from myproject.threadlocals import get_current_user
class SelectTwoTeams(forms.Form):
def save(self):
# for example:
currentUser = get_current_user()
currentUserID = currentUser.id
Found this answer at Reddit.
It is very simple and it is working good in my case.
In your view you have to include some code like this:
if form.is_valid():
obj = form.save(commit=False)
obj.user = request.user # logged in user is available on a view func's `request` instance
obj.save() # safe to save w/ user in tow