My method parse_adf_info never is called and I dont know why. No error occurs. I want to get the links for each ads (parse) and go to ads one by one (parse_ads_urls) and scraping data (parse_ads_info), but this method never is called.
Here is my code:
# -*- coding: utf-8 -*-
from scrapy import Request, Spider
#from zapimoveis.items import ads_info
from scrapy.selector import Selector
#from scrapy.loader import ItemLoader
proxy_list = ["###","###"]
PROXY = "###"
class AdsSpider(Spider):
name = "zapimoveis"
allowed_domains = ["https://www.zapimoveis.com.br/", "https://www.zapimoveis.com.br/oferta/"]
def __init__(self, start_url='', *args, **kwargs):
super(AdsSpider, self).__init__(*args, **kwargs)
self.start_urls = []
self.start_urls.append(start_url)
self.json = '#{"precomaximo":"2147483647","parametrosautosuggest":[{"B\
airro":"JD CAMBURI","Zona":"","Cidade":"VITORIA","Agrupame\
nto":"","Estado":"ES"}],"pagina":"%d","ordem":"DataAtualiz\
acao","paginaOrigem":"ResultadoBusca","semente":"213739135\
0","formato":"Lista"}'
def start_requests(self):
rq = Request(url=self.start_urls[0], callback=self.parse)
rq.meta['proxy'] = PROXY
yield rq
def parse(self, response):
n_pages = response.css('span[class="pull-right num-of"]::text') \
.extract_first()
n_pages = int(n_pages.replace("de ", ""))
for i in range(1, n_pages+1):
rq = Request(url=self.start_urls[0]+(self.json % i),
callback=self.parse_ads_urls, dont_filter=True)
rq.meta['proxy'] = PROXY
yield rq
def parse_ads_urls(self,response):
for article in response.css('article[class=minificha]'):
url_to_ads = article.css('a[class=btn-ver-detalhes]::attr(href)')\
.extract_first()
rq2 = Request(url=url_to_ads, callback=self.parse_ads_info,
dont_filter=True)
rq2.meta['proxy'] = proxy_list[0]
yield rq2
def parse_ads_info(self, response):
print "#--------->"
print response.css('span[class=value-ficha]::text').extract_first()
I removed my personal proxys.
(2017-06-06) EDIT 1:
Output log : https://pastebin.com/4jv2r9um
Related
I'm trying to run a scrapy spider which takes some argument and runing it with os.system. But the celery task(scraper) doesn't gets executed untill it finishes.
Spider
class SpecificAuthorQuotesSpider(scrapy.Spider):
"""Extracts the quotes from specific author"""
start_urls = ['https://quotes.toscrape.com/']
name = "some-quotes"
def __init__(self, author=None, **kwargs):
self.author = author
super().__init__(**kwargs)
def parse(self, response, **kwargs):
item = QuotesItem()
all_div_quotes = response.css('div.quote')
for quote in all_div_quotes:
title = quote.css('span.text::text').extract_first().replace('”', '').replace("“", "")
author = quote.css('.author::text').extract_first()
# Check if author's name matches
if author.strip().lower() == self.author.strip().lower():
item['text'] = title
item['author'] = author
yield item
# Crawl Next Page
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
Task
#shared_task
def task_scrape_from_author(author_name):
"""Scrape quotes from author"""
django_path = Path(__file__).resolve().parent.parent
os.chdir(str(django_path)+"/scraper")
os.system(
"scrapy crawl some-quotes -a author='{}'".format(author_name))
View
def scrape_quotes_from_author(request):
if request.user.is_superuser:
author_name = request.POST.get("athr_name")
task_scrape_from_author.delay(author_name)
messages.add_message(
request, messages.INFO, 'Started crawling quotes from {}'.format(author_name))
return HttpResponseRedirect(reverse("admin:index"))
else:
return HttpResponseRedirect("../")
Github Repo
I don't understand why is the task not getting completed and interrupted without any messages. I tried setting max timeout also but that din't worked.
I make a table for spider_name and spider_class.
model.py
class Spiders(models.Model):
spider_class = models.CharField(max_length=50,verbose_name="Spider Class",null=True)
spider_name = models.CharField(max_length=50,verbose_name="Spider Name",null=True)
I collect all spider_name and class' here.
view.py
from .model import Spiders
from spider_dir.start import startallSpiders
def runAllspiders(request):
all_class = []
spiders = Spiders.objects.all()
for spider in spiders:
spider_name = spider.spider_name
name = 'spider_dir.spider_dir.spiders.'+spider_name
i = importlib.import_module(name)
class_ = getattr(i, spider.spider_class)
all_class.append(class_)
try:
startallSpiders(all_class)
messages.success(request,"Spiders works fine")
except:
messages.warning(request,"An error occure")
return redirect(request.META['HTTP_REFERER'])
I make a start py in scrapy dir
I use crochet to start all of the spiders at once.
start.py
from .spider_dir import settings as st
from scrapy.settings import Settings
from crochet import setup
setup()
def startallSpiders(all_Class):
for class_ in all_Class:
crawler_settings = Settings()
setup()
crawler_settings.setmodule(st)
runner= CrawlerRunner(settings=crawler_settings)
runner.crawl(class_)
settings.py You have to append Django settings in scrapy settings.
import os,sys
sys.path.append(os.path.dirname(os.path.abspath('.')))
os.environ['DJANGO_SETTINGS_MODULE'] = 'django_project.settings'
import django
django.setup()
I figure with Crawlerrunner and everything works fine for 6 months.
I use django, celery, scrapy.
My settings for celery:
CELERY_BROKER_URL = 'amqp://****/myvhost'
CELERY_TIMEZONE = TIME_ZONE
CELERYD_CONCURRENCY = 1000
CELERYD_MAX_TASKS_PER_CHILD = 4
CELERY_IGNORE_RESULT = True
# django celery
CELERY_RESULT_BACKEND = 'django-db'
# celery queues setup
CELERY_DEFAULT_QUEUE = 'default'
CELERY_DEFAULT_ROUTING_KEY = 'default'
CELERY_QUEUES = (
Queue('get_context', Exchange('get_context'), routing_key='get_context'),
Queue('get_article', Exchange('get_article'), routing_key='get_article'),
)
CELERY_ROUTES = {
'parse.tasks.get_context': {
'queue': 'get_context',
'routing_key': 'get_context',
},
'parse.tasks.get_article': {
'queue': 'get_article',
'routing_key': 'get_article',
},
}
There are two tasks on celery:
from api_parser import celery_app
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy_parser.scrapy_parser.spiders.map_links import MapLinksSpider
from scrapy_parser.scrapy_parser.spiders.articles import ArticlesSpider
from threading import Thread
#celery_app.task
def get_context(rules_id, rules):
process = CrawlerProcess(get_project_settings())
process.crawl(MapLinksSpider, rules_id=rules_id, rules=rules)
Thread(target=process.start).start()
#celery_app.task
def get_article(rules_id, link_id, rules, link):
process = CrawlerProcess(get_project_settings())
process.crawl(ArticlesSpider, rules_id=rules_id, link_id=link_id, rules=rules, link=link)
Thread(target=process.start).start()
The first task is triggered by a signal and maps the links.
The second task is started when a new link is added to the database.
My signals in django:
from django.db.models.signals import post_save
from django.dispatch import receiver
from parse.models.rules import Scheduler, Rules, ParseLinks
from parse.tasks import get_context, get_article
#receiver(post_save, sender=Scheduler)
def create_task_get_context(sender, instance, created, **kwargs):
if created:
rules = Rules.objects.get(id=int(instance.rules.id))
get_context.delay(int(rules.id), str(rules.rules))
#receiver(post_save, sender=ParseLinks)
def create_task_get_article(sender, instance, created, **kwargs):
if created:
parse_link = ParseLinks.objects.get(id=int(instance.id))
get_article.delay(int(parse_link.rules.id), int(parse_link.id), str(parse_link.rules.rules), str(parse_link.link))
My spiders:
map_links.py
from parse.models.rules import ParseLinks
import scrapy
import json
class MapLinksSpider(scrapy.Spider):
name = "map_links"
start_urls = []
def __init__(self, **kw):
super(MapLinksSpider, self).__init__(**kw)
self.rules_id = kw.get('rules_id')
self.rules = json.loads(kw.get('rules'))
self.start_urls = [self.rules['url']]
self.templates = self.rules['item']['templates']
self.pagination = self.rules['pagination']
def parse(self, response):
for item in self.templates:
context = response.css(str(item['context']))
for row in context:
link = row.css('%s::attr(%s)' % (item['link']['cssSelector'], item['link']['attr'])).extract_first(),
title = row.css('%s::text' % item['options']['title']['cssSelector']).extract_first(),
date = row.css('%s::text' % item['options']['date']['cssSelector']).extract_first()
ParseLinks.objects.get_or_create(rules_id=self.rules_id, link=self.rules['url'] + link[0], title=title, date=date)
next_page = response.css('%s::attr(%s)' % (self.pagination['link']['cssSelector'], self.pagination['link']['attr'])).extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
articles.py
from parse.models.rules import ParseData
import scrapy
import json
class ArticlesSpider(scrapy.Spider):
name = "articles"
start_urls = []
def __init__(self, **kw):
super(ArticlesSpider, self).__init__(**kw)
self.rules_id = kw.get('rules_id')
self.link_id = kw.get('link_id')
self.rules = json.loads(kw.get('rules'))
self.link = kw.get('link')
def parse(self, response):
self.start_urls = [self.link]
title = response.css('%s::text' % self.rules['article']['title']['cssSelector']).extract_first()
text = response.css('%s::text' % self.rules['article']['text']['cssSelector']).extract_first()
ParseData.objects.create(rules_id=self.rules_id, link_id=self.link_id, title=title, text=text)
yield {
"title": title,
'text': text
}
But I get the error: twisted.internet.error.ReactorNotRestartable
I understand that the error is caused by the launch of a new process for the spider. But I'm using threads. And I do not understand why this does not solve my problem.
I think every beginning scraper meets this question :)
Try this:
0) pip install crochet
import from crochet import setup
setup() - at the top of the file
remove 2 lines:
a) d.addBoth(lambda _: reactor.stop())
b) reactor.run()
The only meaningful lines from [Scrapy docs][2] left are 2 last lines in this my code:
#some more imports
from crochet import setup
setup()
def run_spider(spiderName):
module_name="first_scrapy.spiders.{}".format(spiderName)
scrapy_var = import_module(module_name) #do some dynamic import of selected spider
spiderObj=scrapy_var.mySpider() #get mySpider-object from spider module
crawler = CrawlerRunner(get_project_settings()) #from Scrapy docs
crawler.crawl(spiderObj) #from Scrapy docs
This code allows to select what spider to run just with its name passed to run_spider function and after scraping finishes - select another spider and run it again.
Next you simply run run_spider from Celery task.
[1]: ReactorNotRestartable - Twisted and scrapy
[2]: https://doc.scrapy.org/en/latest/topics/practices.html
I am attempting to build a server that takes user requests for long(ish)-running jobs, updates the user as the job progresses, and returns some data for the client to use. I am attempting to use tornado's WebSocketHandler to do this. Is there a reason I can't call a WebSocketHandler's write_message method from another object?
import tornado.ioloop
import tornado.websocket
import json, sys, os
from uuid import uuid4
class MainHandler(tornado.web.RequestHandler):
def get(self):
self.write('Welcome to the site. Requests cannot be made to the main page.')
class WSInvalidRequest(Exception):
"""Called when user sends invalid request to the server."""
pass
class WSRequestQueue:
def __init__(self):
self._items = []
def put(self, item):
self._items.append(item)
return self._items.length()
def get(self):
return self._items.pop(0)
def get_position(self, item):
return self._items.index(item)
QUEUE = WSRequestQueue()
class WSRequest:
def __init__(self, message, websocket):
self.websocket = websocket
self.ran = False
self.valid = False
self.write(u'Request received.')
try:
self.request = WSRequest.parse_message(message)
self.valid = True
self.write(u'Request validated.')
position = QUEUE.put(self)
self.write(u'Added request to queue behind %i other requests.' % position)
except WSInvalidRequest as e: self.write(e.message)
#staticmethod
def validate_request_dict(request):
if not isinstance(messageDict, dict):
raise WSInvalidRequest(u'Invalid request. Should be JSON dict string.')
if 'arg' not in request:
raise WSInvalidRequest(u'Invalid request. No arg found')
#staticmethod
def parse_message(message):
messageDict = json.loads(message)
validate_request_dict(messageDict)
argument = messsageDict['arg']
return {'argumet': argument}
def write(self, message):
self.websocket.write_messsage(unicode(message))
def run(self):
self.ran = True
def destroy(self):
if self.valid:
if not self.ran: QUEUE.pop(QUEUE.get_position(self))
self.websocket.requests.remove(self)
self.write(u'Removed request from queue.')
class RequestWebSocket(tornado.websocket.WebSocketHandler):
def open(self):
self.id = uuid4()
self.requests = set()
print("WebSocket opened")
def on_message(self, message):
self.write_message(u'You sent: %s' % message)
self.write_message(u'Attempting to add your request to the queue.')
newRequest = WSRequest(message, self)
if newRequest.valid: self.requests.add(newRequest)
else: newRequest.destroy
def on_close(self):
print("WebSocket closed. Removing all requests from the queue.")
for request in self.requests: request.destroy()
def check_origin(self, origin):
return True
if __name__ == "__main__":
# Create the web server
application = tornado.web.Application([
(r'/', MainHandler),
(r'/websocket', RequestWebSocket)
], debug=True)
application.listen(80)
tornado.ioloop.IOLoop.instance().start()
There's a spelling mistake in write_messsage. There's an extra s.
It's at:
class WSRequest:
def write(self, message):
self.websocket.write_messsage(unicode(message))
# ^ extra 's'
spider_closed() function is not performing. If i give just print statement it is printing but if i perform any function call and return the value it is not working.
import scrapy
import re
from pydispatch import dispatcher
from scrapy import signals
from SouthShore.items import Product
from SouthShore.internalData import internalApi
from scrapy.http import Request
class bestbuycaspider(scrapy.Spider):
name = "bestbuy_dca"
allowed_domains = ["bestbuy.ca"]
start_urls = ["http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+beds",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+night+stand",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+headboard",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+desk",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+bookcase",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+dresser",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+tv+stand",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+armoire",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+kids",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+changing+table",
"http://www.bestbuy.ca/Search/SearchResults.aspx?type=product&page=1&sortBy=relevance&sortDir=desc&pageSize=96&query=south+shore+furniture+baby"]
def __init__(self,jsondetails="",serverdetails="", *args,**kwargs):
super(bestbuycaspider, self).__init__(*args, **kwargs)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
self.jsondetails = jsondetails
self.serverdetails=serverdetails
self.data = []
def parse(self,response):
#my stuff here
def spider_closed(self,spider):
print "returning values"
self.results['extractedData']=self.data
print self.results=internalApi(self.jsondetails,self.serverdetails)
yield self.results
1) I want to call some function and return the scraped values
You can create an Item Pipeline with close_spider() method:
class MyPipeline(object):
def close_spider(self, spider):
do_something_here()
Just don't forget to activate it in settings.py as described in the docummentation link above.
I am new to testing and need some help here.
Assuming having this method:
from urllib.request import urlopen
def get_posts():
with urlopen('some url here') as data:
return json.loads(data.read().decode('utf-8'))
The question is how to test this method (using mock.patch decorator if possible)?
What I have now:
#mock.patch('mymodule.urlopen')
def test_get_post(self, mocked_urlopen):
mocked_urlopen.__enter__ = Mock(return_value=self.test_data)
mocked_urlopen.__exit__ = Mock(return_value=False)
...
But it does not seem to be working.
P.S. Is there any convenient way to work with data variable (which type is HTTPResponse) in test so it could just be simple string?
I was fighting with this as well, and finally figured it out. (Python 3 syntax):
import urllib.request
import unittest
from unittest.mock import patch, MagicMock
class TestUrlopen(unittest.TestCase):
#patch('urllib.request.urlopen')
def test_cm(self, mock_urlopen):
cm = MagicMock()
cm.getcode.return_value = 200
cm.read.return_value = 'contents'
cm.__enter__.return_value = cm
mock_urlopen.return_value = cm
with urllib.request.urlopen('http://foo') as response:
self.assertEqual(response.getcode(), 200)
self.assertEqual(response.read(), 'contents')
#patch('urllib.request.urlopen')
def test_no_cm(self, mock_urlopen):
cm = MagicMock()
cm.getcode.return_value = 200
cm.read.return_value = 'contents'
mock_urlopen.return_value = cm
response = urllib.request.urlopen('http://foo')
self.assertEqual(response.getcode(), 200)
self.assertEqual(response.read(), 'contents')
response.close()
here is my take on this
from urllib.request import urlopen
from unittest.mock import patch
class Mock():
def __init__(self, request, context):
return None
def read(self):
return self
def decode(self, arg):
return ''
def __iter__(self):
return self
def __next__(self):
raise StopIteration
with patch('urllib.request.urlopen', Mock):
# do whatever over here
with urlopen('some url here') as data is a context manager
Also, a file can be used as a context manager, so a better approach here is to use io.StringIO
import io
import json
import urllib.request
from unittest.mock import patch
def get_posts():
with urllib.request.urlopen('some url here') as data:
return json.load(data)
def test_get_posts():
data = io.StringIO('{"id": 123}')
with patch.object(urllib.request, 'urlopen', return_value=data):
assert get_posts() == {"id": 123}
Ok, so I have written simple class to simulate context manager.
class PatchContextManager:
def __init__(self, method, enter_return, exit_return=False):
self._patched = patch(method)
self._enter_return = enter_return
self._exit_return = exit_return
def __enter__(self):
res = self._patched.__enter__()
res.context = MagicMock()
res.context.__enter__.return_value = self._enter_return
res.context.__exit__.return_value = self._exit_return
res.return_value = res.context
return res
def __exit__(self, type, value, tb):
return self._patched.__exit__()
Usage:
with PatchContextManager('mymodule.method', 'return_string') as mocked:
a = mymodule.method(47) # a == 'return_string'
mocked.assert_called_with(47)
...