Run a Scrapy spider in a Celery Task (django project) - django

I'm trying to run scrapy (spider/crawl)from django project (task in the admin interrface using celery). this is my code .
this is the error when I try to call the task from a python shell
djangoproject:
-monapp:
-tasks.py
-spider.py
-myspider.py '
-models.py
.....
tasks.py:
from djcelery import celery
from demoapp.spider import *
from demoapp.myspider import *
#celery.task
def add(x, y):
return x + y
#celery.task
def scra():
result_queue = Queue()
crawler = CrawlerWorker(MySpider(), result_queue)
crawler.start()
return "success"
spider.py:
from scrapy import project, signals
from scrapy.settings import Settings
from scrapy.crawler import Crawler
from scrapy.xlib.pydispatch import dispatcher
from multiprocessing.queues import Queue
import multiprocessing
class CrawlerWorker(multiprocessing.Process):
def __init__(self, spider, result_queue):
multiprocessing.Process.__init__(self)
self.result_queue = result_queue
self.crawler = Crawler(Settings())
if not hasattr(project, 'crawler'):
self.crawler.install()
self.crawler.configure()
self.items = []
self.spider = spider
dispatcher.connect(self._item_passed, signals.item_passed)
def _item_passed(self, item):
self.items.append(item)
def run(self):
self.crawler.crawl(self.spider)
self.crawler.start()
self.crawler.stop()
self.result_queue.put(self.items)
myspider.py
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.item import Item, Field
class TorentItem(Item):
title = Field()
desc = Field()
class MySpider(CrawlSpider):
name = 'job'
allowed_domains = ['tanitjobs.com']
start_urls = [\
'http://tanitjobs.com/browse-by-category/Nurse/',]
rules = (
Rule (SgmlLinkExtractor(allow=('page=*',)
,restrict_xpaths=('//div[#class="pageNavigation"]',),
unique = True)
, callback='parse_item', follow= True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items= hxs.select('\
//div[#class="offre"]/div[#class="detail"]')
scraped_items =[]
for item in items:
scraped_item = TorentItem()
scraped_item['title']=item.select(\
'a/strong/text()').extract()
scraped_item['desc'] =item.select(\
'./div[#class="descriptionjob"]/text()').extract()
scraped_items.append(scraped_item)
return scraped_items

I got it work mine on the shell using django management command. Below is my code snippet. Feel free to modify to fit your needs.
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import signals
from scrapy.utils.project import get_project_settings
from django.core.management.base import BaseCommand
from myspiderproject.spiders.myspider import MySpider
class ReactorControl:
def __init__(self):
self.crawlers_running = 0
def add_crawler(self):
self.crawlers_running += 1
def remove_crawler(self):
self.crawlers_running -= 1
if self.crawlers_running == 0:
reactor.stop()
def setup_crawler(domain):
settings = get_project_settings()
crawler = Crawler(settings)
crawler.configure()
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed)
spider = MySpider(domain=domain)
crawler.crawl(spider)
reactor_control.add_crawler()
crawler.start()
reactor_control = ReactorControl()
class Command(BaseCommand):
help = 'Crawls the site'
def handle(self, *args, **options):
setup_crawler('somedomain.com')
reactor.run() # the script will block here until the spider_closed signal was sent
hope this helps.

Related

How to Test in case Unittest By pass file excel to Import API

i'm will test code by TestCase of django.test and APIClient of resrt_framework.test with excel file, where do i can test thus import api
test_import_file.py
import xlrd
from django.test import TestCase
from rest_framework.test import APIClient
from account.models import Account
from account.tests import create_super_user
from instructor.models import Instructor
from utils.excel import get_value
class TestClassImport(TestCase):
def setUp(self):
self.account = create_super_user()
self.client = APIClient()
self.client.force_authenticate(self.account)
self.url = 'static/example/example_class_import.xlsx'
self.file = open('static/example/example_class_import.xlsx', 'rb')
self.wb = xlrd.open_workbook(file_contents=self.file.read())
self.sh = self.wb.sheet_by_index(0)
def test_real_import(self):
file = open(self.url, encoding="utf8", errors='ignore')
url = '/api/dashboard/content-migration/import/instructor/'
self.response = self.client.post(url, file)
self.failUnlessEqual(self.response.status_code, 201)
I hope it will "test_real_import (class.unittest.test_import_file.TestInstructorImport) ... ok"
add every thing to task of celery and call it by unit test of django
'python'
from your_app import your_project_task
class TestInstructorImport(TestCase):
fixtures = ['instructor/fixtures/instructor.json', 'account/fixtures/account.json']
def setUp(self):
self.account = create_super_user()
self.file_path = '/yourpath/name.xlsx'
def test_task_import(self):
response = task_execute.delay(self.file_path)
self.assertEqual(response.result, 'Done.')
code "self.assertEqual(response.result, 'Done.')"
in task just
return('Done.')
when your code done without any error.

how to use db instance in flask-apscheduler's jobs function

When I used flask-apscheduler(not apscheduler), I have some problems in my flask web project. Especially when I used db(flask-sqlalchemy) objects. The problem may be:
JOBS = [
{
'id': 'job1',
'func': 'app.monitor.views:test',
'args': (),
'trigger': 'interval',
'seconds': 2
}
]
./app/init.py:
from flask import Flask
from flask.ext.bootstrap import Bootstrap
from flask.ext.mail import Mail
from flask.ext.moment import Moment
from flask.ext.sqlalchemy import SQLAlchemy
from flask.ext.login import LoginManager
from flask.ext.pagedown import PageDown
from flask_apscheduler import APScheduler
from celery import Celery
# from apscheduler.schedulers.blocking import BlockingScheduler
from config import config,Config
bootstrap = Bootstrap()
mail = Mail()
moment = Moment()
db = SQLAlchemy()
pagedown = PageDown()
celery = Celery(__name__, broker=Config.CELERY_BROKER_URL)
# https://pypi.python.org/pypi/Flask-APScheduler
scheduler = APScheduler()
login_manager = LoginManager()
login_manager.session_protection = 'strong'
login_manager.login_view = 'auth.login'
def create_app(config_name):
app = Flask(__name__)
app.config.from_object(config[config_name])
config[config_name].init_app(app)
bootstrap.init_app(app)
mail.init_app(app)
moment.init_app(app)
db.init_app(app)
login_manager.init_app(app)
pagedown.init_app(app)
scheduler.init_app(app)
celery.conf.update(app.config)
if not app.debug and not app.testing and not app.config['SSL_DISABLE']:
from flask.ext.sslify import SSLify
sslify = SSLify(app)
from .monitor import monitor as monitor_1_0_blueprint
from .laser import laser as laser_1_0_blueprint
app.register_blueprint(monitor_blueprint,url_prefix='/monitor/api')
app.register_blueprint(laser_1_0_blueprint,url_prefix='/laser/api/v1.0')
return app
Error 1:db is : Error 2:db is :No handlers
could be found for logger "apscheduler.executors.default" Error 3:db
is : raise RuntimeError('working outside of application context')
RuntimeError: working outside of application context
The key to the problem is to get the db and app objects in flask-apscheduler jobs function(views.py):
from app import scheduler
def test():
#to Solve the log error problem
import logging
log = logging.getLogger('apscheduler.executors.default')
log.setLevel(logging.INFO) # DEBUG
fmt = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
h = logging.StreamHandler()
h.setFormatter(fmt)
log.addHandler(h)
#get the app object
app = scheduler.app
#get the db object and use it
with app.app_context():
print '........................',db
from app import scheduler#
def test():
#to Solve the log error problem
import logging
log = logging.getLogger('apscheduler.executors.default')
log.setLevel(logging.INFO) # DEBUG
fmt = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
h = logging.StreamHandler()
h.setFormatter(fmt)
log.addHandler(h)
#get the app object
app = scheduler.app
#get the db object and use it
with app.app_context():
print '........................',db
def test():
#to Solve the log error problem
import logging
log = logging.getLogger('apscheduler.executors.default')
log.setLevel(logging.INFO) # DEBUG
fmt = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
h = logging.StreamHandler()
h.setFormatter(fmt)
log.addHandler(h)
#get the app object
app = scheduler.app
#get the db object and use it
with app.app_context():
print '........................',db #the right db object

Flask and Cx_Freeze module object has no attribute rfind

i've an issue with my code, may be one of you will be able to help me about this
here is my script.py :
#! /usr/bin/python
# -*- coding:utf-8 -*-
import jinja2.ext
import webbrowser
def main():
from flask import Flask, flash, redirect, render_template, \
request, url_for,escape, session
from flask_bootstrap import Bootstrap
app = Flask(__name__)
webbrowser.open_new_tab('http://127.0.0.1:5000')
Bootstrap(app)
#app.route('/')
def home():
return render_template('home.html')
#app.route('/choice')
def choice():
return render_template('Choice.html')
#app.route('/src&dest')
def GenerationPage():
return render_template('generate.html')
#app.route('/success')
def successfull():
return render_template('success.html')
return app.run(debug=True)
if __name__ == "__main__":
main()
and my setup.py :
import sys
import os
from cx_Freeze import setup,Executable
path = sys.path + ["src", "src/templates"]
includes = [sys, os]
excludes = []
packages = []
include_files = ['templates']
options = {"path": path,
"includes": includes,
"excludes": excludes,
"include_files": include_files
}
if sys.platform == "win32":
options["include_msvcr"] = True
base = None
if sys.platform == "win32":
base = "Win32GUI"
FirstTarget = Executable(
script="Other.py",
base=base,
compress=False,
copyDependentFiles=True,
appendScriptToExe=True,
appendScriptToLibrary=False
)
setup(
name="FlaskTrapeze",
version="0.1",
description="Ouvre une interface web",
author="Axel M",
options={"build_exe": options},
executables=[FirstTarget]
)
I'm facing this issue when I try to build it under windows
"AttributeError: 'module' object has no attribute 'rfind'"
Thanks for the help !

Issues while integrating tornado app with django site

I have a simple chat application in Tornado powered with RethinkDB.
Am trying to integrate this tornado chat application to run with django site.
For that reason, I have adopted below in rechat.py in order for it to work with django.
Namespaces tornado.wsgi and django.core.wsgi (get_wsgi_application)
Set environment variable for Django settings.py
os.environ['DJANGO_SETTINGS_MODULE'] = 'djangoapp.settings'
When I try to run it after the above changes, it connects the db server, but doesn't do anything. What am I missing?
How can I make this tornado app to work with django 1.8 site?
Below is my code of rechat.py (https://github.com/v3ss0n/rechat) -
import logging
import tornado.escape
from tornado.ioloop import IOLoop
import tornado.web
import os.path
import rethinkdb as r
from tornado import httpserver
from time import time
# from tornado.concurrent import Future
from tornado import gen
from tornado.options import define, options, parse_command_line
import tornado.wsgi
from django.core.wsgi import get_wsgi_application
define("port", default=8888, help="run on the given port", type=int)
define("debug", default=True, help="run in debug mode")
def setup_db(db_name="rechat", tables=['events']):
connection = r.connect(host="localhost")
try:
r.db_create(db_name).run(connection)
for tbl in tables:
r.db(db_name).table_create(tbl, durability="hard").run(connection)
logging.info('Database setup completed.')
except r.RqlRuntimeError:
logging.warn('Database/Table already exists.')
finally:
connection.close()
class RechatApp(tornado.web.Application):
def __init__(self, db):
handlers = [
(r"/", MainHandler),
(r"/a/message/new", MessageNewHandler),
(r"/a/message/updates", MessageUpdatesHandler),
]
settings = dict(cookie_secret="_asdfasdaasdfasfas",
template_path=os.path.join(
os.path.dirname(__file__), "templates"),
static_path=os.path.join(
os.path.dirname(__file__), "static"),
xsrf_cookies=True,
debug=options.debug)
self.db = db
logging.info(db)
tornado.web.Application.__init__(self, handlers, **settings)
class BaseHandler(tornado.web.RequestHandler):
def initialize(self):
self.db = self.application.db
self.evt = r.table("events")
class MainHandler(BaseHandler):
#gen.coroutine
def get(self):
curs = yield self.evt.run(self.db)
messages = []
while (yield curs.fetch_next()):
item = yield curs.next()
messages.append(item)
self.render("index.html", messages=messages)
class MessageNewHandler(BaseHandler):
#gen.coroutine
def post(self):
message = {
"body": self.get_argument("body")
}
# to_basestring is necessary for Python 3's json encoder,
# which doesn't accept byte strings.
start = time()
messages = (yield self.evt.insert(message).run(self.db))
time_taken = time() - start
logging.warn("DBINSERT: %s seconds" % time_taken)
message['id'] = messages['generated_keys'][0]
message["html"] = tornado.escape.to_basestring(
self.render_string("message.html", message=message))
if self.get_argument("next", None):
self.redirect(self.get_argument("next"))
else:
self.write(message)
class MessageUpdatesHandler(BaseHandler):
#gen.coroutine
def post(self):
curs = yield self.evt.changes().run(self.db)
while (yield curs.fetch_next()):
feed = yield curs.next()
message = {
'id': feed['new_val']['id'],
'html': tornado.escape.to_basestring(
self.render_string("message.html",
message=feed['new_val']))}
break
self.finish(dict(messages=[message]))
#gen.coroutine
def main():
""" Async main method. It needed to be async due to r.connect is async . """
parse_command_line()
os.environ['DJANGO_SETTINGS_MODULE'] = 'djangoapp.settings'
db_name = "rechat"
setup_db(db_name)
r.set_loop_type("tornado")
db = yield r.connect("localhost", db=db_name)
#Single db connection for everything thanks a lot Ben and Jeese
http_server = httpserver.HTTPServer(RechatApp(db))
http_server.listen(options.port)
if __name__ == "__main__":
IOLoop.current().run_sync(main)
IOLoop.current().start()

how to output multiple webpages crawled data into csv file using python with scrapy

I have the following code below which crawls all the available pages from a website. This is perfectly `crawling` the valid pages because when I use print function - I can see the data from the `'items'` list, but I don't see any output when I try to use `.csv` as a destination file to dump the stats. (Using this command in command prompt : `scrapy crawl craig -o test.csv -t csv`),..
Please help me output the data into a `csv` file.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from test.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://example.com/subpage/%d"
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["xyz.com"]
#for u in URL:
start_urls = [URL % 1]
def __init__(self):
self.page_number = 1
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[#class='thumb']")
if not titles:
raise CloseSpider('No more pages')
items = []
for titles in titles:
item = CraigslistSampleItem()
item ["title"] = titles.select("a/#title").extract()
item ["url"] = titles.select("a/#href").extract()
items.append(item)
yield items
self.page_number += 1
yield Request(URL % self.page_number)
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from test.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://example.com/subpage/%d"
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["xyz.com"]
def start_requests(self):
for i in range(10):
yield Request(URL % i, callback=self.parse)
def parse(self, response):
titles = response.xpath("//div[#class='thumb']")
if not titles:
raise CloseSpider('No more pages')
for title in titles:
item = CraigslistSampleItem()
item ["title"] = title.xpath("./a/#title").extract()
item ["url"] = title.xpath("./a/#href").extract()
yield item