I'm new to celery and an overall python noob. I must have stumbled upon the right solution during my research but I just don't seem to understand what I need to do for what seems to be a simple case scenario.
I followed the following guide to learn about flask+celery.
What I understand:
There seems there is something obvious I'm missing about how to trigger a task after the first one is finished. I tried using callbacks, using loops, even tried using Celery Flower and Celery beat to realise this has nothing with what I'm doing...
Goal:
After filling the form, I want to send an email with attachements (result of the task) or a failure email otherwise. Without having to wonder what my user is doing on the app (no HTTP requests)
My code:
class ClassWithTheTask:
def __init__(self, filename, proxies):
# do stuff until a variable results is created
self.results = 'this contains my result'
#app.route('/', methods=['GET', 'POST'])
#app.route('/index', methods=['GET', 'POST'])
def index():
form = MyForm()
if form.validate_on_submit():
# ...
# the task
my_task = task1.delay(file_path, proxies)
return redirect(url_for('taskstatus', task_id=my_task.id, filename=filename, email=form.email.data))
return render_template('index.html',
form=form)
#celery.task(bind=True)
def task1(self, filepath, proxies):
task = ClassWithTheTask(filepath, proxies)
return results
#celery.task
def send_async_email(msg):
"""Background task to send an email with Flask-Mail."""
with app.app_context():
mail.send(msg)
#app.route('/status/<task_id>/<filename>/<email>')
def taskstatus(task_id, filename, email):
task = task1.AsyncResult(task_id)
if task.state == 'PENDING':
# job did not start yet
response = {
'state': task.state,
'status': 'Pending...'
}
elif task.state != 'FAILURE':
response = {
'state': task.state,
'status': task.info.get('status', '')
}
if 'results' in task.info:
response['results'] = task.info['results']
response['untranslated'] = task.info['untranslated']
msg = Message('Task Complete for %s !' % filename,
recipients=[email])
msg.body = 'blabla'
with app.open_resource(response['results']) as fp:
msg.attach(response['results'], "text/csv", fp.read())
with app.open_resource(response['untranslated']) as fp:
msg.attach(response['untranslated'], "text/csv", fp.read())
# the big problem here is that it will send the email only if the user refreshes the page and get the 'SUCCESS' status.
send_async_email.delay(msg)
flash('task finished. sent an email.')
return redirect(url_for('index'))
else:
# something went wrong in the background job
response = {
'state': task.state,
'status': str(task.info), # this is the exception raised
}
return jsonify(response)
I don't get the goal of your method for status check. Anyway what you are describing can be accomplished this way.
if form.validate_on_submit():
# ...
# the task
my_task = (
task1.s(file_path, proxies).set(link_error=send_error_email.s(filename, error))
| send_async_email.s()
).delay()
return redirect(url_for('taskstatus', task_id=my_task.id, filename=filename, email=form.email.data))
Then your error task will look like this. The normal task can stay the way it is.
#celery.task
def send_error_email(task_id, filename, email):
task = AsyncResult(task_id)
.....
What happens here is that you are using a chain. You are telling Celery to run your task1, if that completes successfully then run send_async_email, if it fails run send_error_email. This should work, but you might need to adapt the parameters, consider it as pseudocode.
This does not seem right at all:
def task1(self, filepath, proxies):
task = ClassWithTheTask(filepath, proxies)
return results
The line my_task = task1.delay(file_path, proxies) earlier in your code suggests you want to return task but you return results which is not defined anywhere. (ClassWithTheTask is also undefined). This code would crash, and your task would never execute.
Related
Here I want to run my crawler with celery every 1 minute. I write the tasks as below and called the task in the view with delay but I am not getting the result.
I run celery -A mysite worker -l info celery , rabbitmq broker , scrapy and django server in different terminals.
The CrawlerHomeView redirects to the task list successfully by creating the task object.But the celery is not working
It is throwing this error in the celery console
ValueError: not enough values to unpack (expected 3, got 0) [2020-06-08 15:36:06,732: INFO/MainProcess] Received task: crawler.tasks.schedule_task[3b537143-caa8-4445-b3d6-c0bc8d301b89] [2020-06-08 15:36:06,735: ERROR/MainProcess] Task handler raised error: ValueError('not enough values to unpack (expected 3, got 0)') Traceback (most recent call last): File "....\venv\lib\site-packages\billiard\pool.py", line 362, in workloop result = (True, prepare_result(fun(*args, **kwargs))) File "....\venv\lib\site-packages\celery\app\trace.py", line 600, in _fast_trace_task tasks, accept, hostname = _loc ValueError: not enough values to unpack (expected 3, got 0)
views
class CrawlerHomeView(LoginRequiredMixin, View):
login_url = 'users:login'
def get(self, request, *args, **kwargs):
frequency = Task()
categories = Category.objects.all()
targets = TargetSite.objects.all()
keywords = Keyword.objects.all()
form = CreateTaskForm()
context = {
'targets': targets,
'keywords': keywords,
'frequency': frequency,
'form':form,
'categories': categories,
}
return render(request, 'index.html', context)
def post(self, request, *args, **kwargs):
form = CreateTaskForm(request.POST)
if form.is_valid():
unique_id = str(uuid4()) # create a unique ID.
obj = form.save(commit=False)
obj.created_by = request.user
obj.unique_id = unique_id
obj.status = 0
obj.save()
form.save_m2m()
schedule_task.delay(obj.pk)
return render(request, 'index.html', {'form':form, 'errors':form.errors})
tasks.py
scrapyd = ScrapydAPI('http://localhost:6800')
#periodic_task(run_every=crontab(minute=1)) # how to do with task search_frequency value ?
def schedule_task(pk):
task = Task.objects.get(pk=pk)
if task.status == 0 or task.status == 1 and not datetime.date.today() >= task.scraping_end_date:
unique_id = str(uuid4()) # create a unique ID.
keywords = ''
# for keys in ast.literal_eval(obj.keywords.all()): #keywords change to csv
for keys in task.keywords.all():
if keywords:
keywords += ', ' + keys.title
else:
keywords += keys.title
settings = {
'spider_count': len(task.targets.all()),
'keywords': keywords,
'unique_id': unique_id, # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# res = ast.literal_eval(ini_list)
for site_url in task.targets.all():
domain = urlparse(site_url.address).netloc # parse the url and extract the domain
spider_name = domain.replace('.com', '')
scrapyd.schedule('default', spider_name, settings=settings, url=site_url.address, domain=domain,
keywords=keywords)
elif task.scraping_end_date == datetime.date.today():
task.status = 2
task.save() # change the task status as completed.
settings
CELERY_BROKER_URL = 'amqp://localhost'
EDIT
This answer helped me to find the solution Celery raises ValueError: not enough values to unpack.
Now this errors has gone.
Now in the celery console I am seeing this
[2020-06-08 16:33:23,123: INFO/MainProcess] Task crawler.tasks.schedule_task[0578558d-0dc6-4db7-b69f-e912b604ff3d] succeeded in 0.016000000000531145s: None and getting no scraped results in my frontend .
Now my question is how can I check that my task is running periodically every 1 minute ?
It is the very first time I am using celery so here might be some problems.
Celery is no longer supported on Windows as platform ( version 4 dropped official support )
I highly suggest that you dockerize your app instead (or use wsl2),if you don't want to go this route
You would probably need to use gevent ( notice there could be some additional problems if you go this route)
pip install gevent
celery -A <module> worker -l info -P gevent
found similar detailed answer here
Here I am crawling some websites with different keywords. Before It was only scraping and it worked but I implemented celery for this. After using celery I am not being able to get the scraping result but no error is showing. I am using rabbitmq as the message broker here.
tasks.py
#shared_task()
def schedule_task(pk):
task = Task.objects.get(pk=pk)
keywords = ''
# for keys in ast.literal_eval(obj.keywords.all()): #keywords change to csv
for keys in task.keywords.all():
if keywords:
keywords += ', ' + keys.title
else:
keywords += keys.title
task_ids = [] # one Task/Project contains one or multiple scrapy task
settings = {
'spider_count': len(task.targets.all()),
'keywords': keywords,
'unique_id': str(uuid4()), # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# res = ast.literal_eval(ini_list)
for site_url in task.targets.all():
domain = urlparse(site_url.address).netloc # parse the url and extract the domain
spider_name = domain.replace('.com', '')
task = scrapyd.schedule('default', spider_name, settings=settings, url=site_url.address, domain=domain,
keywords=keywords)
views
def post(self, request, *args, **kwargs):
form = CreateTaskForm(request.POST)
if form.is_valid():
unique_id = str(uuid4()) # create a unique ID.
obj = form.save(commit=False)
obj.created_by = request.user
obj.unique_id = unique_id
obj.status = 0
obj.save()
form.save_m2m()
print(obj.pk)
schedule_task.delay(pk=obj.pk)
return redirect('crawler:task-list')
views before using celery ( which returns the scraped results worked fine) I just split the scraping part into tasks.py and call it from view with .delay but didn't returned the result(before it returned).
form = CreateTaskForm(request.POST)
if form.is_valid():
unique_id = str(uuid4()) # create a unique ID.
obj = form.save(commit=False)
obj.created_by = request.user
obj.unique_id = unique_id
obj.status = 0
obj.save()
form.save_m2m()
keywords = ''
# for keys in ast.literal_eval(obj.keywords.all()): #keywords change to csv
for keys in obj.keywords.all():
if keywords:
keywords += ', ' + keys.title
else:
keywords += keys.title
task_ids = [] #one Task/Project contains one or multiple scrapy task
settings = {
'spider_count' : len(obj.targets.all()),
'keywords' : keywords,
'unique_id': unique_id, # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# res = ast.literal_eval(ini_list)
for site_url in obj.targets.all():
domain = urlparse(site_url.address).netloc # parse the url and extract the domain
spider_name = domain.replace('.com', '')
task = scrapyd.schedule('default', spider_name, settings=settings, url=site_url.address, domain=domain, keywords=keywords)
return redirect('crawler:task-list')
celery console
[2020-06-10 20:42:55,885: INFO/MainProcess] celery#DESKTOP-ENPLHOS ready.
[2020-06-10 20:42:55,900: INFO/MainProcess] pidbox: Connected to amqp://guest:**#127.0.0.1:5672//.
[2020-06-10 20:43:13,730: INFO/MainProcess] Received task: crawler.tasks.schedule_task[10e7bf06-5e4e-413c-85a3-79d61b9835cf]
[2020-06-10 20:43:17,077: INFO/MainProcess] Task crawler.tasks.schedule_task[10e7bf06-5e4e-413c-85a3-79d61b9835cf] succeeded in 3.3590000000040163s: None
http://localhost:6800/jobs here I can see the spiders are running but the results are not appearing in my view.
views before using celery ( which returns the scraped results worked fine)
that is because your code runs synchronous....one after the other.
Celery on the other hand runs asynchronous and alway you will get a None as the returned value from it.
If you chain 2 or more Celery tasks (of which all of them run async) then you can make use of their returned value, but not chaining a synchronous view with an async celery task.
Celery tasks are meant to be dispatched and run in background...while your view is suppose to return something else...(without waiting for your spiders to finish)
To be able to make use of the Celery results:
Collected data needs to be stored somewhere (a file like csv, json, etc, .. OR inside a database) and handle the Django View in 2 steps:
first you trigger the Celery task
second collect the stored results and display them
i need a little bit of help understanding a problem that i have with user defined exceptions in python 2.7.11.
I have two files main.py and myErrors.py .in main i post data and receive a response and and in myErrors i handle the errors.
What i'm trying to do is execute the version error in the try:except statement, but it doesn't get executed even thought it should be. what i'm doing is that i pass the response to myErrors and update that data to a dictionary in the errors file.-
my question was badly phrased. What I want to do is, is pass the response to the error handler, but i don't want to execute it, until we get to the Try:except clause in on_response method. So when we get the response and if it's not successful, then check the error code and raise the exception. Now what i'm doing is checking first for errors and then executing the check for success (error code)
Here is the main
def send_messages(self):
response = cm.postData(url=simulateSasServer, jsondata=json_data)
self.on_response(response)
def on_response(self, response):
myERRORS.myERRORS(response)
# if registration is succesful change state to REGISTERED.
if 'registrationResponse' in response:
try:
responseObjects = response['registrationResponse']
for responseObject in responseObjects:
if responseObject['error']['errorCode'] == 0:
do_action
except myErrors.Version():
raise ("version_message")
Here is the myErrors
class myERRORS(Exception):
error_code = {'SUCCESS': 0,
'VERSION': 100,
}
response_data = {}
def __init__(self, response):
self.response_data.update(response)
class Version(myERRORS):
def __init__(self):
self.name = "VERSION"
self.err_code = self.error_code['VERSION']
self.msg = "SAS protocol version used by CBSD is not supported by SAS"
self.version_error()
if self.version_error() is True:
print (self.name, self.err_code, self.msg)
raise Exception(self.name, self.err_code, self.msg)
def version_error(self):
response_objects = self.response_data.values()[0]
if 'registrationResponse' in self.response_data:
for r_object in response_objects:
if r_object['error']['errorCode'] == self.error_code['VERSION']:
return True
Any help is much appreciated.
There isn't really anything special about exceptions. They are classes. What you did is create an instance of a class. You did not raise it. Change:
myERRORS.myERRORS(response)
to:
raise myERRORS.myERRORS(response)
Scrapy application, but the question is really about the Python language - experts can probably answer this immediately without knowing the framework at all.
I've got a class called CrawlWorker that knows how to talk to so-called "spiders" - schedule their crawls, and manage their lifecycle.
There's a TwistedRabbitClient that has-one CrawlWorker. The client only knows how to talk to the queue and hand off messages to the worker - it gets completed work back from the worker asynchronously by using the worker method connect_to_scrape below to connect to a signal emitted by a running spider:
def connect_to_scrape(self, callback):
self._connect_to_signal(callback, signals.item_scraped)
def _connect_to_signal(self, callback, signal):
if signal is signals.item_scraped:
def _callback(item, response, sender, signal, spider):
scrape_config = response.meta['scrape_config']
delivery_tag = scrape_config.delivery_tag
callback(item.to_dict(), delivery_tag)
else:
_callback = callback
dispatcher.connect(_callback, signal=signal)
So the worker provides a layer of "work deserialization" for the Rabbit client, who doesn't know about spiders, responses, senders, signals, items (anything about the nature of the work itself) - only dicts that'll be published as JSON with their delivery tags.
So the callback below isn't registering properly (no errors either):
def publish(self, item, delivery_tag):
self.log('item_scraped={0} {1}'.format(item, delivery_tag))
publish_message = json.dumps(item)
self._channel.basic_publish(exchange=self.publish_exchange,
routing_key=self.publish_key,
body=publish_message)
self._channel.basic_ack(delivery_tag=delivery_tag)
But if I remove the if branch in _connect_to_signal and connect the callback directly (and modify publish to soak up all the unnecessary arguments), it works.
Anyone have any ideas why?
So, I figured out why this wasn't working, by re-stating it in a more general context:
import functools
from scrapy.signalmanager import SignalManager
SIGNAL = object()
class Sender(object):
def __init__(self):
self.signals = SignalManager(self)
def wrap_receive(self, receive):
#functools.wraps(receive)
def wrapped_receive(message, data):
message = message.replace('World', 'Victor')
value = data['key']
receive(message, value)
return wrapped_receive
def bind(self, receive):
_receive = self.wrap_receive(receive)
self.signals.connect(_receive, signal=SIGNAL,
sender=self, weak=False)
def send(self):
message = 'Hello, World!'
data = {'key': 'value'}
self.signals.send_catch_log(SIGNAL, message=message, data=data)
class Receiver(object):
def __init__(self, sender):
self.sender = sender
self.sender.bind(self.receive)
def receive(self, message, value):
"""Receive data from a Sender."""
print 'Receiver received: {0} {1}.'.format(message, value)
if __name__ == '__main__':
sender = Sender()
receiver = Receiver(sender)
sender.send()
This works if and only if weak=False.
The basic problem is that when connecting to the signal, weak=False needs to be specified. Hopefully someone smarter than me can expound on why that's needed.
I am attempting to asynchronously consume a web service because it takes up to 45 seconds to return. Unfortunately, this web service is also somewhat unreliable and can throw errors. I have set up django-celery and have my tasks executing, which works fine until the task fails beyond max_retries.
Here is what I have so far:
#task(default_retry_delay=5, max_retries=10)
def request(xml):
try:
server = Client('https://www.whatever.net/RealTimeService.asmx?wsdl')
xml = server.service.RunRealTimeXML(
username=settings.WS_USERNAME,
password=settings.WS_PASSWORD,
xml=xml
)
except Exception, e:
result = Result(celery_id=request.request.id, details=e.reason, status="i")
result.save()
try:
return request.retry(exc=e)
except MaxRetriesExceededError, e:
result = Result(celery_id=request.request.id, details="Max Retries Exceeded", status="f")
result.save()
raise
result = Result(celery_id=request.request.id, details=xml, status="s")
result.save()
return result
Unfortunately, MaxRetriesExceededError is not being thrown by retry(), so I'm not sure how to handle the failure of this task. Django has already returned HTML to the client, and I am checking the contents of Result via AJAX, which is never getting to a full fail f status.
So the question is: How can I update my database when the Celery task has exceeded max_retries?
The issue is that celery is trying to re-raise the exception you passed in when it hits the retry limit. The code for doing this re-raising is here: https://github.com/celery/celery/blob/v3.1.20/celery/app/task.py#L673-L681
The simplest way around this is to just not have celery manage your exceptions at all:
#task(max_retries=10)
def mytask():
try:
do_the_thing()
except Exception as e:
try:
mytask.retry()
except MaxRetriesExceededError:
do_something_to_handle_the_error()
logger.exception(e)
You can override the after_return method of the celery task class, this method is called after the execution of the task whatever is the ret status (SUCCESS,FAILED,RETRY)
class MyTask(celery.task.Task)
def run(self, xml, **kwargs)
#Your stuffs here
def after_return(self, status, retval, task_id, args, kwargs, einfo=None):
if self.max_retries == int(kwargs['task_retries']):
#If max retries are equals to task retries do something
if status == "FAILURE":
#You can do also something if the tasks fail instead of check the retries
http://readthedocs.org/docs/celery/en/latest/reference/celery.task.base.html#celery.task.base.BaseTask.after_return
http://celery.readthedocs.org/en/latest/reference/celery.app.task.html?highlight=after_return#celery.app.task.Task.after_return
With Celery version 2.3.2 this approach has worked well for me:
class MyTask(celery.task.Task):
abstract = True
def after_return(self, status, retval, task_id, args, kwargs, einfo):
if self.max_retries == self.request.retries:
#If max retries is equal to task retries do something
#task(base=MyTask, default_retry_delay=5, max_retries=10)
def request(xml):
#Your stuff here
I'm just going with this for now, spares me the work of subclassing Task and is easily understood.
# auto-retry with delay as defined below. After that, hook is disabled.
#celery.shared_task(bind=True, max_retries=5, default_retry_delay=300)
def post_data(self, hook_object_id, url, event, payload):
headers = {'Content-type': 'application/json'}
try:
r = requests.post(url, data=payload, headers=headers)
r.raise_for_status()
except requests.exceptions.RequestException as e:
if self.request.retries >= self.max_retries:
log.warning("Auto-deactivating webhook %s for event %s", hook_object_id, event)
Webhook.objects.filter(object_id=hook_object_id).update(active=False)
return False
raise self.retry(exc=e)
return True