#shared_task()
def run_scraper_task():
ImportDataTaskInfo = apps.get_model("data", "TaskInfo")
logger = get_task_logger("A")
logger.info("XXXXXXX")
task = TaskInfo.objects.create(status=DataTaskStatuses.IN_PROGRESS)
try:
logger.info("XXXX")
crawler_settings = Settings()
crawler_settings.setmodule(setting1)
crawler = CrawlerProcess(settings=crawler_settings)
logger.info("XXXX")
crawler.crawl(Spider)
reactor.run()
logger.info("XXXX")
task.status = TaskStatuses.SUCCESS
logger.info("XXXX")
except Exception as error:
logger.error("XXXXX")
task.info += str(error) + "\n"
task.status = TaskStatuses.ERROR
finally:
import_task.save()
So i have scrapy connected to celery task, and scrapy execute spider.py and pipeline.py where i have for instance
def process_item(self, item, spider):
logger = get_task_logger("A")
logger.info("Pipeline activated.")
# i want this message to go into task.info.
I want to inject all my logs into object task.info attribute. How i can catch it in spider and in pipeline?
I have tried to catch proper task object for every step, but this seems not to be optimal option.
Related
I am attempting to show the stage of the tasks on the frontend by displaying it to the user but I cannot change the task status until my celery task is complete when it returns the result.
#upload_blueprint.route("/task/<task_id>", methods=["GET"])
def get_status(task_id):
print('getting task id', task_id)
task_result = celery.AsyncResult(task_id)
print("task_result",task_result)
result = {
"task_id": task_id,
"task_status": task_result.status,
"task_result": task_result.result
}
return jsonify(result), 200
How do i updated task_result = celery.AsyncResult(task_id) without even returning my task function?
And here is the task itself.
#celery.task(name="create_task")
def create_task(info):
###############################################################################
# Create celery web tasks #
###############################################################################
result = {"files": ["file_url_placeholder"], "images": ["image_url_placeholder"], "errors": str( errors ) }
return result, 200
Hello I have a celery task that is suppose to run every 1 hour to fetch a key and it runs and even acts like it has updates the database but it does not update in reality
#app.task
def refresh_token():
r = requests.get(AUTH_URL, auth=HTTPBasicAuth(CONSUMER_KEY, CONSUMER_SECRET))
obj = json.loads(r.text)
obj['expires_in'] = int(obj['expires_in'])
try:
mpesa_token = MpesaAccessToken.objects.get(id=1)
mpesa_token.access_token = obj['access_token']
mpesa_token.save()
print(obj)
print(mpesa_token.access_token)
print("saved")
except:
print(obj)
mpesa_token = MpesaAccessToken.objects.create(**obj)
return 1
the last thee prints all shows in the logs but checking the admin panel, the values are not updated however when I use a view and make a request then call the function, the database get updated, could anyone know what is going on
You need to add transaction.atomic() to your code. Like this:
from django.db import transaction
#app.task
def refresh_token():
with transaction.atomic():
r = requests.get(AUTH_URL, auth=HTTPBasicAuth(CONSUMER_KEY, CONSUMER_SECRET))
obj = json.loads(r.text)
obj['expires_in'] = int(obj['expires_in'])
try:
mpesa_token = MpesaAccessToken.objects.get(id=1)
mpesa_token.access_token = obj['access_token']
mpesa_token.save()
print(obj)
print(mpesa_token.access_token)
print("saved")
except:
print(obj)
mpesa_token = MpesaAccessToken.objects.create(**obj)
return 1
Here I am crawling some websites with different keywords. Before It was only scraping and it worked but I implemented celery for this. After using celery I am not being able to get the scraping result but no error is showing. I am using rabbitmq as the message broker here.
tasks.py
#shared_task()
def schedule_task(pk):
task = Task.objects.get(pk=pk)
keywords = ''
# for keys in ast.literal_eval(obj.keywords.all()): #keywords change to csv
for keys in task.keywords.all():
if keywords:
keywords += ', ' + keys.title
else:
keywords += keys.title
task_ids = [] # one Task/Project contains one or multiple scrapy task
settings = {
'spider_count': len(task.targets.all()),
'keywords': keywords,
'unique_id': str(uuid4()), # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# res = ast.literal_eval(ini_list)
for site_url in task.targets.all():
domain = urlparse(site_url.address).netloc # parse the url and extract the domain
spider_name = domain.replace('.com', '')
task = scrapyd.schedule('default', spider_name, settings=settings, url=site_url.address, domain=domain,
keywords=keywords)
views
def post(self, request, *args, **kwargs):
form = CreateTaskForm(request.POST)
if form.is_valid():
unique_id = str(uuid4()) # create a unique ID.
obj = form.save(commit=False)
obj.created_by = request.user
obj.unique_id = unique_id
obj.status = 0
obj.save()
form.save_m2m()
print(obj.pk)
schedule_task.delay(pk=obj.pk)
return redirect('crawler:task-list')
views before using celery ( which returns the scraped results worked fine) I just split the scraping part into tasks.py and call it from view with .delay but didn't returned the result(before it returned).
form = CreateTaskForm(request.POST)
if form.is_valid():
unique_id = str(uuid4()) # create a unique ID.
obj = form.save(commit=False)
obj.created_by = request.user
obj.unique_id = unique_id
obj.status = 0
obj.save()
form.save_m2m()
keywords = ''
# for keys in ast.literal_eval(obj.keywords.all()): #keywords change to csv
for keys in obj.keywords.all():
if keywords:
keywords += ', ' + keys.title
else:
keywords += keys.title
task_ids = [] #one Task/Project contains one or multiple scrapy task
settings = {
'spider_count' : len(obj.targets.all()),
'keywords' : keywords,
'unique_id': unique_id, # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# res = ast.literal_eval(ini_list)
for site_url in obj.targets.all():
domain = urlparse(site_url.address).netloc # parse the url and extract the domain
spider_name = domain.replace('.com', '')
task = scrapyd.schedule('default', spider_name, settings=settings, url=site_url.address, domain=domain, keywords=keywords)
return redirect('crawler:task-list')
celery console
[2020-06-10 20:42:55,885: INFO/MainProcess] celery#DESKTOP-ENPLHOS ready.
[2020-06-10 20:42:55,900: INFO/MainProcess] pidbox: Connected to amqp://guest:**#127.0.0.1:5672//.
[2020-06-10 20:43:13,730: INFO/MainProcess] Received task: crawler.tasks.schedule_task[10e7bf06-5e4e-413c-85a3-79d61b9835cf]
[2020-06-10 20:43:17,077: INFO/MainProcess] Task crawler.tasks.schedule_task[10e7bf06-5e4e-413c-85a3-79d61b9835cf] succeeded in 3.3590000000040163s: None
http://localhost:6800/jobs here I can see the spiders are running but the results are not appearing in my view.
views before using celery ( which returns the scraped results worked fine)
that is because your code runs synchronous....one after the other.
Celery on the other hand runs asynchronous and alway you will get a None as the returned value from it.
If you chain 2 or more Celery tasks (of which all of them run async) then you can make use of their returned value, but not chaining a synchronous view with an async celery task.
Celery tasks are meant to be dispatched and run in background...while your view is suppose to return something else...(without waiting for your spiders to finish)
To be able to make use of the Celery results:
Collected data needs to be stored somewhere (a file like csv, json, etc, .. OR inside a database) and handle the Django View in 2 steps:
first you trigger the Celery task
second collect the stored results and display them
Here I have a view CrawlerHomeView which is used to create the task object from a form now I want to schedule this task periodically with celery.
I want to schedule this CrawlerHomeView process with the task object search_frequency and by checking some task object fields.
Task Model
class Task(models.Model):
INITIAL = 0
STARTED = 1
COMPLETED = 2
task_status = (
(INITIAL, 'running'),
(STARTED, 'running'),
(COMPLETED, 'completed'),
(ERROR, 'error')
)
FREQUENCY = (
('1', '1 hrs'),
('2', '2 hrs'),
('6', '6 hrs'),
('8', '8 hrs'),
('10', '10 hrs'),
)
name = models.CharField(max_length=255)
scraping_end_date = models.DateField(null=True, blank=True)
search_frequency = models.CharField(max_length=5, null=True, blank=True, choices=FREQUENCY)
status = models.IntegerField(choices=task_status)
tasks.py
I want to run the view below posted periodically [period=(task's search_frequency time] if the task status is 0 or 1 and not crossed the task scraping end date. But I got stuck here. How can I do this?
#periodic_task(run_every=crontab(hour="task.search_frequency")) # how to do with task search_frequency value
def schedule_task(pk):
task = Task.objects.get(pk=pk)
if task.status == 0 or task.status == 1 and not datetime.date.today() > task.scraping_end_date:
# perform the crawl function ---> def crawl() how ??
if task.scraping_end_date == datetime.date.today():
task.status = 2
task.save() # change the task status as complete.
views.py
I want to run this view periodically.How can I do it?
class CrawlerHomeView(LoginRequiredMixin, View):
login_url = 'users:login'
def get(self, request, *args, **kwargs):
# all_task = Task.objects.all().order_by('-id')
frequency = Task()
categories = Category.objects.all()
targets = TargetSite.objects.all()
keywords = Keyword.objects.all()
form = CreateTaskForm()
context = {
'targets': targets,
'keywords': keywords,
'frequency': frequency,
'form':form,
'categories': categories,
}
return render(request, 'index.html', context)
def post(self, request, *args, **kwargs):
form = CreateTaskForm(request.POST)
if form.is_valid():
# try:
unique_id = str(uuid4()) # create a unique ID.
obj = form.save(commit=False)
# obj.keywords = keywords
obj.created_by = request.user
obj.unique_id = unique_id
obj.status = 0
obj.save()
form.save_m2m()
keywords = ''
# for keys in ast.literal_eval(obj.keywords.all()): #keywords change to csv
for keys in obj.keywords.all():
if keywords:
keywords += ', ' + keys.title
else:
keywords += keys.title
# tasks = request.POST.get('targets')
# targets = ['thehimalayantimes', 'kathmandupost']
# print('$$$$$$$$$$$$$$$ keywords', keywords)
task_ids = [] #one Task/Project contains one or multiple scrapy task
settings = {
'spider_count' : len(obj.targets.all()),
'keywords' : keywords,
'unique_id': unique_id, # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# res = ast.literal_eval(ini_list)
for site_url in obj.targets.all():
domain = urlparse(site_url.address).netloc # parse the url and extract the domain
spider_name = domain.replace('.com', '')
task = scrapyd.schedule('default', spider_name, settings=settings, url=site_url.address, domain=domain, keywords=keywords)
# task = scrapyd.schedule('default', spider_name , settings=settings, url=obj.targets, domain=domain, keywords=obj.keywords)
return redirect('crawler:task-list')
# except:
# return render(request, 'index.html', {'form':form})
return render(request, 'index.html', {'form':form, 'errors':form.errors})
Any Suggestions or answer is there for this problem ?
After fighting Celery for 5 years in a 15k tasks/second setup I highly recommend you to switch to Dramatiq, which has a sane, reliable, performant code base that isn't split across multiple convoluted packages and works perfectly in two of my newer projects so far.
From the author's motivation
I’ve used Celery professionally for years and my growing frustration with it is one of the reasons why I developed dramatiq. Here are some of the main differences between Dramatiq, Celery and RQ:
There's also a a Django helper package: https://github.com/Bogdanp/django_dramatiq
Granted, you won't have a builtin celerybeat, but a cron calling python tasks is more robust anyway, we lost a good amount of data because celerybeat decided to stall regularly :)
There are two projects that aim to add periodic task creation: https://gitlab.com/bersace/periodiq and https://apscheduler.readthedocs.io/en/stable/
I haven't used those packages yet, what you could try with periodiq is selecting your database entries, loop through those and define a periodic-task for each (but this requires regular restarts of the periodiq worker to pick up changes):
# tasks.py
from dramatiq import get_broker
from periodiq import PeriodiqMiddleware, cron
broker = get_broker()
broker.add_middleware(PeriodiqMiddleware(skip_delay=30))
for obj in Task.objects.all():
#dramatiq.actor(periodic=cron(obj.frequency))
def hourly(obj=obj):
# import logic based on obj.name
# Do something each hour…
For the error,
Exception Type: EncodeError
Exception Value:
Object of type timedelta is not JSON serializable
Instead of defining following variable in django settings,
CELERY_BEAT_SCHEDULE = {
'task-first': {
'task': 'scheduler.tasks.create_task',
'schedule': timedelta(minutes=1)
},
can you try following in your celery file:
app.conf.beat_schedule = {
'task-first': {
'task': 'scheduler.tasks.create_task',
'schedule': crontab(minute='*/1')
}
}
this works for me given, celery server is up and running.
Apart from this why are you redirecting to 'list_tasks' after each task, what does it exactly do? Also, you have called the celery task from the view add_task_celery.delay(name,date,freq), is it just another way to add task apart from periodic task defined using celery-beat?
Edit 1:
My structure looks like as follow:
settings.py
CELERY_TIMEZONE = 'Asia/Kolkata'
CELERY_BROKER_URL = 'amqp://localhost'
celery.py
app.conf.beat_schedule = {
'task1': {
'task': '<app_name>.tasks.random_task',
'schedule': crontab(minute=0, hour=0)
},
}
Here you should note that I have a file named tasks in my app folder and there I have written a shared task as follow:
#shared_task
def random_task(total):
...
Also, apart from this you should start both celery beat as well as a celery worker process as follow:
celery -A <project_name>.celery worker -l error
celery -A <project_name>.celery beat -l error --scheduler django_celery_beat.schedulers:DatabaseScheduler
You can any scheduler you want, on production I use DatabaseScheduler. For testing you can try with following command:
celery -A <project_name> beat -l info -S django
You should run all these commands from the project folder of the Django project
I believe the problem is with 2nd and 3rd parameter in the task definition, which is freq and date. Although from the error, you posted, Object of type timedelta is not JSON serializable, it looks like it's talking about freq field which is of type DurationField that returns timedelta object.
Ideally, both fields must be serialized before passing to the task.
one simple way would be -
1) You can explicitly serialize these fields and pass to the task and in the task again convert it to datetime / timedelta object.
alternatively, you can dump whole data dict if there are too many items.
add_task_celery.delay(json.dumps(form.cleaned_data)),
and then in the task do -> json.loads(...)
2) Another thing you can try is to pass the serializer in the parameters explicitly.(using apply_async instead of delay)
add_task_celery.apply_async((name, date, freq), serializer='json')
3) You can also set value, if you haven't already, for setting CELERY_TASK_SERIALIZER = 'json' (default value is 'pickle').
I'm new to celery and an overall python noob. I must have stumbled upon the right solution during my research but I just don't seem to understand what I need to do for what seems to be a simple case scenario.
I followed the following guide to learn about flask+celery.
What I understand:
There seems there is something obvious I'm missing about how to trigger a task after the first one is finished. I tried using callbacks, using loops, even tried using Celery Flower and Celery beat to realise this has nothing with what I'm doing...
Goal:
After filling the form, I want to send an email with attachements (result of the task) or a failure email otherwise. Without having to wonder what my user is doing on the app (no HTTP requests)
My code:
class ClassWithTheTask:
def __init__(self, filename, proxies):
# do stuff until a variable results is created
self.results = 'this contains my result'
#app.route('/', methods=['GET', 'POST'])
#app.route('/index', methods=['GET', 'POST'])
def index():
form = MyForm()
if form.validate_on_submit():
# ...
# the task
my_task = task1.delay(file_path, proxies)
return redirect(url_for('taskstatus', task_id=my_task.id, filename=filename, email=form.email.data))
return render_template('index.html',
form=form)
#celery.task(bind=True)
def task1(self, filepath, proxies):
task = ClassWithTheTask(filepath, proxies)
return results
#celery.task
def send_async_email(msg):
"""Background task to send an email with Flask-Mail."""
with app.app_context():
mail.send(msg)
#app.route('/status/<task_id>/<filename>/<email>')
def taskstatus(task_id, filename, email):
task = task1.AsyncResult(task_id)
if task.state == 'PENDING':
# job did not start yet
response = {
'state': task.state,
'status': 'Pending...'
}
elif task.state != 'FAILURE':
response = {
'state': task.state,
'status': task.info.get('status', '')
}
if 'results' in task.info:
response['results'] = task.info['results']
response['untranslated'] = task.info['untranslated']
msg = Message('Task Complete for %s !' % filename,
recipients=[email])
msg.body = 'blabla'
with app.open_resource(response['results']) as fp:
msg.attach(response['results'], "text/csv", fp.read())
with app.open_resource(response['untranslated']) as fp:
msg.attach(response['untranslated'], "text/csv", fp.read())
# the big problem here is that it will send the email only if the user refreshes the page and get the 'SUCCESS' status.
send_async_email.delay(msg)
flash('task finished. sent an email.')
return redirect(url_for('index'))
else:
# something went wrong in the background job
response = {
'state': task.state,
'status': str(task.info), # this is the exception raised
}
return jsonify(response)
I don't get the goal of your method for status check. Anyway what you are describing can be accomplished this way.
if form.validate_on_submit():
# ...
# the task
my_task = (
task1.s(file_path, proxies).set(link_error=send_error_email.s(filename, error))
| send_async_email.s()
).delay()
return redirect(url_for('taskstatus', task_id=my_task.id, filename=filename, email=form.email.data))
Then your error task will look like this. The normal task can stay the way it is.
#celery.task
def send_error_email(task_id, filename, email):
task = AsyncResult(task_id)
.....
What happens here is that you are using a chain. You are telling Celery to run your task1, if that completes successfully then run send_async_email, if it fails run send_error_email. This should work, but you might need to adapt the parameters, consider it as pseudocode.
This does not seem right at all:
def task1(self, filepath, proxies):
task = ClassWithTheTask(filepath, proxies)
return results
The line my_task = task1.delay(file_path, proxies) earlier in your code suggests you want to return task but you return results which is not defined anywhere. (ClassWithTheTask is also undefined). This code would crash, and your task would never execute.