Django APscheduler prevent more workers running scheduled task - django

I use APScheduler in Django, on Windows IIS to run my background script. Problem is, taks gets run multiple times. If I run same program on my PC, it only runs once, but when I upload to windows server (which hosts my Django app) it runs more times. I guess it has some connection with the number of workers? Job is scheduled, but each time job task is done, it's like it runs random number of instances. First 1 time, then 2, then 10, then again 2. Even tho I have 'replace_existing=True, coalesce= True, misfire_grace_time = 1, max_instances = 1'
planer_zad.py
from apscheduler.schedulers.background import BackgroundScheduler
from blog.views import cron_mail_overdue
def start():
scheduler.add_job(cron_mail_overdue, "cron", hour=7, minute=14, day_of_week='mon-sun', id="task002", replace_existing=True, coalesce= True, misfire_grace_time = 10, max_instances = 1)
scheduler.start()
apps.py
from django.apps import AppConfig
class BlogConfig(AppConfig):
name = 'blog'
def ready(self):
#print('Starting Scheduler...')
from .planer import planer_zad
planer_zad.start()
For test I tried 'interval':
scheduler.add_job(cron_mail_overdue, "interval", minutes=1, id="task002", replace_existing=True, coalesce= True, misfire_grace_time = 10, max_instances = 1)
Tried:
scheduler = BackgroundScheduler({
'apscheduler.executors.default': {
'class': 'apscheduler.executors.pool:ThreadPoolExecutor',
'max_workers': '1'
},
'apscheduler.executors.processpool': {
'type': 'processpool',
'max_workers': '1'
},
'apscheduler.job_defaults.coalesce': 'True',
'apscheduler.job_defaults.max_instances': '1',
'apscheduler.timezone': 'UTC',
})
scheduler.add_job(cron_mail_overdue, "cron", hour=9, minute=3, second=00, day_of_week='mon-sun', id="task002", replace_existing=True, coalesce= True, misfire_grace_time = 10, max_instances = 1)
scheduler.start()
Does not work. Sometimes it runs only once, then 12 times.

Just test if the object already exists in ready() :
# django/myapp/apps.py
from django.apps import AppConfig
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
class BlogConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'blog'
def __init__(self, app_name, app_module):
super(BlogConfig, self).__init__(app_name, app_module)
self.planer_zad = None
def ready(self):
if os.environ.get('RUN_MAIN', None) != 'true':
return
if self.planer_zad is None:
background_scheduler = BackgroundScheduler()
background_scheduler.add_job(task1, CronTrigger.from_crontab('* * * * *')) # Every minutes (debug).
background_scheduler.start()
return background_scheduler
def task1(self):
print("cron task is working")
You can then call it later :
# api.py
from django.apps import apps
#router.get("/background-task")
def background_task(request):
"""
Run a background task.
"""
user = request.user
blog_config= apps.get_app_config('blog')
background_scheduler = blog_config.background_scheduler
return {"status": "Success", "True": str(background_scheduler)}

Related

telebot register_next_step_handler_by_chat_id not works in celery shared task

I have a small Telegram bot Django project and the bot needs to send a message to users that have been inactive for over an hour and then wait for input from the user using tg_nick_or_phone_input_handler and write it to the TelegramBotClientModel instance, sending more messages to chat.
def tg_nick_or_phone_input_handler(message):
chat_id = message.chat.id
bot_client = TelegramBotClientModel.objects.get(chat_id=chat_id)
bot_client.phone_or_nickname = message.text
bot_client.request_sent = True
bot_client.save()
bot.send_message(
chat_id=chat_id,
text='REQUEST SENT'
)
bot.send_message(
chat_id=chat_id,
text='some message'
)
import django.utils.timezone as tz
from celery import shared_task
from tg_funnel_bot.bot import bot
from .views import tg_nick_or_phone_input_handler
from .models import TelegramBotClientModel, BotMessagesSettingsModel
# celery task where should register next step handler
#shared_task(name='send_message_for_interrupted_dialog_after_one_hour')
def send_message_for_interrupted_dialog_after_one_hour():
bot_messages_settings = BotMessagesSettingsModel.objects.all().first()
inactive_for_hour_clients = TelegramBotClientModel.objects.filter(
updated_at__lte=tz.now() - tz.timedelta(minutes=5),
request_sent=False,
message_for_one_hour_inactive_sent=False
)
for inactive_client in inactive_for_hour_clients:
chat_id = inactive_client.chat_id
bot.send_message(
chat_id=chat_id,
text=bot_messages_settings.user_inactive_for_hour_message_text_first_part
)
bot.send_message(
chat_id=chat_id,
text=bot_messages_settings.user_inactive_for_hour_message_text_second_part,
reply_markup=bot_messages_settings.get_inactive_message_second_part_markup()
)
bot.register_next_step_handler_by_chat_id(
chat_id=chat_id,
callback=tg_nick_or_phone_input_handler
)
inactive_client = TelegramBotClientModel.objects.get(pk=inactive_client.pk)
inactive_client.message_for_one_hour_inactive_sent = True
inactive_client.save()
return tz.now()
bot.register_next_step_handler_by_chat_id not register handler and follow input in bot chat not processing.
Maybe I cannot use bot instance in another processes, but I can send messages and register_next_step_handler_by_chat_id works if use it in file with webhook update APIView like this:
from telebot import types
from rest_framework.views import APIView
from rest_framework.response import Response
from tg_funnel_bot.bot import bot
from .models import BotMessagesSettingsModel, TelegramBotClientModel
class UpdatesHandlerBotAPIView(APIView):
def post(self, request):
json_data = request.body.decode('UTF-8')
update_data = types.Update.de_json(json_data)
bot.process_new_updates([update_data])
return Response({'code': 200})
def after_loading_questions_data_handler(message):
chat_id = message.chat.id
bot_messages_settings = BotMessagesSettingsModel.objects.all().first()
after_loading_questions_data_text = bot_messages_settings.after_data_loading_text
bot.send_message(
chat_id=chat_id,
text=after_loading_questions_data_text
)
bot.register_next_step_handler_by_chat_id(
chat_id=chat_id,
callback=tg_nick_or_phone_input_handler
)
def tg_nick_or_phone_input_handler(message):
chat_id = message.chat.id
bot_client = TelegramBotClientModel.objects.get(chat_id=chat_id)
bot_client.phone_or_nickname = message.text
bot_client.request_sent = True
bot_client.save()
bot.send_message(
chat_id=chat_id,
text='REQUEST SENT'
)
bot.send_message(
chat_id=chat_id,
text='some text'
)
Celery configs
CELERY_BROKER_URL = 'redis://localhost:6379/0'
CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
CELERY_BEAT_SCHEDULE = {
'add-every-10-minutes': {
'task': 'send_message_for_interrupted_dialog_after_one_hour',
'schedule': crontab(minute='*/1'),
}
}
This is so strange, Give me a hand, please.

How to schedule my crawler function in django periodically using celery?

Here I have a view CrawlerHomeView which is used to create the task object from a form now I want to schedule this task periodically with celery.
I want to schedule this CrawlerHomeView process with the task object search_frequency and by checking some task object fields.
Task Model
class Task(models.Model):
INITIAL = 0
STARTED = 1
COMPLETED = 2
task_status = (
(INITIAL, 'running'),
(STARTED, 'running'),
(COMPLETED, 'completed'),
(ERROR, 'error')
)
FREQUENCY = (
('1', '1 hrs'),
('2', '2 hrs'),
('6', '6 hrs'),
('8', '8 hrs'),
('10', '10 hrs'),
)
name = models.CharField(max_length=255)
scraping_end_date = models.DateField(null=True, blank=True)
search_frequency = models.CharField(max_length=5, null=True, blank=True, choices=FREQUENCY)
status = models.IntegerField(choices=task_status)
tasks.py
I want to run the view below posted periodically [period=(task's search_frequency time] if the task status is 0 or 1 and not crossed the task scraping end date. But I got stuck here. How can I do this?
#periodic_task(run_every=crontab(hour="task.search_frequency")) # how to do with task search_frequency value
def schedule_task(pk):
task = Task.objects.get(pk=pk)
if task.status == 0 or task.status == 1 and not datetime.date.today() > task.scraping_end_date:
# perform the crawl function ---> def crawl() how ??
if task.scraping_end_date == datetime.date.today():
task.status = 2
task.save() # change the task status as complete.
views.py
I want to run this view periodically.How can I do it?
class CrawlerHomeView(LoginRequiredMixin, View):
login_url = 'users:login'
def get(self, request, *args, **kwargs):
# all_task = Task.objects.all().order_by('-id')
frequency = Task()
categories = Category.objects.all()
targets = TargetSite.objects.all()
keywords = Keyword.objects.all()
form = CreateTaskForm()
context = {
'targets': targets,
'keywords': keywords,
'frequency': frequency,
'form':form,
'categories': categories,
}
return render(request, 'index.html', context)
def post(self, request, *args, **kwargs):
form = CreateTaskForm(request.POST)
if form.is_valid():
# try:
unique_id = str(uuid4()) # create a unique ID.
obj = form.save(commit=False)
# obj.keywords = keywords
obj.created_by = request.user
obj.unique_id = unique_id
obj.status = 0
obj.save()
form.save_m2m()
keywords = ''
# for keys in ast.literal_eval(obj.keywords.all()): #keywords change to csv
for keys in obj.keywords.all():
if keywords:
keywords += ', ' + keys.title
else:
keywords += keys.title
# tasks = request.POST.get('targets')
# targets = ['thehimalayantimes', 'kathmandupost']
# print('$$$$$$$$$$$$$$$ keywords', keywords)
task_ids = [] #one Task/Project contains one or multiple scrapy task
settings = {
'spider_count' : len(obj.targets.all()),
'keywords' : keywords,
'unique_id': unique_id, # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# res = ast.literal_eval(ini_list)
for site_url in obj.targets.all():
domain = urlparse(site_url.address).netloc # parse the url and extract the domain
spider_name = domain.replace('.com', '')
task = scrapyd.schedule('default', spider_name, settings=settings, url=site_url.address, domain=domain, keywords=keywords)
# task = scrapyd.schedule('default', spider_name , settings=settings, url=obj.targets, domain=domain, keywords=obj.keywords)
return redirect('crawler:task-list')
# except:
# return render(request, 'index.html', {'form':form})
return render(request, 'index.html', {'form':form, 'errors':form.errors})
Any Suggestions or answer is there for this problem ?
After fighting Celery for 5 years in a 15k tasks/second setup I highly recommend you to switch to Dramatiq, which has a sane, reliable, performant code base that isn't split across multiple convoluted packages and works perfectly in two of my newer projects so far.
From the author's motivation
I’ve used Celery professionally for years and my growing frustration with it is one of the reasons why I developed dramatiq. Here are some of the main differences between Dramatiq, Celery and RQ:
There's also a a Django helper package: https://github.com/Bogdanp/django_dramatiq
Granted, you won't have a builtin celerybeat, but a cron calling python tasks is more robust anyway, we lost a good amount of data because celerybeat decided to stall regularly :)
There are two projects that aim to add periodic task creation: https://gitlab.com/bersace/periodiq and https://apscheduler.readthedocs.io/en/stable/
I haven't used those packages yet, what you could try with periodiq is selecting your database entries, loop through those and define a periodic-task for each (but this requires regular restarts of the periodiq worker to pick up changes):
# tasks.py
from dramatiq import get_broker
from periodiq import PeriodiqMiddleware, cron
broker = get_broker()
broker.add_middleware(PeriodiqMiddleware(skip_delay=30))
for obj in Task.objects.all():
#dramatiq.actor(periodic=cron(obj.frequency))
def hourly(obj=obj):
# import logic based on obj.name
# Do something each hour…
For the error,
Exception Type: EncodeError
Exception Value:
Object of type timedelta is not JSON serializable
Instead of defining following variable in django settings,
CELERY_BEAT_SCHEDULE = {
'task-first': {
'task': 'scheduler.tasks.create_task',
'schedule': timedelta(minutes=1)
},
can you try following in your celery file:
app.conf.beat_schedule = {
'task-first': {
'task': 'scheduler.tasks.create_task',
'schedule': crontab(minute='*/1')
}
}
this works for me given, celery server is up and running.
Apart from this why are you redirecting to 'list_tasks' after each task, what does it exactly do? Also, you have called the celery task from the view add_task_celery.delay(name,date,freq), is it just another way to add task apart from periodic task defined using celery-beat?
Edit 1:
My structure looks like as follow:
settings.py
CELERY_TIMEZONE = 'Asia/Kolkata'
CELERY_BROKER_URL = 'amqp://localhost'
celery.py
app.conf.beat_schedule = {
'task1': {
'task': '<app_name>.tasks.random_task',
'schedule': crontab(minute=0, hour=0)
},
}
Here you should note that I have a file named tasks in my app folder and there I have written a shared task as follow:
#shared_task
def random_task(total):
...
Also, apart from this you should start both celery beat as well as a celery worker process as follow:
celery -A <project_name>.celery worker -l error
celery -A <project_name>.celery beat -l error --scheduler django_celery_beat.schedulers:DatabaseScheduler
You can any scheduler you want, on production I use DatabaseScheduler. For testing you can try with following command:
celery -A <project_name> beat -l info -S django
You should run all these commands from the project folder of the Django project
I believe the problem is with 2nd and 3rd parameter in the task definition, which is freq and date. Although from the error, you posted, Object of type timedelta is not JSON serializable, it looks like it's talking about freq field which is of type DurationField that returns timedelta object.
Ideally, both fields must be serialized before passing to the task.
one simple way would be -
1) You can explicitly serialize these fields and pass to the task and in the task again convert it to datetime / timedelta object.
alternatively, you can dump whole data dict if there are too many items.
add_task_celery.delay(json.dumps(form.cleaned_data)),
and then in the task do -> json.loads(...)
2) Another thing you can try is to pass the serializer in the parameters explicitly.(using apply_async instead of delay)
add_task_celery.apply_async((name, date, freq), serializer='json')
3) You can also set value, if you haven't already, for setting CELERY_TASK_SERIALIZER = 'json' (default value is 'pickle').

celery 4.3.0 : get variable from inside current task

I have this task which is designed to bulk insert or delete objects in database :
views.py
from .tasks import run_task_with
def index():
# some code to retrieve obj_list
run_task_with(insert_obj, obj_list).delay()
return HttpResponseRedirect('/app_root/')
tasks.py
#shared_task
def run_task_with(func, queryset):
cache.add('current_task_id', run_task_with.request.id)
obj_numb = len(queryset)
r = map(func, queryset)
for i, obj in enumerate(r):
sleep(0.1)
progress_percent = int(round(float(i) / float(obj_numb) * 100))
current_task.update_state(
state='PROGRESS',
meta={'progress_percent': progress_percent}
)
But run_task_with.request.id keeps returning None even while object insertions runs smoothly. Could anyone explain to me why ?
Thanks
Answer found here : Celery does not registering tasks
from django.apps import apps
app.autodiscover_tasks(lambda: [n.name for n in apps.get_app_configs()])
And for subsidiary issue (to be able to pass function func to task) :
CELERY_ACCEPT_CONTENT = ['json', 'pickle']
CELERY_TASK_SERIALIZER = 'pickle'

python - telegram bot sendMessage in specific date

I am terribly new in python and my progress is like a snail:(
I want to make a telegram bot that send a message at specific date and time. I used apscheduler and telepot libraries for that. and this is my code:
import telepot
import sys
import time
from time import sleep
from datetime import datetime
from apscheduler.scheduler import Scheduler
import logging
bot = telepot.Bot("***")
logging.basicConfig()
sched = Scheduler()
sched.start()
exec_date = datetime(2017, 9, 12 ,1,51,0)
def handle(msg):
content_type,chat_type,chat_id = telepot.glance(msg)
print(content_type,chat_type,chat_id)
if content_type == 'text' :
bot.sendMessage(chat_id,msg['text'])
def sendSimpleText():
# content_type,chat_type,chat_id = telepot.glance(msg)
# print(content_type,chat_type,chat_id)
#
# if content_type == 'text' :
chat_id = telepot.
bot.sendMessage(chat_id,'faez')
def main():
job = sched.add_date_job(sendSimpleText, exec_date)
while True:
sleep(1)
sys.stdout.write('.'); sys.stdout.flush()
# bot.message_loop(handle)
# # job = sched.add_date_job(sendSimpleText, '2017-09-11 21:35:00', ['testFuckBot'])
# while True:
# time.sleep(10)
if __name__ == '__main__':
main()
my question is what do I pass to sendSimpleText as argument in add_date_job? in this line:
job = sched.add_date_job(sendSimpleText, exec_date)
I know that msg is the message that user is typed so for add_date_job I have nothing?
You are used an old (2.1.2) version of APScheduler.
New version has a new syntax.
A function add_date_job no more available.
This is a worked solution for you:
import telepot
import sys
import time
from datetime import datetime
from apscheduler.schedulers.background import BackgroundScheduler
from telepot.loop import MessageLoop
import logging
bot = telepot.Bot("***YOUR_BOT_TOKEN***")
logging.basicConfig()
sched = BackgroundScheduler()
exec_date = datetime(2017, 9, 12 ,3,5,0)
def handle(msg):
content_type,chat_type,chat_id = telepot.glance(msg)
print(content_type,chat_type,chat_id)
if content_type == 'text' :
bot.sendMessage(chat_id,msg['text'])
def sendSimpleText(chat_id):
bot.sendMessage(chat_id,'faez')
def main():
MessageLoop(bot, handle).run_as_thread()
job = sched.add_job(sendSimpleText, run_date=exec_date, args=['**YOUR_TELEGRAM_ID**'])
while True:
time.sleep(1)
sys.stdout.write('.'); sys.stdout.flush()
if __name__ == '__main__':
sched.start()
main()

How to get the "full" async result in Celery link_error callback

I have Celery 3.1.18 running with Django 1.6.11 and RabbitMQ 3.5.4, and trying to test my async task in a failure state (CELERY_ALWAYS_EAGER=True). However, I cannot get the proper "result" in the error callback. The example in the Celery docs shows:
#app.task(bind=True)
def error_handler(self, uuid):
result = self.app.AsyncResult(uuid)
print('Task {0} raised exception: {1!r}\n{2!r}'.format(
uuid, result.result, result.traceback))
When I do this, my result is still "PENDING", result.result = '', and result.traceback=''. But the actual result returned by my .apply_async call has the right "FAILURE" state and traceback.
My code (basically a Django Rest Framework RESTful endpoint that parses a .tar.gz file, and then sends a notification back to the user, when the file is done parsing):
views.py:
from producer_main.celery import app as celery_app
#celery_app.task()
def _upload_error_simple(uuid):
print uuid
result = celery_app.AsyncResult(uuid)
print result.backend
print result.state
print result.result
print result.traceback
msg = 'Task {0} raised exception: {1!r}\n{2!r}'.format(uuid,
result.result,
result.traceback)
class UploadNewFile(APIView):
def post(self, request, repository_id, format=None):
try:
uploaded_file = self.data['files'][self.data['files'].keys()[0]]
self.path = default_storage.save('{0}/{1}'.format(settings.MEDIA_ROOT,
uploaded_file.name),
uploaded_file)
print type(import_file)
self.async_result = import_file.apply_async((self.path, request.user),
link_error=_upload_error_simple.s())
print 'results from self.async_result:'
print self.async_result.id
print self.async_result.backend
print self.async_result.state
print self.async_result.result
print self.async_result.traceback
return Response()
except (PermissionDenied, InvalidArgument, NotFound, KeyError) as ex:
gutils.handle_exceptions(ex)
tasks.py:
from producer_main.celery import app
from utilities.general import upload_class
#app.task
def import_file(path, user):
"""Asynchronously import a course."""
upload_class(path, user)
celery.py:
"""
As described in
http://celery.readthedocs.org/en/latest/django/first-steps-with-django.html
"""
from __future__ import absolute_import
import os
import logging
from celery import Celery
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'producer_main.settings')
from django.conf import settings
log = logging.getLogger(__name__)
app = Celery('producer') # pylint: disable=invalid-name
# Using a string here means the worker will not have to
# pickle the object when using Windows.
app.config_from_object('django.conf:settings')
app.autodiscover_tasks(lambda: settings.INSTALLED_APPS) # pragma: no cover
#app.task(bind=True)
def debug_task(self):
print('Request: {0!r}'.format(self.request))
My backend is configured as such:
CELERY_ALWAYS_EAGER = True
CELERY_EAGER_PROPAGATES_EXCEPTIONS = False
BROKER_URL = 'amqp://'
CELERY_RESULT_BACKEND = 'redis://localhost'
CELERY_RESULT_PERSISTENT = True
CELERY_IGNORE_RESULT = False
When I run my unittest for the link_error state, I get:
Creating test database for alias 'default'...
<class 'celery.local.PromiseProxy'>
130ccf13-c2a0-4bde-8d49-e17eeb1b0115
<celery.backends.redis.RedisBackend object at 0x10aa2e110>
PENDING
None
None
results from self.async_result:
130ccf13-c2a0-4bde-8d49-e17eeb1b0115
None
FAILURE
Non .zip / .tar.gz file passed in.
Traceback (most recent call last):
So the task results are not available in my _upload_error_simple() method, but they are available from the self.async_result returned variable...
I could not get the link and link_error callbacks to work, so I finally had to use the on_failure and on_success task methods described in the docs and this SO question. My tasks.py then looks like:
class ErrorHandlingTask(Task):
abstract = True
def on_failure(self, exc, task_id, targs, tkwargs, einfo):
msg = 'Import of {0} raised exception: {1!r}'.format(targs[0].split('/')[-1],
str(exc))
def on_success(self, retval, task_id, targs, tkwargs):
msg = "Upload successful. You may now view your course."
#app.task(base=ErrorHandlingTask)
def import_file(path, user):
"""Asynchronously import a course."""
upload_class(path, user)
You appear to have _upload_error() as a bound method of your class - this is probably not what you want. try making it a stand-along task:
#celery_app.task(bind=True)
def _upload_error(self, uuid):
result = celery_app.AsyncResult(uuid)
msg = 'Task {0} raised exception: {1!r}\n{2!r}'.format(uuid,
result.result,
result.traceback)
class Whatever(object):
....
self.async_result = import_file.apply_async((self.path, request.user),
link=self._upload_success.s(
"Upload finished."),
link_error=_upload_error.s())
in fact there's no need for the self paramater since it's not used so you could just do this:
#celery_app.task()
def _upload_error(uuid):
result = celery_app.AsyncResult(uuid)
msg = 'Task {0} raised exception: {1!r}\n{2!r}'.format(uuid,
result.result,
result.traceback)
note the absence of bind=True and self
Be careful with UUID instance!
If you will try to get status of a task with id not string type but UUID type, you will only get PENDING status.
from uuid import UUID
from celery.result import AsyncResult
task_id = UUID('d4337c01-4402-48e9-9e9c-6e9919d5e282')
print(AsyncResult(task_id).state)
# PENDING
print(AsyncResult(str(task_id)).state)
# SUCCESS