I want to fetch categories from a Magento API and display them in a template. In the same time, I want to save them in DB for an ulterior use.
Categories are too many and the render of the template takes more than 30 sec.
I start to learn using asyncio but couldn't get my way with it. I surely missed something.
First, my URL leads to the function that retrieves the categories
#login_required
def get_categories(request):
Category.objects.all().delete()
try:
cats = fetch_categories()
tree = cats['children_data']
except:
print('erreur : impossible de récupérer les catégories (fetch_categories)')
asyncio.run(parse_categories(tree))
return render(request, 'categories/categories_list.html', {'tree': tree})
When I get the "categories tree", I send it to
async def parse_categories(tree):
for lvl1 in tree:
all_tasks = []
asyncio.create_task(save_cat(lvl1))
# main products categories (turbo, injectors ...)
for lvl2 in lvl1['children_data']:
asyncio.create_task(save_cat(lvl2))
# sub categories like RENAULT, DACIA
for lvl3 in lvl2['children_data']:
asyncio.create_task(save_cat(lvl3))
for lvl4 in lvl3['children_data']:
asyncio.create_task(save_cat(lvl4))
for lvl5 in lvl4['children_data']:
asyncio.create_task(save_cat(lvl5))
My save() function is async. I'm not sure it should be. Before I started using async, it was working.
async def save_cat(cat):
cat_id = cat['id']
new_cat = Category()
new_cat.id = cat_id
new_cat.name = cat.get('name', None)
new_cat.parent = cat.get('parent_id', None)
new_cat.url = cat.get('path', None)
new_cat.is_active = cat.get('is_active', None)
new_cat.position = cat.get('position', None)
new_cat.level = cat.get('level', None)
new_cat.save()
When I run, no error. The context is well sent to the template and displays well. But no category is saved.
I also tried to make a task list with asyncio.create_task in each level and execute the loop at the end of parse_categories() like said in this thread, without success.
all_tasks.append(asyncio.create_task(save_cat(lvl1)))
[...]
responses = asyncio.gather(*all_tasks, return_exceptions=True)
loop = asyncio.get_event_loop()
loop.run_until_complete(responses)
loop.close()
Any clue to solve my case will be welcome
I'm experiencing issues trying to update a queryset on setUp:
class MyTestCase(BaseTestCase):
OPERATOR_USERNAME = "test_operator"
OPERATOR_PASSWORD = "secret"
OPERATOR_EMAIL = "test#example.org"
#classmethod
def setUpClass(cls):
super().setUpClass()
cls.operator = Operator.objects.create_superuser(
username=cls.OPERATOR_USERNAME, password=cls.OPERATOR_PASSWORD, email=cls.OPERATOR_EMAIL
)
def setUp(self) -> None:
self.client.login(username=self.OPERATOR_USERNAME, password=self.OPERATOR_PASSWORD)
utd_ids = MyModel.objects.filter(
ref_year=2021).values_list("id", flat=True
)[:10]
utd_qs = MyModel.objects.filter(id__in=utd_ids) # just added another step for debugging purposes
# update initial utd status
_updates = utd_qs.update(status="INITIAL_STATE_VALUE")
print(_updates) # it prints 10
self.ssn_list = list(utd_qs.values_list("user__ssn", flat=True))
self.client.login(username=self.OPERATOR_USERNAME, password=self.OPERATOR_PASSWORD)
print(MyModel.objects.filter(id__in=utd_ids).values("status").distinct())
# this should retrieve 1 value but instead it retrieve multiple values different from INITIAL_STATE_VALUE
am I doing something wrong? I tried the same update through python manage.py shell on a similar queryset and it works as expected
In trying to understand django's select_for_update, I created a test to see. In my understanding, if there are two processes, one selects for update inside transaction, the other tries to write to the locked rows before transaction ends in other process, the write would fail. However, the following test passes ok. What could be wrong?
#pytest.mark.django_db(transaction=True)
def test_select_for_update():
ExampleModel.objects.create(type="test1")
ExampleModel.objects.create(type="test2")
import os
child_pid = os.fork()
if child_pid == 0:
sleep(5)
qs = ExampleModel.objects.all()
ExampleModel.objects.bulk_update(qs, ["type"])
else:
with transaction.atomic():
qs = ExampleModel.objects.all().select_for_update()
for obj in qs:
obj.type = "test3"
ExampleModel.objects.bulk_update(qs, ["type"])
sleep(10)
Here I have a view CrawlerHomeView which is used to create the task object from a form now I want to schedule this task periodically with celery.
I want to schedule this CrawlerHomeView process with the task object search_frequency and by checking some task object fields.
Task Model
class Task(models.Model):
INITIAL = 0
STARTED = 1
COMPLETED = 2
task_status = (
(INITIAL, 'running'),
(STARTED, 'running'),
(COMPLETED, 'completed'),
(ERROR, 'error')
)
FREQUENCY = (
('1', '1 hrs'),
('2', '2 hrs'),
('6', '6 hrs'),
('8', '8 hrs'),
('10', '10 hrs'),
)
name = models.CharField(max_length=255)
scraping_end_date = models.DateField(null=True, blank=True)
search_frequency = models.CharField(max_length=5, null=True, blank=True, choices=FREQUENCY)
status = models.IntegerField(choices=task_status)
tasks.py
I want to run the view below posted periodically [period=(task's search_frequency time] if the task status is 0 or 1 and not crossed the task scraping end date. But I got stuck here. How can I do this?
#periodic_task(run_every=crontab(hour="task.search_frequency")) # how to do with task search_frequency value
def schedule_task(pk):
task = Task.objects.get(pk=pk)
if task.status == 0 or task.status == 1 and not datetime.date.today() > task.scraping_end_date:
# perform the crawl function ---> def crawl() how ??
if task.scraping_end_date == datetime.date.today():
task.status = 2
task.save() # change the task status as complete.
views.py
I want to run this view periodically.How can I do it?
class CrawlerHomeView(LoginRequiredMixin, View):
login_url = 'users:login'
def get(self, request, *args, **kwargs):
# all_task = Task.objects.all().order_by('-id')
frequency = Task()
categories = Category.objects.all()
targets = TargetSite.objects.all()
keywords = Keyword.objects.all()
form = CreateTaskForm()
context = {
'targets': targets,
'keywords': keywords,
'frequency': frequency,
'form':form,
'categories': categories,
}
return render(request, 'index.html', context)
def post(self, request, *args, **kwargs):
form = CreateTaskForm(request.POST)
if form.is_valid():
# try:
unique_id = str(uuid4()) # create a unique ID.
obj = form.save(commit=False)
# obj.keywords = keywords
obj.created_by = request.user
obj.unique_id = unique_id
obj.status = 0
obj.save()
form.save_m2m()
keywords = ''
# for keys in ast.literal_eval(obj.keywords.all()): #keywords change to csv
for keys in obj.keywords.all():
if keywords:
keywords += ', ' + keys.title
else:
keywords += keys.title
# tasks = request.POST.get('targets')
# targets = ['thehimalayantimes', 'kathmandupost']
# print('$$$$$$$$$$$$$$$ keywords', keywords)
task_ids = [] #one Task/Project contains one or multiple scrapy task
settings = {
'spider_count' : len(obj.targets.all()),
'keywords' : keywords,
'unique_id': unique_id, # unique ID for each record for DB
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
}
# res = ast.literal_eval(ini_list)
for site_url in obj.targets.all():
domain = urlparse(site_url.address).netloc # parse the url and extract the domain
spider_name = domain.replace('.com', '')
task = scrapyd.schedule('default', spider_name, settings=settings, url=site_url.address, domain=domain, keywords=keywords)
# task = scrapyd.schedule('default', spider_name , settings=settings, url=obj.targets, domain=domain, keywords=obj.keywords)
return redirect('crawler:task-list')
# except:
# return render(request, 'index.html', {'form':form})
return render(request, 'index.html', {'form':form, 'errors':form.errors})
Any Suggestions or answer is there for this problem ?
After fighting Celery for 5 years in a 15k tasks/second setup I highly recommend you to switch to Dramatiq, which has a sane, reliable, performant code base that isn't split across multiple convoluted packages and works perfectly in two of my newer projects so far.
From the author's motivation
I’ve used Celery professionally for years and my growing frustration with it is one of the reasons why I developed dramatiq. Here are some of the main differences between Dramatiq, Celery and RQ:
There's also a a Django helper package: https://github.com/Bogdanp/django_dramatiq
Granted, you won't have a builtin celerybeat, but a cron calling python tasks is more robust anyway, we lost a good amount of data because celerybeat decided to stall regularly :)
There are two projects that aim to add periodic task creation: https://gitlab.com/bersace/periodiq and https://apscheduler.readthedocs.io/en/stable/
I haven't used those packages yet, what you could try with periodiq is selecting your database entries, loop through those and define a periodic-task for each (but this requires regular restarts of the periodiq worker to pick up changes):
# tasks.py
from dramatiq import get_broker
from periodiq import PeriodiqMiddleware, cron
broker = get_broker()
broker.add_middleware(PeriodiqMiddleware(skip_delay=30))
for obj in Task.objects.all():
#dramatiq.actor(periodic=cron(obj.frequency))
def hourly(obj=obj):
# import logic based on obj.name
# Do something each hour…
For the error,
Exception Type: EncodeError
Exception Value:
Object of type timedelta is not JSON serializable
Instead of defining following variable in django settings,
CELERY_BEAT_SCHEDULE = {
'task-first': {
'task': 'scheduler.tasks.create_task',
'schedule': timedelta(minutes=1)
},
can you try following in your celery file:
app.conf.beat_schedule = {
'task-first': {
'task': 'scheduler.tasks.create_task',
'schedule': crontab(minute='*/1')
}
}
this works for me given, celery server is up and running.
Apart from this why are you redirecting to 'list_tasks' after each task, what does it exactly do? Also, you have called the celery task from the view add_task_celery.delay(name,date,freq), is it just another way to add task apart from periodic task defined using celery-beat?
Edit 1:
My structure looks like as follow:
settings.py
CELERY_TIMEZONE = 'Asia/Kolkata'
CELERY_BROKER_URL = 'amqp://localhost'
celery.py
app.conf.beat_schedule = {
'task1': {
'task': '<app_name>.tasks.random_task',
'schedule': crontab(minute=0, hour=0)
},
}
Here you should note that I have a file named tasks in my app folder and there I have written a shared task as follow:
#shared_task
def random_task(total):
...
Also, apart from this you should start both celery beat as well as a celery worker process as follow:
celery -A <project_name>.celery worker -l error
celery -A <project_name>.celery beat -l error --scheduler django_celery_beat.schedulers:DatabaseScheduler
You can any scheduler you want, on production I use DatabaseScheduler. For testing you can try with following command:
celery -A <project_name> beat -l info -S django
You should run all these commands from the project folder of the Django project
I believe the problem is with 2nd and 3rd parameter in the task definition, which is freq and date. Although from the error, you posted, Object of type timedelta is not JSON serializable, it looks like it's talking about freq field which is of type DurationField that returns timedelta object.
Ideally, both fields must be serialized before passing to the task.
one simple way would be -
1) You can explicitly serialize these fields and pass to the task and in the task again convert it to datetime / timedelta object.
alternatively, you can dump whole data dict if there are too many items.
add_task_celery.delay(json.dumps(form.cleaned_data)),
and then in the task do -> json.loads(...)
2) Another thing you can try is to pass the serializer in the parameters explicitly.(using apply_async instead of delay)
add_task_celery.apply_async((name, date, freq), serializer='json')
3) You can also set value, if you haven't already, for setting CELERY_TASK_SERIALIZER = 'json' (default value is 'pickle').
I have a problem where I insert a database item using a SQLAlchemy / Tastypie REST interface, but the item is missing when subsequently get the list of items. It shows up only after I get the list of items a second time.
I am using SQLAlchemy with Tastypie/Django running on Apache via mod_wsgi. I use a singleton Database Manager class to hold my engine and declarative_base, and with Tastypie, a separate class to get the session and make sure I roll-back if there is a problem with the commit. As in the update below, the problem occurs when I don't close my session after inserting. Why is this necessary?
My original code was like this:
Session = scoped_session(sessionmaker(autoflush=True))
# Singleton Database Manager class for managing session
class DatabaseManager():
engine = None
base = None
def ready(self):
host='mysql+mysqldb://etc...'
if self.engine and self.base:
return True
else:
try:
self.engine = create_engine(host, pool_recycle=3600)
self.base = declarative_base(bind=self.engine)
return True
except:
return False
def getSession(self):
if self.ready():
session = Session()
session.configure(bind=self.engine)
return session
else:
return None
DM = DatabaseManager()
# A session class I use with Tastypie to ensure the session is destroyed at the
# end of the transaction, because Tastypie creates singleton Resources used for
# all threads
class MySession:
def __init__(self):
self.s = DM.getSession()
def safeCommit(self):
try:
self.s.commit()
except:
self.s.rollback()
raise
def __del__(self):
try:
self.s.commit()
except:
self.s.rollback()
raise
# ... Then ... when I get requests through Apache/mod_wsgi/Django/Tastypie
# First Request
obj_create():
db = MySession()
print db.s.query(DBClass).count() # returns 4
newItem = DBClass()
db.s.add(newItem)
db.s.safeCommit()
print db.s.query(DBClass).count() # returns 5
# Second Request after First Request returns
obj_get_list():
db = MySession()
print db.s.query(DBClass).count() # returns 4 ... should be 5
# Third Request is okay
obj_get_list():
db = MySession()
print db.s.query(DBClass).count() # returns 5
UPDATE
After further digging, it appears that the problem is my session needed to be closed after creating. Perhaps because Tastypie's object_create() adds the SQLAlchemy object to it's bundle, and I don't know what happens after it leaves the function's scope:
obj_create():
db = MySession()
newItem = DBClass()
db.s.add(newItem)
db.s.safeCommit()
copiedObj = copyObj(newItem) # copy SQLAlchemy record into non-sa object (see below)
db.s.close()
return copiedObj
If someone cares to explain this in an answer, I can close the question. Also, for those who are curious, I copy my object out of SQLAlchemy like this:
class Struct:
def __init__(self, **entries):
self.__dict__.update(entries)
class MyTastypieResource(Resource):
...
def copyObject(self, object):
base = {}
# self._meta is part of my tastypie resource
for p in class_mapper(self._meta.object_class).iterate_properties:
if p.key not in base and p.key not in self._meta.excludes:
base[p.key] = getattr(object,p.key)
return Struct(**base)
The problem was resolved by closing my session. The update in the answer didn't solve the problem fully - I ended up adding a middleware class to close the session at the end of a transaction. This ensured everything was written to the database. The middleware looks a bit like this:
class SQLAlchemySessionMiddleWare(object):
def process_response(self, request, response):
try:
session = MyDatabaseManger.getSession()
session.commit()
session.close()
except Exception, err:
pass
return response
def process_exception(self, request, exception):
try:
session = MyDatabaseManger.getSession()
session.rollback()
session.close()
except Exception, err:
pass