Optimizing Django Aggregation Over Subsets

Optimizing Django Aggregation Over Subsets - django

I'm currently trying to do some summary statistics calculations for date-based subsets of a large "competition" table (~3M rows) stored in SQLite. Specifically, I'm trying to calculate statistics for:
this year
last year
the lifetime of the competitor
Here's a model breakdown:
class Competitor(models.Model):
# ... ID is the only important part here
class Venue(models.Model):
# ... ID is the only important part here
class Division(models.Model):
venue = models.ForeignKey(Venue)
# ...
class Level(models.Model):
division = models.ForeignKey(Division)
# ...
class Class(models.TextChoices):
STANDARD = "Standard", _("Standard")
JWW = "JWW", _("JWW")
class Run(models.Model):
competitor_id = models.ForeignKey(Competitor, related_name="runs", db_index=True)
date = models.DateField(verbose_name="Date created", db_index=True)
MACH = models.IntegerField(..., db_index=True)
PACH = models.IntegerField(..., db_index=True)
yps = models.FloatField(..., db_index=True)
score = models.IntegerField(..., db_index=True)
qualified = models.BooleanField(..., db_index=True)
division = models.ForeignKey(Division, db_index=True)
level = models.ForeignKey(Level, db_index=True)
cls = models.CharField(max_length=..., choices=Class.choices)
# ... Other fields that aren't relevant
For each Competitor, I want to generate summary statistics that describe their performance this year, last year, and over all time, and store that in a Report model:
class CurrPrevLifetime(models.Model):
curr_yr = models.FloatField(default=0)
prev_yr = models.FloatField(default=0)
lifetime = models.FloatField(default=0)
class Report(models.Model):
... = models.ForeignKey(CurrPrevLifetime, related_name=...)
# repeat as needed for as many fields need this
My current aggregation setup looks like this:
curr_yr = Q(date__year=datetime.date.today().year)
prev_yr = Q(date__year=datetime.date.today().year-1)
JWW = Q(cls=Class.JWW)
standard = Q(cls=Class.STANDARD)
aggregates = {
"curr_yr_max_standard_MACH": Max("MACH", filter=curr_yr & standard),
"curr_yr_max_standard_PACH": Max("PACH", filter=curr_yr & standard),
"curr_yr_average_yps_standard": Avg("yps", filter=curr_yr & standard),
"curr_yr_max_yps_standard": Max("yps", filter=curr_yr & standard),
"curr_yr_max_JWW_MACH": Max("MACH", filter=curr_yr & JWW),
"curr_yr_max_JWW_PACH": Max("PACH", filter=curr_yr & JWW),
"curr_yr_average_yps_JWW": Avg("yps", filter=curr_yr & JWW),
"curr_yr_max_yps_JWW": Max("yps", filter=curr_yr & JWW),
"curr_yr_MACH_points": Sum("MACH", filter=curr_yr),
"curr_yr_PACH_points": Sum("PACH", filter=curr_yr),
"prev_yr_max_standard_MACH": Max("MACH", filter=prev_yr & standard),
"prev_yr_max_standard_PACH": Max("PACH", filter=prev_yr & standard),
"prev_yr_average_yps_standard": Avg("yps", filter=prev_yr & standard),
"prev_yr_max_yps_standard": Max("yps", filter=prev_yr & standard),
"prev_yr_max_JWW_MACH": Max("MACH", filter=prev_yr & JWW),
"prev_yr_max_JWW_PACH": Max("PACH", filter=prev_yr & JWW),
"prev_yr_average_yps_JWW": Avg("yps", filter=prev_yr & JWW),
"prev_yr_max_yps_JWW": Max("yps", filter=prev_yr & JWW),
"prev_yr_MACH_points": Sum("MACH", filter=prev_yr),
"prev_yr_PACH_points": Sum("PACH", filter=prev_yr),
"lifetime_max_standard_MACH": Max("MACH", filter=standard),
"lifetime_max_standard_PACH": Max("PACH", filter=standard),
"lifetime_average_yps_standard": Avg("yps", filter=standard),
"lifetime_max_yps_standard": Max("yps", filter=standard),
"lifetime_max_JWW_MACH": Max("MACH", filter=JWW),
"lifetime_max_JWW_PACH": Max("PACH", filter=JWW),
"lifetime_average_yps_JWW": Avg("yps", filter=JWW),
"lifetime_max_yps_JWW": Max("yps", filter=JWW),
"lifetime_MACH_points": Sum("MACH"),
"lifetime_PACH_points": Sum("PACH"),
}
competitor.runs.aggregate(**aggregates)
I then take the results, break them into triples of (curr, prev, lifetime), and store those as CurrPrevLifetime using CurrPrevLifetime.objects.bulk_create.
This aggregation takes a surprisingly long time to complete, given that the operations are maxing, averaging, and summing (~5s). For reference, a competitor has somewhere in the vicinity of ~500 runs to their name over their lifetime.
What is to blame for the performance hit? Indexes are applied to the computed fields and the relations between the models, I've tried going the "filter then aggregate" route, with no noticeable difference and a significant increase in LOC written.
With 40,000 competitors, 5 seconds per report is going to take too long. How do I speed up this process? I'm happy to restructure if necessary.
I've tried tackling this a number of ways:
Breaking the competitor.runs into curr_yr_runs, prev_yr_runs, and lifetime_runs, and performing the aggregates individually (poor performance).
Breaking them up by cls into JWW_runs and standard_runs, and performing the aggregates by date that way (poor performance).

Related

django - improve performance of __in queryset in M2M filtering

I have a models that has a M2M relationship to another model.
These are my models:
class Catalogue(models.Model):
city = models.CharField(db_index=True,max_length=100, null=True)
district = models.CharField(db_index=True,max_length=100, null=True)
type = models.ManyToManyField(Type, db_index=True)
datetime = models.CharField(db_index=True, max_length=100, null=True)
class Type(models.Model):
name = models.CharField(max_length=100)
def __str__(self):
return self.name
And this is views.py:
class all_ads(generic.ListView):
paginate_by = 12
template_name = 'new_list_view_grid-card.html'
def get_queryset(self):
city_district = self.request.GET.getlist('city_district')
usage = self.request.GET.get('usage')
status = self.request.GET.get('status')
last2week = datetime.datetime.now() - datetime.timedelta(days=14)
status = status.split(',')
if usage:
usage = usage.split(',')
else:
usage = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31']
intersections = list(set(status).intersection(usage))
type_q = (Q(type__in=intersections) & Q(type__isnull=False))
result = models.Catalogue.objects.filter(
Q(datetime__gte=last2week) &
type_q &
((reduce(operator.or_, (Q(city__contains=x) for x in city_district)) & Q(city__isnull=False)) |
(reduce(operator.or_, (Q(district__contains=x) for x in city_district)) & Q(district__isnull=False)))
).distinct().order_by('-datetime').prefetch_related('type')
return result
I want to filter MySQL db with some queries and return result in a listview.
It works good on a small database, but with large database it takes over 10 seconds to return results. If I delete type_q query, It takes 2 seconds (reduce 10 second!).
How can I improve performance of __in queryset?

It looks like type_q itself is not really the culprit, but acts as a multiplier, since now we make a LEFT OUTER JOIN, and thus the __contains runs over all combinations. This is thus more a peculiarity of two filters that work together
We can omit this with:
cat_ids = list(Catalogue.objects.filter(
Q(*[Q(city__contains=x) for x in city_district], _connector=Q.OR) |
Q(*[Q(district__contains=x) for x in city_district], _connector=Q.OR)
).values_list('pk', flat=True))
result = models.Catalogue.objects.filter(
Q(datetime__gte=last2week),
type_q,
pk__in=cat_ids
).distinct().order_by('-datetime').prefetch_related('type')
Some database (MySQL is known to not optimize a subquery very well), can even do that with a subquery with. So here we do not materialize the list, but let Django work with a subquery:
cat_ids = Catalogue.objects.filter(
Q(*[Q(city__contains=x) for x in city_district], _connector=Q.OR) |
Q(*[Q(district__contains=x) for x in city_district], _connector=Q.OR)
).values_list('pk', flat=True)
result = models.Catalogue.objects.filter(
Q(datetime__gte=last2week),
type_q,
pk__in=cat_ids
).distinct().order_by('-datetime').prefetch_related('type')

Django compute score of matches (workaround to annotate a query after a union)

I need to compute the ranking of athletes during boxing tournaments. In my models, I track the result of each match and the points to be attributed for each outcome to each athlete.
class Member(models.Model):
surname = models.CharField(max_length=200)
last_name = models.CharField(max_length=200)
class Tournament(models.Model):
name = models.CharField(max_length=200)
class TrophyRule(models.Model):
win = models.IntegerField()
loose = models.IntegerField()
draw = models.IntegerField()
class Ring(models.Model):
code = models.CharFieldmax_length=1)
tournament = models.ForeignKey(Tournament, on_delete=models.CASCADE)
class Match(models.Model):
ring = models.ForeignKey(Ring, null=True, on_delete=models.SET_NULL)
winner = models.CharField(max_length=20, null=True, blank=True)
trophy_rule = models.ForeignKey(TrophyRule, on_delete=models.SET_NULL, null=True)
red_member = models.ForeignKey(Member, related_name='reds', on_delete=models.SET_NULL, null=True)
red_count_ranking = models.BooleanField(default=True)
blue_member = models.ForeignKey(Member, related_name='blues', on_delete=models.SET_NULL, null=True)
blue_count_ranking = models.BooleanField(default=True)
Based on this model, I need to sum the points acquired when the athlete was in the red corner with the points acquired when the athlete was in the blue corner. The result should be a queryset with all members and their total number of points.
In order to achieve this, I started with the computation of the points acquired by the athletes in the red corner:
from apps.members.models import Member
from django.db.models import Case, Sum, When, Q
red = Member.objects.filter(reds__ring__tournament_id=11402).annotate(
points=Case(
When(Q(reds__winner='red') & Q(reds__red_count_ranking=True), then='reds__trophy_rule__win'),
When(Q(reds__winner='draw') & Q(reds__red_count_ranking=True), then='reds__trophy_rule__draw'),
When(Q(reds__winner='blue') & Q(reds__red_count_ranking=True), then='reds__trophy_rule__loose'),
),
)
I also did the same for the points acquired by the athletes in the blue corner:
blue = Member.objects.filter(blues__ring__tournament_id=11402).annotate(
points=Case(
When(Q(blues__winner='blue') & Q(blues__blue_count_ranking=True), then='blues__trophy_rule__win'),
When(Q(blues__winner='draw') & Q(blues__blue_count_ranking=True), then='blues__trophy_rule__draw'),
When(Q(blues__winner='red') & Q(blues__blue_count_ranking=True), then='blues__trophy_rule__loose'),
),
)
Now, I need to combine the two queries and sum the points for each athlete. This is the part where I am stuck at the moment.
I tried to use union() which translates to a SQL UNION:
red.union(blue)
If I have 4 matches, with union() I get a queryset with 8 members (4 red and 4 blue), which is exactly what I am looking for. Unfortunately, when I try to compute the final number of points (points when the athlete was red + points when the athlete was blue), I trigger the error: Calling QuerySet.annotate() after union() is not supported (as per documentation).
red.union(blue).annotate(Sum('points'))
Is there an other way to achieve this with the Django ORM? I prefer not to revert to raw SQL if not necessary.

This might be possible in a single request (untested code) :
from django.db.models import Case, When, Q, F
members = Member.objects.filter(Q(reds__ring__tournament_id=11402)|Q(blues__ring__tournament_id=11402)).annotate(
red_points=Case(
When(Q(reds__winner='red') & Q(reds__red_count_ranking=True), then='reds__trophy_rule__win'),
When(Q(reds__winner='draw') & Q(reds__red_count_ranking=True), then='reds__trophy_rule__draw'),
When(Q(reds__winner='blue') & Q(reds__red_count_ranking=True), then='reds__trophy_rule__loose'),
),
blue_points=Case(
When(Q(blues__winner='blue') & Q(blues__blue_count_ranking=True), then='blues__trophy_rule__win'),
When(Q(blues__winner='draw') & Q(blues__blue_count_ranking=True), then='blues__trophy_rule__draw'),
When(Q(blues__winner='red') & Q(blues__blue_count_ranking=True), then='blues__trophy_rule__loose'),
),
points=F('red_points') + F('blue_points')
)

Django ORM , Doubts regarding normalization

I have a table in which data gets written in a very fast rate (around 40000 writes per minute)
class dummyclass():
field1 = models.CharField()
field2 = models.IntegerField()
...
field6 = models.DecimalField()
There are about 6 to 8 fields in it which is constantly changing
So i decided to split this class into six different classes like this
class dummyclass():
field1 = models.CharField()
field2 = modles.CharField()
class subdummyclass():
dummy = models.ForiegnKeyField(dummyclass)
field3 = models.CharField()
class subdummyclass1():
dummy = models.ForiegnKeyField(dummyclass)
field4 = models.CharField()
class subdummyclass2():
dummy = models.ForiegnKeyField(dummyclass)
field5 = models.CharField()
class subdummyclass3():
dummy = models.ForiegnKeyField(dummyclass)
field6 = models.CharField()
is there any advantage in splitting data to different tables like this or am i overdoing this (overnormalization) .
Any help is appreciated
Thanks and regards

Building up subqueries of derived django fields

I have a few transformations I need to perform on my table before I aggregate.
I need to multiply transaction_type (which is either 1 or -1) by amount to yield a signed_amount. Then I need to sum all signed_amounts by primary_category (which is a foreign key to secondary category which is a foreign key of my table).
DEBIT = -1
CREDIT = 1
TRANSACTION_TYPE_CHOICES = (
(DEBIT, 'debit'),
(CREDIT, 'credit'),
)
class Transaction(models.Model):
amount = models.DecimalField(max_digits=7, decimal_places=2)
transaction_type = models.IntegerField(choices=TRANSACTION_TYPE_CHOICES)
secondary_category = models.ForeignKey(Secondary_Category)
class Primary_Category(models.Model):
name = models.CharField("Category Name", max_length=30)
category = models.ForeignKey(Primary_Category_Bucket)
class Secondary_Category(models.Model):
name = models.CharField("Category Name", max_length=30)
primary_category = models.ForeignKey(Primary_Category)
I'm stuck on the first bit though.
from django.db.models import Sum, Count, F
original_transactions = Transaction.objects.all()
original_transactions.signed_amount = F('transaction_type') * F('amount')
for transaction in original_transactions:
print transaction.signed_amount
When I try to sanity check that signed_amount is being calculated, I get an error that 'Transaction' object has no attribute 'signed_amount'. I don't want to save signed_amount to the database. I just want to generate it as derived field so I can calculate my totals.
How do I calculate this derived field and subsequently aggregate by primary_category.name?

User python decorator property on a method for class Transaction:
class Transaction(models.Model):
amount = models.DecimalField(max_digits=7, decimal_places=2)
transaction_type = models.IntegerField(choices=TRANSACTION_TYPE_CHOICES)
secondary_category = models.ForeignKey(Secondary_Category)
#property
def signed_amount(self):
return self.amount * self.transaction_type
Then for each Transaction object you can do transaction.signed_amount.
I'm not sure if the aggregation part could be done using queries, but if you don't have that many PrimaryCategory, then python would be good enough to achieve it.

Or you can do this.
all_transactions = Transaction.objects.all().order_by('secondary_category__primary_category_id')
sum = 0
if all_transactions:
primary_category_id = all_transactions[0].secondary_category.primary_category_id
for transaction in all_transactions:
if primary_category_id == transaction.secondary_category.primary_category_id:
sum += (transaction.amount * transaction_type)
else:
sum = (transaction.amount * transaction_type)
print sum

Django queryset search on multiple models, return the same object

I'm trying to create an advanced search on my website, you are looking at various models related to each one, always returning a list of profiles that meet some parameters
Here are my Models:
class Profile(models.Model):
first_name=models.CharField(max_length=60, blank=False)
last_name=models.CharField(max_length=60, blank=False)
residence=models.CharField(max_length=60, null=True, blank=True)
birthdate=models.DateField(null=True, blank=True)
telephone=models.CharField(max_length=60, null=True, blank=True)
email=models.EmailField(null=True, blank=True)
linkedin=models.URLField(null=True, blank=True)
starred=models.BooleanField(default=False)
created_from = models.ForeignKey(EmployeeUser, related_name='profile_author')
created_on = models.DateField(default=tznow)
internal_id = models.CharField(max_length=5, blank=True)
class Education(models.Model):
almalaurea_id = models.CharField(max_length=60, null=True, blank=True)
profile = models.ForeignKey(Profile, related_name='education_profile')
education_type = models.ForeignKey(Education_type, related_name='education_type')
class Education_type(models.Model):
VALUES = (
(0, 'Altro'),
(1, 'Licenza media'),
(2, 'Diploma'),
(3, 'Laurea Triennale'),
(4, 'Laurea Magistrale'),
)
title = models.CharField(max_length=60)
value = models.IntegerField(choices=VALUES)
I want to search the profiles that meet various results, such as birthdate, residence, starred, education (based on education_type)
This is an example scenario, my research includes other models
These are the research in my view, I thought that having found the results of the two queries, I could extract the profile id and compare them, then run another query by selecting profiles that match, but I think it's not a great idea, the real scenario includes other various models.
filters_profile = []
filters_education = []
year = form.cleaned_data["year"]
residence = form.cleaned_data["residence"]
starred = form.cleaned_data["starred"]
education_type = form.cleaned_data["education_type"]
if year:
filters_profile.append(Q(birthdate__year=year))
if residence:
filters_profile.append(Q(residence__icontains=residence))
if starred:
filters_profile.append(Q(starred=starred))
result_profile = Profile.objects.filter(reduce(lambda q1, q2: q1 & q2, filters_profile)).order_by('first_name')
result_education = None
if education_type:
e = Education_type.objects.filter(title=education_type)
result_education = Education.objects.filter(education_type=e).prefetch_related('profile','education_type')
Any idea?
Many thanks in advance :)
EDIT :
About the solution of #Geo Jacob
Here is the third models:
if valutation:
result_valutation = Status.objects.filter(valutation=valutation).values_list('profile_id', flat=True)
key['id__in'] = result_valutation
Adding this code for my scenario, this solution don't work, as i written in the comments :)
"in practice, the content of key['id__in'] is overwritten when the other model query (this) is executed"

Try this:
key = {}
year = form.cleaned_data["year"]
residence = form.cleaned_data["residence"]
starred = form.cleaned_data["starred"]
education_type = form.cleaned_data["education_type"]
if year:
key['birthdate__year'] = year
if residence:
key['residence__icontains'] = residence
if starred:
key['starred'] = starred
if education_type:
e = Education_type.objects.filter(title=education_type)
result_education = Education.objects.filter(education_type=e).values_list('profile_id', flat=True)
key['id__in'] = result_education
result_profile = Profile.objects.filter(**key).order_by('first_name')

My solution working on more than 2 models, based on #Geo Jacob solution, thank you
I make a check and put in key['id__in'] only matched id from the previous query, so as to intersect the results
key = {}
statokey = 0
year = form.cleaned_data["year"]
residence = form.cleaned_data["residence"]
starred = form.cleaned_data["starred"]
education_type = form.cleaned_data["education_type"]
valutation = form.cleaned_data["valutation"]
if year:
key['birthdate__year'] = year
if residence:
key['residence__icontains'] = residence
if starred:
key['starred'] = starred
if education_type:
e = Education_type.objects.filter(title=education_type)
result_education = Education.objects.filter(education_type=e).values_list('profile_id', flat=True)
if statokey > 0:
for r in result_education:
for k in key['id__in']:
if r == k:
key['id__in'] = str(r)
else:
key['id__in'] = result_education
statokey += 1
if valutation:
result_valutation = Status.objects.filter(valutation=valutation).values_list('profile_id', flat=True)
if statokey > 0:
for r in result_valutation:
for k in key['id__in']:
if r == k:
key['id__in'] = str(r)
else:
key['id__in'] = result_valutation
statokey += 1
result_profile = Profile.objects.filter(**key).order_by('first_name')

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Optimizing Django Aggregation Over Subsets - django

Related

django - improve performance of __in queryset in M2M filtering

Django compute score of matches (workaround to annotate a query after a union)

Django ORM , Doubts regarding normalization

Building up subqueries of derived django fields

Django queryset search on multiple models, return the same object

Categories

Resources