Django workaround to use window function call in an aggregate function? - django

I'm trying to calculate customer order frequency.
First use a window function to get the previous order date then annotate the days since the last order.
from django.db.models import Avg, F, Window
from django.db.models.functions import ExtractDay, Lag, TruncDate
orders = (
Order.objects
.annotate(
prev_order_date=Window(
expression=Lag('paid_at', 1),
partition_by=[F('customer_email')],
order_by=F('paid_at').asc(),
),
days_since_last=ExtractDay(
TruncDate('paid_at') - TruncDate('prev_order_date')
),
)
)
Then group by customer_email before calculating the average frequency.
customer_data = (
orders.values('customer')
.annotate(avg_frequency=Avg('days_since_last'))
)
Unfortunately this throws an error. Does anyone know of a workaround or know of an alternate way to calculate the average frequency?
psycopg2.errors.GroupingError: aggregate function calls cannot contain window function calls

I found the django-cte package through this answer.
Join on the order id then make sure to annotate the result of the window function before grouping.
from django_cte import CTEManager, With
class OrderCTE(Order):
objects = CTEManager()
class Meta:
proxy = True
orders = With(
Order.objects
.annotate(
prev_order_date=Window(
expression=Lag('paid_at', 1),
partition_by=[F('customer_email')],
order_by=F('paid_at').asc(),
),
days_since_last=ExtractDay(
TruncDate('paid_at') - TruncDate('prev_order_date')
),
)
)
customer_data = list(
orders.join(OrderCTE, id=orders.col.id)
.with_cte(orders)
.annotate(days_since_last=orders.col.days_since_last)
.values('customer_email')
.order_by('customer_email')
.annotate(
avg_frequency=Avg('days_since_last'),
)
.values_list(
'customer_email',
'avg_frequency',
)
)

Related

Django annotate exclude with Case & When (Conditional Expression)

I'm using Django 2.2
While making queryset, I want count of related model, based on few conditions like
queryset = self.model.objects.filter(user=self.request.user).annotate(
count_videos=Count('video'),
count_completed=Count(
Case(
When(video__status__in=Video.STATUS_LIST_COMPLETED)
)
),
count_failed=Count(
Case(
When(video__status__in=Video.STATUS_LIST_FAILED)
)
),
count_pending=Count(
Case(
When(
video__status__not_in=Video.STATUS_LIST_PENDING_EXCLUDE
)
)
)
)
Here 3 counts are working, but in last count count_pending, I have to count against exlude(). i.e., count number of records excluding the passed list.
How can I use exclude with the above statement?
We can negate the value we pass to the filter= parameter [Django-doc]:
from django.db.models import Count, Q
queryset = self.model.objects.filter(user=self.request.user).annotate(
count_videos=Count('video'),
count_completed=Count(
'video',
filter=Q(video__status__in=STATUS_LIST_COMPLETED)
),
count_failed=Count(
'video',
filter=Q(video__status__in=Video.STATUS_LIST_FAILED)
),
count_pending=Count(
'video',
filter=~Q(video__status__in=Video.STATUS_LIST_PENDING_EXCLUDE)
)
)
This will result in a query like:
SELECT model.*,
COUNT(
CASE WHEN NOT video.status IN STATUS_LIST_PENDING_EXCLUDE
AND video.status IS NOT NULL
THEN video.id
ELSE NULL END
) AS count_pending
FROM model
LEFT OUTER JOIN video ON model.id = video.model_id
GROUP BY model.id
Apologies for the reply to a super old question, but this one hits high on searches for this topic. I needed a very similar thing and wanted a count but had some odd conditions I couldn't work out with ~Q and landed on an annotate that looked like the following. Posting here only for case for someone that happens to need something similar.
I required a count of Reviews completed, and those in progress, but if the review.status was UNTOUCHED it wasn't to get counted in the 'in progress' or 'completed' bin. I used Case with the default value set to 1 for the "not" condition (not completed) then wrapped the Case in a Sum as shown. There were about 9 different status's that indicated 'in progress' and I didn't want to name them all.
.values(___bunch_of_group_by_fields_here___)\
.annotate(
completed=Sum(Case(
When(status__in=[Review.REVIEW_COMPLETE,
], then=Value(1)),
default=Value(0),
output_field=IntegerField(),
)),
# essentially: ( not (review complete or untouched) )
# gets all the status between untouched (default first step) and
# complete (final status in the workflow for a review) without having
# to specify all the in between statuses
inprogress=Sum(Case(
When(status__in=[Review.REVIEW_COMPLETE,
Review.UNTOUCHED
], then=Value(0)),
default=Value(1),
output_field=IntegerField(),
))

Using Annotate & Artithmetic in a Django subquery

I am trying to improve my understanding of the Django queryset syntax and am hoping that someone could help me check my understanding.
Could this:
total_packed = (
PackingRecord.objects.filter(
product=OuterRef('pk'), fifolink__sold_out=False
).values('product') # Group by product
.annotate(total=Sum('qty')) # Sum qty for 'each' product
.values('total')
)
total_sold = (
FifoLink.objects.filter(
packing_record__product=OuterRef('pk'), sold_out=False
).values('packing_record__product')
.annotate(total=Sum('sale__qty'))
.values('total')
)
output = obj_set.annotate(
sold=Subquery(total_sold[:1]),
packed=Subquery(total_packed[:1]),
).annotate(
in_stock=F('packed') - F('sold')
)
be safely reduced to this:
in_stock = (
FifoLink.objects.filter(
packing_record__product=OuterRef('pk'), sold_out=False
).values('packing_record__product')
.annotate(total=Sum(F('sale__qty')-F('packing_record__qty')))
.values('total')
)
output = obj_set.annotate(
in_stock=Subquery(total_sold[:1]),
)
Basically, I am trying to move the math being completed in the outer .annotate() into the queryset itself by using the fk relationship instead of running two separate querysets. I think this is allowed, but I am not sure if I am understanding it correctly.

Django conditional Subquery aggregate

An simplified example of my model structure would be
class Corporation(models.Model):
...
class Division(models.Model):
corporation = models.ForeignKey(Corporation)
class Department(models.Model):
division = models.ForeignKey(Division)
type = models.IntegerField()
Now I want to display a table that display corporations where a column will contain the number of departments of a certain type, e.g. type=10. Currently, this is implemented with a helper on the Corporation model that retrieves those, e.g.
class Corporation(models.Model):
...
def get_departments_type_10(self):
return (
Department.objects
.filter(division__corporation=self, type=10)
.count()
)
The problem here is that this absolutely murders performance due to the N+1 problem.
I have tried to approach this problem with select_related, prefetch_related, annotate, and subquery, but I havn't been able to get the results I need.
Ideally, each Corporation in the queryset should be annotated with an integer type_10_count which reflects the number of departments of that type.
I'm sure I could do something with raw sql in .extra(), but the docs announce that it is going to be deprecated (I'm on Django 1.11)
EDIT: Example of raw sql solution
corps = Corporation.objects.raw("""
SELECT
*,
(
SELECT COUNT(*)
FROM foo_division div ON div.corporation_id = c.id
JOIN foo_department dept ON dept.division_id = div.id
WHERE dept.type = 10
) as type_10_count
FROM foo_corporation c
""")
I think with Subquery we can get SQL similar to one you have provided, with this code
# Get amount of departments with GROUP BY division__corporation [1]
# .order_by() will remove any ordering so we won't get additional GROUP BY columns [2]
departments = Department.objects.filter(type=10).values(
'division__corporation'
).annotate(count=Count('id')).order_by()
# Attach departments as Subquery to Corporation by Corporation.id.
# Departments are already grouped by division__corporation
# so .values('count') will always return single row with single column - count [3]
departments_subquery = departments.filter(division__corporation=OuterRef('id'))
corporations = Corporation.objects.annotate(
departments_of_type_10=Subquery(
departments_subquery.values('count'), output_field=IntegerField()
)
)
The generated SQL is
SELECT "corporation"."id", ... (other fields) ...,
(
SELECT COUNT("division"."id") AS "count"
FROM "department"
INNER JOIN "division" ON ("department"."division_id" = "division"."id")
WHERE (
"department"."type" = 10 AND
"division"."corporation_id" = ("corporation"."id")
) GROUP BY "division"."corporation_id"
) AS "departments_of_type_10"
FROM "corporation"
Some concerns here is that subquery can be slow with large tables. However, database query optimizers can be smart enough to promote subquery to OUTER JOIN, at least I've heard PostgreSQL does this.
1. GROUP BY using .values and .annotate
2. order_by() problems
3. Subquery
You should be able to do this with a Case() expression to query the count of departments that have the type you are looking for:
from django.db.models import Case, IntegerField, Sum, When, Value
Corporation.objects.annotate(
type_10_count=Sum(
Case(
When(division__department__type=10, then=Value(1)),
default=Value(0),
output_field=IntegerField()
)
)
)
I like the following way of doing it:
departments = Department.objects.filter(
type=10,
division__corporation=OuterRef('id')
).annotate(
count=Func('id', 'Count')
).values('count').order_by()
corporations = Corporation.objects.annotate(
departments_of_type_10=Subquery(depatments)
)
The more details on this method you can see in this answer: https://stackoverflow.com/a/69020732/10567223

Django 1.8 conditional annotation results in INNER JOIN instead of LEFT OUTER JOIN

The models:
class Bar(GenericModel):
...
class Foo(GenericModel):
bar = models.ForeignKey(Bar, related_name='foo_bar')
The query:
bars = Bar.objects
.prefetch_related('foo_bar')
.annotate(sum_foo=Sum(
Case(
When(foo_bar__is_deleted=False, then='foo_bar__amount'),
default=Value(0),
output_field=IntegerField()
)
)
)
The former results in an inner join: SELECT ... FROM "bar" INNER JOIN "foo" ON ( "bar"."id" = "foo"."bar_id" ) ...
What I intend to obtain is a LEFT OUTER JOIN (a full list of "bar" objects annotated with "foo.amount" sums, or 0s if "foo" related to "bar" doesn't exist) instead of the INNER JOIN? Is it possible to do without falling back to raw SQL?
This is a known bug, corrected in Django 1.8.3 (release notes).
As you noted, the issue is that an INNER JOIN is being created, filtering out Bar objects when there's no corresponding relation to Foo objects.
Using a Django version higher than 1.8.3 will solve the issue.
This way seems to work correctly:
bars = Bar.objects
.prefetch_related('foo_bar')
.annotate(sum_foo=Sum(
Case(
When(Q(foo_bar__is_deleted=False) | Q(foo_bar__is_deleted=None),
then='foo_bar__amount'),
default=Value(0),
output_field=IntegerField()
)
),
)

How to add distance from point as an annotation in GeoDjango

I have a Geographic Model with a single PointField, I'm looking to add an annotation for the distance of each model from a given point, which I can later filter on and do additional jiggery pokery.
There's the obvious queryset.distance(to_point) function, but this doesn't actually annotate the queryset, it just adds a distance attribute to each model in the queryset, meaning I can't then apply .filter(distance__lte=some_distance) to it later on.
I'm also aware of filtering by the field and distance itself like so:
queryset.filter(point__distance_lte=(to_point, D(mi=radius)))
but since I will want to do multiple filters (to get counts of models within different distance ranges), I don't really want to make the DB calculate the distance from the given point every time, since that could be expensive.
Any ideas? Specifically, is there a way to add this as a regular annotation rather than an inserted attribute of each model?
I couldn't find any baked in way of doing this, so in the end I just created my own Aggregation class:
This only works with post_gis, but making one for another geo db shouldn't be too tricky.
from django.db.models import Aggregate, FloatField
from django.db.models.sql.aggregates import Aggregate as SQLAggregate
class Dist(Aggregate):
def add_to_query(self, query, alias, col, source, is_summary):
source = FloatField()
aggregate = SQLDist(
col, source=source, is_summary=is_summary, **self.extra)
query.aggregates[alias] = aggregate
class SQLDist(SQLAggregate):
sql_function = 'ST_Distance_Sphere'
sql_template = "%(function)s(ST_GeomFromText('%(point)s'), %(field)s)"
This can be used as follows:
queryset.annotate(distance=Dist('longlat', point="POINT(1.022 -42.029)"))
Anyone knows a better way of doing this, please let me know (or tell me why mine is stupid)
One of the modern approaches is the set "output_field" arg to avoid «Improper geometry input type: ». Withour output_field django trying to convert ST_Distance_Sphere float result to GEOField and can not.
queryset = self.objects.annotate(
distance=Func(
Func(
F('addresses__location'),
Func(
Value('POINT(1.022 -42.029)'),
function='ST_GeomFromText'
),
function='ST_Distance_Sphere',
output_field=models.FloatField()
),
function='round'
)
)
Doing it like this this works for me, ie I can apply a filter on an annotation.
Broken up for readability.
from models import Address
from django.contrib.gis.measure import D
from django.contrib.gis.db.models.functions import Distance
intMiles = 200
destPoint = Point(5, 23)
queryset0 = Address.objects.all().order_by('-postcode')
queryset1 = queryset0.annotate(distance=Distance('myPointField' , destPoint ))
queryset2 = queryset1.filter(distance__lte=D(mi=intMiles))
Hope it helps somebody :)
You can use GeoQuerySet.distance
cities = City.objects.distance(reference_pnt)
for city in cities:
print city.distance()
Link: GeoDjango distance documentaion
Edit: Adding distance attribute along with distance filter queries
usr_pnt = fromstr('POINT(-92.69 19.20)', srid=4326)
City.objects.filter(point__distance_lte=(usr_pnt, D(km=700))).distance(usr_pnt).order_by('distance')
Supported distance lookups
distance_lt
distance_lte
distance_gt
distance_gte
dwithin
A way to annotate & sort w/out GeoDjango. This model contains a foreignkey to a Coordinates record which contains lat and lng properties.
def get_nearby_coords(lat, lng, max_distance=10):
"""
Return objects sorted by distance to specified coordinates
which distance is less than max_distance given in kilometers
"""
# Great circle distance formula
R = 6371
qs = Precinct.objects.all().annotate(
distance=Value(R)*Func(
Func(
F("coordinates__lat")*Value(math.sin(math.pi/180)),
function="sin",
output_field=models.FloatField()
) * Value(
math.sin(lat*math.pi/180)
) + Func(
F("coordinates__lat")* Value(math.pi/180),
function="cos",
output_field=models.FloatField()
) * Value(
math.cos(lat*math.pi/180)
) * Func(
Value(lng*math.pi/180) - F("coordinates__lng") * Value(math.pi/180),
function="cos",
output_field=models.FloatField()
),
function="acos"
)
).order_by("distance")
if max_distance is not None:
qs = qs.filter(distance__lt=max_distance)
return qs