Django aggregating records for each foreign key

Django aggregating records for each foreign key - django

class Order(models.Model):
product = models.ForeignKey(Product, on_delete=models.CASCADE)
category = models.ForeignKey(
Category, null=True, on_delete=models.SET_NULL
)
user = models.ForeignKey(User, null=True, on_delete=models.SET_NULL)
placed = models.DateTimeField(auto_now=True)
shipped = models.DateTimeField(null=True)
delivered = models.DateTimeField(null=True)
I want to calculate statistics on how fast the order has been processed for each category
where process time is delivered - shipped
In result I want to achieve something like this:
[
{
"category": <category 1>
"processed_time": <average processed time in seconds>
},
{
"category": <category 2>
"processed_time": <average processed time in seconds>
},
{
"category": <category 3>
"processed_time": <average processed time in seconds>
},
]
I can calculate this outside of the ORM but I'd like to achieve this somehow with annotation/aggregation
delivered = delivered_qs.annotate(first_processed=Min("delivered"), last_processed=Max("delivered")) \
.aggregate(processed_time=F("last_processed")-F("first_processed"))
This QS returns time only for all categories and I dont know how to retrieve time for each individual category

You want to do a group by, which in Django works kinda weird. For more information see the documentation
But by first using .values you say again the queryset you gonna group by on the category. Than you determine the min, the max and the difference.
delivered = (
delivered_qs
.values('category')
.annotate(
first_processed=Min("delivered"),
last_processed=Max("delivered"),
processed_time=F("last_processed") - F("first_processed"),
)
)
Which, in my expectation, would return:
[{
"category": 1,
"first_processed": timedelta(),
"last_processed": timedelta(),
"processed_time": timedelta()
}, ...]

Related

Calculating Percentiles using Django Aggregation

I maintain a Django service that allows online community moderators to review/approve/reject user posts. Right now we measure the average "time to approval" but we need to start measuring the 90th percentile "time to approval" instead. So where we used to say "on average content gets approved in 3.3 hours", we might now say something like "90% of content is approved in 4.2 hours or less".
# Models.py
class Moderation(models.Model):
content = models.TextField()
created_at = models.DateTimeField(auto_now_add=True)
message_id = models.TextField(blank=True, null=True)
class ModerationAction(models.Model):
moderation = models.ForeignKey(Moderation)
action = models.CharField(max_length=50)
created_at = models.DateTimeField(auto_now_add=True)
# stats.py
average_time_to_approve_7days = ModerationAction.objects.filter(
action__in=moderation_actions,
created_at__gte=timezone.now() - timedelta(days=7)
).annotate(
time_to_approve=F('created_at') - F('moderation__created_at')
).values(
'action',
'time_to_approve'
).aggregate(
Avg('time_to_approve')
)['time_to_approve__avg']
# This returns a value like datetime.timedelta(0, 4008, 798824)
My goal: I'm seeking a way to get the 90th percentile time rather than the average time.

Django - Reuse an annotated count

I have a model that allows me to log errors with a hash that I generate, so the same errors have the same hash so I am able to group and count them. The model looks something like this.
class ErrorLog(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
date = models.DateTimeField(null=True, blank=True, db_index=True)
log = models.TextField(blank=True, null=True)
log_hash = models.CharField(max_length=255, blank=True, null=True)
In my view, I perform the following query to count the errors by hash.
def get(self, request):
qs = self.filter_queryset(self.get_queryset())
total_errors = qs.count()
qs = qs.values(
'log_hash', 'log'
).annotate(
error_count=Count('log_hash')
).annotate(
percentage_of_occurrence=Concat(
Cast(
funcs.Round(
(F('error_count') / total_errors) * 100, 1
), CharField()
), Value('%')
)
)
This works like a charm, because I can get my results back just as I want them.
"results": [
{
"error_count": 2,
"percentage_of_occurrence": "50.0%",
"log_hash": "8f7744ba51869f93ce990c67bd8d3544",
"log": "Error 1"
},
{
"error_count": 1,
"percentage_of_occurrence": "25.0%",
"log_hash": "de54a1e3be2cab4d04d8c61f538a71df",
"log": "Error 2"
},
{
"error_count": 1,
"percentage_of_occurrence": "25.0%",
"log_hash": "05988dc15543ef06e803a930923d11d4",
"log": "Error 3"
}
]
Here comes the problem, this is REALLY slow on a large table, so after inspecting the SQL generated I saw one problem. I am counting 2 times, one to get the error_count, and another one to calculate the percentage_of_occurrence.
SELECT `errorlog`.`log_hash`, `errorlog`.`log`, COUNT(`errorlog`.`log_hash`) AS `error_count`,
((COUNT(`errorlog`.`log_hash`) / ) * 100) AS `percentage_of_occurrence`
FROM `errorlog`
GROUP BY `errorlog`.`log_hash`, `errorlog`.`log`
ORDER BY `error_count` DESC
Is there any way I can reuse the first count to calculate the percentage_of_occurrence without having to count again? Also, I am not very savvy on SQL, but would it be better if the log_hash column was indexed?

how to filter nested related django objects

I have an app with lots of investors that invest in the same rounds, which belong to companies, as seen below. However when a user(investor) is logged in, i only want him to be able to see HIS investments.
{
"id": 1,
"name": "Technology Company",
"rounds": [
{
"id": 1,
"kind": "priced round",
"company": 1,
"investments": [
{
"id": 1,
"investor": 1,
"round": 1,
"size": 118000,
},
{
"id": 2,
"investor": 2,
"round": 1,
"size": 183000,
},
]
}
]
},
Currently, my viewsets extend get_queryset as so:
class CompanyViewSet(viewsets.ModelViewSet):
def get_queryset(self):
user = self.request.user
investor = Investor.objects.get(user=user)
companies = Company.objects.filter(rounds__investments__investor=investor)
return companies
It retrieves the investments that belong to the investor, but when it takes those investments again to retrieve the rounds, it grabs the round with ALL investors.
How can i write this such that it only displays the investments that below to the Investor?
Here are my models:
class Company(models.Model):
name = models.CharField(max_length=100)
class Round(PolymorphicModel):
company = models.ForeignKey(Company, related_name='rounds', blank=True, null=True)
class Investment(PolymorphicModel):
investor = models.ForeignKey(Investor, related_name='investor')
size = models.BigIntegerField(default=0)

Your description of what happens is pretty unclear. What does "when it takes those investments again" mean? Anyway, I'm guessing what you need to do is to use .prefetch_related and a Prefetch object.
from django.db.models import Prefetch
class CompanyViewSet(viewsets.ModelViewSet):
def get_queryset(self):
user = self.request.user
investor = Investor.objects.get(user=user)
companies = Company.objects.filter(
rounds__investments__investor_id=investor.id
).prefetch_related(Prefetch(
'rounds__investments',
queryset=Investment.objects.filter(
investor_id=investor.pk,
),
))
return companies
I haven't tested this snippet but it should give you a pointer in the right direction. I also optimized the investor lookup to check the id only, this will save you an unnecessary indirection.

Serializers in django rest framework with dynamic fields

I am trying to build a small api with django rest framework but I don't want to map directly the tables with calls (as in the examples).
I have the following database schema:
In models.py:
class ProductType(models.Model):
name = models.CharField(max_length=255, blank=False, null=False, unique=True)
class Product(models.Model):
#staticmethod
def get_accepted_fields(self):
return {'color': 'pink', 'size': 34, 'speed': 0, 'another_prop': ''}
name = models.CharField(max_length=255, blank=False, null=False, unique=True)
class ProductConfig(models.Model):
product_type = models.ForeignKey(ProductType)
product = models.ForeignKey(Product)
# a json field with all kind of fields: eg: {"price": 123, "color": "red"}
value = models.TextField(blank=True)
As you can see, every product can have multiple configurations and the value field is a json with different parameters. The json will be one level only. The configuration will have a flag if is active or not (so, the 1 product will have only 1 active configuration)
So, the data will look for example like this:
store_producttype
=================
1 type1
2 type2
store_product
=============
id name
1 car
store_productconfig
===================
id product_type_id product_id value active
1 2 1 { "color": "red", "size": 34, "speed": 342} 0
2 1 1 { "color": "blue", "size": 36, "speed": 123, "another_prop": "xxx"} 1
What I want to know is how can I get /product/1/ like this:
{
"id": 1,
"name": "car",
"type": "type1",
"color": "blue",
"size": 36,
"speed": 123,
"another_prop": "xxx",
}
and to create a new product posting a json similar with the one above.
The json fields are defined but some of them can miss (eg: "another_prop" in the productconfig.id=1
On update, anyway, it will create a new row in productconfig and it will put inactive=0 on the previous one.
So, every product can have different configuration and I want to go back to a specific configuration back in time in some specific cases). I am not really bound to this data model, so if you have suggentions for improvement I am open to them, but I don't want to have that properties as columns in the table.
The question is, what will be the best way to write the serializers for this model? There is any good example somewhere for a such use case?
Thank you.

Let's take this step by step:
In order to get a JSON like the one you posted, you must first transform your string (productConfig value field) to a dictionary. This can be done by using ast.literal_eval ( see more here).
Then, in your product serializer, you must specify the source for each field, like this:
class ProductSerializer(serializers.ModelSerializer):
color = serializer.Field(source='value_dict.color')
size = serializer.Field(source='value_dict.size')
type = serializer.Field(source='type.name')
class Meta:
model = Product
fields = (
'id',
'color',
'size',
'type',
)
This should work just fine for creating the representation that you want. However, this will not create automatically the product config, because DRF doesn't yet allow nested object creation.
This leads us to the next step:
For creating a product with a configuration from JSON, you must override the post method in your view, and create it yourself. This part shouldn't be so hard, but if you need an example, just ask.
This is more of a suggestion: if the json fields are already defined, wouldn't it be easier to define them as separate fields in your productConfig model?

Querying data from Django

Here's what my model structure looks like:
class Visitor(models.Model):
id = models.AutoField(primary_key=True)
class Session(models.Model):
id = models.AutoField(primary_key=True)
visit = models.ForeignKey(Visitor)
sequence_no = models.IntegerField(null=False)
class Track(models.Model):
id = models.AutoField(primary_key=True)
session = models.ForeignKey(Session)
action = models.ForeignKey(Action)
when = models.DateTimeField(null=False, auto_now_add=True)
sequence_no = models.IntegerField(null = False)
class Action(models.Model):
id = models.AutoField(primary_key=True)
url = models.CharField(max_length=65535, null=False)
host = models.IntegerField(null=False)
As you can see, each Visitor has multiple Sessions; each Session has multiple Tracks and each Track has one Action. Tracks are always ordered ascendingly by the session and the sequence_no. A Visitors average time on an site (i.e. a particular Action.host) is the difference in Track.when (time) between the highest and lowest Track.sequence_no divided by the number of Sessions of that Visitor.
I need to calculate the average time of visitors on the site which be the sum of the time for each visitor on the Action.site divided by the number of visitors.
I could query this using SQL but I'd like to keep my query as Djangonic as possible and I'm still very lost with complex queries.

For a specific Action object you can gather interesting data about Sessions:
from django.db.models import Min, Max
from yourapp.models import *
host = 1 # I suppose you want to calculate for each site
sessions = list(Session.objects.filter(
track__action__host=host,
).annotate(
start=Min('track__when'),
end=Max('track__when'),
).values('visit_id', 'start', 'end'))
You will get something in the line of:
[
{ 'visit_id': 1, 'start': datetime(...), 'end': datetime(...) },
{ 'visit_id': 1, 'start': datetime(...), 'end': datetime(...) },
{ 'visit_id': 2, 'start': datetime(...), 'end': datetime(...) },
....
]
Now it's only a matter of getting the desired result from the data:
number_of_visitors = len(set(s['visit_id'] for s in sessions))
total_time = sum((s['end'] - s['start']).total_seconds() for s in sessions)
average_time_spent = total_time / number_of_visitors
Another way is to use two queries instead of one, and avoid the len(set(...)) snippet:
sessions = Session.objects.filter(
track__action__host=host,
).annotate(
start=Min('track__when'),
end=Max('track__when'),
)
number_of_visitors = sessions.values('visit_id').distict().count()
total_time = sum((s['end'] - s['start']).total_seconds()
for s in sessions.values('start', 'end'))
There is NO WAY to do actual calculated fields barring the provided aggregations, so either you do it in raw SQL or you do in code like this.
At least the proposed solution uses Django's ORM as far as possible.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Django aggregating records for each foreign key - django

Related

Calculating Percentiles using Django Aggregation

Django - Reuse an annotated count

how to filter nested related django objects

Serializers in django rest framework with dynamic fields

Querying data from Django

Categories

Resources