Django queryset not returning distinct values - django

I have a query that for some reason is not returning distinct values even though I have specified distinct, I thought it may be because of the only, so I removed that, but the list is still the same
circuit_providers = CircuitInfoData.objects.only('provider').values('provider').distinct()
I just want a list of unqiue providers
model.py
from __future__ import unicode_literals
from django.db import models
import string
import random
import time
import os
# Create your models here.
from service.models import ServiceContacts
def site_photos_path(instance, filename):
file ,extension = os.path.splitext(filename)
# file will be uploaded to MEDIA_ROOT/user_<id>/<filename>
chars=string.ascii_uppercase + string.digits
random_string = ''.join(random.choice(chars) for _ in range(6))
filename = '%s-%s%s' % (random_string,time.strftime("%d-%m-%H-%M-%S"),extension)
return 'site_photos/{0}'.format(filename)
def service_upload_path(instance, filename):
file ,extension = os.path.splitext(filename)
# file will be uploaded to MEDIA_ROOT/user_<id>/<filename>
chars=string.ascii_uppercase + string.digits
random_string = ''.join(random.choice(chars) for _ in range(6))
filename = '%s-%s%s' % (random_string,time.strftime("%d-%m-%H-%M-%S"),extension)
return 'service_files/{0}'.format(filename)
def site_files_path(instance, filename):
file ,extension = os.path.splitext(filename)
# file will be uploaded to MEDIA_ROOT/user_<id>/<filename>
chars=string.ascii_uppercase + string.digits
random_string = ''.join(random.choice(chars) for _ in range(6))
filename = '%s-%s%s' % (random_string,time.strftime("%d-%m-%H-%M-%S"),extension)
return 'site_files/{0}'.format(filename)
provider_choices = (
('KCOM','KCOM'),
('BT','BT'),
('EE','EE'),
('THREE','THREE'),
)
circuit_choices = (
('DSL','DSL'),
('VDSL','VDSL'),
('MPLS','MPLS'),
('4G','4G'),
('Internet Leased Line','Internet Leased Line'),
)
subnet_mask_choices = (
('/16','/16'),
('/24','/24'),
('/25','/25'),
('/26','/26'),
('/27','/27'),
('/28','/28'),
('/29','/29'),
('/30','/30'),
('/31','/31'),
)
class ShowroomConfigData(models.Model):
location = models.CharField(max_length=50)
subnet = models.GenericIPAddressField(protocol='IPv4')
r1_loopback_ip = models.GenericIPAddressField(protocol='IPv4',verbose_name="R1 Loopback IP")
r2_loopback_ip = models.GenericIPAddressField(protocol='IPv4',verbose_name="R2 Loopback IP")
opening_date = models.DateField(verbose_name="Showroom opening date")
last_hw_refresh_date = models.DateField(verbose_name="Date of latest hardware refresh")
is_showroom = models.BooleanField(default=True,verbose_name="Is this site a showroom?")
class Meta:
verbose_name = "Showroom Data"
verbose_name_plural = "Showroom Data"
ordering = ('location',)
def __unicode__(self):
return self.location
class MajorSiteInfoData(models.Model):
location = models.CharField(max_length=200)
major_subnet = models.GenericIPAddressField(protocol='IPv4',verbose_name="Major Site Subnet")
routed_subnet = models.GenericIPAddressField(protocol='IPv4',verbose_name="Routed Link Subnet")
bgp_as = models.CharField(max_length=6,verbose_name="BGP AS Number")
class Meta:
verbose_name = "Major Site Data"
verbose_name_plural = "Major Site Data"
def __unicode__(self):
return self.location
class CircuitInfoData(models.Model):
showroom_config_data = models.ForeignKey(ShowroomConfigData,verbose_name="Install Showroom")
major_site_info = models.ForeignKey(MajorSiteInfoData,verbose_name="Install Site")
circuit_type = models.CharField(max_length=100,choices=circuit_choices)
circuit_speed = models.IntegerField(blank=True)
circuit_bearer = models.IntegerField(blank=True)
provider = models.CharField(max_length=200,choices=provider_choices)
ref_no = models.CharField(max_length=200,verbose_name="Reference No")
class Meta:
verbose_name = "Circuit Data"
verbose_name_plural = "Circuit Data"
ordering = ('showroom_config_data__location','circuit_speed')
def __unicode__(self):
return '%s | %s | %s | %s | %s' % (self.showroom_config_data.location,self.major_site_info.location, self.provider, self.service_type, self.ref_no)
results from shell below
[root#network-tools infternal]# python manage.py shell
Python 2.7.5 (default, Nov 20 2015, 02:00:19)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-4)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
(InteractiveConsole)
>>> from networks.models import CircuitInfoData
>>> d = CircuitInfoData.objects.values('provider').distinct()
>>> for item in d:
... print item
...
{'provider': u'BT'}
{'provider': u'BT'}
{'provider': u'KCOM'}
{'provider': u'BT'}
{'provider': u'BT'}
{'provider': u'KCOM'}
.....
>>> print d.query
SELECT DISTINCT "networks_circuitinfodata"."provider", "networks_showroomconfigdata"."location", "networks_circuitinfodata"."circuit_speed" FROM "networks_circuitinfodata" INNER JOIN "networks_showroomconfigdata" ON ("networks_circuitinfodata"."showroom_config_data_id" = "networks_showroomconfigdata"."id") ORDER BY "networks_showroomconfigdata"."location" ASC, "networks_circuitinfodata"."circuit_speed" ASC
>>>
one thing ive noticed is that when i print items in shell as above
#### with def __unicode__(self): #####
>>> from networks.models import CircuitInfoData
>>> d = CircuitInfoData.objects.only('provider').distinct()
>>> for i in d:
... print i
...
Location1 | Showroom | BT | DSL | N/A
Location2 | Showroom | BT | MPLS | XXXX
Location2 | Showroom | KCOM | MPLS | XXXX
Location3 | Showroom | BT | MPLS | XXXX
Location3 | Showroom | BT | DSL | N/A
Location4 | Showroom | KCOM | MPLS | XXXXX
...
#### with out def __unicode__(self): #####
>>> from networks.models import CircuitInfoData
>>> d = CircuitInfoData.objects.only('provider').distinct()
>>> for i in d:
... print i
...
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
...
#### with either ####
>>> for i in d:
... print i.provider
...
BT
BT
KCOM
BT
BT
KCOM
...

The documentation for distinct says
Returns a new QuerySet that uses SELECT DISTINCT in its SQL query.
This eliminates duplicate rows from the query results.
By default, a QuerySet will not eliminate duplicate rows. In practice,
this is rarely a problem, because simple queries such as
Blog.objects.all() don’t introduce the possibility of duplicate result
rows.
Distinct gives you distinct rows but you are looking at only one of the fields in the record and in that field items can be duplicated unless it has a unique constraint on it. And in this case you don't.
If you happen to be using postgresql you can do
CircuitInfoData.objects.distinct('provider')
to achieve your objective.
UPDATE:
Since you mentioned in the comments that you use sqlite, use this solution.
CircuitInfoData.objects.values('provider').distinct()
This will work because now each row has only one column. the resulting query will be similar to
SELECT DISTINCT "someapp_circuitinfodata"."name" FROM "someapp_circuitinfodata"
UPDATE 2:
Notice that you have overridden the __unicode__ function.
def __unicode__(self):
return '%s | %s | %s | %s | %s' %
(self.showroom_config_data.location,self.major_site_info.location,
self.provider, self.service_type, self.ref_no)
You are referring to the fields in the related model. This is going to be very costly (unless you use select_related). Also note that if you iterate through a queryset and use print for debug purposes it will give you misleading results (since what you are seeing is the output of __unicode__, a rather complex function)

Related

django - improve performance of __in queryset in M2M filtering

I have a models that has a M2M relationship to another model.
These are my models:
class Catalogue(models.Model):
city = models.CharField(db_index=True,max_length=100, null=True)
district = models.CharField(db_index=True,max_length=100, null=True)
type = models.ManyToManyField(Type, db_index=True)
datetime = models.CharField(db_index=True, max_length=100, null=True)
class Type(models.Model):
name = models.CharField(max_length=100)
def __str__(self):
return self.name
And this is views.py:
class all_ads(generic.ListView):
paginate_by = 12
template_name = 'new_list_view_grid-card.html'
def get_queryset(self):
city_district = self.request.GET.getlist('city_district')
usage = self.request.GET.get('usage')
status = self.request.GET.get('status')
last2week = datetime.datetime.now() - datetime.timedelta(days=14)
status = status.split(',')
if usage:
usage = usage.split(',')
else:
usage = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31']
intersections = list(set(status).intersection(usage))
type_q = (Q(type__in=intersections) & Q(type__isnull=False))
result = models.Catalogue.objects.filter(
Q(datetime__gte=last2week) &
type_q &
((reduce(operator.or_, (Q(city__contains=x) for x in city_district)) & Q(city__isnull=False)) |
(reduce(operator.or_, (Q(district__contains=x) for x in city_district)) & Q(district__isnull=False)))
).distinct().order_by('-datetime').prefetch_related('type')
return result
I want to filter MySQL db with some queries and return result in a listview.
It works good on a small database, but with large database it takes over 10 seconds to return results. If I delete type_q query, It takes 2 seconds (reduce 10 second!).
How can I improve performance of __in queryset?
It looks like type_q itself is not really the culprit, but acts as a multiplier, since now we make a LEFT OUTER JOIN, and thus the __contains runs over all combinations. This is thus more a peculiarity of two filters that work together
We can omit this with:
cat_ids = list(Catalogue.objects.filter(
Q(*[Q(city__contains=x) for x in city_district], _connector=Q.OR) |
Q(*[Q(district__contains=x) for x in city_district], _connector=Q.OR)
).values_list('pk', flat=True))
result = models.Catalogue.objects.filter(
Q(datetime__gte=last2week),
type_q,
pk__in=cat_ids
).distinct().order_by('-datetime').prefetch_related('type')
Some database (MySQL is known to not optimize a subquery very well), can even do that with a subquery with. So here we do not materialize the list, but let Django work with a subquery:
cat_ids = Catalogue.objects.filter(
Q(*[Q(city__contains=x) for x in city_district], _connector=Q.OR) |
Q(*[Q(district__contains=x) for x in city_district], _connector=Q.OR)
).values_list('pk', flat=True)
result = models.Catalogue.objects.filter(
Q(datetime__gte=last2week),
type_q,
pk__in=cat_ids
).distinct().order_by('-datetime').prefetch_related('type')

how to merge two annotated querysets into one result

Model:
class Foo(models.model):
name = models.CharField(max_length = 50, blank = True, unique = True)
class Bar1(models.Model):
foo = models.ForeignKey('Foo')
value = models.DecimalField(max_digits=10,decimal_places=2)
class Bar2(models.Model):
foo = models.ForeignKey('Foo')
value = models.DecimalField(max_digits=10,decimal_places=2)
Clasess Bar1 and Bar2 are unrelated, so I can't do it as one class what would solve the problem. But this is only example to show the problem as pure as possible.
first = Foo.objects.all().annotate(Sum("bar1__value"))
second = Foo.objects.all().annotate(Sum("bar2__value"))
each of this querysets contains correct values.
I can't merge it into:
both = Foo.objects.all().annotate(Sum("bar1__value")).annotate(Sum("bar2__value"))
Because the sum value multiplicates - this is unfortunately expected behaviour - because of JOINS
And now the problem - how to merge/join first and second to get the both?
Example:
Bar 1:
foo | value
--------------
A | 10
B | 20
B | 20
Bar 2:
foo | value
--------------
A | -0.10
A | -0.10
B | -0.25
both (value differs depends on order of entering bar1 and bar2)
foo | bar1__value__sum | bar2__value__sum
---------------------------------
A | 20 | -0.20
B | 40 | -0.50
expected result:
foo | bar1__value__sum | bar2__value__sum
---------------------------------
A | 10 | -0.20
B | 40 | -0.25
I couldn't use itertools.chains because the result is:
foo | bar1__value__sum | bar2__value__sum
---------------------------------
A | null | -0.20
B | null | -0.25
A | 10 | null
B | 40 | null
Your problem is a known limitation of Django's ORM: https://code.djangoproject.com/ticket/10060.
If you're ok with doing two queries, here's one option:
result = Foo.objects.annotate(b1_sum=Sum("bar1__value"))
bar2_sums = Foo.objects.annotate(b2_sum=Sum("bar2__value")).in_bulk()
for foo in result:
foo.b2_sum = bar2_sums.get(foo.pk).b2_sum
According to answer of #emulbreh i read the ticket and found some solution. I go this way and made this:
models.py:
from django.db.models.expressions import RawSQL
from django.db.models.query import QuerySet
(...)
class NewManager(models.Manager):
"""A re-usable Manager to access a custom QuerySet"""
def __getattr__(self, attr, *args):
try:
return getattr(self.__class__, attr, *args)
except AttributeError:
# don't delegate internal methods to the queryset
if attr.startswith('__') and attr.endswith('__'):
raise
return getattr(self.get_query_set(), attr, *args)
def get_query_set(self):
return self.model.QuerySet(self.model, using=self._db)
class Foo(models.Model):
name = models.CharField(max_length = 50, blank = True, unique = True)
objects =NewManager()
def __str__(self):
return self.name
class QuerySet(QuerySet):
def annotate_sum(self, modelClass, field_name):
annotation_name="%s__%s__%s" % (modelClass._meta.model_name,field_name,'sum')
raw_query = "SELECT SUM({field}) FROM {model2} WHERE {model2}.{model3}_id = {model1}.id".format(
field = field_name,
model3 = self.model._meta.model_name,
model2 = modelClass._meta.db_table,
model1 = self.model._meta.db_table
)
debug.debug("%s" % raw_query)
annotation = {annotation_name: RawSQL(raw_query, [])}
return self.annotate(**annotation)
And views.py:
both = Foo.objects.annotate_sum(Bar1, 'value').annotate_sum( Bar2, 'value')
the sql result is exact what I want:
SELECT "app_foo"."id", "app_foo"."name", (SELECT SUM(value) FROM app_bar1 WHERE app_bar1.foo_id = app_foo.id) AS "bar1__value__sum", (SELECT SUM(value) FROM app_bar2 WHERE app_bar2.foo_id = app_foo.id) AS "bar2__value__sum" FROM "app_foo"
Of course it isn't perfect - it needs some error checking (e.g. double quotes) or aliases, but i think this is the right direction
I landed on this page after having a similar problem, but with Count instead of Sum.
The simplest solution is to use Count(<field>, distinct=True) on the 2nd Count, i.e.
both = Foo.objects.all().annotate(Count("bar1__value")
).annotate(Count("bar2__value", distinct=True))
References:
ticket 10060/comment:60 linked by #emulbreh answer
django 2.0 docs / Aggregation # Combining multiple aggregations

Django QuerySet union operator are not commutative after annotated filter

... and return unexpected results (in Django 1.6.5)
My models.py
class Member(models.Model):
...
class Donation(models.Model):
year = models.PositiveSmallIntegerField()
cheque_amount = models.DecimalField(default=0, max_digits=8, decimal_places=2)
donor = models.ForeignKey(Member)
...
class SpecialTitle(models.Model):
chair_title = models.CharField(max_length=128, blank=True)
member = models.OneToOneField(Member)
...
I'd like the union of the two querysets in one of my admin filters
donors = queryset.filter(
donation__year__exact=2014
).annotate(sum_donation=Sum('donation__cheque_amount')).filter(sum_donation__gte=1000)
chairs = queryset.filter(specialtitle__chair_title__iendswith='Chair')
Here is the puzzling part (in Django manager shell)
>>> donors | chairs == chairs | donors
False
>>> donors.count(); chairs.count()
189
17
>>> (donors | chairs).count(); (chairs | donors).count()
193
291
>>> (donors | chairs).distinct().count(); (chairs | donors).distinct().count()
193
207
And none of them are the correct results. I'd expect a set operation to be
>>> set(donors) | set(chairs) == set(chairs) | set(donors)
True
>>> set(donors) & set(chairs) == set(chairs) & set(donors)
True
>>>
And they return the correct results. However, Django admin filter demands a QuerySet, not a python set (or list)
Why is this? How do I get a proper union of Django QuerySet (of the same type) after annotated filter?
Thank you.
It appears I had no other choice but to use the python set union operator and hit the database again for the desired result.
donors = queryset.filter(
donation__year__exact=2014
).annotate(sum_donation=Sum('donation__cheque_amount')).filter(sum_donation__gte=1000)
chairs = queryset.filter(specialtitle__chair_title__iendswith='Chair')
result = queryset.filter(pk__in=[person.id for person in set(donors) | set(chairs)])

complex sums divided by month

models:
class Category(models.Model):
name = models.CharField(max_length=100)
class Operation(models.Model):
date = models.DateField()
value = models.DecimalField(max_digits = 9, decimal_places = 2)
category = models.ForeignKey(Category, null = True)
comments = models.TextField(null = True)
Now I want to create a view, with 13 columns:
name of category | -11 | -10 | -9 | ... | -1 | 0
eg.
...food.. | $123.00 | $100.14 | ... | $120.13| $54.12
.clothes.| $555.23 | $232.23 | ... | $200.12| $84.44
where $123.00 for example is a sum of values of operations with category food, made 11 months ago, $100.14 - 10 months ago and so on - $54.12 is sum of current month, 555.23 => the same but category clothes...
I googled a lot, but most of examples are simple - without related class (category)
The correct answer after suggestion of Answer 1:
def get_month_sum_series(self):
import qsstats, datetime
from django.db.models import Sum
qss = qsstats.QuerySetStats(self.operation_set.all(), date_field='date', aggregate_field='value',aggregate_class=Sum)
today = datetime.date.today()
year_ago = today - datetime.timedelta(days=365)
return qss.time_series( start_date=year_ago, end_date=today, interval='months')
Take a look at django-qsstats. It has a time_series feature which will alow you to get whole series of data for all time in one request. In your case I'd create a method in Category, something like:
def price_series(self):
return qsstats.time_series(queryset=self.operation_set.all(), start_date=year_ago, end_date=now, interval='months')
Of course, you'll need to set up year_ago and now variables (for example, using datetime module functions).

Django ORM equivalent for this SQL..calculated field derived from related table

I have the following model structure below:
class Master(models.Model):
name = models.CharField(max_length=50)
mounting_height = models.DecimalField(max_digits=10,decimal_places=2)
class MLog(models.Model):
date = models.DateField(db_index=True)
time = models.TimeField(db_index=True)
sensor_reading = models.IntegerField()
m_master = models.ForeignKey(Master)
The goal is to produce a queryset that returns all the fields from MLog plus a calculated field (item_height) based on the related data in Master
using Django's raw sql:
querySet = MLog.objects.raw('''
SELECT a.id,
date,
time,
sensor_reading,
mounting_height,
(sensor_reading - mounting_height) as item_height
FROM db_mlog a JOIN db_master b
ON a.m_master_id = b.id
''')
How do I code this using Django's ORM?
I can think of two ways to go about this without relying on raw(). The first is pretty much the same as what #tylerl suggested. Something like this:
class Master(models.Model):
name = models.CharField(max_length=50)
mounting_height = models.DecimalField(max_digits=10,decimal_places=2)
class MLog(models.Model):
date = models.DateField(db_index=True)
time = models.TimeField(db_index=True)
sensor_reading = models.IntegerField()
m_master = models.ForeignKey(Master)
def _get_item_height(self):
return self.sensor_reading - self.m_master.mounting_height
item_height = property(_get_item_height)
In this case I am defining a custom (derived) property for MLog called item_height. This property is calculated as the difference of the sensor_reading of an instance and the mounting_height of its related master instance. More on property here.
You can then do something like this:
In [4]: q = MLog.objects.all()
In [5]: q[0]
Out[5]: <MLog: 2010-09-11 8>
In [6]: q[0].item_height
Out[6]: Decimal('-2.00')
The second way to do this is to use the extra() method and have the database do the calculation for you.
In [14]: q = MLog.objects.select_related().extra(select =
{'item_height': 'sensor_reading - mounting_height'})
In [16]: q[0]
Out[16]: <MLog: 2010-09-11 8>
In [17]: q[0].item_height
Out[17]: Decimal('-2.00')
You'll note the use of select_related(). Without this the Master table will not be joined with the query and you will get an error.
I always do the calculations in the app rather than in the DB.
class Thing(models.Model):
foo = models.IntegerField()
bar = models.IntegerField()
#Property
def diff():
def fget(self):
return self.foo - self.bar
def fset(self,value):
self.bar = self.foo - value
Then you can manipulate it just as you would any other field, and it does whatever you defined with the underlying data. For example:
obj = Thing.objects.all()[0]
print(obj.diff) # prints .foo - .bar
obj.diff = 4 # sets .bar to .foo - 4
Property, by the way, is just a standard property decorator, in this case coded as follows (I don't remember where it came from):
def Property(function):
keys = 'fget', 'fset', 'fdel'
func_locals = {'doc':function.__doc__}
def probeFunc(frame, event, arg):
if event == 'return':
locals = frame.f_locals
func_locals.update(dict((k,locals.get(k)) for k in keys))
sys.settrace(None)
return probeFunc
sys.settrace(probeFunc)
function()
return property(**func_locals)