How to prefetch descendants with django-treebeard's MP_Node? - django

I'm developing an application with a hierarchical data structure in django-rest-framework using django-treebeard. My (simplified) main model looks like this
class Task(MP_Node):
name = models.CharField(_('name'), max_length=64)
started = models.BooleanField(default=True)
What I'm currently trying to achieve is a list view of all root nodes which shows extra fields (such as whether all children have started). To do this I specified a view:
class TaskViewSet(viewsets.ViewSet):
def retrieve(self, request, pk=None):
queryset = Task.get_tree().filter(depth=1, job__isnull=True)
operation = get_object_or_404(queryset, pk=pk)
serializer = TaskSerializer(operation)
return Response(serializer.data)
and serializer
class TaskSerializer(serializers.ModelSerializer):
are_children_started = serializers.SerializerMethodField()
def get_are_children_started(self, obj):
return all(task.started for task in Task.get_tree(obj))
This all works and I get the expected results. However, I run into a N+1 query problem where for each root task I need to fetch all children separately. Normally this would be solvable using prefetch_related but as I use the Materialized Path structure from django-treebeard there are no Django relationships between the task models, so prefetch_related doesn't know what to do out of the box. I've tried to use custom Prefetch objects but as this still requires a Django relation path I could not get it to work.
My current idea is to extend the Task model with a foreign key pointing to its root node like such:
root_node = models.ForeignKey('self', null=True,
related_name='descendant_tasks',
verbose_name=_('root task')
)
in order to make the MP relationship explicit so it can be queried. However, this does feel like a bit of a non-dry method of doing it so I wonder whether anyone has another suggestion on how to tackle it.

In the end I did end up with adding a foreign key to each task pointing to its root node like such:
root_node = models.ForeignKey('self', null=True,
related_name='descendant_tasks',
verbose_name=_('root task')
)
I updated my save method on my Task model to make sure I always point to the correct root node
def save(self, force_insert=False, force_update=False, using=None, update_fields=None):
try:
self.root_task = self.get_root()
except ObjectDoesNotExist:
self.root_task = None
return super(Task, self).save(force_insert=False, force_update=False, using=None,
update_fields=None
)
and this allows me to simply prefetch all descendants using prefetch_related('descendants').
Whenever I need to have the descendants in a nested fashion I use the following function to nest the flattened list of descendants again
def build_nested(tasks):
def get_basepath(path, depth):
return path[0:depth * Task.steplen]
container, link = [], {}
for task in sorted(tasks, key=attrgetter('depth')):
depth = int(len(task.path) / Task.steplen)
try:
parent_path = get_basepath(task.path, depth - 1)
parent_obj = link[parent_path]
if not hasattr(parent_obj, 'sub_tasks'):
parent_obj.sub_tasks = []
parent_obj.sub_tasks.append(task)
except KeyError: # Append it as root task if no parent exists
container.append(task)
link[task.path] = task
return container

If you want to avoid using a Foreign Key you can iterate over the queryset and re-create the tree structure in memory.
In my case I wanted to have a template tag (much like django-mptt's recursetree templatetag) to show multiple levels of nested pages with only one database query. Basically copying mptt.utils.get_cached_trees I ended up with this:
def get_cached_trees(queryset: QuerySet) -> list:
"""Return top-most pages / roots.
Each page will have its children stored in `_cached_children` attribute
and its parent in `_cached_parent`. This avoids having to query the database.
"""
top_nodes: list = []
path: list = []
for obj in queryset:
obj._cached_children = []
if obj.depth == queryset[0].depth:
add_top_node(obj, top_nodes, path)
else:
while not is_child_of(obj, parent := path[-1]):
path.pop()
add_child(parent, obj)
if obj.numchild:
path.append(obj)
return top_nodes
def add_top_node(obj: MP_Node, top_nodes: list, path: list) -> None:
top_nodes.append(obj)
path.clear()
def add_child(parent: MP_Node, obj: MP_Node) -> None:
obj._cached_parent = parent
parent._cached_children.append(obj)
def is_child_of(child: MP_Node, parent: MP_Node) -> bool:
"""Return whether `child` is a sub page of `parent` without database query.
`_get_children_path_interval` is an internal method of MP_Node.
"""
start, end = parent._get_children_path_interval(parent.path)
return start < child.path < end
It can be used like this to avoid the dreaded N+1 query problem:
for page in get_cached_trees(queryset):
for child in page._cached_children:
...

Related

Django Rest Framework Filtering an object in get_queryset method

Basically, I have a catalog viewset. In the list view I want to make a few filtering and return accordingly.
Relevant Catalog model fields are:
class Catalog(models.Model):
name = models.CharField(max_length=191, null=True, blank=False)
...
team = models.ForeignKey(Team, on_delete=models.CASCADE, editable=False, related_name='catalogs')
whitelist_users = models.JSONField(null=True, blank=True, default=list) # If white list is null, it is open to whole team
Views.py
class CatalogViewSet(viewsets.ModelViewSet):
permission_classes = (IsOwnerAdminOrRestricted,)
def get_queryset(self):
result = []
user = self.request.user
catalogs = Catalog.objects.filter(team__in=self.request.user.team_set.all())
for catalog in catalogs:
if catalog.whitelist_users == [] or catalog.whitelist_users == None:
# catalog is open to whole team
result.append(catalog)
else:
# catalog is private
if user in catalog.whitelist_users:
result.append(catalog)
return result
So this is my logic;
1 - Get the catalog object if catalog's team is one of the current user' team.
2 - Check if the catalog.whitelist_users contains the current user. (There is also an exception that if it is none means it s open to whole team so I can show it in the list view.)
Now this worked but since I am returning an array, it doesn't find the detail objects correctly. I mean /catalog/ID doesn't work correctly.
I am new to DRF so I am guessing there is something wrong here. How would you implement this filtering better?
As the name of the method suggests, you need to return a queryset. Also, avoid iterating over a queryset if that's not necessary. It's better to do it in a single database hit. For complex queries, you can use the Q object.
from django.db.models import Q
# ...
def get_queryset(self):
user = self.request.user
catalogs = Catalog.objects.filter(
Q(whitelist_users__in=[None, []]) | Q(whitelist_users__contains=user),
team__in=user.team_set.all())
return catalogs
Now I am not 100% sure the whitelist_users__contains=user will work since it depends on how you construct your JSON, but the idea is there, you will just need to adapt what it contains.
This will be much more effective than looping in python and will respect what get_queryset is meant for.
A simple solution that comes to mind is just creating a list of PKs and filtering again, that way you return a Queryset. Not the most efficient solution, but should work:
def get_queryset(self):
pks = []
user = self.request.user
catalogs = Catalog.objects.filter(team__in=user.team_set.all())
for catalog in catalogs:
if catalog.whitelist_users == [] or catalog.whitelist_users == None:
# catalog is open to whole team
pks.append(catalog.pk)
else:
# catalog is private
if user in catalog.whitelist_users:
pks.append(catalog.pk)
return Catalog.objects.filter(id__in=pks)

Django: How to load related model objects when instantiating a model

I have two related models (1-n). From the parent model, I am doing a lot of operations on the child model. For each operation I am calling:
ItensOrder.objects.filter(order=self.pk)
Inside the Order class, which is the parent, I am using the children objects several times, like this:
def total(self):
itens = ItensOrder.objects.filter(order=self.pk)
valor = sum(Counter(item.price * item.quantity for item in itens))
return str(valor)
def details(self):
itens = ItensOrder.objects.filter(order=self.pk)
return format_html_join('\n', "{} ({} x {} = {})<br/>",
((item.item.name,str(item.quantity),item.price,str(item.price * item.quantity)) for item in itens))
What is the best way to load the related objects ONLY ONCE, so I can avoid reaching the database every time I need the related objects.
I've been trying this on the parent model:
def __init__(self, *args, **kwargs):
if self.pk is not None:
self.itens = ItensOrder.objects.filter(order=self.pk)
else:
self.itens = None
But this is wrong....
Anybody can help please!?
You can access related child objects by using the related_name of a ForeignKey field
order = Order.objects.get(id=1)
itens = order.itensorder_set.all()
This reverse relationship attribute will by default be the model name lowercase followed by "_set", you can change this by setting related_name on the foreign key
You can pre-populate this property with a cache of all the related objects by using prefetch_related
order = Order.objects.prefetch_related('itensorder_set').get(id=1)
order.itensorder_set.all() # This can be called multiple times but will not hit the database
In your case
class Order(models.Model):
def total(self):
valor = sum(Counter(item.price * item.quantity for item in self.itensorder_set.all()))
return str(valor)
def details(self):
return format_html_join('\n', "{} ({} x {} = {})<br/>",
((item.item.name,str(item.quantity),item.price,str(item.price * item.quantity)) for item in self.itensorder_set.all()))
and in your model admin override get_queryset
def get_queryset(self, request):
return super().get_queryset(request).prefetch_related('itensorder_set')
you can user select_related inside your functions
def total(self):
itens = ItensOrder.objects.select_related('order').filter(order=self)
valor = sum(Counter(item.price * item.quantity for item in itens))
return str(valor)
def details(self):
itens = ItensOrder.objects.select_related('order').filter(order=self)
return format_html_join('\n', "{} ({} x {} = {})<br/>",
((item.item.name,str(item.quantity),item.price,str(item.price * item.quantity)) for item in itens))

How to sort after dehydrate creates all data in Tastypie

I want to BookResource to perform join (book table with author table) with dedydrate(...) function. Final result should be sorted by table Author.
dehydrate(...) is called for each item in Book table.
class Author(Model)
author_name = models.CharField(max_length=64)
class Book(Model)
author = models.ForeignKey('Author')
book_name = models.CharField(max_length=64)
class BookResource(ModelResource):
class Meta(object):
# The point here is Book table can be sorted. But, final result
# should be sorted by author_name
queryset = Book.objects.all().order_by('book_name')
resource_name = 'api_test'
serializer = Serializer(formats=['xml', 'json'])
allowed_methods = ('get')
always_return_data = True
def dehydrate(self, bundle):
author_id = bundle.obj.author.id # author is foreign key of book
author_obj = Author.objects.get(id=bundle.obj.author.id)
# Construct queryset with author_name. Same as join 2 tables.
# But, I want to sort by author.
bundle.data['author_name'] = author_obj.author_name
return bundle
# This is called before dehydrate(...) is called. Not sure how to use it.
def apply_sorting(self, obj_list, options=None):
return obj_list
Questions:
1) How to sort result by author if using above code?
2) Could not figure out how to do join. Can you provide alternative?
Thank you.
Very late answer, but I ran into this lately. If you look at the Tastypie resource on the dehydrate method
there is a series of methods called. Shortly, this is the order of calls:
build_bundle
obj_get_list
apply_sorting
paginators
full_dehydrate (field by field)
alter_list_data_to_serialize
return self.create_response
You want to focus on the second last method self.alter_list_data_to_serialize(request, to_be_serialized)
the base method returns the second parameter
def alter_list_data_to_serialize(self, request, data):
"""
A hook to alter list data just before it gets serialized & sent to the user.
Useful for restructuring/renaming aspects of the what's going to be
sent.
Should accommodate for a list of objects, generally also including
meta data.
"""
return data
Whatever you want to do you can do it in here. Consider that to be serialized format will be a dict with 2 fields: meta and objects. the object field has a list of bundle objects.
An additional sorting layer could be:
def alter_list_data_to_serialize(self, request, data):
try:
sord = True if request.GET.get("sord") == "asc" else False
data["objects"].sort(
key=lambda x: x.data[request.GET.get("sidx", default_value)],
reverse=sord)
except:
pass
return data
You can optimize it as is a bit dirt. But that's the main idea.
Again, it's a workaround and all the sorting should belong to apply_sorting

AND search with reverse relations

I'm working on a django project with the following models.
class User(models.Model):
pass
class Item(models.Model):
user = models.ForeignKey(User)
item_id = models.IntegerField()
There are about 10 million items and 100 thousand users.
My goal is to override the default admin search that takes forever and
return all the matching users that own "all" of the specified item ids within a reasonable timeframe.
These are a couple of the tests I use to better illustrate my criteria.
class TestSearch(TestCase):
def search(self, searchterm):
"""A tuple is returned with the first element as the queryset"""
return do_admin_search(User.objects.all())
def test_return_matching_users(self):
user = User.objects.create()
Item.objects.create(item_id=12345, user=user)
Item.objects.create(item_id=67890, user=user)
result = self.search('12345 67890')
assert_equal(1, result[0].count())
assert_equal(user, result[0][0])
def test_exclude_users_that_do_not_match_1(self):
user = User.objects.create()
Item.objects.create(item_id=12345, user=user)
result = self.search('12345 67890')
assert_false(result[0].exists())
def test_exclude_users_that_do_not_match_2(self):
user = User.objects.create()
result = self.search('12345 67890')
assert_false(result[0].exists())
The following snippet is my best attempt using annotate that takes over 50 seconds.
def search_by_item_ids(queryset, item_ids):
params = {}
for i in item_ids:
cond = Case(When(item__item_id=i, then=True), output_field=BooleanField())
params['has_' + str(i)] = cond
queryset = queryset.annotate(**params)
params = {}
for i in item_ids:
params['has_' + str(i)] = True
queryset = queryset.filter(**params)
return queryset
Is there anything I can do to speed it up?
Here's some quick suggestions that should improve performance drastically.
Use prefetch_related` on the initial queryset to get related items
queryset = User.objects.filter(...).prefetch_related('user_set')
Filter with the __in operator instead of looping through a list of IDs
def search_by_item_ids(queryset, item_ids):
return queryset.filter(item__item_id__in=item_ids)
Don't annotate if it's already a condition of the query
Since you know that this queryset only consists of records with ids in the item_ids list, no need to write that per object.
Putting it all together
You can speed up what you are doing drastically just by calling -
queryset = User.objects.filter(
item__item_id__in=item_ids
).prefetch_related('user_set')
with only 2 db hits for the full query.

DRF - How to get WritableField to not load entire database into memory?

I have a very large database (6 GB) that I would like to use Django-REST-Framework with. In particular, I have a model that has a ForeignKey relationship to the django.contrib.auth.models.User table (not so big) and a Foreign Key to a BIG table (lets call it Products). The model can be seen below:
class ShoppingBag(models.Model):
user = models.ForeignKey('auth.User', related_name='+')
product = models.ForeignKey('myapp.Product', related_name='+')
quantity = models.SmallIntegerField(default=1)
Again, there are 6GB of Products.
The serializer is as follows:
class ShoppingBagSerializer(serializers.ModelSerializer):
product = serializers.RelatedField(many=False)
user = serializers.RelatedField(many=False)
class Meta:
model = ShoppingBag
fields = ('product', 'user', 'quantity')
So far this is great- I can do a GET on the list and individual shopping bags, and everything is fine. For reference the queries (using a query logger) look something like this:
SELECT * FROM myapp_product WHERE product_id=1254
SELECT * FROM auth_user WHERE user_id=12
SELECT * FROM myapp_product WHERE product_id=1404
SELECT * FROM auth_user WHERE user_id=12
...
For as many shopping bags are getting returned.
But I would like to be able to POST to create new shopping bags, but serializers.RelatedField is read-only. Let's make it read-write:
class ShoppingBagSerializer(serializers.ModelSerializer):
product = serializers.PrimaryKeyRelatedField(many=False)
user = serializers.PrimaryKeyRelatedField(many=False)
...
Now things get bad... GET requests to the list action take > 5 minutes and I noticed that my server's memory jumps up to ~6GB; why?! Well, back to the SQL queries and now I see:
SELECT * FROM myapp_products;
SELECT * FROM auth_user;
Ok, so that's not good. Clearly we're doing "prefetch related" or "select_related" or something like that in order to get access to all the products; but this table is HUGE.
Further inspection reveals where this happens on Line 68 of relations.py in DRF:
def initialize(self, parent, field_name):
super(RelatedField, self).initialize(parent, field_name)
if self.queryset is None and not self.read_only:
manager = getattr(self.parent.opts.model, self.source or field_name)
if hasattr(manager, 'related'): # Forward
self.queryset = manager.related.model._default_manager.all()
else: # Reverse
self.queryset = manager.field.rel.to._default_manager.all()
If not readonly, self.queryset = ALL!!
So, I'm pretty sure that this is where my problem is; and I need to say, don't select_related here, but I'm not 100% if this is the issue or where to deal with this. It seems like all should be memory safe with pagination, but this is simply not the case. I'd appreciate any advice.
In the end, we had to simply create our own PrimaryKeyRelatedField class to override the default behavior in Django-Rest-Framework. Basically we ensured that the queryset was None until we wanted to lookup the object, then we performed the lookup. This was extremely annoying, and I hope the Django-Rest-Framework guys take note of this!
Our final solution:
class ProductField(serializers.PrimaryKeyRelatedField):
many = False
def __init__(self, *args, **kwargs):
kwarsgs['queryset'] = Product.objects.none() # Hack to ensure ALL products are not loaded
super(ProductField, self).__init__(*args, **kwargs)
def field_to_native(self, obj, field_name):
return unicode(obj)
def from_native(self, data):
"""
Perform query lookup here.
"""
try:
return Product.objects.get(pk=data)
except Product.ObjectDoesNotExist:
msg = self.error_messages['does_not_exist'] % smart_text(data)
raise ValidationError(msg)
except (TypeError, ValueError):
msg = self.error_messages['incorrect_type'] % type(data)
raise ValidationError(msg)
And then our serializer is as follows:
class ShoppingBagSerializer(serializers.ModelSerializer):
product = ProductField()
...
This hack ensures the entire database isn't loaded into memory, but rather performs one-off selects based on the data. It's not as efficient computationally, but it also doesn't blast our server with 5 second database queries loaded into memory!