How to test multi-output pyspark transforms in Foundry - foundry-code-repositories

We want to write unit tests for python transforms that have multiple outputs (i.e. #transform annotation) and have not been able to build the TransformOutput objects we need to pass to the function we're testing.
What's the best way to do this?

You can create fake inputs and outputs as follows, then pass them into your #transform function:
class FakeTransformInput:
def __init__(self, df):
self.df = df
def dataframe(self):
return self.df
def set_mode(self, mode):
pass
class FakeTransformOutput:
def __init__(self, df):
self.df = df
def dataframe(self):
return self.df
def write_dataframe(
self, df, partition_cols=None, bucket_cols=None, bucket_count=None,
sort_by=None, output_format=None, options=None, column_descriptions=None,
column_typeclasses=None):
self.df = df
def set_mode(self, mode):
pass
And to use them:
output_schema = StructType([
StructField("col_1", StringType(), True),
StructField("col_2", StringType(), True),
StructField("col_n", StringType(), True),
])
output_transform = FakeTransformOutput(spark_session.createDataFrame([], output_schema))
input_transform = FakeTransformInput(spark_session.createDataFrame(input_df))
YOUR_MODULE.compute(
input_transform, output_transform
)
# Perform assertions on output_transform

Related

Possible to get queryset from list of queryset -Django

I wanted to take queryset from multiple models. I am trying to achieve multiple search with and condition.
views.py,
def list(self, request, *args, **kwargs):
search_query = self.request.query_params.get('search')
split_query = search_query.split()
employment = None
employee1 = []
employment1 = []
for query in split_query:
print("hi", query)
# query = self.request.query_params.get('search')
employee = PositionFulfillment.objects.filter(
Q(employment__employee__code__icontains=query) |
Q(employment__employee__person__name__icontains=query) |
Q(employment__employee__person__surname__icontains=query)
)
# emp = list(chain(employee))
employee1.append(employee)
print("employee", employee1)
active_employee = PositionFulfillment.objects.filter(primary_flag=True, thru_date=None)
if active_employee:
employment = active_employee.filter(
Q(position__position_type__name__icontains=query) |
Q(employment__organization__name__icontains=query) |
Q(employment__status__status__employment_status__icontains=query)
)
employment1.append(employment)
all_results = list(chain(map(lambda x: x, employee1), map(lambda y: y, employment1)))
# all_results = list(chain(employee, employment))
print("all_results", all_results)
serializer = EmployeeSearchSerializer(all_results)
return Response(serializer.data, status=status.HTTP_200_OK)
I have got output like below,
all_results,
[<QuerySet [<PositionFulfillment: 27>, <PositionFulfillment: 29>, <PositionFulfillment: 30>]>, <QuerySet []>, <QuerySet []>, <QuerySet [<PositionFulfillment: 28>]>]
Expected output,
[<PositionFulfillment: 27>, <PositionFulfillment: 29>, <PositionFulfillment: 30>]> ,<QuerySet [<PositionFulfillment: 28>]>]
How can i achieve this???
You can add conditions to the query instead of creating new queries:
def list(self, request, *args, **kwargs):
search_query = self.request.query_params.get('search')
split_query = search_query.split()
q = None # this is filter
for query in split_query:
print("hi", query)
if q is None:
q = Q(employment__employee__code__icontains=query)
else:
q.add(Q(employment__employee__code__icontains=query), Q.OR)
q.add(Q(employment__employee__person__name__icontains=query), Q.OR)
q.add(Q(employment__employee__person__surname__icontains=query), Q.OR)
# same for active_employee
# make only one query based on all conditions in q
all_results = PositionFulfillment.objects.filter(q)
print("all_results", all_results)
serializer = EmployeeSearchSerializer(all_results)
return Response(serializer.data, status=status.HTTP_200_OK)

Sorting the django objects based on SerializerMethodField

I am trying to order the user profiles based on the timestamp of the last message in between both the users.
I am using SerializerMethodField to get the timestamp of the last message.
is there any way I can sort the data?
class UserProfileSerializer(serializers.ModelSerializer):
lastmessage = serializers.SerializerMethodField()
class Meta:
model = User
fields = ['id','lastmessage']
def get_lastmessage(self,obj):
k = self.context.get('view').kwargs['sid']
data =( Message.objects.filter(receiver=obj.id,sender=k) | Message.objects.filter(sender=obj.id,receiver=k)).order_by('-timestamp').values('message','timestamp')
if len(data) == 0:
return ""
else:
data = data.first()
data["timestamp"] = str(data["timestamp"])
return str(data)
My view:
class UserChatViewSet(viewsets.ModelViewSet):
queryset = User.objects.all()
serializer_class = UserProfileSerializer
Now my views return:
[{
"id": 4,
"lastmessage": "{'message': 'random', 'timestamp': '2020-06-14 23:49:33.077749+00:00'}"
},
{
"id": 5,
"lastmessage": ""
},
{
"id": 6,
"lastmessage": "{'message': 'sample', 'timestamp': '2020-06-14 11:53:03.880833+00:00'}"
},
{
"id": 7,
"lastmessage": ""
}]
But I want it to sort based on the timestamp of last message
You can overwrite list in order to achieve this:
def list(self, request, *args, **kwargs):
response = super().list(request, args, kwargs)
# sort response.data['results']
return response
Also, lastmessage can be a dict instead of a str, so it's easier to work with.
The order of your response should be handled in the view .
from django.db.models import Subquery, OuterRef
lm = Message.objects.filter(sender=OuterRef("id"), receiver=self.kwargs['sid']).order_by('-timestamp')
data = User.objects.all().annotate(
lastmessage=Subquery(
lm.values('timestamp')[:1]
)
).order_by('-lastmessage__timestamp')

Django filter testing

class BusinessPartnerFilter(SilBaseFilter):
active = django_filters.BooleanFilter(
name='date_deactivated', lookup_expr='isnull')
parent_name = django_filters.CharFilter(name='parent__name')
unmapped = django_filters.BooleanFilter(method='check_if_unmapped')
I have added the field 'unmapped' above and created the method filter below. Can someone please help me to write tests for the filter. I'm stuck.
class Meta(object):
model = models.BusinessPartner
fields = [
'name', 'bp_type', 'slade_code', 'parent', 'national_identifier',
'active', 'parent_name', 'unmapped'
]
def check_if_unmapped(self, queryset, field, value):
if value:
exclude_bps = [record.id for record in queryset if record.mapped == 0 and record.unmapped == 0]
return queryset.exclude(id__in=exclude_bps)
return queryset
You can either test the filter method in isolation, or test the evaluation of FilterSet.qs.
To test the filter method, you don't necessarily need a fully initialized FilterSet.
qs = BusinessPartner.objects.all()
f = BusinessPartnerFilter()
result = f.check_if_unmapped(qs, 'unmapped', True)
# assert something about the result
That said, it's not much more difficult to fully initialize the FilterSet and check the .qs.
qs = BusinessPartner.objects.all()
f = BusinessPartnerFilter(data={'unmapped': 'True'}, queryset=qs)
result = f.qs
# assert something about the result

Django Rest- filtering results to show first variation

I'd like to filter my results to only show the first "Variation" class pointing to "Product" class.
I almost want it to work like this:
'product__variation__image__image'[0],
OR
'product_set__variation__image__image'[0],
I tried using:
.distinct('product_pk')
But this would not work if I was using .order_by()
The below code works, but each item is repeated several times as it has variations relating to it.
Here's a snippet of my Django rest views:
wardrobematch = {
'user': lambda x: ('user__pk', x)
}
class WardrobeListView(APIView):
renderer_classes = (JSONRenderer, )
def get(self, request, *args, **kwargs):
filters = {}
for key, value in request.GET.items():
key = key.lower()
if key in wardrobematch:
lookup, val = wardrobematch[key](value.lower())
filters[lookup] = val
qset = (
Analytic.objects
.filter(like=True,**filters)
# .distinct('product_id',)
.values('product_id', 'product__name', 'product__brand', 'product__store__store_name', 'product__variation__image__image', 'product__variation__price__price', 'updated',)
)
result = sorted(qset, key=lambda obj: obj.updated)
return Response(serializers.serialize("json", result))
When you specify field names, you must provide an order_by() in the QuerySet, and the fields in order_by() must start with the fields in distinct(), in the same order.
Taken from: Django's documentation.
You could sort it afterwards anyway.
qs = Analytic.objects.filter(like=True,**filters)
.distinct('product_pk',)
.values('product_pk', 'product__name', 'product__brand', 'product__store__store_name', 'product__variation__image__image', 'product__variation__price__price', 'updated')
result = sorted(qs, key=lambda obj: obj['updated'])
return Response(json.dumps({'result': result}))

how can I accelerate data migration from a large database in django

I try to remove model inheritance from my django 1.7 app
models.py
class Element(models.Model):
ELEMENT_A = 'ELEMENT_A'
ELEMENT_B = 'ELEMENT_B'
TYPES = (
(ELEMENT_A, 'Element A'),
(ELEMENT_B, 'Element B')
)
number = models.CharField(max_length=255)
type = models.CharField(max_length=10, choices=TYPES, default=ELEMENT_A)
quantity = models.IntegerField(default=1)
class ChildElement(Element):
old_number = models.CharField(max_length=30, unique=True)
old_quantity = models.IntegerField()
my migration file
def forwards_remove_heritance(apps, schema_editor):
childs = apps.get_model("core", "ChildElement")
type = Element.ELEMENT_A
for c in childs.objects.all():
c.number = c.old_number
c.quantity = c.old_quantity
c.type = type
c.save()
return
def backward_remove_heritance(apps, schema_editor):
return
class Migration(migrations.Migration):
dependencies = [
]
operations = [
migrations.RunPython(
forwards_remove_heritance, backward_remove_heritance
),
]
my migration take hours, I have more than 750k elements in app_ChildElement table
Any idea ?
Use queryset update with F expressions, example:
ChildElement.objects.all().update(number=F('old_number'))
the solution I found was to update my database directly with a raw SQL inside an operation
class RemoveChild(Operation):
reversible = True
def __init__(self):
pass
def state_forwards(self, app_label, state):
pass
def database_forwards(self, app_label, schema_editor, from_state, to_state):
schema_editor.execute("""update Element
SET Element.number = ChildElement.old_number, Element.quantity=ChildElement.old_quantity, Element.type='ELEMENT_B'
FROM Element
INNER JOIN ChildElement
ON ChildElement.element_ptr_id = Element.id;
""")
def database_backwards(self, app_label, schema_editor, from_state, to_state):
pass
def describe(self):
return "Remove child heritance model"