How to upload and process large excel files using Celery in Django? - django

I am trying to upload and process excel file using Django and DRF with Celery.
There is an issue when I am trying to pass the file to my Celery task to be processed in the background, I get a following error:
kombu.exceptions.EncodeError: Object of type InMemoryUploadedFile is not JSON serializable
Here is my view post request handler:
class FileUploadView(generics.CreateAPIView):
"""
POST: upload file to save data in the database
"""
parser_classes = [MultiPartParser]
serializer_class = FileSerializerXLSX
def post(self, request, format=None):
"""
Allows to upload file and lets it be handled by pandas
"""
serialized = FileSerializerXLSX(data=request.data)
if serialized.is_valid():
file_obj = request.data['file']
# file_bytes = file_obj.read()
print(file_obj)
import_excel_task.delay(file_obj)
print("its working")
return Response(status=204)
return Response(serialized._errors, status=status.HTTP_400_BAD_REQUEST)
And my celery task:
def import_excel_helper(file_obj):
df = extract_excel_to_dataframe(file_obj)
transform_df_to_clientmodel(df)
transform_df_to_productmodel(df)
transform_df_to_salesmodel(df)
#shared_task(name="import_excel_task")
def import_excel_task(file_obj):
"""Save excel file in the background"""
logger.info("Importing excel file")
import_excel_helper(file_obj)
Any idea what is the way to handle importing Excel files into celery task so that it can be processed by other functions in the background?

As in the error, the body of the request to call a celery task must be JSON serializable since it is the default configuration. Then as documented in kombu:
The primary disadvantage to JSON is that it limits you to the following data types: strings, Unicode, floats, boolean, dictionaries, and lists. Decimals and dates are notably missing.
Let's say this is my excel file.
file.xlsx
Some
Value
Here
:)
Solution 1
Convert the raw bytes of the excel into Base64 string before calling the task so that it can be JSON serialized (since strings are valid data types in a JSON document, raw bytes are not). Then, everything else in the Celery configurations are the same default values.
tasks.py
import base64
import pandas
from celery import Celery
app = Celery('tasks')
#app.task
def add(excel_file_base64):
excel_file = base64.b64decode(excel_file_base64)
df = pandas.read_excel(excel_file)
print("Contents of excel file:", df)
views.py
import base64
from tasks import add
with open("file.xlsx", 'rb') as file: # Change this to be your <request.data['file']>
excel_raw_bytes = file.read()
excel_base64 = base64.b64encode(excel_raw_bytes).decode()
add.apply_async((excel_base64,))
Output
[2021-08-19 20:40:28,904: INFO/MainProcess] Task tasks.add[d5373444-485d-4c50-8695-be2e68ef1c67] received
[2021-08-19 20:40:29,094: WARNING/ForkPoolWorker-4] Contents of excel file:
[2021-08-19 20:40:29,094: WARNING/ForkPoolWorker-4]
[2021-08-19 20:40:29,099: WARNING/ForkPoolWorker-4] Some Value
0 Here :)
[2021-08-19 20:40:29,099: WARNING/ForkPoolWorker-4]
[2021-08-19 20:40:29,099: INFO/ForkPoolWorker-4] Task tasks.add[d5373444-485d-4c50-8695-be2e68ef1c67] succeeded in 0.19386404199940444s: None
Solution 2:
This is the harder way. Implement a custom serializer that will handle excel files.
tasks.py
import ast
import base64
import pandas
from celery import Celery
from kombu.serialization import register
def my_custom_excel_encoder(obj):
"""Uncomment this block if you intend to pass it as a Base64 string:
file_base64 = base64.b64encode(obj[0][0]).decode()
obj = list(obj)
obj[0] = [file_base64]
"""
return str(obj)
def my_custom_excel_decoder(obj):
obj = ast.literal_eval(obj)
"""Uncomment this block if you passed it as a Base64 string (as commented above in the encoder):
obj[0][0] = base64.b64decode(obj[0][0])
"""
return obj
register(
'my_custom_excel',
my_custom_excel_encoder,
my_custom_excel_decoder,
content_type='application/x-my-custom-excel',
content_encoding='utf-8',
)
app = Celery('tasks')
app.conf.update(
accept_content=['json', 'my_custom_excel'],
)
#app.task
def add(excel_file):
df = pandas.read_excel(excel_file)
print("Contents of excel file:", df)
views.py
from tasks import add
with open("file.xlsx", 'rb') as excel_file: # Change this to be your <request.data['file']>
excel_raw_bytes = excel_file.read()
add.apply_async((excel_raw_bytes,), serializer='my_custom_excel')
Output
Same as Solution 1
Solution 3
You might be interested with this documentation of Sending raw data without Serialization

Related

Passing Audio Files To Celery Task

I have a music uploading app and believe that it would be smart to pass the files to a celery task to handle uploading. However, when attempting to pass the files, as I will show in my code below, I get a message stating that they are not JSON serializable. What would be the correct way to handle this operation?
Everything below uploaded_songs in .views.py is my current code that successfully uploads the audio tracks. It doesn't, however, utilize celery yet.
.task.py
from django.contrib.auth import get_user_model
from Beyond_April_Base_Backend.celery import app
from django.contrib.auth.models import User
#app.task
def upload_songs(songs, user_id):
try:
user = User.objects.get(pk=user_id)
print('user and songs')
print(user)
print(songs)
except User.DoesNotExist:
logging.warning("Tried to find non-exisiting user '%s'" % user_id)
.views.py
class ConcertUploadView(APIView):
permission_classes = [permissions.IsAuthenticated]
def post(self, request):
track_files = request.FILES.getlist('files')
current_user = self.request.user
upload_songs.delay(track_files, current_user.pk)
try:
selected_band = Band.objects.get(name=request.data['band'])
except ObjectDoesNotExist:
print('band not received from form')
selected_band = Band.objects.get(name='Band')
venue_name = request.data['venue']
concert_date_str = request.data['concertDate']
concert_date_split = concert_date_str.split('(')[0]
concert_date = datetime.strptime(concert_date_split, '%a %b %d %Y %H:%M:%S %Z%z ')
concert_city = request.data['city']
concert_state = request.data['state']
concert_country = request.data['country']
new_concert = Concert(
venue=venue_name,
date=concert_date,
city=concert_city,
state=concert_state,
country=concert_country,
band=selected_band,
user=current_user,
)
new_concert.save()
i = 0
for song in track_files:
audio_metadata = music_tag.load_file(track_files[i].temporary_file_path())
temp_path = song.temporary_file_path
song_title = str(audio_metadata['title'])
audio_file_instance = Song(
title=song_title,
concert=new_concert,
user=current_user,
concert_order = i + 1,
audio_file = track_files[i],
)
audio_file_instance.save()
i += 1
return Response(status=status.HTTP_201_CREATED)
When you create a celery task, it serializes the arguments so that it can store the message in the queue backend (RabbitMQ, Redis, etc). The default serializer is JSON, and a binary file is not JSON-serializable. See celery's serialization docs for more info.
You could base64 encode the binary file to text, but you shouldn't: it will increase the size of the data, and you'll be passing around potentially very large messages. With lots of large messages, you could run out of memory/space in your backend, and it will make it hard to inspect or log messages.
Instead, you should store the binary file somewhere, and pass a reference (filename, S3 URL, database key, etc) to the task. The task can then load the file, do what it needs to, and delete the original (if appropriate).

how to load csv file data into pandas using request.FILES(django 1.11) without saving file on server

i just want to upload .csv file via form, directly in to pandas dataframe in django without saving physically file on to server.
def post(self, request, format=None):
try:
from io import StringIO, BytesIO
import io
print("data===",request.FILES['file'].read().decode("utf-8"))
# print("file upload FILES data=====",pd.read_csv(request.FILES['file'].read(), sep=','))
#print(request.FILES)
print("file upload data df=====11")
mm = pd.read_csv( BytesIO(request.FILES['file'].read().decode("utf-8")))
print("dataframe data=====",mm)
# import io, csv
# urlData = request.FILES['file']
# data = [row for row in (csv.reader(urlData))]
# print("file upload data df=====222",data)
# mm = pd.read_csv()
#excel_file = request.FILES['file']
# movies = pd.read_excel(request.FILES['file'])
except Exception as e:
print(e)
log.debug("Error in CheckThreadStatus api key required "+str(e))
return Response(responsejson('api key required', status=404))
the ans is straight forward: that is
pd.read_csv(request.FILES['file'])
works perfectly fine, the mistake i was doing is that.. my csv file was not in correct format.
Check With
pd.read_csv('data.csv') # doctest: +SKIP
If using post method you can try
getFile = request.FILE['file_name']
pd.read_csv(getFile) # doctest: +SKIP
You can use StringIO for reading and decoding your csv :
import csv
from io import StringIO
csv_file = request.FILES["csv_file"]
content = StringIO(csv_file.read().decode('utf-8'))
reader = csv.reader(content)
After reading you can populate your database like this :
csv_rows = [row for row in reader]
field_names = csv_rows[0] # Get the header row
del csv_rows[0] # Deleting header after storing it's values in field_names
for index, row in enumerate(csv_rows):
data_dict = dict(zip(field_names, row))
Model.objects.update_or_create(id=row[0],
defaults=data_dict
)
Make sure to validate data before inserting, if the data is critical.
HINT: use django forms to validate for you.
from django import forms

Start and Stop a periodically background Task with Django

I would like to make a bitcoin notification with Django. If managed to have a working Telegram bot that send the bitcoin stat when I ask him to do so. Now I would like him to send me a message if bitcoin reaches a specific value. There are some tutorials with running python script on server but not with Django. I read some answers and descriptions about django channels but couldn't adapt them to my project.
I would like to send, by telegram, a command about the amount and duration. Django would then start a process with these values and values of the channel I'm sending from in the background. If now, within the duration, the amount is reached, Django sends a message back to my channel. This should also be possible for more than one person.
Is these possible to do with Django out of the box, maybe with decorators, or do I need django-channels or something else?
Edit 2018-08-10:
Maybe my code explains a little bit better what I want to do.
import requests
import json
from datetime import datetime
from django.shortcuts import render
from django.http import HttpResponse
from django.conf import settings
from django.views.generic import TemplateView
from django.views.decorators.csrf
import csrf_exempt
class AboutView(TemplateView):
template_name = 'telapi/about.html'
bot_token = settings.BOT_TOKEN
def get_url(method):
return 'https://api.telegram.org/bot{}/{}'.format(bot_token, method)
def process_message(update):
data = {}
data['chat_id'] = update['message']['from']['id']
data['text'] = "I can hear you!"
r = requests.post(get_url('sendMessage'), data=data)
#csrf_exempt
def process_update(request, r_bot_token):
''' Method that is called from telegram-bot'''
if request.method == 'POST' and r_bot_token == bot_token:
update = json.loads(request.body.decode('utf-8'))
if 'message' in update:
if update['message']['text'] == 'give me news':
new_bitcoin_price(update)
else:
process_message(update)
return HttpResponse(status=200)
bitconin_api_uri = 'https://api.coinmarketcap.com/v2/ticker/1/?convert=EUR'
# response = requests.get(bitconin_api_uri)
def get_latest_bitcoin_price():
response = requests.get(bitconin_api_uri)
response_json = response.json()
euro_price = float(response_json['data']['quotes']['EUR']['price'])
timestamp = int(response_json['metadata']['timestamp'])
date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
return euro_price, date
def new_bitcoin_price(update):
data = {}
data['chat_id'] = update['message']['from']['id']
euro_price, date = get_latest_bitcoin_price()
data['text'] = "Aktuel ({}) beträgt der Preis {:.2f}€".format(
date, euro_price)
r = requests.post(get_url('sendMessage'), data=data)
Edit 2018-08-13:
I think the solution would be celery-beat and channels. Does anyone know a good tutorial?
One of my teammates uses django-celery-beat, that is available at https://github.com/celery/django-celery-beat to do this and he gave me some excellent feedback from it. You can schedule the celery tasks using the crontab syntax.
I had same issue, there are several typical approaches: Celery, Django-Channels, etc.
But you can avoid them all with simple approach: https://docs.djangoproject.com/en/2.1/howto/custom-management-commands/
I have used django commands in my project to run periodically tasks to rebuild users statistics:
Implement yourself application command, for example your application name is myapp and you have placed my_periodic_task.py in myapp/management/commands folder, so you can run your task once by typing python manage.py my_periodic_task
place beside manage.py file new file for example background.py with same code:
-
import os
from subprocess import call
BASE = os.path.dirname(__file__)
MANAGE_BASE = os.path.join(BASE, 'manage.py')
while True:
sleep(YOUR_TIMEOUT)
call(['python', MANAGE_BASE , 'my_periodic_task'])
Run your server for example: python background.py & python manage.py runserver 0.0.0.0:8000

Simply Import Task with Celery

I have a simple view which uploads CSV data to a mapped model and populates the data. This works perfect, but now I want to integrate Celery and I'm really struggling to get the following task to work. I'm trying Celery with Django and Amazon SQS.
This is the main part of my view.py which runs the task:
def upload(request):
# If we had a POST then get the request post values.
if request.method == 'POST':
form = ContactUploadForm(request.POST, request.FILES)
# Check we have valid data
if form.is_valid():
filename = handle_uploaded_file(request.FILES['file'])
import_csv.delay(filename)
def handle_uploaded_file(f):
with open('name.csv', 'wb+') as destination:
for chunk in f.chunks():
destination.write(chunk)
This was my 1st attempt at the task.py
#task
def import_csv(filename):
ContactCSVModel.import_from_file(filename)
Which gives the error in the celery log: AttributeError: 'NoneType' object has no attribute 'seek'
My second attempt I think won't work because it actually trying to upload the file to SQS and gives SQSError: 413 Request Entity Too Large. I'm assuming this is not what I want to do at all, its a task and I don't want to upload the file to SQS.
2nd attempt at task.py
#task
def import_csv(filename):
ContactCSVModel.import_data(data = open(filename))
3rd attempted at task.py by passing in the request instead
#task
def import_csv(request):
filename = handle_uploaded_file(request.FILES['file'])
ContactCSVModel.import_data(data = open(filename))
This give the error **Can't pickle <type 'cStringIO.StringO'>: attribute lookup cStringIO.StringO failed**
How can I achieve this task? I'm sure it's something very simple :) As you can see I have tried a few different things above to create this task.
Following this example: http://codeinthehole.com/writing/use-models-for-uploads/
Create a new model to handle the file upload and use celery to run the import, this way the task is just the job id
#task
def process_upload(upload_id):
upload = Uploads.objects.get(id=upload_id)
upload.process()

how to unit test file upload in django

In my django app, I have a view which accomplishes file upload.The core snippet is like this
...
if (request.method == 'POST'):
if request.FILES.has_key('file'):
file = request.FILES['file']
with open(settings.destfolder+'/%s' % file.name, 'wb+') as dest:
for chunk in file.chunks():
dest.write(chunk)
I would like to unit test the view.I am planning to test the happy path as well as the fail path..ie,the case where the request.FILES has no key 'file' , case where request.FILES['file'] has None..
How do I set up the post data for the happy path?Can somebody tell me?
I used to do the same with open('some_file.txt') as fp: but then I needed images, videos and other real files in the repo and also I was testing a part of a Django core component that is well tested, so currently this is what I have been doing:
from django.core.files.uploadedfile import SimpleUploadedFile
def test_upload_video(self):
video = SimpleUploadedFile("file.mp4", "file_content", content_type="video/mp4")
self.client.post(reverse('app:some_view'), {'video': video})
# some important assertions ...
In Python 3.5+ you need to use bytes object instead of str. Change "file_content" to b"file_content"
It's been working fine, SimpleUploadedFile creates an InMemoryFile that behaves like a regular upload and you can pick the name, content and content type.
From Django docs on Client.post:
Submitting files is a special case. To POST a file, you need only
provide the file field name as a key, and a file handle to the file
you wish to upload as a value. For example:
c = Client()
with open('wishlist.doc') as fp:
c.post('/customers/wishes/', {'name': 'fred', 'attachment': fp})
I recommend you to take a look at Django RequestFactory. It's the best way to mock data provided in the request.
Said that, I found several flaws in your code.
"unit" testing means to test just one "unit" of functionality. So,
if you want to test that view you'd be testing the view, and the file
system, ergo, not really unit test. To make this point more clear. If
you run that test, and the view works fine, but you don't have
permissions to save that file, your test would fail because of that.
Other important thing is test speed. If you're doing something like
TDD the speed of execution of your tests is really important.
Accessing any I/O is not a good idea.
So, I recommend you to refactor your view to use a function like:
def upload_file_to_location(request, location=None): # Can use the default configured
And do some mocking on that. You can use Python Mock.
PS: You could also use Django Test Client But that would mean that you're adding another thing more to test, because that client make use of Sessions, middlewares, etc. Nothing similar to Unit Testing.
I do something like this for my own event related application but you should have more than enough code to get on with your own use case
import tempfile, csv, os
class UploadPaperTest(TestCase):
def generate_file(self):
try:
myfile = open('test.csv', 'wb')
wr = csv.writer(myfile)
wr.writerow(('Paper ID','Paper Title', 'Authors'))
wr.writerow(('1','Title1', 'Author1'))
wr.writerow(('2','Title2', 'Author2'))
wr.writerow(('3','Title3', 'Author3'))
finally:
myfile.close()
return myfile
def setUp(self):
self.user = create_fuser()
self.profile = ProfileFactory(user=self.user)
self.event = EventFactory()
self.client = Client()
self.module = ModuleFactory()
self.event_module = EventModule.objects.get_or_create(event=self.event,
module=self.module)[0]
add_to_admin(self.event, self.user)
def test_paper_upload(self):
response = self.client.login(username=self.user.email, password='foz')
self.assertTrue(response)
myfile = self.generate_file()
file_path = myfile.name
f = open(file_path, "r")
url = reverse('registration_upload_papers', args=[self.event.slug])
# post wrong data type
post_data = {'uploaded_file': i}
response = self.client.post(url, post_data)
self.assertContains(response, 'File type is not supported.')
post_data['uploaded_file'] = f
response = self.client.post(url, post_data)
import_file = SubmissionImportFile.objects.all()[0]
self.assertEqual(SubmissionImportFile.objects.all().count(), 1)
#self.assertEqual(import_file.uploaded_file.name, 'files/registration/{0}'.format(file_path))
os.remove(myfile.name)
file_path = import_file.uploaded_file.path
os.remove(file_path)
I did something like that :
from django.core.files.uploadedfile import SimpleUploadedFile
from django.test import TestCase
from django.core.urlresolvers import reverse
from django.core.files import File
from django.utils.six import BytesIO
from .forms import UploadImageForm
from PIL import Image
from io import StringIO
def create_image(storage, filename, size=(100, 100), image_mode='RGB', image_format='PNG'):
"""
Generate a test image, returning the filename that it was saved as.
If ``storage`` is ``None``, the BytesIO containing the image data
will be passed instead.
"""
data = BytesIO()
Image.new(image_mode, size).save(data, image_format)
data.seek(0)
if not storage:
return data
image_file = ContentFile(data.read())
return storage.save(filename, image_file)
class UploadImageTests(TestCase):
def setUp(self):
super(UploadImageTests, self).setUp()
def test_valid_form(self):
'''
valid post data should redirect
The expected behavior is to show the image
'''
url = reverse('image')
avatar = create_image(None, 'avatar.png')
avatar_file = SimpleUploadedFile('front.png', avatar.getvalue())
data = {'image': avatar_file}
response = self.client.post(url, data, follow=True)
image_src = response.context.get('image_src')
self.assertEquals(response.status_code, 200)
self.assertTrue(image_src)
self.assertTemplateUsed('content_upload/result_image.html')
create_image function will create image so you don't need to give static path of image.
Note : You can update code as per you code.
This code for Python 3.6.
from rest_framework.test import force_authenticate
from rest_framework.test import APIRequestFactory
factory = APIRequestFactory()
user = User.objects.get(username='#####')
view = <your_view_name>.as_view()
with open('<file_name>.pdf', 'rb') as fp:
request=factory.post('<url_path>',{'file_name':fp})
force_authenticate(request, user)
response = view(request)
As mentioned in Django's official documentation:
Submitting files is a special case. To POST a file, you need only provide the file field name as a key, and a file handle to the file you wish to upload as a value. For example:
c = Client()
with open('wishlist.doc') as fp:
c.post('/customers/wishes/', {'name': 'fred', 'attachment': fp})
More Information: How to check if the file is passed as an argument to some function?
While testing, sometimes we want to make sure that the file is passed as an argument to some function.
e.g.
...
class AnyView(CreateView):
...
def post(self, request, *args, **kwargs):
attachment = request.FILES['attachment']
# pass the file as an argument
my_function(attachment)
...
In tests, use Python's mock something like this:
# Mock 'my_function' and then check the following:
response = do_a_post_request()
self.assertEqual(mock_my_function.call_count, 1)
self.assertEqual(
mock_my_function.call_args,
call(response.wsgi_request.FILES['attachment']),
)
if you want to add other data with file upload then follow the below method
file = open('path/to/file.txt', 'r', encoding='utf-8')
data = {
'file_name_to_receive_on_backend': file,
'param1': 1,
'param2': 2,
.
.
}
response = self.client.post("/url/to/view", data, format='multipart')`
The only file_name_to_receive_on_backend will be received as a file other params received normally as post paramas.
In Django 1.7 there's an issue with the TestCase wich can be resolved by using open(filepath, 'rb') but when using the test client we have no control over it. I think it's probably best to ensure file.read() returns always bytes.
source: https://code.djangoproject.com/ticket/23912, by KevinEtienne
Without rb option, a TypeError is raised:
TypeError: sequence item 4: expected bytes, bytearray, or an object with the buffer interface, str found
from django.test import Client
from requests import Response
client = Client()
with open(template_path, 'rb') as f:
file = SimpleUploadedFile('Name of the django file', f.read())
response: Response = client.post(url, format='multipart', data={'file': file})
Hope this helps.
Very handy solution with mock
from django.test import TestCase, override_settings
#use your own client request factory
from my_framework.test import APIClient
from django.core.files import File
import tempfile
from pathlib import Path
import mock
image_mock = mock.MagicMock(spec=File)
image_mock.name = 'image.png' # or smt else
class MyTest(TestCase):
# I assume we want to put this file in storage
# so to avoid putting garbage in our MEDIA_ROOT
# we're using temporary storage for test purposes
#override_settings(MEDIA_ROOT=Path(tempfile.gettempdir()))
def test_send_file(self):
client = APIClient()
client.post(
'/endpoint/'
{'file':image_mock},
format="multipart"
)
I am using Python==3.8.2 , Django==3.0.4, djangorestframework==3.11.0
I tried self.client.post but got a Resolver404 exception.
Following worked for me:
import requests
upload_url='www.some.com/oaisjdoasjd' # your url to upload
with open('/home/xyz/video1.webm', 'rb') as video_file:
# if it was a text file we would perhaps do
# file = video_file.read()
response_upload = requests.put(
upload_url,
data=video_file,
headers={'content-type': 'video/webm'}
)
I am using django rest framework and I had to test the upload of multiple files.
I finally get it by using format="multipart" in my APIClient.post request.
from rest_framework.test import APIClient
...
self.client = APIClient()
with open('./photo.jpg', 'rb') as fp:
resp = self.client.post('/upload/',
{'images': [fp]},
format="multipart")
I am using GraphQL, upload for test:
with open('test.jpg', 'rb') as fp:
response = self.client.execute(query, variables, data={'image': [fp]})
code in class mutation
#classmethod
def mutate(cls, root, info, **kwargs):
if image := info.context.FILES.get("image", None):
kwargs["image"] = image
TestingMainModel.objects.get_or_create(
id=kwargs["id"],
defaults=kwargs
)