Django To upload and read and write large excel file - django

I am new to Django and i need my app to allow users to upload excel files. On server side I am reading the excel file by each cell, append some values and then translate the values and again write back to excel file and download the attachment. I am able to perform this action for small files, but for large file it gives me timeout error. Please see the below .
enter code here
def translatedoc(request):
data=""
convrowstr=""
if request.method=='POST':
response = StreamingHttpResponse (content_type='application/vnd.ms-excel')
try:
form=fileUpload(request.POST,request.FILES)
if form.is_valid():
input_file=request.FILES.get('file')
sl=request.POST.get('fsl')
if sl=="Detect Language":
sl="auto"
else:
# get sl code from database
sl=languagecode.objects.filter(Language=sl).values_list('code')
sl=str(sl[0][0])
# get tl code from database
tl=languagecode.objects.filter(Language=request.POST.get('ftl')).values_list('code')
wb = xlrd.open_workbook(file_contents=input_file.read())
wb_sheet=wb.sheet_by_index(0)
for rownum in range(0, wb_sheet.nrows):
convstr=""
for colnum in range(0,wb_sheet.ncols):
try:
rw=wb_sheet.cell_value(rownum,colnum)
if type(rw)==float or type(rw)==int:
convstr=convstr +'<td>' + str(rw)
else:
convstr=convstr +'<td>' + rw
except Exception as e:
pass
if len(convstr) + len(convrowstr) >20000:
# translate if the length of doc exceed the limit
#call google api module
data=data + translate(convrowstr,sl,str(tl[0][0]))
convrowstr=""
if rownum==wb_sheet.nrows-1:
convrowstr= convrowstr + "<tr>" + convstr
# translate for first or last
#call google api module
data=data + translate(convrowstr,sl,str(tl[0][0]))
convrowstr=""
convrowstr= convrowstr + "<tr>" + convstr
log.error(rownum)
if len(data)>1:
sio=StringIO.StringIO()
try:
workbook = xlwt.Workbook()
sheet = workbook.add_sheet("output")
row=0
for rw in data.split("<tr>")[1:]:
col=0
for cl in rw.split("<td>")[1:]:
try:
sheet.write(row,col,cl.split("<b>")[1].split("</b>")[0])
except Exception as e:
pass
col+=1
row+=1
workbook.save(sio)
sio.seek(0)
sv=sio.getvalue()
response['Content-Disposition'] = 'attachment; filename=Output.xls'
return response
except Exception as e:
log.error(e)
except Exception as e:
log.error(e)

you can do the through celery for large file upload. You can read the file in celery.

Related

Save a file from requests using django filesystem

I'm currently trying to save a file via requests, it's rather large, so I'm instead streaming it.
I'm unsure how to specifically do this, as I keep getting different errors. This is what I have so far.
def download_file(url, matte_upload_path, matte_servers, job_name, count):
local_filename = url.split('/')[-1]
url = "%s/static/downloads/%s_matte/%s/%s" % (matte_servers[0], job_name, count, local_filename)
with requests.get(url, stream=True) as r:
r.raise_for_status()
fs = FileSystemStorage(location=matte_upload_path)
print(matte_upload_path, 'matte path upload')
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
fs.save(local_filename, f)
return local_filename
but it returns
io.UnsupportedOperation: read
I'm basically trying to have requests save it to the specific location via django, any help would be appreciated.
I was able to solve this, by using a tempfile to save the python requests, then saving it via the FileSystemStorage
local_filename = url.split('/')[-1]
url = "%s/static/downloads/%s_matte/%s/%s" % (matte_servers[0], job_name, count, local_filename)
response = requests.get(url, stream=True)
fs = FileSystemStorage(location=matte_upload_path)
lf = tempfile.NamedTemporaryFile()
# Read the streamed image in sections
for block in response.iter_content(1024 * 8):
# If no more file then stop
if not block:
break
# Write image block to temporary file
lf.write(block)
fs.save(local_filename, lf)

download log - modify and use last line

I'm trying to shorten or simplify my code.
I want to download a log file from an internal server which is updated every 10 seconds, but I'm only running my script every 10 or 15 minutes.
The log file is semicolon seperated and has many rows in it I don't use. So my workflow is as following.
get current date in YYYYMMDD format
download the file
delay for waiting that the file is finished downloading
trim the file to the rows I need
only process last line of the file
delete the files
I'm new to python and if you could help me to shorten/simplify my code in less steps I would be thankful.
import urllib
import time
from datetime import date
today = str(date.today())
import csv
url = "http://localserver" + today + ".log"
urllib.urlretrieve (url, "output.log")
time.sleep(15)
with open("output.log","rb") as source:
rdr= csv.reader(source, delimiter=';')
with open("result.log","wb") as result:
wtr= csv.writer( result )
for r in rdr:
wtr.writerow( (r[0], r[1], r[2], r[3], r[4], r[5], r[15], r[38], r[39], r[42], r[54], r[90], r[91], r[92], r[111], r[116], r[121], r[122], r[123], r[124]) )
with open('result.log') as myfile:
print (list(myfile)[-1]) #how do I access certain rows here?
You could probably make use of the advanced module, requests as below. The timeout can be increased depending on the time it takes for the download to complete successfully. Furthermore, the two with open statements can be consolidated in a single line. What is more, in order to load the line one by one in to the memory, we can make use of iter_lines generator. Note that stream=True should be set in order to load line one at a time.
from datetime import date
import csv
import requests
# Declare variables
today = str(date.today())
url = "http://localserver" + today + ".log"
outfile = 'output.log'
# Instead of waiting for 15 seconds explicitly consider using requests module
# with timeout parameter
response = requests.get(url, timeout=15, stream=True)
if response.status_code != 200:
print('Failed to get data:', response.status_code)
with open(outfile, 'w') as dest:
writer = csv.writer(dest)
# Walk through the request response line by line w/o loadin gto memory
line = list(response.iter_lines())[-1]
# Decode the response to string and split line by line
reader = csv.reader(line.decode('utf-8').splitlines(), delimiter=';')
# Read line by line for the splitted content and write to file
for r in reader:
writer.writerow((r[0], r[1], r[2], r[3], r[4], r[5], r[15], r[38], r[39], r[42], r[54], r[90], r[91], r[92],
r[111], r[116], r[121], r[122], r[123], r[124]))
print('File written successfully: ' + outfile)

Is there any faster way for downloading multiple files from s3 to local folder?

I am trying to download 12,000 files from s3 bucket using jupyter notebook, which is estimating to complete download in 21 hours. This is because each file is downloaded one at a time. Can we do multiple downloads parallel to each other so I can speed up the process?
Currently, I am using the following code to download all files
### Get unique full-resolution image basenames
images = df['full_resolution_image_basename'].unique()
print(f'No. of unique full-resolution images: {len(images)}')
### Create a folder for full-resolution images
images_dir = './images/'
os.makedirs(images_dir, exist_ok=True)
### Download images
images_str = "','".join(images)
limiting_clause = f"CONTAINS(ARRAY['{images_str}'],
full_resolution_image_basename)"
_ = download_full_resolution_images(images_dir,
limiting_clause=limiting_clause)
See the code below. This will only work with python 3.6+, because of the f-string (PEP 498). Use a different method of string formatting for older versions of python.
Provide the relative_path, bucket_name and s3_object_keys. In addition, max_workers is optional, and if not provided the number will be a multiple of 5 times the number of machine processors.
Most of the code for this answer came from an answer to How to create an async generator in Python?
which sources from this example documented in the library.
import boto3
import os
from concurrent import futures
relative_path = './images'
bucket_name = 'bucket_name'
s3_object_keys = [] # List of S3 object keys
max_workers = 5
abs_path = os.path.abspath(relative_path)
s3 = boto3.client('s3')
def fetch(key):
file = f'{abs_path}/{key}'
os.makedirs(file, exist_ok=True)
with open(file, 'wb') as data:
s3.download_fileobj(bucket_name, key, data)
return file
def fetch_all(keys):
with futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_key = {executor.submit(fetch, key): key for key in keys}
print("All URLs submitted.")
for future in futures.as_completed(future_to_key):
key = future_to_key[future]
exception = future.exception()
if not exception:
yield key, future.result()
else:
yield key, exception
for key, result in fetch_all(S3_OBJECT_KEYS):
print(f'key: {key} result: {result}')
Thank you for this. Had 9000 over JPEG images that I needed to download from my S3. I tried to incorporate this directly into my Colab Pro but wasn't able to get it to work. Kept getting "Errno 21 : Is a directory" error.
Had to add 2 things: 1) a makedir to create the directory I want & 2) use mknod, instead of mkdir.
fetch_all is almost the same: except a small edit for max_workers to actually take effect. s3c is just my boto3.client with my keys and all.
My download time went from 30+ mins to 5 mins with 1000 workers.
os.makedirs('/*some dir you want*/*prefix*')
def fetch(key):
file = f'{abs_path}/{key}'
os.mknod(file, mode=384)
with open(file, 'wb') as data:
s3c.download_fileobj(bucket_name, key, data)
return file
def fetch_all(keys):
with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_key = {executor.submit(fetch, key): key for key in keys}
print("All URLs submitted.")
for future in futures.as_completed(future_to_key):
key = future_to_key[future]
exception = future.exception()
if not exception:
yield key, future.result()
else:
yield key, exception
You can try this out. This is fast
import boto3
from multiprocessing import Pool
bucket_name = 'BUCKET_NAME'
prefix = 'PREFIX'
local_dir = './downloads/' # PUT YOUR LOCAL DIR
max_process = 20 # CAN BE CHANGE
debug_en = True
# pass your credentials and region name
s3_client = boto3.client('s3',aws_access_key_id=' ',
aws_secret_access_key=' ', region_name=' ')
def downfiles(bucket_name, src_obj, dest_path):
try:
s3_client.download_file(bucket_name, src_obj, dest_path)
if debug_en:
print("[dubug] downloading object: %s to %s" %(src_obj, dest_path))
except:
pass
def download_dir(bucket_name, sub_prefix):
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name, Prefix=sub_prefix)
pool = Pool(max_process)
print(pool)
mp_data = []
for page in pages:
if 'Contents' in page:
for obj in page['Contents']:
src_obj = obj['Key']
dest_path = local_dir + src_obj
mp_data.append((bucket_name, src_obj, dest_path))
os.path.dirname(dest_path) and os.makedirs(os.path.dirname(dest_path), exist_ok=True)
pool.starmap(downfiles, mp_data)
return len(mp_data)
if __name__ == '__main__':
print("starting script...")
start_time = datetime.now()
s3_dirs = prefix
total_files = 0
for s3_dir in s3_dirs:
print("[Information] %s directory is downloading" % s3_dir)
no_files = download_dir(bucket_name, s3_dir)
total_files = total_files + no_files
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
print('Total File numbers: %d' % total_files)
print("ended")

multiple openpyxl xlsx workbooks into one .zip file for download

I am trying to get some xlsx files from a form, i load them using openpyxl and do some data processing.. and finally i need to download all processed xlsx files zipped to the user.
here is an example of what i did so far
if form.is_valid():
s = StringIO.StringIO()
zf = zipfile.ZipFile(s, mode="w")
for xlsx in request.FILES.getlist('xlsxs'):
element_column = "G"
element_row = 16
massar_column = "C"
massar_row_start = 18
loop = column_index_from_string(element_column)
while (loop <= ws.max_column):
for i in range(massar_row_start, ws.max_row+1):
# ...
ws["%s%s" % (element_column,i)] = 0
# ...
loop+=2
element_column = get_column_letter(loop)
buf = save_virtual_workbook(wb)
zf.write(buf) # or zf.write(wb)
zf.close()
response = HttpResponse(s.getvalue(), content_type="application/x-zip-compressed")
response['Content-Disposition'] = "attachment; filename=notes.zip"
return response
I get the error
TypeError at My_view
stat() argument 1 must be encoded string without null bytes, not str
Thanks in advance for any help you can offer.
save_virtual_workbook returns a bytestream - source.
You are passing this value to ZipFile.write which is expecting a filename.
I think you should be using ZipFile.writestr, and you need to provide a filename that will be used inside the archive. I'm not sure how you are getting the error message you see, but this is the first mistake I can see.

Uploading video to YouTube and adding it to playlist using YouTube Data API v3 in Python

I wrote a script to upload a video to YouTube using YouTube Data API v3 in the python with help of example given in Example code.
And I wrote another script to add uploaded video to playlist using same YouTube Data API v3 you can be seen here
After that I wrote a single script to upload video and add that video to playlist. In that I took care of authentication and scops still I am getting permission error. here is my new script
#!/usr/bin/python
import httplib
import httplib2
import os
import random
import sys
import time
from apiclient.discovery import build
from apiclient.errors import HttpError
from apiclient.http import MediaFileUpload
from oauth2client.file import Storage
from oauth2client.client import flow_from_clientsecrets
from oauth2client.tools import run
# Explicitly tell the underlying HTTP transport library not to retry, since
# we are handling retry logic ourselves.
httplib2.RETRIES = 1
# Maximum number of times to retry before giving up.
MAX_RETRIES = 10
# Always retry when these exceptions are raised.
RETRIABLE_EXCEPTIONS = (httplib2.HttpLib2Error, IOError, httplib.NotConnected,
httplib.IncompleteRead, httplib.ImproperConnectionState,
httplib.CannotSendRequest, httplib.CannotSendHeader,
httplib.ResponseNotReady, httplib.BadStatusLine)
# Always retry when an apiclient.errors.HttpError with one of these status
# codes is raised.
RETRIABLE_STATUS_CODES = [500, 502, 503, 504]
CLIENT_SECRETS_FILE = "client_secrets.json"
# A limited OAuth 2 access scope that allows for uploading files, but not other
# types of account access.
YOUTUBE_UPLOAD_SCOPE = "https://www.googleapis.com/auth/youtube.upload"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
# Helpful message to display if the CLIENT_SECRETS_FILE is missing.
MISSING_CLIENT_SECRETS_MESSAGE = """
WARNING: Please configure OAuth 2.0
To make this sample run you will need to populate the client_secrets.json file
found at:
%s
with information from the APIs Console
https://code.google.com/apis/console#access
For more information about the client_secrets.json file format, please visit:
https://developers.google.com/api-client-library/python/guide/aaa_client_secrets
""" % os.path.abspath(os.path.join(os.path.dirname(__file__),
CLIENT_SECRETS_FILE))
def get_authenticated_service():
flow = flow_from_clientsecrets(CLIENT_SECRETS_FILE, scope=YOUTUBE_UPLOAD_SCOPE,
message=MISSING_CLIENT_SECRETS_MESSAGE)
storage = Storage("%s-oauth2.json" % sys.argv[0])
credentials = storage.get()
if credentials is None or credentials.invalid:
credentials = run(flow, storage)
return build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
http=credentials.authorize(httplib2.Http()))
def initialize_upload(title,description,keywords,privacyStatus,file):
youtube = get_authenticated_service()
tags = None
if keywords:
tags = keywords.split(",")
insert_request = youtube.videos().insert(
part="snippet,status",
body=dict(
snippet=dict(
title=title,
description=description,
tags=tags,
categoryId='26'
),
status=dict(
privacyStatus=privacyStatus
)
),
# chunksize=-1 means that the entire file will be uploaded in a single
# HTTP request. (If the upload fails, it will still be retried where it
# left off.) This is usually a best practice, but if you're using Python
# older than 2.6 or if you're running on App Engine, you should set the
# chunksize to something like 1024 * 1024 (1 megabyte).
media_body=MediaFileUpload(file, chunksize=-1, resumable=True)
)
vid=resumable_upload(insert_request)
#Here I added lines to add video to playlist
#add_video_to_playlist(youtube,vid,"PL2JW1S4IMwYubm06iDKfDsmWVB-J8funQ")
#youtube = get_authenticated_service()
add_video_request=youtube.playlistItems().insert(
part="snippet",
body={
'snippet': {
'playlistId': "PL2JW1S4IMwYubm06iDKfDsmWVB-J8funQ",
'resourceId': {
'kind': 'youtube#video',
'videoId': vid
}
#'position': 0
}
}
).execute()
def resumable_upload(insert_request):
response = None
error = None
retry = 0
vid=None
while response is None:
try:
print "Uploading file..."
status, response = insert_request.next_chunk()
if 'id' in response:
print "'%s' (video id: %s) was successfully uploaded." % (
title, response['id'])
vid=response['id']
else:
exit("The upload failed with an unexpected response: %s" % response)
except HttpError, e:
if e.resp.status in RETRIABLE_STATUS_CODES:
error = "A retriable HTTP error %d occurred:\n%s" % (e.resp.status,
e.content)
else:
raise
except RETRIABLE_EXCEPTIONS, e:
error = "A retriable error occurred: %s" % e
if error is not None:
print error
retry += 1
if retry > MAX_RETRIES:
exit("No longer attempting to retry.")
max_sleep = 2 ** retry
sleep_seconds = random.random() * max_sleep
print "Sleeping %f seconds and then retrying..." % sleep_seconds
time.sleep(sleep_seconds)
return vid
if __name__ == '__main__':
title="sample title"
description="sample description"
keywords="keyword1,keyword2,keyword3"
privacyStatus="public"
file="myfile.mp4"
vid=initialize_upload(title,description,keywords,privacyStatus,file)
print 'video ID is :',vid
I am not able to figure out what is wrong. I am getting permission error. both script works fine independently.
could anyone help me figure out where I am wrong or how to achieve uploading video and adding that too playlist.
I got the answer actually in both the independent script scope is different.
scope for uploading is "https://www.googleapis.com/auth/youtube.upload"
scope for adding to playlist is "https://www.googleapis.com/auth/youtube"
as scope is different so I had to handle authentication separately.