Requirement states that the lambda function must check the zipfile for any excluded file extensions which have been defined in the function.
I have outlined the steps which are needed for a successful run.
I need to validate it and make sure that the zip file doesn't have the bad extensions. This step seems to be running and the validation is being run.
The file needs to be unzipped.
The file should be unzipped in an 'unzipped' folder in the same directory.
All the above steps are occurring but I seem to be getting an attribute error in my code which has been outlined below. Any ideas/ solutions are greatly appreciated.
import json
import zipfile
import os
import boto3
from urllib.parse import unquote_plus
import io
import re
import gzip
exclude_list = [".exe", ".scr", ".vbs", ".js", ".xml", "docm", ".xps"]
sns = boto3.client('sns' )
def read_nested_zip(tf, bucket, key, s3_client):
print(key)
print ("search for.zip:",re.search(r'\.zip', key, re.IGNORECASE))
## need to add exception handling
##if re.search(r'\.gzip$', key, re.IGNORECASE):
## print ('gzip file found')
## fil = gzip.GzipFile(tf, mode='rb')
if re.search(r'\.zip$', key, re.IGNORECASE):
print ('zip file found')
fil = zipfile.ZipFile(tf, "r").namelist()
else:
fil = ()
print ('no file found')
print (fil)
##with fil as zipf:
##try to narrow scope - run loop else exit
for file in fil:
print(file)
if re.search(r'(\.zip|)$', file, re.IGNORECASE):
childzip = io.BytesIO(fil.read(file))
read_nested_zip(childzip, bucket, key, s3_client)
else:
if any(x in file.lower() for x in exclude_list):
print("Binary, dont load")
print(file)
print(bucket)
print(key)
env = bucket.split('-')[2].upper()
# Copy the parent zip to a separate folder and remove it from the path
copy_source = {'Bucket': bucket, 'Key': key}
s3_client.copy_object(Bucket=bucket, CopySource=copy_source, Key='do_not_load_'+key)
s3_client.delete_object(Bucket = bucket, Key = key)
sns.publish(
TopicArn = 'ARN',
Subject = env + ': S3 upload warning: Non standard File encountered ',
Message = 'Non standard File encountered' + key + ' uploaded to bucket ' + bucket + ' The file has been moved to ' + 'do_not_load_'+key
)
else:
print("File in supported formats, can be loaded " + file)
#folder = re.sub(r"\/[^/]+$", "",key)
folder = "/".join(key.split("/", 2)[:2]) + "/unzipped"
print(folder)
print("Bucket is "+ bucket)
print("file to copy is "+ file)
buffer = io.BytesIO(fil.read(file))
s3_resource = boto3.resource('s3')
s3_resource.meta.client.upload_fileobj(buffer,Bucket=bucket,Key= folder + '/' + file)
s3_resource.Object(bucket, folder + '/' + file).wait_until_exists()
def lambda_handler(event, context):
print(event)
for record in event['Records']:
s3_client = boto3.client('s3')
key = unquote_plus(record['s3']['object']['key'])
print(key)
print (type(key))
size = record['s3']['object']['size']
bucket = record['s3']['bucket']['name']
obj = s3_client.get_object(Bucket=bucket, Key=key)
print(obj)
putObjects = []
with io.BytesIO(obj["Body"].read()) as tf:
# rewind the file
#tf.seek(0)
read_nested_zip(tf, bucket, key, s3_client)
Error code"[ERROR] AttributeError: 'list' object has no attribute 'read'
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 85, in lambda_handler
read_nested_zip(tf, bucket, key, s3_client)
File "/var/task/lambda_function.py", line 35, in read_nested_zip
childzip = io.BytesIO(fil.read())
Things I tried:
1.
childzip = io.BytesIO(fil.read(file))
#tried switching the childzip = io.BytesIO(fil.read()) #still failed
changed
childzip = io.BytesIO(fil)
[ERROR] AttributeError: module 'zipfile' has no attribute 'read'
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 85, in lambda_handler
read_nested_zip(tf, bucket, key, s3_client)
File "/var/task/lambda_function.py", line 25, in read_nested_zip
fil = zipfile.read(tf, "r").namelist()
Any ideas are appreciated. Best
As long as the ZIP file is not too big, I'd suggest downloading the ZIP file to the Lambda function's /tmp folder and then using the zipfile context manager to simplify accessing the ZIP file. Alternatively, you can stream the ZIP file but probably still use the context manager.
Note that I've included code that specifically reads the byte content of a file from within the ZIP file. See bytes = myzip.read(name) below.
For example:
import json
import os
import zipfile
import boto3
from urllib.parse import unquote_plus
ZIP_NAME = "/tmp/local.zip"
EXCLUDE_LIST = [".exe", ".scr", ".vbs", ".js", ".xml", "docm", ".xps"]
s3 = boto3.client("s3")
def process_zip(bucket, key):
s3.download_file(bucket, key, ZIP_NAME)
with zipfile.ZipFile(ZIP_NAME, "r") as myzip:
namelist = myzip.namelist()
for name in namelist:
print("Zip contains:", name)
extensions = [os.path.splitext(name)[1] for name in namelist]
print("Extensions:", extensions)
if any(extension in EXCLUDE_LIST for extension in extensions):
print("Banned extensions present in:", extensions)
os.remove(ZIP_NAME)
return
for name in namelist:
print("Zip read:", name)
bytes = myzip.read(name)
# your code here ...
os.remove(ZIP_NAME)
def lambda_handler(event, context):
for record in event.get("Records", []):
key = unquote_plus(record["s3"]["object"]["key"])
bucket = record["s3"]["bucket"]["name"]
if os.path.splitext(key)[1] == ".zip":
process_zip(bucket, key)
return {"statusCode": 200, "body": json.dumps("OK")}
Related
I have a file with urls in my s3 bucket. I would like to use a python lambda function to upload the url files to s3 bucket.
For example my uploaded file to s3 contains:
http://...
http://...
Each line corresponds to a file to be uploaded into s3.
Here is the code:
import json
import urllib.parse
import boto3
import requests
import os
from gzip import GzipFile
from io import TextIOWrapper
import requests
print('Loading functions')
s3 = boto3.client('s3')
def get_file_seqs(response):
try:
size = response['ResponseMetadata']['HTTPHeaders']['content-length']
print("[+] Size retrieved")
return size
except:
print("[-] Size can not be retrieved")
def lambda_handler(event, context):
# Defining bucket objects
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
#get file from s3
print('[+] Getting file from S3 bucket')
response = s3.get_object(Bucket=bucket, Key=key)
try:
#checking file size
print('[+] Checking file size')
file_size = get_file_seqs(response)
if file_size == 0:
print('File size is equal to 0')
return False
else:
#create new directories
print('[+] Creating new directories')
bucket_name = "triggersnextflow"
directories = ['backups/sample/', 'backups/control/']
#loop to create new dirs
for dirs in directories:
s3.put_object(Bucket = bucket_name, Key = dirs, Body = '')
#NOW I WOULD LIKE TO DOWNLOAD THE FILES FROM THE URLS INSIDE S3 OBJECT
#return true
return True
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
Download an S3 object to a file:
import boto3
s3 = boto3.resource('s3')
s3.meta.client.download_file('mybucket', 'hello.txt', '/tmp/hello.txt')
You will find great resource of information here:
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.download_file
I have tried below code but I am not able to convert the data from json to csv. Can someone please help me?
import boto3
import botocore
import csv
def lambda_handler(event, context):
BUCKET_NAME = 'name of the bucket' # replace with your bucket name
KEY = 'OUTPUT.csv' # replace with your object key
json_data = [{"id":"1","name":"test"},{"id":"2","name":"good"}]
with open("data.csv", "w") as file:
csv_file = csv.writer(file)
csv_file.writerow(['id', 'name'])
for item in data:
csv_file.writerow([item.get('id'),item.get('name')])
csv_binary = open('data.csv', 'rb').read()
try:
obj = s3.Object(BUCKET_NAME, KEY)
obj.put(Body=csv_binary)
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print("The object does not exist.")
else:
raise
s3client = boto3.client('s3')
try:
download_url = s3client.generate_presigned_url(
'get_object',
Params={
'Bucket': BUCKET_NAME,
'Key': KEY
},
ExpiresIn=3600
)
return {"csv_link": download_url}
except Exception as e:
raise utils_exception.ErrorResponse(400, e, Log)
Here is the response I am getting for the above code:
{
"errorMessage": "[Errno 30] Read-only file system: 'data.csv'",
"errorType": "OSError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 8, in lambda_handler\n with open(\"data.csv\", \"wb\") as file:\n"
]
}
In AWS Lambda, you can only create files in the /tmp/ directory. Therefore, use:
with open("/tmp/data.csv", "w") as file:
A maximum of 512MB is provided, so it is a good idea to delete any temporary files so they do not interfere with future executions of the Lambda function.
i m trying to create a simple event driven AWS Lambda Python function to extract a ZIP or GZIP attachment from an email stored in S3 by another service (such as Amazon SES).
from __future__ import print_function
import email
import zipfile
import os
import gzip
import string
import boto3
import urllib
print('Loading function')
s3 = boto3.client('s3')
s3r = boto3.resource('s3')
xmlDir = "/tmp/output/"
outputBucket = "" # Set here for a seperate bucket otherwise it is set to the events bucket
outputPrefix = "xml/" # Should end with /
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key']).decode('utf8')
try:
# Set outputBucket if required
if not outputBucket:
global outputBucket
outputBucket = bucket
# Use waiter to ensure the file is persisted
waiter = s3.get_waiter('object_exists')
waiter.wait(Bucket=bucket, Key=key)
response = s3r.Bucket(bucket).Object(key)
# Read the raw text file into a Email Object
msg = email.message_from_string(response.get()["Body"].read())
if len(msg.get_payload()) == 2:
# Create directory for XML files (makes debugging easier)
if os.path.isdir(xmlDir) == False:
os.mkdir(xmlDir)
# The first attachment
attachment = msg.get_payload()[1]
# Extract the attachment into /tmp/output
extract_attachment(attachment)
# Upload the XML files to S3
upload_resulting_files_to_s3()
else:
print("Could not see file/attachment.")
return 0
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist '
'and your bucket is in the same region as this '
'function.'.format(key, bucket))
raise e
def extract_attachment(attachment):
# Process filename.zip attachments
if "gzip" in attachment.get_content_type():
contentdisp = string.split(attachment.get('Content-Disposition'), '=')
fname = contentdisp[1].replace('\"', '')
open('/tmp/' + contentdisp[1], 'wb').write(attachment.get_payload(decode=True))
# This assumes we have filename.xml.gz, if we get this wrong, we will just
# ignore the report
xmlname = fname[:-3]
open(xmlDir + xmlname, 'wb').write(gzip.open('/tmp/' + contentdisp[1], 'rb').read())
# Process filename.xml.gz attachments (Providers not complying to standards)
elif "zip" in attachment.get_content_type():
open('/tmp/attachment.zip', 'wb').write(attachment.get_payload(decode=True))
with zipfile.ZipFile('/tmp/attachment.zip', "r") as z:
z.extractall(xmlDir)
else:
print('Skipping ' + attachment.get_content_type())
def upload_resulting_files_to_s3():
# Put all XML back into S3 (Covers non-compliant cases if a ZIP contains multiple results)
for fileName in os.listdir(xmlDir):
if fileName.endswith(".xml"):
print("Uploading: " + fileName) # File name to upload
s3r.meta.client.upload_file(xmlDir+'/'+fileName, outputBucket, outputPrefix+fileName)
on running the function i m getting this error
'Records': KeyError
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 25, in lambda_handler
for record in event["Records"]:
KeyError: 'Records'
i tried googling and found few telling me to add Mapping Template --https://intellipaat.com/community/18329/keyerror-records-in-aws-s3-lambda-trigger ,
"KeyError: 'Records'" in AWS S3 - Lambda trigger,
following this link but i m getting some other error
'query': KeyError
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 24, in lambda_handler
for record in event['query']['Records']:
KeyError: 'query'
I'm trying to index pdf documents that are uploaded to s3 bucket. My lambda function is working fine til PDF extraction part. it's establishing connection with elastic search endpoint and while uploading data elastic search for indexing, it's throwing error. Please find lambda function code below. Please help me with this. Thanks in advance.
from __future__ import print_function
import json
import urllib
import boto3
import slate
import elasticsearch
import datetime
es_endpoint = 'search-sdjsf-zrtisx]sdaswasfsjmtsyuih3awvu.us-east-
1.es.amazonaws.com'
es_index = 'pdf_text_extracts'
es_type = 'document'
print('Loading function')
s3 = boto3.client('s3')
# prepare a dict to hold our document data
doc_data = {}
doc_data['insert_time'] =
str(datetime.datetime.isoformat(datetime.datetime.now()))
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
object_key = urllib.unquote_plus(event['Records'][0]['s3']['object']
['key']).decode('utf8')
try:
# get the file data from s3
temp_pdf_file = open('/tmp/tempfile.pdf', 'w')
response = s3.get_object(Bucket=bucket, Key=object_key)
print("CONTENT TYPE: " + response['ContentType'])
# return response['ContentType']
temp_pdf_file.write(response['Body'].read()) # write the object data
to a local file; will be passed to slate
temp_pdf_file.close() # close the temporary file for now
# pull the text from the temporary PDF file using slate
print("Extracting data from: " + object_key)
with open('/tmp/tempfile.pdf') as temp_pdf_file:
doc = slate.PDF(temp_pdf_file)
# store document data to dict
doc_data['source_pdf_name'] = object_key
doc_data['document_text'] = doc[0] # we're only worried about page 1
at this point
#datj=json.dumps(doc_data)
#z=json.loads(datj)
#print(z)
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist
and your bucket is in the same region as this
function.'.format(object_key, bucket))
raise e
# put the data in ES
#try:
es = elasticsearch.Elasticsearch([{'host': es_endpoint, 'port': 443,
'use_ssl': True}]) # hold off on validating certs
es_response = es.index(index=es_index, doc_type=es_type, body=doc_data)
print('Data posted to ES: ' + str(es_response))
#except Exception as e:
#print('Data post to ES failed: ' + str(e))
#raise e
return "Done"
I have removed try and except in last block to find the actual error and its throwing the below error while trying to upload data to elastic search.
Traceback (most recent call last):
File "/var/runtime/awslambda/bootstrap.py", line 576, in <module>
main()
File "/var/runtime/awslambda/bootstrap.py", line 571, in main
handle_event_request(request_handler, invokeid, event_body, context_objs,
invoked_function_arn)
File "/var/runtime/awslambda/bootstrap.py", line 264, in
handle_event_request
result = report_fault_helper(invokeid, sys.exc_info(), None)
File "/var/runtime/awslambda/bootstrap.py", line 315, in report_fault_helper
msgs = [str(value), etype.__name__]
Remove the return "Done" at the end, that's not allowed in a Lambda environment.
I have the following script to upload a file unto google drive, using python27. As it is now it will upload a new copy of the file, but I want the existing file updated/overwritten. I can't find help in the Google Drive API references and guides for python. Any suggestions?
from __future__ import print_function
import os
from apiclient.discovery import build
from httplib2 import Http
from oauth2client import file, client, tools
try:
import argparse
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args()
except ImportError:
flags = None
# Gain acces to google drive
SCOPES = 'https://www.googleapis.com/auth/drive.file'
store = file.Storage('storage.json')
creds = store.get()
if not creds or creds.invalid:
flow = client.flow_from_clientsecrets('client_secret.json', SCOPES)
creds = tools.run_flow(flow, store, flags) \
if flags else tools.run(flow, store)
DRIVE = build('drive', 'v3', http=creds.authorize(Http()))
#The file that is being uploaded
FILES = (
('all-gm-keys.txt', 'application/vnd.google-apps.document'), #in google doc format
)
#Where the file ends on google drive
for filename, mimeType in FILES:
folder_id = '0B6V-MONTYPYTHONROCKS-lTcXc' #Not the real folder id
metadata = {'name': filename,'parents': [ folder_id ] }
if mimeType:
metadata['mimeType'] = mimeType
res = DRIVE.files().create(body=metadata, media_body=filename).execute()
if res:
print('Uploaded "%s" (%s)' % (filename, res['mimeType']))
I think that you are looking for the update method. Here is a link to the documentation. There is an example on overwriting the file in python.
I think that using the official google client api instead of pure http requests should make your task easier.
from apiclient import errors
from apiclient.http import MediaFileUpload
# ...
def update_file(service, file_id, new_title, new_description, new_mime_type,
new_filename, new_revision):
"""Update an existing file's metadata and content.
Args:
service: Drive API service instance.
file_id: ID of the file to update.
new_title: New title for the file.
new_description: New description for the file.
new_mime_type: New MIME type for the file.
new_filename: Filename of the new content to upload.
new_revision: Whether or not to create a new revision for this file.
Returns:
Updated file metadata if successful, None otherwise.
"""
try:
# First retrieve the file from the API.
file = service.files().get(fileId=file_id).execute()
# File's new metadata.
file['title'] = new_title
file['description'] = new_description
file['mimeType'] = new_mime_type
# File's new content.
media_body = MediaFileUpload(
new_filename, mimetype=new_mime_type, resumable=True)
# Send the request to the API.
updated_file = service.files().update(
fileId=file_id,
body=file,
newRevision=new_revision,
media_body=media_body).execute()
return updated_file
except errors.HttpError, error:
print 'An error occurred: %s' % error
return None
Link the example: https://developers.google.com/drive/api/v2/reference/files/update#examples