Requirement states that the lambda function must check the zipfile for any excluded file extensions which have been defined in the function.
I have outlined the steps which are needed for a successful run.
I need to validate it and make sure that the zip file doesn't have the bad extensions. This step seems to be running and the validation is being run.
The file needs to be unzipped.
The file should be unzipped in an 'unzipped' folder in the same directory.
All the above steps are occurring but I seem to be getting an attribute error in my code which has been outlined below. Any ideas/ solutions are greatly appreciated.
import json
import zipfile
import os
import boto3
from urllib.parse import unquote_plus
import io
import re
import gzip
exclude_list = [".exe", ".scr", ".vbs", ".js", ".xml", "docm", ".xps"]
sns = boto3.client('sns' )
def read_nested_zip(tf, bucket, key, s3_client):
print(key)
print ("search for.zip:",re.search(r'\.zip', key, re.IGNORECASE))
## need to add exception handling
##if re.search(r'\.gzip$', key, re.IGNORECASE):
## print ('gzip file found')
## fil = gzip.GzipFile(tf, mode='rb')
if re.search(r'\.zip$', key, re.IGNORECASE):
print ('zip file found')
fil = zipfile.ZipFile(tf, "r").namelist()
else:
fil = ()
print ('no file found')
print (fil)
##with fil as zipf:
##try to narrow scope - run loop else exit
for file in fil:
print(file)
if re.search(r'(\.zip|)$', file, re.IGNORECASE):
childzip = io.BytesIO(fil.read(file))
read_nested_zip(childzip, bucket, key, s3_client)
else:
if any(x in file.lower() for x in exclude_list):
print("Binary, dont load")
print(file)
print(bucket)
print(key)
env = bucket.split('-')[2].upper()
# Copy the parent zip to a separate folder and remove it from the path
copy_source = {'Bucket': bucket, 'Key': key}
s3_client.copy_object(Bucket=bucket, CopySource=copy_source, Key='do_not_load_'+key)
s3_client.delete_object(Bucket = bucket, Key = key)
sns.publish(
TopicArn = 'ARN',
Subject = env + ': S3 upload warning: Non standard File encountered ',
Message = 'Non standard File encountered' + key + ' uploaded to bucket ' + bucket + ' The file has been moved to ' + 'do_not_load_'+key
)
else:
print("File in supported formats, can be loaded " + file)
#folder = re.sub(r"\/[^/]+$", "",key)
folder = "/".join(key.split("/", 2)[:2]) + "/unzipped"
print(folder)
print("Bucket is "+ bucket)
print("file to copy is "+ file)
buffer = io.BytesIO(fil.read(file))
s3_resource = boto3.resource('s3')
s3_resource.meta.client.upload_fileobj(buffer,Bucket=bucket,Key= folder + '/' + file)
s3_resource.Object(bucket, folder + '/' + file).wait_until_exists()
def lambda_handler(event, context):
print(event)
for record in event['Records']:
s3_client = boto3.client('s3')
key = unquote_plus(record['s3']['object']['key'])
print(key)
print (type(key))
size = record['s3']['object']['size']
bucket = record['s3']['bucket']['name']
obj = s3_client.get_object(Bucket=bucket, Key=key)
print(obj)
putObjects = []
with io.BytesIO(obj["Body"].read()) as tf:
# rewind the file
#tf.seek(0)
read_nested_zip(tf, bucket, key, s3_client)
Error code"[ERROR] AttributeError: 'list' object has no attribute 'read'
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 85, in lambda_handler
read_nested_zip(tf, bucket, key, s3_client)
File "/var/task/lambda_function.py", line 35, in read_nested_zip
childzip = io.BytesIO(fil.read())
Things I tried:
1.
childzip = io.BytesIO(fil.read(file))
#tried switching the childzip = io.BytesIO(fil.read()) #still failed
changed
childzip = io.BytesIO(fil)
[ERROR] AttributeError: module 'zipfile' has no attribute 'read'
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 85, in lambda_handler
read_nested_zip(tf, bucket, key, s3_client)
File "/var/task/lambda_function.py", line 25, in read_nested_zip
fil = zipfile.read(tf, "r").namelist()
Any ideas are appreciated. Best
As long as the ZIP file is not too big, I'd suggest downloading the ZIP file to the Lambda function's /tmp folder and then using the zipfile context manager to simplify accessing the ZIP file. Alternatively, you can stream the ZIP file but probably still use the context manager.
Note that I've included code that specifically reads the byte content of a file from within the ZIP file. See bytes = myzip.read(name) below.
For example:
import json
import os
import zipfile
import boto3
from urllib.parse import unquote_plus
ZIP_NAME = "/tmp/local.zip"
EXCLUDE_LIST = [".exe", ".scr", ".vbs", ".js", ".xml", "docm", ".xps"]
s3 = boto3.client("s3")
def process_zip(bucket, key):
s3.download_file(bucket, key, ZIP_NAME)
with zipfile.ZipFile(ZIP_NAME, "r") as myzip:
namelist = myzip.namelist()
for name in namelist:
print("Zip contains:", name)
extensions = [os.path.splitext(name)[1] for name in namelist]
print("Extensions:", extensions)
if any(extension in EXCLUDE_LIST for extension in extensions):
print("Banned extensions present in:", extensions)
os.remove(ZIP_NAME)
return
for name in namelist:
print("Zip read:", name)
bytes = myzip.read(name)
# your code here ...
os.remove(ZIP_NAME)
def lambda_handler(event, context):
for record in event.get("Records", []):
key = unquote_plus(record["s3"]["object"]["key"])
bucket = record["s3"]["bucket"]["name"]
if os.path.splitext(key)[1] == ".zip":
process_zip(bucket, key)
return {"statusCode": 200, "body": json.dumps("OK")}
Basic lambda function trying to get contents of the bucket but getting errors though
import json
import urllib.parse
import boto3
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
try:
response = s3.get_object(Bucket=bucket, Key=key)
print("CONTENT TYPE: " + response['ContentType'])
return response['ContentType']
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
Here is the error message when i run the lambda function.
Error message
{
"errorMessage": "'Records'",
"errorType": "KeyError",
"requestId": "5c89bb8e-a70e-4c33-ba00-43174095544e",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 13, in lambda_handler\n bucket = event['Records'][0]['s3']['bucket']['name']\n"
]
}
Function Logs
START RequestId: 5c89bb8e-a70e-4c33-ba00-43174095544e Version: $LATEST
[ERROR] KeyError: 'Records'
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 13, in lambda_handler
bucket = event['Records'][0]['s3']['bucket']['name']
END RequestId: 5c89bb8e-a70e-4c33-ba00-43174095544e
REPORT RequestId: 5c89bb8e-a70e-4c33-ba00-43174095544e Duration: 1.89 ms Billed Duration: 2 ms Memory Size: 128 MB Max Memory Used: 69 MB Init Duration: 356.28 ms
The problem is that
bucket = event['Records'][0]['s3']['bucket']['name']
Doesn't exist. Check the event object when its been triggered from S3. If you want to test in console you need to pass a similarly shaped object as the event.
I'm using the below Lambda code to read data from an S3 bucket that triggers the lambda function once a file is created into the S3 bucket.
import json
import urllib.parse
import boto3
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
print("bucket= ", bucket)
print("key= ", key)
try:
response = s3.get_object(Bucket=bucket, Key=key)
print("CONTENT TYPE: " + response['ContentType'])
return response['ContentType']
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
The uploaded file is a csv file that contains 50 comma delimited 50 records.
I need to stream this received data into an AWS Kinesis Data Stream called test-stream once the file is uploaded to the S3 bucket.. Any help please?
Thanks..
I have tried below code but I am not able to convert the data from json to csv. Can someone please help me?
import boto3
import botocore
import csv
def lambda_handler(event, context):
BUCKET_NAME = 'name of the bucket' # replace with your bucket name
KEY = 'OUTPUT.csv' # replace with your object key
json_data = [{"id":"1","name":"test"},{"id":"2","name":"good"}]
with open("data.csv", "w") as file:
csv_file = csv.writer(file)
csv_file.writerow(['id', 'name'])
for item in data:
csv_file.writerow([item.get('id'),item.get('name')])
csv_binary = open('data.csv', 'rb').read()
try:
obj = s3.Object(BUCKET_NAME, KEY)
obj.put(Body=csv_binary)
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print("The object does not exist.")
else:
raise
s3client = boto3.client('s3')
try:
download_url = s3client.generate_presigned_url(
'get_object',
Params={
'Bucket': BUCKET_NAME,
'Key': KEY
},
ExpiresIn=3600
)
return {"csv_link": download_url}
except Exception as e:
raise utils_exception.ErrorResponse(400, e, Log)
Here is the response I am getting for the above code:
{
"errorMessage": "[Errno 30] Read-only file system: 'data.csv'",
"errorType": "OSError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 8, in lambda_handler\n with open(\"data.csv\", \"wb\") as file:\n"
]
}
In AWS Lambda, you can only create files in the /tmp/ directory. Therefore, use:
with open("/tmp/data.csv", "w") as file:
A maximum of 512MB is provided, so it is a good idea to delete any temporary files so they do not interfere with future executions of the Lambda function.
So Im writing a lambda fuction, witch is triggered by an S3 PUT,
import datetime
import boto3
import botocore
#boto3.set_stream_logger('botocore', level='DEBUG')
def lambda_handler(event, context):
src_bucket_name=event['Records'][0]['s3']['bucket']['name']
print src_bucket_name
file = event['Records'][0]['s3']['object']['key']
split_string = file.split('/')
file_string = split_string[-1].split('_')
fecha_str = event['Records'][0]['eventTime']
fecha_real=datetime.datetime.strptime(fecha_str, '%Y-%m-%dT%H:%M:%S.%fZ')+ datetime.timedelta(hours=-6)
new_path='PATH/'+file_string[0].lower()+'/'+str(fecha_real.year)+'/'+str(fecha_real.month)+'/'+split_string[-1]
s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
copy_source = {
'Bucket': src_bucket_name,
'Key': file
}
s3.meta.client.copy(copy_source, DST_BUCKET_NAME, new_path)
when I run the code I get
ClientError: An error occurred (404) when calling the HeadObject operation: Not Found
the file does exists
file in source_bucket
Could you please tell what am I doing wrong?
EDIT:
I gave admin permissions to the role I'm using and still having the same error.
UPDATE-CLOSED:
I deleted the role, made a new one, change the code on the copy part to this:
copy_source = {
'Bucket': src_bucket_name,
'Key': file
}
r = s3_client.copy_object(
Bucket=[DST_BUCKET_NAME],
CopySource=copy_source,
Key=new_path
)
and it worked!