I have a lambda function that moves files from one s3 bucket to another :
import json
import boto3
from datetime import datetime, timedelta
def lambda_handler(event, context):
# TODO implement
SOURCE_BUCKET = 'source-bucket'
DESTINATION_BUCKET = 'destination-bucket'
s3_client = boto3.client('s3')
# Create a reusable Paginator
paginator = s3_client.get_paginator('list_objects_v2')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket=SOURCE_BUCKET)
# Loop through each object, looking for ones older than a given time period
for page in page_iterator:
for object in page['Contents']:
if object['LastModified'] < datetime.now().astimezone() - timedelta(hours=1): # <-- Change time period here
print(f"Moving {object['Key']}")
# Copy object
s3_client.copy_object(
ACL='bucket-owner-full-control',
Bucket=DESTINATION_BUCKET,
Key=object['Key'],
CopySource={'Bucket':SOURCE_BUCKET, 'Key':object['Key']}
)
# Delete original object
s3_client.delete_object(Bucket=SOURCE_BUCKET, Key=object['Key'])
I am getting error :
Response:
{
"errorMessage": "'Contents'",
"errorType": "KeyError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 21, in lambda_handler\n for object in page['Contents']:\n"
]
}
Request ID:
"518e0f39-63e4-43df-842d-b73d56f83cd8"
Function Logs:
START RequestId: 518e0f39-63e4-43df-842d-b73d56f83cd8 Version: $LATEST
[ERROR] KeyError: 'Contents'
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 21, in lambda_handler
for object in page['Contents']:END RequestId: 518e0f39-63e4-43df-842d-b73d56f83cd8
REPORT RequestId: 518e0f39-63e4-43df-842d-b73d56f83cd8 Duration: 1611.00 ms Billed Duration: 1700 ms Memory Size: 128 MB Max Memory Used: 76 MB Init Duration: 248.12 ms
can someone help here. It has moved all the files but still giving me error.
This is assuming that the key Contents is always returned. If there are not objects in the bucket this will not exist.
Add a simple if "Contents" in page to handle it not always existing.
So your function code might look like
import json
import boto3
from datetime import datetime, timedelta
def lambda_handler(event, context):
# TODO implement
SOURCE_BUCKET = 'source-bucket'
DESTINATION_BUCKET = 'destination-bucket'
s3_client = boto3.client('s3')
# Create a reusable Paginator
paginator = s3_client.get_paginator('list_objects_v2')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket=SOURCE_BUCKET)
# Loop through each object, looking for ones older than a given time period
for page in page_iterator:
if "Contents" in page:
for object in page['Contents']:
if object['LastModified'] < datetime.now().astimezone() - timedelta(hours=1): # <-- Change time period here
print(f"Moving {object['Key']}")
# Copy object
s3_client.copy_object(
ACL='bucket-owner-full-control',
Bucket=DESTINATION_BUCKET,
Key=object['Key'],
CopySource={'Bucket':SOURCE_BUCKET, 'Key':object['Key']}
)
# Delete original object
s3_client.delete_object(Bucket=SOURCE_BUCKET, Key=object['Key'])
else:
print("No Contents key for page!")
Related
Requirement states that the lambda function must check the zipfile for any excluded file extensions which have been defined in the function.
I have outlined the steps which are needed for a successful run.
I need to validate it and make sure that the zip file doesn't have the bad extensions. This step seems to be running and the validation is being run.
The file needs to be unzipped.
The file should be unzipped in an 'unzipped' folder in the same directory.
All the above steps are occurring but I seem to be getting an attribute error in my code which has been outlined below. Any ideas/ solutions are greatly appreciated.
import json
import zipfile
import os
import boto3
from urllib.parse import unquote_plus
import io
import re
import gzip
exclude_list = [".exe", ".scr", ".vbs", ".js", ".xml", "docm", ".xps"]
sns = boto3.client('sns' )
def read_nested_zip(tf, bucket, key, s3_client):
print(key)
print ("search for.zip:",re.search(r'\.zip', key, re.IGNORECASE))
## need to add exception handling
##if re.search(r'\.gzip$', key, re.IGNORECASE):
## print ('gzip file found')
## fil = gzip.GzipFile(tf, mode='rb')
if re.search(r'\.zip$', key, re.IGNORECASE):
print ('zip file found')
fil = zipfile.ZipFile(tf, "r").namelist()
else:
fil = ()
print ('no file found')
print (fil)
##with fil as zipf:
##try to narrow scope - run loop else exit
for file in fil:
print(file)
if re.search(r'(\.zip|)$', file, re.IGNORECASE):
childzip = io.BytesIO(fil.read(file))
read_nested_zip(childzip, bucket, key, s3_client)
else:
if any(x in file.lower() for x in exclude_list):
print("Binary, dont load")
print(file)
print(bucket)
print(key)
env = bucket.split('-')[2].upper()
# Copy the parent zip to a separate folder and remove it from the path
copy_source = {'Bucket': bucket, 'Key': key}
s3_client.copy_object(Bucket=bucket, CopySource=copy_source, Key='do_not_load_'+key)
s3_client.delete_object(Bucket = bucket, Key = key)
sns.publish(
TopicArn = 'ARN',
Subject = env + ': S3 upload warning: Non standard File encountered ',
Message = 'Non standard File encountered' + key + ' uploaded to bucket ' + bucket + ' The file has been moved to ' + 'do_not_load_'+key
)
else:
print("File in supported formats, can be loaded " + file)
#folder = re.sub(r"\/[^/]+$", "",key)
folder = "/".join(key.split("/", 2)[:2]) + "/unzipped"
print(folder)
print("Bucket is "+ bucket)
print("file to copy is "+ file)
buffer = io.BytesIO(fil.read(file))
s3_resource = boto3.resource('s3')
s3_resource.meta.client.upload_fileobj(buffer,Bucket=bucket,Key= folder + '/' + file)
s3_resource.Object(bucket, folder + '/' + file).wait_until_exists()
def lambda_handler(event, context):
print(event)
for record in event['Records']:
s3_client = boto3.client('s3')
key = unquote_plus(record['s3']['object']['key'])
print(key)
print (type(key))
size = record['s3']['object']['size']
bucket = record['s3']['bucket']['name']
obj = s3_client.get_object(Bucket=bucket, Key=key)
print(obj)
putObjects = []
with io.BytesIO(obj["Body"].read()) as tf:
# rewind the file
#tf.seek(0)
read_nested_zip(tf, bucket, key, s3_client)
Error code"[ERROR] AttributeError: 'list' object has no attribute 'read'
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 85, in lambda_handler
read_nested_zip(tf, bucket, key, s3_client)
File "/var/task/lambda_function.py", line 35, in read_nested_zip
childzip = io.BytesIO(fil.read())
Things I tried:
1.
childzip = io.BytesIO(fil.read(file))
#tried switching the childzip = io.BytesIO(fil.read()) #still failed
changed
childzip = io.BytesIO(fil)
[ERROR] AttributeError: module 'zipfile' has no attribute 'read'
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 85, in lambda_handler
read_nested_zip(tf, bucket, key, s3_client)
File "/var/task/lambda_function.py", line 25, in read_nested_zip
fil = zipfile.read(tf, "r").namelist()
Any ideas are appreciated. Best
As long as the ZIP file is not too big, I'd suggest downloading the ZIP file to the Lambda function's /tmp folder and then using the zipfile context manager to simplify accessing the ZIP file. Alternatively, you can stream the ZIP file but probably still use the context manager.
Note that I've included code that specifically reads the byte content of a file from within the ZIP file. See bytes = myzip.read(name) below.
For example:
import json
import os
import zipfile
import boto3
from urllib.parse import unquote_plus
ZIP_NAME = "/tmp/local.zip"
EXCLUDE_LIST = [".exe", ".scr", ".vbs", ".js", ".xml", "docm", ".xps"]
s3 = boto3.client("s3")
def process_zip(bucket, key):
s3.download_file(bucket, key, ZIP_NAME)
with zipfile.ZipFile(ZIP_NAME, "r") as myzip:
namelist = myzip.namelist()
for name in namelist:
print("Zip contains:", name)
extensions = [os.path.splitext(name)[1] for name in namelist]
print("Extensions:", extensions)
if any(extension in EXCLUDE_LIST for extension in extensions):
print("Banned extensions present in:", extensions)
os.remove(ZIP_NAME)
return
for name in namelist:
print("Zip read:", name)
bytes = myzip.read(name)
# your code here ...
os.remove(ZIP_NAME)
def lambda_handler(event, context):
for record in event.get("Records", []):
key = unquote_plus(record["s3"]["object"]["key"])
bucket = record["s3"]["bucket"]["name"]
if os.path.splitext(key)[1] == ".zip":
process_zip(bucket, key)
return {"statusCode": 200, "body": json.dumps("OK")}
Basic lambda function trying to get contents of the bucket but getting errors though
import json
import urllib.parse
import boto3
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
try:
response = s3.get_object(Bucket=bucket, Key=key)
print("CONTENT TYPE: " + response['ContentType'])
return response['ContentType']
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
Here is the error message when i run the lambda function.
Error message
{
"errorMessage": "'Records'",
"errorType": "KeyError",
"requestId": "5c89bb8e-a70e-4c33-ba00-43174095544e",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 13, in lambda_handler\n bucket = event['Records'][0]['s3']['bucket']['name']\n"
]
}
Function Logs
START RequestId: 5c89bb8e-a70e-4c33-ba00-43174095544e Version: $LATEST
[ERROR] KeyError: 'Records'
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 13, in lambda_handler
bucket = event['Records'][0]['s3']['bucket']['name']
END RequestId: 5c89bb8e-a70e-4c33-ba00-43174095544e
REPORT RequestId: 5c89bb8e-a70e-4c33-ba00-43174095544e Duration: 1.89 ms Billed Duration: 2 ms Memory Size: 128 MB Max Memory Used: 69 MB Init Duration: 356.28 ms
The problem is that
bucket = event['Records'][0]['s3']['bucket']['name']
Doesn't exist. Check the event object when its been triggered from S3. If you want to test in console you need to pass a similarly shaped object as the event.
i m trying to create a simple event driven AWS Lambda Python function to extract a ZIP or GZIP attachment from an email stored in S3 by another service (such as Amazon SES).
from __future__ import print_function
import email
import zipfile
import os
import gzip
import string
import boto3
import urllib
print('Loading function')
s3 = boto3.client('s3')
s3r = boto3.resource('s3')
xmlDir = "/tmp/output/"
outputBucket = "" # Set here for a seperate bucket otherwise it is set to the events bucket
outputPrefix = "xml/" # Should end with /
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key']).decode('utf8')
try:
# Set outputBucket if required
if not outputBucket:
global outputBucket
outputBucket = bucket
# Use waiter to ensure the file is persisted
waiter = s3.get_waiter('object_exists')
waiter.wait(Bucket=bucket, Key=key)
response = s3r.Bucket(bucket).Object(key)
# Read the raw text file into a Email Object
msg = email.message_from_string(response.get()["Body"].read())
if len(msg.get_payload()) == 2:
# Create directory for XML files (makes debugging easier)
if os.path.isdir(xmlDir) == False:
os.mkdir(xmlDir)
# The first attachment
attachment = msg.get_payload()[1]
# Extract the attachment into /tmp/output
extract_attachment(attachment)
# Upload the XML files to S3
upload_resulting_files_to_s3()
else:
print("Could not see file/attachment.")
return 0
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist '
'and your bucket is in the same region as this '
'function.'.format(key, bucket))
raise e
def extract_attachment(attachment):
# Process filename.zip attachments
if "gzip" in attachment.get_content_type():
contentdisp = string.split(attachment.get('Content-Disposition'), '=')
fname = contentdisp[1].replace('\"', '')
open('/tmp/' + contentdisp[1], 'wb').write(attachment.get_payload(decode=True))
# This assumes we have filename.xml.gz, if we get this wrong, we will just
# ignore the report
xmlname = fname[:-3]
open(xmlDir + xmlname, 'wb').write(gzip.open('/tmp/' + contentdisp[1], 'rb').read())
# Process filename.xml.gz attachments (Providers not complying to standards)
elif "zip" in attachment.get_content_type():
open('/tmp/attachment.zip', 'wb').write(attachment.get_payload(decode=True))
with zipfile.ZipFile('/tmp/attachment.zip', "r") as z:
z.extractall(xmlDir)
else:
print('Skipping ' + attachment.get_content_type())
def upload_resulting_files_to_s3():
# Put all XML back into S3 (Covers non-compliant cases if a ZIP contains multiple results)
for fileName in os.listdir(xmlDir):
if fileName.endswith(".xml"):
print("Uploading: " + fileName) # File name to upload
s3r.meta.client.upload_file(xmlDir+'/'+fileName, outputBucket, outputPrefix+fileName)
on running the function i m getting this error
'Records': KeyError
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 25, in lambda_handler
for record in event["Records"]:
KeyError: 'Records'
i tried googling and found few telling me to add Mapping Template --https://intellipaat.com/community/18329/keyerror-records-in-aws-s3-lambda-trigger ,
"KeyError: 'Records'" in AWS S3 - Lambda trigger,
following this link but i m getting some other error
'query': KeyError
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 24, in lambda_handler
for record in event['query']['Records']:
KeyError: 'query'
I'm trying to index pdf documents that are uploaded to s3 bucket. My lambda function is working fine til PDF extraction part. it's establishing connection with elastic search endpoint and while uploading data elastic search for indexing, it's throwing error. Please find lambda function code below. Please help me with this. Thanks in advance.
from __future__ import print_function
import json
import urllib
import boto3
import slate
import elasticsearch
import datetime
es_endpoint = 'search-sdjsf-zrtisx]sdaswasfsjmtsyuih3awvu.us-east-
1.es.amazonaws.com'
es_index = 'pdf_text_extracts'
es_type = 'document'
print('Loading function')
s3 = boto3.client('s3')
# prepare a dict to hold our document data
doc_data = {}
doc_data['insert_time'] =
str(datetime.datetime.isoformat(datetime.datetime.now()))
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
object_key = urllib.unquote_plus(event['Records'][0]['s3']['object']
['key']).decode('utf8')
try:
# get the file data from s3
temp_pdf_file = open('/tmp/tempfile.pdf', 'w')
response = s3.get_object(Bucket=bucket, Key=object_key)
print("CONTENT TYPE: " + response['ContentType'])
# return response['ContentType']
temp_pdf_file.write(response['Body'].read()) # write the object data
to a local file; will be passed to slate
temp_pdf_file.close() # close the temporary file for now
# pull the text from the temporary PDF file using slate
print("Extracting data from: " + object_key)
with open('/tmp/tempfile.pdf') as temp_pdf_file:
doc = slate.PDF(temp_pdf_file)
# store document data to dict
doc_data['source_pdf_name'] = object_key
doc_data['document_text'] = doc[0] # we're only worried about page 1
at this point
#datj=json.dumps(doc_data)
#z=json.loads(datj)
#print(z)
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist
and your bucket is in the same region as this
function.'.format(object_key, bucket))
raise e
# put the data in ES
#try:
es = elasticsearch.Elasticsearch([{'host': es_endpoint, 'port': 443,
'use_ssl': True}]) # hold off on validating certs
es_response = es.index(index=es_index, doc_type=es_type, body=doc_data)
print('Data posted to ES: ' + str(es_response))
#except Exception as e:
#print('Data post to ES failed: ' + str(e))
#raise e
return "Done"
I have removed try and except in last block to find the actual error and its throwing the below error while trying to upload data to elastic search.
Traceback (most recent call last):
File "/var/runtime/awslambda/bootstrap.py", line 576, in <module>
main()
File "/var/runtime/awslambda/bootstrap.py", line 571, in main
handle_event_request(request_handler, invokeid, event_body, context_objs,
invoked_function_arn)
File "/var/runtime/awslambda/bootstrap.py", line 264, in
handle_event_request
result = report_fault_helper(invokeid, sys.exc_info(), None)
File "/var/runtime/awslambda/bootstrap.py", line 315, in report_fault_helper
msgs = [str(value), etype.__name__]
Remove the return "Done" at the end, that's not allowed in a Lambda environment.
I tried using the below script to create snapshots for instances having the tag name [Backup or backup] according to https://serverlesscode.com/post/lambda-schedule-ebs-snapshot-backups/ && https://serverlesscode.com/post/lambda-schedule-ebs-snapshot-backups-2/
I have successfully created the snapshots with DeleteOn tags as said in the first and second link.The latter part of the second link explains how to delete those snapshots on the specified date. Based on that code, I have the following to delete the snapshots after 7 days.
This is the code :
import boto3
import re
import datetime
ec = boto3.client('ec2')
iam = boto3.client('iam')
def lambda_handler(event, context):
account_ids = list('123456789011')
try:
iam.get_user()
except Exception as e:
account_ids.append(re.search(r'(arn:aws:sts::)([0-9]+)', str(e)).groups()[1])
delete_on = datetime.date.today().strftime('%Y-%m-%d')
filters = [
{'Name': 'tag-key', 'Values': ['DeleteOn']},
{'Name': 'tag-value', 'Values': [delete_on]},
]
snapshot_response = ec.describe_snapshots(OwnerIds=account_ids, Filters=filters)
for snap in snapshot_response['Snapshots']:
print "Deleting snapshot %s" % snap['SnapshotId']
ec.delete_snapshot(SnapshotId=snap['SnapshotId'])
By doing this, I get the following error :
'NoneType' object has no attribute 'groups': AttributeError
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 27, in lambda_handler
account_ids.append(re.search(r'(arn:aws:sts::)([0-9]+)', str(e)).groups()[1])
AttributeError: 'NoneType' object has no attribute 'groups'
I solved it by updating a part of my above code this way:
def lambda_handler(event, context):
account_ids = ['123456789011']