Copying S3 objects from one account to other using Lambda python - amazon-web-services

I'm using boto3 to copy files from s3 bucket from one account to other. I need a similar functionality like aws s3 sync. Please see my code. My company has decided to 'PULL' from other S3 bucket (source account). Please don't suggest replication, S3 batch, S3 trigger Lambda..etc. We have gone through all these options and my management do not want to do any configuration at source side. Can you please review this code and let me know if this code works for thousands of objects. Source bucket has nearly 10000 objects. We will create this lambda function in destination account and create a cloudwatch event to trigger the lambda once in a day.
I am checking ETag so that modified files will be copied across when this function is triggered.
Edit: I simplified my code just to see pagination works. It's working if I don't add client.copy(). If I add this line in for loop after reading 3,4 objects it's throwing "errorMessage": "2021-08-07T15:29:07.827Z 82757747-7b72-4f29-ae9f-22e95f969d6c Task timed out after 3.00 seconds". Please advise. Please note that 'test/' folder in my source bucket has around 1100 objects.
import os
import logging
import botocore
logger = logging.getLogger()
logger.setLevel(os.getenv('debug_level', 'INFO'))
client = boto3.client('s3')
def handler(event, context):
main(event, logger)
def main(event, logger):
try:
SOURCE_BUCKET = os.environ.get('SRC_BUCKET')
DEST_BUCKET = os.environ.get('DST_BUCKET')
REGION = os.environ.get('REGION')
prefix = 'test/'
# Create a reusable Paginator
paginator = client.get_paginator('list_objects_v2')
print ('after paginator')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket=SOURCE_BUCKET,Prefix = prefix)
print ('after page iterator')
index = 0
for page in page_iterator:
for obj in page['Contents']:
index += 1
print ("I am looking for {} in the source bucket".format(obj['ETag']))
copy_source = {'Bucket': SOURCE_BUCKET, 'Key': obj['Key']}
client.copy(copy_source, DEST_BUCKET, obj['Key'])
logger.info("number of objects copied {}:".format(index))
except botocore.exceptions.ClientError as e:
raise

This version is working fine if I increase the Lambda timeout to 15 min and memory to 512MB. This checks if the source object already exists in destination before copying.
import boto3
import os
import logging
import botocore
from botocore.client import Config
logger = logging.getLogger()
logger.setLevel(os.getenv('debug_level', 'INFO'))
config = Config(connect_timeout=5, retries={'max_attempts': 0})
client = boto3.client('s3', config=config)
#client = boto3.client('s3')
def handler(event, context):
main(event, logger)
def main(event, logger):
try:
DEST_BUCKET = os.environ.get('DST_BUCKET')
SOURCE_BUCKET = os.environ.get('SRC_BUCKET')
REGION = os.environ.get('REGION')
prefix = ''
# Create a reusable Paginator
paginator = client.get_paginator('list_objects_v2')
print ('after paginator')
# Create a PageIterator from the Paginator
page_iterator_src = paginator.paginate(Bucket=SOURCE_BUCKET,Prefix = prefix)
page_iterator_dest = paginator.paginate(Bucket=DEST_BUCKET,Prefix = prefix)
print ('after page iterator')
index = 0
for page_source in page_iterator_src:
for obj_src in page_source['Contents']:
flag = "FALSE"
for page_dest in page_iterator_dest:
for obj_dest in page_dest['Contents']:
# checks if source ETag already exists in destination
if obj_src['ETag'] in obj_dest['ETag']:
flag = "TRUE"
break
if flag == "TRUE":
break
if flag != "TRUE":
index += 1
client.copy_object(Bucket=DEST_BUCKET, CopySource={'Bucket': SOURCE_BUCKET, 'Key': obj_src['Key']}, Key=obj_src['Key'],)
print ("source ETag {} and destination ETag {}".format(obj_src['ETag'],obj_dest['ETag']))
print ("source Key {} and destination Key {}".format(obj_src['Key'],obj_dest['Key']))
print ("Number of objects copied{}".format(index))
logger.info("number of objects copied {}:".format(index))
except botocore.exceptions.ClientError as e:
raise

Related

Lambda task timeout but no application log

I have a python lambda that triggers by S3 uploads to a specific folder. The lambda function is to process the uploaded file and outputs it to another folder on the same S3 bucket.
The issue is that when I do a bulk upload using AWS console, some files do not get processed. I ended up setting a dead letter queue to catch these invocations. While inspecting the message in the queue, there is a request ID which I tried to find it in the lambda logs.
These are the logs for the request ID:
Now the odd part is that in the python code, the first line after the imports is print('Loading function') which does not show up in the lambda log?
Added the python code here. It should still print the Processing file name: " + key which is inside the handler ya?
import urllib.parse
from datetime import datetime
import boto3
from constants import CONTENT_TYPE, XML_EXTENSION, VALIDATING
from xml_process import *
from s3Integration import download_file
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context):
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
print("Processing file name: " + key)
try:
response = s3.get_object(Bucket=bucket, Key=key)
xml_content = response["Body"].read()
content_type = response["ContentType"]
tree = ET.fromstring(xml_content)
key_file_name = key.split("/")[1]
# Creating a temporary copy by downloading file to get the namespaces
temp_file_name = "/tmp/" + key_file_name
download_file(key, temp_file_name)
namespaces = {node[0]: node[1] for _, node in ET.iterparse(temp_file_name, events=['start-ns'])}
for name, value in namespaces.items():
ET.register_namespace(name, value)
# Preparing path for file processing
processed_file = key_file_name.split(".")[0] + "_processed." + key_file_name.split(".")[1]
print(processed_file, "processed")
db_record = XMLMapping(file_path=key,
processed_file_path=processed_file,
uploaded_by="lambda",
status=VALIDATING, uploaded_date=datetime.now(), is_active=True)
session.add(db_record)
session.commit()
if key_file_name.split(".")[1] == XML_EXTENSION:
if content_type in CONTENT_TYPE:
xml_parse(tree, db_record, processed_file, True)
else:
print("Content Type is not valid. Provided value: ", content_type)
else:
print("File extension is not valid. Provided extension: ", key_file_name.split(".")[1])
return "success"
except Exception as e:
print(e)
raise e
I don't think its a permission issue as other files uploaded in the same batch were processed successfully.

Botocore Stubber - Unable to locate credentials

I'm working on unit tests for my lambda which is getting some files from S3, processing them and loading data from them to DynamoDB. I created botocore stubbers that are used during tests, but I got botocore.exceptions.NoCredentialsError: Unable to locate credentials
My lambda handler code
s3_client = boto3.client('s3')
ddb_client = boto3.resource('dynamodb', region_name='eu-west-1')
def lambda_handler(event, context):
for record in event['Records']:
s3_event = record.get('s3')
bucket = s3_event.get('bucket', {}).get('name', '')
file_key = s3_event.get('object', {}).get('key', '')
file = s3_client.get_object(Bucket=bucket, Key=file_key)
and tests file:
class TestLambda(unittest.TestCase):
def setUp(self) -> None:
self.session = botocore.session.get_session()
# S3 Stubber Set Up
self.s3_client = self.session.create_client('s3', region_name='eu-west-1')
self.s3_stubber = Stubber(self.s3_client)
# DDB Stubber Set Up
self.ddb_resource = boto3.resource('dynamodb', region_name='eu-west-1')
self.ddb_stubber = Stubber(self.ddb_resource.meta.client)
def test_s3_to_ddb_handler(self) -> None:
event = {}
with self.s3_stubber:
with self.ddb_stubber:
response = s3_to_ddb_handler.s3_to_ddb_handler(event, ANY)
Issue seems to be that actual call to AWS resources is done which shouldnt be the case and stubber should be used, how can I force that?
You need to call .activate() on your Stubber instances: https://botocore.amazonaws.com/v1/documentation/api/latest/reference/stubber.html#botocore.stub.Stubber.activate

Share JPEG file stored on S3 via URL instead of downloading

I have recently completed this tutorial from AWS on how to create a thumbnail generator using lambda and S3: https://docs.aws.amazon.com/lambda/latest/dg/with-s3-tutorial.html . Basically, I'm uploading an image file to my '-source' bucket and then lambda generates a thumbnail and uploads it to my '-thumbnail' bucket.
Everything works as expected. However, I wanted to use s3 object URL in the '-thumbnail' bucket so that I can load the image from there for a small app I'm building. The issue I'm having is that the URL doesn't display the image in the browser but instead downloads the file. This causes my app to error out.
I did some research and learned that I had to change the content-type to image/jpeg and then also made the object public using ACL. This works for all of the other buckets I have except the one that has the thumbnail. I have recreated this bucket several times. I even copied the settings from my existing buckets. I have compared settings to all the other buckets and they appear to be the same.
I wanted to reach out and see if anyone has ran into this type of issue before. Or if there is something I might be missing.
Here is the code I'm using to generate the thumbnail.
import boto3
from boto3.dynamodb.conditions import Key, Attr
import os
import sys
import uuid
import urllib.parse
from urllib.parse import unquote_plus
from PIL.Image import core as _imaging
import PIL.Image
s3 = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table(os.environ['DB_TABLE_NAME'])
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
recordId = key
tmpkey = key.replace('/', '')
download_path = '/tmp/{}{}'.format(uuid.uuid4(), tmpkey)
upload_path = '/tmp/resized-{}'.format(tmpkey)
try:
s3.download_file(bucket, key, download_path)
resize_image(download_path, upload_path)
bucket = bucket.replace('source', 'thumbnail')
s3.upload_file(upload_path, bucket, key)
print(f"Thumbnail created and uploaded to {bucket} successfully.")
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
else:
s3.put_object_acl(ACL='public-read',
Bucket=bucket,
Key=key)
#create image url to add to dynamo
url = f"https://postreader-thumbnail.s3.us-west-2.amazonaws.com/{key}"
print(url)
#create record id to update the appropriate record in the 'Posts' table
recordId = key.replace('.jpeg', '')
#add the image_url column along with the image url as the value
table.update_item(
Key={'id':recordId},
UpdateExpression=
"SET #statusAtt = :statusValue, #img_urlAtt = :img_urlValue",
ExpressionAttributeValues=
{':statusValue': 'UPDATED', ':img_urlValue': url},
ExpressionAttributeNames=
{'#statusAtt': 'status', '#img_urlAtt': 'img_url'},
)
def resize_image(image_path, resized_path):
with PIL.Image.open(image_path) as image:
#change to standard/hard-coded size
image.thumbnail(tuple(x / 2 for x in image.size))
image.save(resized_path)
This could happen if the Content-Type of the file you're uploading is binary/octet-stream , you can modify your script like below to provide custom content-type while uploading.
s3.upload_file(upload_path, bucket, key, ExtraArgs={'ContentType':
"image/jpeg"})
After more troubleshooting the issue was apparently related to the bucket's name. I created a new bucket with a different name than it had previously. After doing so I was able to upload and share images without issue.
I edited my code so that the lambda uploads to the new bucket name and I am able to share the image via URL without downloading.

Dump lambda output to csv and have it email as an attachment

I have a lambda function that generates a list of untagged buckets in AWS environment. Currently I send the output to a slack channel directly. Instead I would like to have my lambda dump the output to a csv file and send it as a report. Here is the code for it, let me know if you need any other details.
import boto3
from botocore.exceptions import ClientError
import urllib3
import json
http = urllib3.PoolManager()
def lambda_handler(event, context):
#Printing the S3 buckets with no tags
s3 = boto3.client('s3')
s3_re = boto3.resource('s3')
buckets = []
print('Printing buckets with no tags..')
for bucket in s3_re.buckets.all():
s3_bucket = bucket
s3_bucket_name = s3_bucket.name
try:
response = s3.get_bucket_tagging(Bucket=s3_bucket_name)
except ClientError:
buckets.append(bucket)
print(bucket)
for bucket in buckets:
data = {"text": "%s bucket has no tags" % (bucket)}
r = http.request("POST", "https://hooks.slack.com/services/~/~/~",
body = json.dumps(data),
headers = {"Content-Type": "application/json"})

Does boto3 have a credential cache comparable to awscli?

With awscli there's a credential cache in ~/.aws/cli/cache which allows me to cache credentials for a while. This is very helpful when using MFA. Does boto3 have a similar capability or do I have to explicitly cache my credentials returned from session = boto3.session.Session(profile_name='CTO:Admin')?
It is already there.
http://boto3.readthedocs.org/en/latest/guide/configuration.html#assume-role-provider
When you specify a profile that has IAM role configuration, boto3 will make an AssumeRole call to retrieve temporary credentials. Subsequent boto3 API calls will use the cached temporary credentials until they expire, in which case boto3 will automatically refresh credentials. boto3 does not write these temporary credentials to disk. This means that temporary credentials from the AssumeRole calls are only cached in memory within a single Session. All clients created from that session will share the same temporary credentials.
I created a Python library that provides this for you - see https://github.com/mixja/boto3-session-cache
Example:
import boto3_session_cache
# This returns a regular boto3 client object with the underlying session configured with local credential cache
client = boto3_session_cache.client('ecs')
ecs_clusters = client.list_clusters()
To summarise above points, a working example:
from os import path
import os
import sys
import json
import datetime
from distutils.spawn import find_executable
from botocore.exceptions import ProfileNotFound
import boto3
import botocore
def json_encoder(obj):
"""JSON encoder that formats datetimes as ISO8601 format."""
if isinstance(obj, datetime.datetime):
return obj.isoformat()
else:
return obj
class JSONFileCache(object):
"""JSON file cache.
This provides a dict like interface that stores JSON serializable
objects.
The objects are serialized to JSON and stored in a file. These
values can be retrieved at a later time.
"""
CACHE_DIR = path.expanduser(path.join('~', '.aws', 'ansible-ec2', 'cache'))
def __init__(self, working_dir=CACHE_DIR):
self._working_dir = working_dir
def __contains__(self, cache_key):
actual_key = self._convert_cache_key(cache_key)
return path.isfile(actual_key)
def __getitem__(self, cache_key):
"""Retrieve value from a cache key."""
actual_key = self._convert_cache_key(cache_key)
try:
with open(actual_key) as f:
return json.load(f)
except (OSError, ValueError, IOError):
raise KeyError(cache_key)
def __setitem__(self, cache_key, value):
full_key = self._convert_cache_key(cache_key)
try:
file_content = json.dumps(value, default=json_encoder)
except (TypeError, ValueError):
raise ValueError("Value cannot be cached, must be "
"JSON serializable: %s" % value)
if not path.isdir(self._working_dir):
os.makedirs(self._working_dir)
with os.fdopen(os.open(full_key,
os.O_WRONLY | os.O_CREAT, 0o600), 'w') as f:
f.truncate()
f.write(file_content)
def _convert_cache_key(self, cache_key):
full_path = path.join(self._working_dir, cache_key + '.json')
return full_path
session = boto3.session.Session()
try:
cred_chain = session._session.get_component('credential_provider')
except ProfileNotFound:
print "Invalid Profile"
sys.exit(1)
provider = cred_chain.get_provider('assume-role')
provider.cache = JSONFileCache()
# Do something with the session...
ec2 = session.resource('ec2')
Originally, the credential caching and automatic renewing of temporary credentials was part of the AWSCLI but this commit (and some subsequent ones) moved that functionality to botocore which means it is now available in boto3, as well.