I would like to export DynamoDB Table to S3 bucket in CSV format using Python (Boto3) - amazon-web-services

This question has been asked earlier in the following link:
How to write dynamodb scan data's in CSV and upload to s3 bucket using python?
I have amended the code as advised in the comments. The code looks like as follows:
import csv
import boto3
import json
dynamodb = boto3.resource('dynamodb')
db = dynamodb.Table('employee_details')
def lambda_handler(event, context):
AWS_BUCKET_NAME = 'session5cloudfront'
s3 = boto3.resource('s3')
bucket = s3.Bucket(AWS_BUCKET_NAME)
path = '/tmp/' + 'employees.csv'
try:
response = db.scan()
myFile = open(path, 'w')
for i in response['Items']:
csv.register_dialect('myDialect', delimiter=' ', quoting=csv.QUOTE_NONE)
with myFile:
writer = csv.writer(myFile, dialect='myDialect')
writer.writerows(i)
print(i)
except :
print("error")
bucket.put_object(
ACL='public-read',
ContentType='application/csv',
Key=path,
# Body=json.dumps(i),
)
# print("here")
body = {
"uploaded": "true",
"bucket": AWS_BUCKET_NAME,
"path": path,
}
# print("then here")
return {
"statusCode": 200,
"body": json.dumps(body)
}
I am a novice, please help me in fixing this code as it is having problem in inserting data in file created in S3 Bucket.
Thanks

I have revised the code to be simpler and to also handle paginated responses for tables with more than 1MB of data:
import csv
import boto3
import json
TABLE_NAME = 'employee_details'
OUTPUT_BUCKET = 'my-bucket'
TEMP_FILENAME = '/tmp/employees.csv'
OUTPUT_KEY = 'employees.csv'
s3_resource = boto3.resource('s3')
dynamodb_resource = boto3.resource('dynamodb')
table = dynamodb_resource.Table(TABLE_NAME)
def lambda_handler(event, context):
with open(TEMP_FILENAME, 'w') as output_file:
writer = csv.writer(output_file)
header = True
first_page = True
# Paginate results
while True:
# Scan DynamoDB table
if first_page:
response = table.scan()
first_page = False
else:
response = table.scan(ExclusiveStartKey = response['LastEvaluatedKey'])
for item in response['Items']:
# Write header row?
if header:
writer.writerow(item.keys())
header = False
writer.writerow(item.values())
# Last page?
if 'LastEvaluatedKey' not in response:
break
# Upload temp file to S3
s3_resource.Bucket(OUTPUT_BUCKET).upload_file(TEMP_FILENAME, OUTPUT_KEY)

Related

How to move files from one s3 to another recursively?

i have a lambda to copy files (sample code below) , once files are copied , i delete from source. is there a way to move files instead and also do it recursively, to maintain folder or prefix in s3?
import json
import boto3
s3_client=boto3.client('s3')
def lambda_handler(event, context):
source_bucket_name=event['Records'][0]['s3']['bucket']['name']
file_name=event['Records'][0]['s3']['object']['key']
destination_bucket_name='gfg-destination-bucket'
copy_object={'Bucket':source_bucket_name,'Key':file_name}
s3_client.copy_object(CopySource=copy_object,Bucket=destination_bucket_name,Key=file_name)
return {
'statusCode': 200,
'body': json.dumps('Success')
There is no direct method for moving file from source S3 bucket to Destination S3 Bucket.
So you have to Copy file then delete from source
import json
import boto3
def lambda_handler(event, context):
# TODO implement
SOURCE_BUCKET='source-test-file'
DESTINATION_BUCKET='destiation-test-file'
s3_resource = boto3.resource('s3')
s3_client = boto3.client('s3')
bucket = s3_resource.Bucket(SOURCE_BUCKET)
dest_bucket = s3_resource.Bucket(DESTINATION_BUCKET)
#count = bucket.objects.filter(Prefix=SOURCE_BUCKET)
count = bucket.objects.all()
#count total files in bucket
totalCount = 0
print('result :',len(list(count)))
length = len(list(count))
if (length==0):
print('result :',len(list(count)))
print("bucket is empety")
else:
print("bucket is not empety")
for object in bucket.objects.all():
print("print :",object.key)
key1 = object.key
sourceObject = { 'Bucket' : 'source-test-file', 'Key': object.key}
destObject = { 'Bucket' : 'destiation-test-file', 'Key': object.key}
s3_resource.meta.client.copy(sourceObject, 'destiation-test-file',key1)
#Delete the File After Copying It to the Target Directory
s3_resource.Object('sourceObject','*.txt').delete()
totalCount += 1
print("file Moved")

Dump lambda output to csv and have it email as an attachment

I have a lambda function that generates a list of untagged buckets in AWS environment. Currently I send the output to a slack channel directly. Instead I would like to have my lambda dump the output to a csv file and send it as a report. Here is the code for it, let me know if you need any other details.
import boto3
from botocore.exceptions import ClientError
import urllib3
import json
http = urllib3.PoolManager()
def lambda_handler(event, context):
#Printing the S3 buckets with no tags
s3 = boto3.client('s3')
s3_re = boto3.resource('s3')
buckets = []
print('Printing buckets with no tags..')
for bucket in s3_re.buckets.all():
s3_bucket = bucket
s3_bucket_name = s3_bucket.name
try:
response = s3.get_bucket_tagging(Bucket=s3_bucket_name)
except ClientError:
buckets.append(bucket)
print(bucket)
for bucket in buckets:
data = {"text": "%s bucket has no tags" % (bucket)}
r = http.request("POST", "https://hooks.slack.com/services/~/~/~",
body = json.dumps(data),
headers = {"Content-Type": "application/json"})

Lambda Copy Image from one bucket to another after resize

I am receiving images on a S3 bucket. Using a lambda function, I want to resize the images to a thumbnail and copy the thumbnail into another s3 bucket. The following is the code:
import json
import boto3
import ast
from urllib.request import urlopen
import time
from boto3.dynamodb.conditions import Key, Attr
from PIL import Image
s3_client=boto3.client('s3')
s3_res = boto3.resource('s3')
def lambda_handler(event, context):
client = boto3.resource("dynamodb")
tnlBuck = s3_res.Bucket('aivuthumbnail')
for record in event['Records']:
bucket=record['s3']['bucket']['name']
ikey = record['s3']['object']['key']
params = {'Bucket': bucket, 'Key': ikey}
proj = ikey.split('_')[0]
outlet = ikey.split('_')[1]
parameter = ikey.split('_')[2]
dat = ikey.split('_')[3]
table = client.Table("telescopeImageReceipt")
table.put_item(Item={'image':ikey,'project':proj,'outlet':outlet,'parameter':parameter,'date':dat})
url = s3_client.generate_presigned_url(ClientMethod='get_object', Params=params)
with urlopen(url) as conn:
image = Image.open(conn)
MAX_SIZE = (100, 100)
image.thumbnail(MAX_SIZE)
image.copy("Bucket":tnlBuck)
I have changed the last line to various combinations. But nothing works. The lambda function has full access to S3, Dynamodb and Cloudwatch logs.
The following were some of the options I tried and got the error messages:
Option Tried: tnlBuck.copy(image, ikey)
Error : Expecting dictionary formatted: {"Bucket": bucket_name, "Key": key} but got <PIL.JpegImagePlugin.JpegImageFile image
Option Tried: s3_client.copy({"Bucket":tnlBuck, "Key":ikey})
Error: TypeError: copy() missing 2 required positional arguments: 'Bucket' and ‘Key'
Option tried: image.copy({"Bucket":tnlBuck, "Key":ikey})
Error: TypeError: copy() takes 1 positional argument but 2 were given
Other options had more or less similar errors or thrown a syntax error.
You need to use an S3 bucket to copy the image to it and not the PIL Image object.
Your code should be changed to this:
import json
import io
import boto3
import ast
from urllib.request import urlopen
import time
from boto3.dynamodb.conditions import Key, Attr
from PIL import Image
s3_client=boto3.client('s3')
s3_res = boto3.resource('s3')
def lambda_handler(event, context):
client = boto3.resource("dynamodb")
tnlBuck = s3_res.Bucket('aivuthumbnail')
for record in event['Records']:
bucket=record['s3']['bucket']['name']
ikey = record['s3']['object']['key']
params = {'Bucket': bucket, 'Key': ikey}
proj = ikey.split('_')[0]
outlet = ikey.split('_')[1]
parameter = ikey.split('_')[2]
dat = ikey.split('_')[3]
table = client.Table("telescopeImageReceipt")
table.put_item(Item={'image':ikey,'project':proj,'outlet':outlet,'parameter':parameter,'date':dat})
url = s3_client.generate_presigned_url(ClientMethod='get_object', Params=params)
with urlopen(url) as conn:
image = Image.open(conn)
MAX_SIZE = (100, 100)
image.thumbnail(MAX_SIZE)
img_bytes = io.BytesIO()
image.save(img_bytes, format='JPEG')
img_bytes.seek(0)
tnl_bucket.Object(ikey).put(Body=img_bytes.read())
You should use tnl_bucket to create a new object from thumbnailed image bytes.
img_bytes = io.BytesIO()
image.save(img_bytes, format='JPEG')
img_bytes.seek(0)
tnl_bucket.Object(ikey).put(Body=img_bytes.read())
PIL can save to file on path or BytesIO. You need to return to the stream beginning with .seek(0) so it can be read from the start to get bytes for put method.

Get information about a file uploaded to S3

i have created a lambda function that sends emails whenever a file is uploaded on s3 bucket, but now i want to have all the informations related to that file as the name, size, date and time of upload, and if it's possible where it comes from.
I have all this infortmation on aws console, but want to have it in the email body.
i am using serverless framework. v 1.22.0
here is my code
import json
import boto3
import botocore
import logging
import sys
import os
import traceback
from botocore.exceptions import ClientError
from pprint import pprint
from time import strftime, gmtime
email_from = '********#*****.com'
email_to = '********#*****.com'
email_subject = 'new event on s3 '
email_body = 'a new file is uploaded'
#setup simple logging for INFO
logger = logging.getLogger()
logger.setLevel(logging.INFO)
from botocore.exceptions import ClientError
def sthree(event, context):
"""Send email whenever a file is uploaded to S3"""
body = {}
status_code = 200
email_body = str(context)
try:
s3 = boto3.client('s3')
ses = boto3.client('ses')
ses.send_email(Source = email_from,
Destination = {'ToAddresses': [email_to,],},
Message = {'Subject': {'Data': email_subject}, 'Body':{'Text' : {'Data': email_body}}}
)
except Exception as e:
print(traceback.format_exc())
status_code = 500
body["message"] = json.dumps(e)
response = {
"statusCode": 200,
"body": json.dumps(body)
}
return response
Here is the event json structure sent by S3 upon object creation:
http://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html
You can get the file names, sizes and source ip like this:
for record in event['Records']:
filename = record['s3']['object']['key'];
filesize = record['s3']['object']['size'];
source = record['requestParameters']['sourceIPAddress'];
eventTime = record['eventTime'];
def lambda_handler(event, context):
s3 = boto3.client('s3')
email_from = 'XXXXXXXXX#XXX.com'
email_to = 'XXXXXXXXX#XXX.com'
email_subject = 'new event on s3'
email_body = "File Name :" + event[u'Records'][0][u's3'][u'object'][u'key'] + "\n" + "File Size :" + str(event[u'Records'][0][u's3'][u'object'][u'size']) + "\n" + "Upload Time :" + event[u'Records'][0][u'eventTime'] + "\n" + "User Details :" + event[u'Records'][0][u'userIdentity'][u'principalId']
ses = boto3.client('ses')
ses.send_email(Source = email_from,
Destination = {'ToAddresses': [email_to,],},
Message = {'Subject': {'Data': email_subject}, 'Body':{'Text' : {'Data': email_body}}}
)
print("Function execution Completed !!!")

DynamoDB pagination using Boto3

We are using boto3 for our DynamoDB and we need to do a full scan of our tables to enable to do that based on other post we need to do a pagination. However, we are unable to find a working sample of pagination. Here is what we did.
import boto3
client_setting = boto3.client('dynamodb', region_name='ap-southeast-2')
paginator = client_setting.get_paginator('scan')
esk = {}
data = []
unconverted_ga = ourQuery(params1, params2)
for page in unconverted_ga:
data.append(page)
esk = page['LastEvaluatedKey']
We dont know exactly how to make the esk as the ExclusiveStartKey of our next query. What should be the expected value of ExclusiveStartkey parameter? We are still new in DynamoDB and there's many things we need to learn including this. thanks!
From the answer by Tay B at https://stackoverflow.com/a/38619425/3176550
import boto3
dynamodb = boto3.resource('dynamodb',
aws_session_token=aws_session_token,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=region
)
table = dynamodb.Table('widgetsTableName')
response = table.scan()
data = response['Items']
while 'LastEvaluatedKey' in response:
response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
data.update(response['Items'])
After hour of search, i've finally found a better solution. For those who are new to DynamoDB, we should'nt missed this - http://docs.aws.amazon.com/amazondynamodb/latest/gettingstartedguide/GettingStarted.Python.04.html
from __future__ import print_function # Python 2/3 compatibility
import boto3
import json
import decimal
from boto3.dynamodb.conditions import Key, Attr
# Helper class to convert a DynamoDB item to JSON.
class DecimalEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, decimal.Decimal):
if o % 1 > 0:
return float(o)
else:
return int(o)
return super(DecimalEncoder, self).default(o)
dynamodb = boto3.resource('dynamodb', region_name='us-west-2', endpoint_url="http://localhost:8000")
table = dynamodb.Table('Movies')
fe = Key('year').between(1950, 1959)
pe = "#yr, title, info.rating"
# Expression Attribute Names for Projection Expression only.
ean = { "#yr": "year", }
esk = None
response = table.scan(
FilterExpression=fe,
ProjectionExpression=pe,
ExpressionAttributeNames=ean
)
for i in response['Items']:
print(json.dumps(i, cls=DecimalEncoder))
// As long as LastEvaluatedKey is in response it means there are still items from the query related to the data
while 'LastEvaluatedKey' in response:
response = table.scan(
ProjectionExpression=pe,
FilterExpression=fe,
ExpressionAttributeNames= ean,
ExclusiveStartKey=response['LastEvaluatedKey']
)
for i in response['Items']:
print(json.dumps(i, cls=DecimalEncoder))
You can try with following code:
esk = None
while True:
scan_generator = YourTableName.scan(max_results=10, exclusive_start_key=esk)
for item in scan_generator:
# your code for processing
# condition to check if entire table is scanned
else:
break;
# Load the last keys
esk = scan_generator.kwargs['exclusive_start_key'].values()
Here is the reference documentation link.
Hope that helps
Bit more verbose but I like it.
def fetch_from_table(last_key=None):
if last_key:
response = table.query(
IndexName='advertCatalogIdx',
KeyConditionExpression=Key('sk').eq('CATALOG'),
Limit=5,
ExclusiveStartKey=last_key
)
else:
response = table.query(
IndexName='advertCatalogIdx',
KeyConditionExpression=Key('sk').eq('CATALOG'),
Limit=5
)
# print(response)
for item in response['Items']:
print(item['address'])
print('***************************')
return response.get('LastEvaluatedKey')
last_key = fetch_from_table()
while last_key != None:
print("Running again : ")
last_key = fetch_from_table(last_key)
import sys
import boto3
client = boto3.client('dynamodb')
marker = None
while True:
paginator = client.get_paginator('list_tables')
page_iterator = paginator.paginate(
PaginationConfig={
'MaxItems': 1000,
'PageSize': 100,
'StartingToken': marker})
for page in page_iterator:
tables=page['TableNames']
for table in tables:
print (table)
try:
marker = page['NextToken']
except KeyError:
sys.exit()