csv file from S3 to Kinesis Streams using Lambda Function in Python - amazon-web-services

I'm using the below Lambda code to read data from an S3 bucket that triggers the lambda function once a file is created into the S3 bucket.
import json
import urllib.parse
import boto3
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
print("bucket= ", bucket)
print("key= ", key)
try:
response = s3.get_object(Bucket=bucket, Key=key)
print("CONTENT TYPE: " + response['ContentType'])
return response['ContentType']
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
The uploaded file is a csv file that contains 50 comma delimited 50 records.
I need to stream this received data into an AWS Kinesis Data Stream called test-stream once the file is uploaded to the S3 bucket.. Any help please?
Thanks..

Related

How to upload a list of files from the web directly to s3 bucket

I have a file with urls in my s3 bucket. I would like to use a python lambda function to upload the url files to s3 bucket.
For example my uploaded file to s3 contains:
http://...
http://...
Each line corresponds to a file to be uploaded into s3.
Here is the code:
import json
import urllib.parse
import boto3
import requests
import os
from gzip import GzipFile
from io import TextIOWrapper
import requests
print('Loading functions')
s3 = boto3.client('s3')
def get_file_seqs(response):
try:
size = response['ResponseMetadata']['HTTPHeaders']['content-length']
print("[+] Size retrieved")
return size
except:
print("[-] Size can not be retrieved")
def lambda_handler(event, context):
# Defining bucket objects
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
#get file from s3
print('[+] Getting file from S3 bucket')
response = s3.get_object(Bucket=bucket, Key=key)
try:
#checking file size
print('[+] Checking file size')
file_size = get_file_seqs(response)
if file_size == 0:
print('File size is equal to 0')
return False
else:
#create new directories
print('[+] Creating new directories')
bucket_name = "triggersnextflow"
directories = ['backups/sample/', 'backups/control/']
#loop to create new dirs
for dirs in directories:
s3.put_object(Bucket = bucket_name, Key = dirs, Body = '')
#NOW I WOULD LIKE TO DOWNLOAD THE FILES FROM THE URLS INSIDE S3 OBJECT
#return true
return True
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
Download an S3 object to a file:
import boto3
s3 = boto3.resource('s3')
s3.meta.client.download_file('mybucket', 'hello.txt', '/tmp/hello.txt')
You will find great resource of information here:
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.download_file

S3 boto3 corrupts file

I have the following function:
def upload_file(file_name, bucket, object_name=None):
"""Upload a file to an S3 bucket -> from aws docs
:param file_name: File to upload
:param bucket: Bucket to upload to
:param object_name: S3 object name. If not specified then file_name is used
:return: True if file was uploaded, else False
"""
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = file_name
# Upload the file
s3_client = boto3.client('s3')
try:
response = s3_client.upload_file(file_name, bucket, object_name)
except ClientError as e:
logging.error(e)
return False
return True
I am trying to upload a html file to an S3 bucket acting as a webserver. When I manually upload the html file to S3, it works as expected, and displays the page when I navigate to the S3 bucket's URL.
If I programmatically upload the file using the above function, the html file will no longer be hosted, and my browser will attempt to download a XZ file.
Am I missing a parameter or something?
Courtesy of #jarmod, I learned I was setting an incorrect content-type.
Here is the updated function to upload an HTML file as text/html.
def upload_file(file_name, bucket, object_name=None):
"""Upload a file to an S3 bucket -> from aws docs
:param file_name: File to upload
:param bucket: Bucket to upload to
:param object_name: S3 object name. If not specified then file_name is used
:return: True if file was uploaded, else False
"""
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = file_name
# Upload the file
s3_client = boto3.client('s3')
try:
response = s3_client.upload_file(file_name, bucket, object_name, ExtraArgs={'ContentType': "text/html"})
except ClientError as e:
logging.error(e)
return False
return True

Upload files to s3 bucket using python gives Access Denied

I created a Python script that should upload a file from my local ec2 to the s3 bucket
import boto3
s3 = boto3.resource('s3')
data = open('backupFile.txt', 'rb')
s3.Bucket('mlsd').put_object(Key='backupFile.txt', Body=data)
I went to AWS account details and got the credentials.
I executed aws configure to set credentials on my EC2.
Hear is the output of the credentials using aws configure list:
I went to .aws/credentials and pasted access_key_id, secret_access_key, and token
I ensured that the token is not expired.
When I ran the script, I got the following output:
Not sure what the problem is.
Boto3 detects your credentials in possible locations, as described here, so it should find your access_key_id and secret_access_key
Make sure the user whose access_key_id you use has the access to S3 bucket.
I tried this code example and it works:
import logging
import boto3
from botocore.exceptions import ClientError
def upload_file(file_name, bucket, object_name=None):
"""Upload a file to an S3 bucket
:param file_name: File to upload
:param bucket: Bucket to upload to
:param object_name: S3 object name. If not specified then file_name is used
:return: True if file was uploaded, else False
"""
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = file_name
# Upload the file
s3_client = boto3.client('s3')
try:
response = s3_client.upload_file(file_name, bucket, object_name)
except ClientError as e:
logging.error(e)
return False
return True

Convert json to csv from one S3 bucket and upload to another S3 bucket through AWS Lambda

I have tried below code but I am not able to convert the data from json to csv. Can someone please help me?
import boto3
import botocore
import csv
def lambda_handler(event, context):
BUCKET_NAME = 'name of the bucket' # replace with your bucket name
KEY = 'OUTPUT.csv' # replace with your object key
json_data = [{"id":"1","name":"test"},{"id":"2","name":"good"}]
with open("data.csv", "w") as file:
csv_file = csv.writer(file)
csv_file.writerow(['id', 'name'])
for item in data:
csv_file.writerow([item.get('id'),item.get('name')])
csv_binary = open('data.csv', 'rb').read()
try:
obj = s3.Object(BUCKET_NAME, KEY)
obj.put(Body=csv_binary)
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
print("The object does not exist.")
else:
raise
s3client = boto3.client('s3')
try:
download_url = s3client.generate_presigned_url(
'get_object',
Params={
'Bucket': BUCKET_NAME,
'Key': KEY
},
ExpiresIn=3600
)
return {"csv_link": download_url}
except Exception as e:
raise utils_exception.ErrorResponse(400, e, Log)
Here is the response I am getting for the above code:
{
"errorMessage": "[Errno 30] Read-only file system: 'data.csv'",
"errorType": "OSError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 8, in lambda_handler\n with open(\"data.csv\", \"wb\") as file:\n"
]
}
In AWS Lambda, you can only create files in the /tmp/ directory. Therefore, use:
with open("/tmp/data.csv", "w") as file:
A maximum of 512MB is provided, so it is a good idea to delete any temporary files so they do not interfere with future executions of the Lambda function.

AWS Lambda bootstrap.py file is throwing error while trying to upload data to elastic search

I'm trying to index pdf documents that are uploaded to s3 bucket. My lambda function is working fine til PDF extraction part. it's establishing connection with elastic search endpoint and while uploading data elastic search for indexing, it's throwing error. Please find lambda function code below. Please help me with this. Thanks in advance.
from __future__ import print_function
import json
import urllib
import boto3
import slate
import elasticsearch
import datetime
es_endpoint = 'search-sdjsf-zrtisx]sdaswasfsjmtsyuih3awvu.us-east-
1.es.amazonaws.com'
es_index = 'pdf_text_extracts'
es_type = 'document'
print('Loading function')
s3 = boto3.client('s3')
# prepare a dict to hold our document data
doc_data = {}
doc_data['insert_time'] =
str(datetime.datetime.isoformat(datetime.datetime.now()))
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
object_key = urllib.unquote_plus(event['Records'][0]['s3']['object']
['key']).decode('utf8')
try:
# get the file data from s3
temp_pdf_file = open('/tmp/tempfile.pdf', 'w')
response = s3.get_object(Bucket=bucket, Key=object_key)
print("CONTENT TYPE: " + response['ContentType'])
# return response['ContentType']
temp_pdf_file.write(response['Body'].read()) # write the object data
to a local file; will be passed to slate
temp_pdf_file.close() # close the temporary file for now
# pull the text from the temporary PDF file using slate
print("Extracting data from: " + object_key)
with open('/tmp/tempfile.pdf') as temp_pdf_file:
doc = slate.PDF(temp_pdf_file)
# store document data to dict
doc_data['source_pdf_name'] = object_key
doc_data['document_text'] = doc[0] # we're only worried about page 1
at this point
#datj=json.dumps(doc_data)
#z=json.loads(datj)
#print(z)
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist
and your bucket is in the same region as this
function.'.format(object_key, bucket))
raise e
# put the data in ES
#try:
es = elasticsearch.Elasticsearch([{'host': es_endpoint, 'port': 443,
'use_ssl': True}]) # hold off on validating certs
es_response = es.index(index=es_index, doc_type=es_type, body=doc_data)
print('Data posted to ES: ' + str(es_response))
#except Exception as e:
#print('Data post to ES failed: ' + str(e))
#raise e
return "Done"
I have removed try and except in last block to find the actual error and its throwing the below error while trying to upload data to elastic search.
Traceback (most recent call last):
File "/var/runtime/awslambda/bootstrap.py", line 576, in <module>
main()
File "/var/runtime/awslambda/bootstrap.py", line 571, in main
handle_event_request(request_handler, invokeid, event_body, context_objs,
invoked_function_arn)
File "/var/runtime/awslambda/bootstrap.py", line 264, in
handle_event_request
result = report_fault_helper(invokeid, sys.exc_info(), None)
File "/var/runtime/awslambda/bootstrap.py", line 315, in report_fault_helper
msgs = [str(value), etype.__name__]
Remove the return "Done" at the end, that's not allowed in a Lambda environment.