Trying to get all RDS instances with boto3 - does not return all RDS instances.
When I look at my RDS instances in Oregon (us-west-2), I see the following:
However, if I run the below Python3 script, I only get one result:
$ python3 ./stackoverflow.py
RDS instances in Oregon
------------------------------
aurora-5-7-yasmin.cazdggrmkpt1.us-west-2.rds.amazonaws.com qa test db.t2.small aurora-5-7-yasmin
$
Can you suggest a way to get boto3 to display all RDS instances?
$ cat ./stackoverflow.py
import collections
import boto3
import datetime
import pygsheets
REGIONS = ('us-west-2',)
REGIONS_H = ('Oregon',)
currentDT = str(datetime.datetime.now())
def create_spreadsheet(outh_file, spreadsheet_name = "AWS usage"):
client = pygsheets.authorize(outh_file=outh_file, outh_nonlocal=True)
client.list_ssheets(parent_id=None)
spread_sheet = client.create(spreadsheet_name)
return spread_sheet
def rds_worksheet_creation(spread_sheet):
for i in range(len(REGIONS)):
region = REGIONS[i]
region_h = REGIONS_H[i]
print()
print("{} instances in {}".format("RDS", region_h))
print("------------------------------")
client = boto3.client('rds', region_name=region)
db_instances = client.describe_db_instances()
for i in range(len(db_instances)):
j = i - 1
try:
DBName = db_instances['DBInstances'][j]['DBName']
MasterUsername = db_instances['DBInstances'][0]['MasterUsername']
DBInstanceClass = db_instances['DBInstances'][0]['DBInstanceClass']
DBInstanceIdentifier = db_instances['DBInstances'][0]['DBInstanceIdentifier']
Endpoint = db_instances['DBInstances'][0]['Endpoint']
Address = db_instances['DBInstances'][0]['Endpoint']['Address']
print("{} {} {} {} {}".format(Address, MasterUsername, DBName, DBInstanceClass,
DBInstanceIdentifier))
except KeyError:
continue
if __name__ == "__main__":
spread_sheet = create_spreadsheet(spreadsheet_name = "AWS usage", outh_file = '../client_secret.json')
spread_sheet.link(syncToCloud=False)
rds_worksheet_creation(spread_sheet)
$ cat ../client_secret.json
{"installed":{"client_id":"362799999999-uml0m2XX4v999999mr2s03XX9g8l9odi.apps.googleusercontent.com","project_id":"amiable-shuttle-198516","auth_uri":"https://accounts.google.com/o/oauth2/auth","token_uri":"https://accounts.google.com/o/oauth2/token","auth_provider_x509_cert_url":"https://www.googleapis.com/oauth2/v1/certs","client_secret":"XXXXxQH434Qg-xxxx99_n0vW","redirect_uris":["urn:ietf:wg:oauth:2.0:oob","http://localhost"]}}
$
Edit 1:
Following Michael's comment, I changed the script to the following, but even though one more related line appeared, most of the RDS instances are still not returned:
$ python3 ./stackoverflow.py
RDS instances in Oregon
------------------------------
aurora-5-7-yasmin.cazdggrmkpt1.us-west-2.rds.amazonaws.com qa +++ DBName gave KeyError +++ db.t2.small aurora-5-7-yasmin
aurora-5-7-yasmin.cazdggrmkpt1.us-west-2.rds.amazonaws.com qa test db.t2.small aurora-5-7-yasmin
$
$ cat ./stackoverflow.py
import collections
import boto3
import datetime
import pygsheets
REGIONS = ('us-west-2',)
REGIONS_H = ('Oregon',)
currentDT = str(datetime.datetime.now())
def create_spreadsheet(outh_file, spreadsheet_name = "AWS usage"):
client = pygsheets.authorize(outh_file=outh_file, outh_nonlocal=True)
client.list_ssheets(parent_id=None)
spread_sheet = client.create(spreadsheet_name)
return spread_sheet
def rds_worksheet_creation(spread_sheet):
for i in range(len(REGIONS)):
region = REGIONS[i]
region_h = REGIONS_H[i]
print()
print("{} instances in {}".format("RDS", region_h))
print("------------------------------")
client = boto3.client('rds', region_name=region)
db_instances = client.describe_db_instances()
for i in range(len(db_instances)):
j = i - 1
try:
DBName = db_instances['DBInstances'][j]['DBName']
except KeyError:
DBName = "+++ DBName gave KeyError +++"
MasterUsername = db_instances['DBInstances'][0]['MasterUsername']
DBInstanceClass = db_instances['DBInstances'][0]['DBInstanceClass']
DBInstanceIdentifier = db_instances['DBInstances'][0]['DBInstanceIdentifier']
Endpoint = db_instances['DBInstances'][0]['Endpoint']
Address = db_instances['DBInstances'][0]['Endpoint']['Address']
print("{} {} {} {} {}".format(Address, MasterUsername, DBName, DBInstanceClass,
DBInstanceIdentifier))
if __name__ == "__main__":
spread_sheet = create_spreadsheet(spreadsheet_name = "AWS usage", outh_file = '../client_secret.json')
spread_sheet.link(syncToCloud=False)
rds_worksheet_creation(spread_sheet)
You have an error in your original code but if you want this code to scale to a large number of instances (it is unlikely you'll need this) then you'll want to use something like the following:
import boto3
available_regions = boto3.Session().get_available_regions('rds')
for region in available_regions:
rds = boto3.client('rds', region_name=region)
paginator = rds.get_paginator('describe_db_instances').paginate()
for page in paginator:
for dbinstance in page['DBInstances']:
print("{DBInstanceClass}".format(**dbinstance))
You can get rid of the paginator and just use the first loop if you know each region will have fewer than 100s of instances:
for region in available_regions:
rds = boto3.client('rds', region_name=region)
for dbinstance in rds.describe_db_instances():
print("{DBInstanceClass}".format(**dbinstance))
Additionally you can provide a simple
dbinstance.get('DBName', 'No Name Set')
instead of excepting around the KeyError.
Your for loop range is getting the value of 2 since db_instancesis dict type.
Instead of
for i in range(len(db_instances)):
It should be
for i in range(len(db_instances['DBInstances'])):
Which gives list type and correct length to iterate the loop.
This Code will list all RDS instances present in the account
Try this 100 % working code
#!/usr/bin/env python
import boto3
client = boto3.client('rds')
response = client.describe_db_instances()
for i in response['DBInstances']:
db_name = i['DBName']
db_instance_name = i['DBInstanceIdentifier']
db_type = i['DBInstanceClass']
db_storage = i['AllocatedStorage']
db_engine = i['Engine']
print db_instance_name,db_type,db_storage,db_engine
FYI, the more Pythonic way to do loops in this case would be:
for instance in db_instances['DBInstances']:
MasterUsername = instance['MasterUsername']
DBInstanceClass = instance['DBInstanceClass']
etc.
This avoids the need for i-type iterators.
Related
I know this page lists up the instance types which based on Nitro system but I would like to know the list in a dynamic way with CLI. (for example, using aws ec2 describe-instances). Is it possible to get Nitro based instance type other than parsing the static page? If so, could you tell me the how?
You'd have to write a bit of additional code to get that information. aws ec2 describe-instances will give you InstanceType property. You should use a programming language to parse the JSON, extract InstanceType and then call describe-instances like so: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-instance-types.html?highlight=nitro
From the JSON you get back, extract hypervisor. That'll give you Nitro if the instance is Nitro.
Here's a Python code that might work. I have not tested it fully but you can tweak this to get the results you want.
"""List all EC2 instances"""
import boto3
def ec2_connection():
"""Connect to AWS using API"""
region = 'us-east-2'
aws_key = 'xxx'
aws_secret = 'xxx'
session = boto3.Session(
aws_access_key_id = aws_key,
aws_secret_access_key = aws_secret
)
ec2 = session.client('ec2', region_name = region)
return ec2
def get_reservations(ec2):
"""Get a list of instances as a dictionary"""
response = ec2.describe_instances()
return response['Reservations']
def process_instances(reservations, ec2):
"""Print a colorful list of IPs and instances"""
if len(reservations) == 0:
print('No instance found. Quitting')
return
for reservation in reservations:
for instance in reservation['Instances']:
# get friendly name of the server
# only try this for mysql1.local server
friendly_name = get_friendly_name(instance)
if friendly_name.lower() != 'mysql1.local':
continue
# get the hypervisor based on the instance type
instance_type = get_instance_info(instance['InstanceType'], ec2)
# print findings
print(f'{friendly_name} // {instance["InstanceType"]} is {instance_type}')
break
def get_instance_info(instance_type, ec2):
"""Get hypervisor from the instance type"""
response = ec2.describe_instance_types(
InstanceTypes=[instance_type]
)
return response['InstanceTypes'][0]['Hypervisor']
def get_friendly_name(instance):
"""Get friendly name of the instance"""
tags = instance['Tags']
for tag in tags:
if tag['Key'] == 'Name':
return tag['Value']
return 'Unknown'
def run():
"""Main method to call"""
ec2 = ec2_connection()
reservations = get_reservations(ec2)
process_instances(reservations, ec2)
if __name__ == '__main__':
run()
print('Done')
In the above answer , the statement "From the JSON you get back, extract hypervisor. That'll give you Nitro if the instance is Nitro " is not longer accurate.
As per the latest AWS documentation,
hypervisor - The hypervisor type of the instance (ovm | xen ). The value xen is used for both Xen and Nitro hypervisors.
Cleaned up, verified working code below:
# Get all instance types that run on Nitro hypervisor
import boto3
def get_nitro_instance_types():
"""Get all instance types that run on Nitro hypervisor"""
ec2 = boto3.client('ec2', region_name = 'us-east-1')
response = ec2.describe_instance_types(
Filters=[
{
'Name': 'hypervisor',
'Values': [
'nitro',
]
},
],
)
instance_types = []
for instance_type in response['InstanceTypes']:
instance_types.append(instance_type['InstanceType'])
return instance_types
get_nitro_instance_types()
Example output as of 12/06/2022 below:
['r5dn.8xlarge', 'x2iedn.xlarge', 'r6id.2xlarge', 'r6gd.medium',
'm5zn.2xlarge', 'r6idn.16xlarge', 'c6a.48xlarge', 'm5a.16xlarge',
'im4gn.2xlarge', 'c6gn.16xlarge', 'c6in.24xlarge', 'r5ad.24xlarge',
'r6i.xlarge', 'c6i.32xlarge', 'x2iedn.2xlarge', 'r6id.xlarge',
'i3en.24xlarge', 'i3en.12xlarge', 'm5d.8xlarge', 'c6i.8xlarge',
'r6g.large', 'm6gd.4xlarge', 'r6a.2xlarge', 'x2iezn.4xlarge',
'c6i.large', 'r6in.24xlarge', 'm6gd.xlarge', 'm5dn.2xlarge',
'd3en.2xlarge', 'c6id.8xlarge', 'm6a.large', 'is4gen.xlarge',
'r6g.8xlarge', 'm6idn.large', 'm6a.2xlarge', 'c6i.4xlarge',
'i4i.16xlarge', 'm5zn.6xlarge', 'm5.8xlarge', 'm6id.xlarge',
'm5n.16xlarge', 'c6g.16xlarge', 'r5n.12xlarge', 't4g.nano',
'm5ad.12xlarge', 'r6in.12xlarge', 'm6idn.12xlarge', 'g5.2xlarge',
'trn1.32xlarge', 'x2gd.8xlarge', 'is4gen.4xlarge', 'r6gd.xlarge',
'r5a.xlarge', 'r5a.2xlarge', 'c5ad.24xlarge', 'r6a.xlarge',
'r6g.medium', 'm6id.12xlarge', 'r6idn.2xlarge', 'c5n.2xlarge',
'g5.4xlarge', 'm5d.xlarge', 'i3en.3xlarge', 'r5.24xlarge',
'r6gd.2xlarge', 'c5d.large', 'm6gd.12xlarge', 'm6id.2xlarge',
'm6i.large', 'z1d.2xlarge', 'm5a.4xlarge', 'm5a.2xlarge',
'c6in.xlarge', 'r6id.16xlarge', 'c7g.8xlarge', 'm5dn.12xlarge',
'm6gd.medium', 'im4gn.8xlarge', 'm5dn.large', 'c5ad.4xlarge',
'r6g.16xlarge', 'c6a.24xlarge', 'c6a.16xlarge']
"""List all EC2 instances"""
import boto3
def ec2_connection():
"""Connect to AWS using API"""
region = 'us-east-2'
aws_key = 'xxx'
aws_secret = 'xxx'
session = boto3.Session(
aws_access_key_id = aws_key,
aws_secret_access_key = aws_secret
)
ec2 = session.client('ec2', region_name = region)
return ec2
def get_reservations(ec2):
"""Get a list of instances as a dictionary"""
response = ec2.describe_instances()
return response['Reservations']
def process_instances(reservations, ec2):
"""Print a colorful list of IPs and instances"""
if len(reservations) == 0:
print('No instance found. Quitting')
return
for reservation in reservations:
for instance in reservation['Instances']:
# get friendly name of the server
# only try this for mysql1.local server
friendly_name = get_friendly_name(instance)
if friendly_name.lower() != 'mysql1.local':
continue
# get the hypervisor based on the instance type
instance_type = get_instance_info(instance['InstanceType'], ec2)
# print findings
print(f'{friendly_name} // {instance["InstanceType"]} is {instance_type}')
break
def get_instance_info(instance_type, ec2):
"""Get hypervisor from the instance type"""
response = ec2.describe_instance_types(
InstanceTypes=[instance_type]
)
return response['InstanceTypes'][0]['Hypervisor']
def get_friendly_name(instance):
"""Get friendly name of the instance"""
tags = instance['Tags']
for tag in tags:
if tag['Key'] == 'Name':
return tag['Value']
return 'Unknown'
def run():
"""Main method to call"""
ec2 = ec2_connection()
reservations = get_reservations(ec2)
process_instances(reservations, ec2)
if name == 'main':
run()
print('Done')
We've got a number of stored procedures that have been built in Redshift on AWS.
We need to download and upload these stored procedures so that they can be kept in GITHUB as a means of tracking changes.
These procedures eventually need to be part of a cloudformation template so that the infrastructure can be maintained as well.
Ideally this could be done using the AWS CLI but there doesn't seem to be a command to do that.
How are AWS RedShift stored proceedure managed in an automation/ CICD environment?
I have a portion of a working solution.
import json
import psycopg2
import os
def run_sql_commands(input_command, database="mydatabasename", host_port=1234 ,host_url="datawarehouse.redshift.amazonaws.com"):
"""
:param input_command: sql string to execute
:param database: which database to run the query
:return:
"""
results = None
# db_user and db_pass will need to be set as environment variables
db_user = os.environ["db_user"]
db_pass = os.environ["db_pass"]
db_host = host_url
db_port = host_port
db_name = database
try:
conn = psycopg2.connect(
"dbname={} port={} user={} host={} password={}".format(db_name, db_port, db_user, db_host, db_pass))
cursor = conn.cursor()
cursor.execute(input_command)
results = cursor.fetchall()
except Exception as e:
return None
return results
def get_arg_type(oid, database):
sql = f"SELECT typname FROM pg_catalog.pg_type WHERE oid={oid}"
r = run_sql_commands(sql, database)
return r[0][0]
def download_all_procedures(database, file_location="./local-code-store"):
get_all_stored_procedure_sql = """
SELECT
n.nspname,
b.usename,
p.proname,
p.proargnames,
p.proargtypes,
p.prosrc
FROM
pg_catalog.pg_namespace n
JOIN pg_catalog.pg_proc p ON
pronamespace = n.oid
JOIN pg_user b ON
b.usesysid = p.proowner
WHERE
nspname NOT IN ('information_schema', 'pg_catalog');
"""
r = run_sql_commands(get_all_stored_procedure_sql, database)
counted_items=[]
for item in r:
table = item[0]
author = item[1]
procedure_name = item[2]
procedure_arguments = item[3]
procedure_argtypes = item[4]
procedure = item[5]
t_list = []
for this_oid in procedure_argtypes.split():
t = get_arg_type(this_oid, database="mydatabasename")
t_list.append(t)
meta_data = {'table': table,
'author': author,
'arguments': procedure_arguments,
'argument_types': t_list}
filename = f'{file_location}/{database}/Schemas/{table}/{procedure_name}.sql'
os.makedirs(os.path.dirname(filename), exist_ok=True)
f = open(filename, 'w')
count = f.write(procedure.replace('\r\n', '\n'))
f.close()
filename = f'{file_location}/{database}/Schemas/{table}/{procedure_name}.json'
os.makedirs(os.path.dirname(filename), exist_ok=True)
f = open(filename, 'w')
counted_items.append(f.write(json.dumps(meta_data)))
f.close()
return counted_items
if __name__ == "__main__":
print("Starting...")
c = download_all_procedures("mydatabase")
print("... finished")
I am trying to get instances that have tag name 'ttl' and tag values 'older than today'.
Our users tag their instances based on a future date (yyyy.mm.dd) so that the script should not automatically delete them before that date. The filter below should be able to get only instances whose tag-value is smaller than today's date. Is there a way to filter the tage values based on this logic -> get me instances that are < today ?
today = = datetime.today().strftime('%Y.%m.%d')
filters = [{'Name': 'tag:ttl','Values': ['<today']},{'Name': 'instance-state-name','Values': ['running']}]
The full code looks like this:
import boto3
import logging
#setup simple logging for INFO
logger = logging.getLogger()
logger.setLevel(logging.INFO)
#define the connection
ec2 = boto3.resource('ec2')
def lambda_handler(event, context):
filters = [{'Name': 'tag:ttl','Values': ['<today']},{'Name': 'instance-state-name','Values': ['running']}]
#locate all running instances
RunningInstances = [instance.id for instance in instances]
print (RunningInstances)
#make sure there are actually instances to shut down.
if len(RunningInstances) > 0:
shuttingDown = ec2.instances.filter(InstanceIds=RunningInstances).stop()
print "shuttingDown"
else:
print "Nothing to see here"
You can't compare dates in the query as you specified. You need to write some code comparing the dates in your python script and take action. Roughly like below:
import datetime
import boto3
# Connect to EC2
ec2 = boto3.resource('ec2')
def get_ttl(instance):
for tag in instance.tags:
if 'ttl'in tag['Key']:
ttl = tag['Value']
return ttl
def lambda_handler(event,context):
running_instances = ec2.instances.filter(Filters=[{
'Name': 'instance-state-name',
'Values': ['running']}])
for instance in running_instances:
ttl = get_ttl(instance)
if ttl:
if datetime.date.today() > datetime.datetime.strptime(ttl, "%Y-%m-%d").date():
print('stopping instance')
else:
print("nothing to do here")
Basically, it resolves around comparing dates
In [30]: datestring = '2021-02-22'
In [31]: datetime.date.today() == datetime.datetime.strptime(datestring, "%Y-%m-%d").date()
Out[31]: False
In [32]: datetime.date.today() < datetime.datetime.strptime(datestring, "%Y-%m-%d").date()
Out[32]: True
In [33]: datetime.date.today() > datetime.datetime.strptime(datestring, "%Y-%m-%d").date()
Out[33]: False
I can see following information regarding the RDS Instance
I want to know how can I get value of current activity using boto3. Current value as shown in below screenshot is 0.
I tried
response=client.describe_db_instances()
But it didnt returned the value of active connections.
You can get that data from CloudWatch. RDS sends any state information there and just render a few metrics in RDS dashboard.
#ivan thanks for the directions.
I created following python script to get information about instances with 0 connections and delete them after that. I hope it helps someone.
import datetime
import boto3
class RDSTermination:
#Strandard constructor for RDSTermination class
def __init__(self, cloudwatch_object, rds_object):
self.cloudwatch_object = cloudwatch_object
self.rds_object = rds_object
#Getter and setters for variables.
#property
def cloudwatch_object(self):
return self._cloudwatch_object
#cloudwatch_object.setter
def cloudwatch_object(self, cloudwatch_object):
self._cloudwatch_object = cloudwatch_object
#property
def rds_object(self):
return self._rds_object
#rds_object.setter
def rds_object(self, rds_object):
self._rds_object = rds_object
# Fetch connections details for all the RDS instances.Filter the list and return
# only those instances which are having 0 connections at the time of this script run
def _get_instance_connection_info(self):
rds_instances_connection_details = {}
response = self.cloudwatch_object.get_metric_data(
MetricDataQueries=[
{
'Id': 'fetching_data_for_something',
'Expression': "SEARCH('{AWS/RDS,DBInstanceIdentifier} MetricName=\"DatabaseConnections\"', 'Average', 300)",
'ReturnData': True
},
],
EndTime=datetime.datetime.utcnow(),
StartTime=datetime.datetime.utcnow() - datetime.timedelta(hours=2),
ScanBy='TimestampDescending',
MaxDatapoints=123
)
# response is of type dictionary with MetricDataResults as key
for instance_info in response['MetricDataResults']:
if len(instance_info['Timestamps']) > 0:
rds_instances_connection_details[instance_info['Label']] = instance_info['Values'][-1]
return rds_instances_connection_details
# Fetches list of all instances and there status.
def _fetch_all_rds_instance_state(self):
all_rds_instance_state = {}
response = self.rds_object.describe_db_instances()
instance_details = response['DBInstances']
for instance in instance_details:
all_rds_instance_state[instance['DBInstanceIdentifier']] = instance['DBInstanceStatus']
return all_rds_instance_state
# We further refine the list and remove instances which are stopped. We will work on
# Instances with Available state only
def _get_instance_allowed_for_deletion(self):
instances = self._get_instance_connection_info()
all_instance_state = self._fetch_all_rds_instance_state()
instances_to_delete = []
try:
for instance_name in instances.keys():
if instances[instance_name] == 0.0 and all_instance_state[instance_name] == 'available':
instances_to_delete.append(instance_name)
except BaseException:
print("Check if instance connection_info is empty")
return instances_to_delete
# Function to delete the instances reported in final list.It deletes instances with 0 connection
# and status as available
def terminate_rds_instances(self, dry_run=True):
if dry_run:
message = 'DRY-RUN'
else:
message = 'DELETE'
rdsnames = self._get_instance_allowed_for_deletion()
if len(rdsnames) > 0:
for rdsname in rdsnames:
try:
response = self.rds_object.describe_db_instances(
DBInstanceIdentifier=rdsname
)
termination_protection = response['DBInstances'][0]['DeletionProtection']
except BaseException as e:
print('[ERROR]: reading details' + str(e))
exit(1)
if termination_protection is True:
try:
print("Removing delete termination for {}".format(rdsname))
if not dry_run:
response = self.rds_object.modify_db_instance(
DBInstanceIdentifier=rdsname,
DeletionProtection=False
)
except BaseException as e:
print(
"[ERROR]: Could not modify db termination protection "
"due to following error:\n " + str(
e))
exit(1)
try:
if not dry_run:
print("i got executed")
response = self.rds_object.delete_db_instance(
DBInstanceIdentifier=rdsname,
SkipFinalSnapshot=True,
)
print('[{}]: RDS instance {} deleted'.format(message, rdsname))
except BaseException:
print("[ERROR]: {} rds instance not found".format(rdsname))
else:
print("No RDS instance marked for deletion")
if __name__ == "__main__":
cloud_watch_object = boto3.client('cloudwatch', region_name='us-east-1')
rds_object = boto3.client('rds', region_name='us-east-1')
rds_termination_object = RDSTermination(cloud_watch_object, rds_object)
rds_termination_object.terminate_rds_instances(dry_run=True)
Is there a way to send EMR logs to CloudWatch instead of S3. We would like to have all our services logs in one location. Seems like the only thing you can do is set up alarms for monitoring but that doesn't cover logging.
https://docs.aws.amazon.com/emr/latest/ManagementGuide/UsingEMR_ViewingMetrics.html
Would I have to install CloudWatch agent on the nodes in the cluster https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/AgentReference.html
you can install the CloudWatch agent via EMR’s bootstrap configuration, and configure it to watch log directories. It then starts to push logs to Amazon CloudWatch Logs
You can read the logs from s3 and push them to the cloudwatch using boto3 and delete them from s3 if you do not need. In some use-cases stdout.gz log will be needed to be in the cloudwatch for monitoring purposes.
boto3 documentation on put_log_events
import boto3
import botocore.session
import logging
import time
import datetime
import gzip
def get_session(service_name):
session = botocore.session.get_session()
aws_access_key_id = session.get_credentials().access_key
aws_secret_access_key = session.get_credentials().secret_key
aws_session_token = session.get_credentials().token
region = session.get_config_variable('region')
return boto3.client(
service_name = service_name,
region_name = region,
aws_access_key_id = aws_access_key_id,
aws_secret_access_key = aws_secret_access_key,
aws_session_token = aws_session_token
)
def get_log_file(s3, bucket, key):
log_file = None
try:
obj = s3.get_object(Bucket=bucket, Key=key)
compressed_body = obj['Body'].read()
log_file = gzip.decompress(compressed_body)
except Exception as e:
logger.error(f"Error reading from bucket : {e}")
raise
return log_file
def create_log_events(logs, batch_size):
log_event_batch = []
log_event_batch_collection = []
try:
for line in logs.splitlines():
log_event = {'timestamp': int(round(time.time() * 1000)), 'message':line.decode('utf-8')}
if len(log_event_batch) < batch_size:
log_event_batch.append(log_event)
else:
log_event_batch_collection.append(log_event_batch)
log_event_batch = []
log_event_batch.append(log_event)
except Exception as e:
logger.error(f"Error creating log events : {e}")
raise
log_event_batch_collection.append(log_event_batch)
return log_event_batch_collection
def create_log_stream_and_push_log_events(logs, log_group, log_stream, log_event_batch_collection, delay):
response = logs.create_log_stream(logGroupName=log_group, logStreamName=log_stream)
seq_token = None
try:
for log_event_batch in log_event_batch_collection:
log_event = {
'logGroupName': log_group,
'logStreamName': log_stream,
'logEvents': log_event_batch
}
if seq_token:
log_event['sequenceToken'] = seq_token
response = logs.put_log_events(**log_event)
seq_token = response['nextSequenceToken']
time.sleep(delay)
except Exception as e:
logger.error(f"Error pushing log events : {e}")
raise
The caller function
def main():
s3 = get_session('s3')
logs = get_session('logs')
BUCKET_NAME = 'Your_Bucket_Name'
KEY = 'logs/emr/Path_To_Log/stdout.gz'
BATCH_SIZE = 10000 #According to boto3 docs
PUSH_DELAY = 0.2 #According to boto3 docs
LOG_GROUP='test_log_group' #Destination log group
LOG_STREAM='{}-{}'.format(time.strftime('%Y-%m-%d'),'logstream.log')
log_file = get_log_file(s3, BUCKET_NAME, KEY)
log_event_batch_collection = create_log_events(log_file, BATCH_SIZE)
create_log_stream_and_push_log_events(logs, LOG_GROUP, LOG_STREAM, log_event_batch_collection, PUSH_DELAY)