EMR Cluster: AutoScaling Policy For Instance Group Could Not Attach And Failed - amazon-web-services

I am trying to automate the EMR cluster creation through boto3. Unfortunately, I'm getting the following warning:
The Auto Scaling policy for instance group ig-MI0ANZ0C3WNN in Amazon EMR cluster j-BS3Y2OAO65R6 (qidv2_historical_3.0.1) could not attach and failed at 2021-09-20 17:41 UTC.
I cannot figure out what is the issue is. This was adapted from an aws cli command which didn't raise any warnings or issues, but after transitioning to boto3, was getting this autoscaling policy warning
cluster_id = self.boto_client().run_job_flow(
Name=self.cluster_name,
LogUri='s3n://aws-logs',
JobFlowRole='EMR_EC2_DefaultRole',
ReleaseLabel=self.release_label,
Applications=[{'Name': 'Spark'},{'Name': 'Hive'},{'Name': 'Hadoop'},{'Name': 'Pig'},{'Name': 'Hue'},
{'Name': 'Zeppelin'},{'Name': 'Livy'},{'Name': 'JupyterHub'},{'Name': 'Tensorflow'}
],
AutoScalingRole='EMR_AutoScaling_DefaultRole',
BootstrapActions=[
{
'Name': 'Custom action',
'ScriptBootstrapAction': {
'Path': 's3://ml-data/emr-bootstrap_spk3.0.1.sh'
}
}
],
ServiceRole='EMR_DefaultRole',
ScaleDownBehavior='TERMINATE_AT_TASK_COMPLETION',
EbsRootVolumeSize=25,
Steps=[
{
'Name': 'Setup Debugging',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['state-pusher-script']
}
},
{
'Name': 'Setup - Sync with S3',
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['aws', 's3', 'sync',
's3://ch-ml-data/',
'/mnt/src/']
}
},
{
'Name': 'Spark Application',
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['cd /mnt/src; bash spark_jobs/qid_pipeline_historical_run.sh']
}
}
],
Configurations=[
{
'Classification': 'zeppelin-env',
'Properties': {},
'Configurations': [
{
'Classification': 'export',
'Properties': {
'ZEPPELIN_PORT': '8890',
'HADOOP_CONF_DIR': '/etc/hadoop/conf',
'ZEPPELIN_LOG_DIR': '/var/log/zeppelin',
'ZEPPELIN_PID': '$ZEPPELIN_PID_DIR/zeppelin.pid',
'MASTER': 'yarn-client',
'SPARK_SUBMIT_OPTIONS': "$SPARK_SUBMIT_OPTIONS --conf '\''spark.executorEnv.PYTHONPATH=/usr/lib/spark/python/lib/py4j-src.zip:/usr/lib/spark/python/:<CPS>{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-src.zip'\'' --conf spark.yarn.isPython=true",
'PYSPARK_DRIVER_PYTHON': '/mnt/anaconda3/envs/question-identification-v2/bin/python',
'ZEPPELIN_NOTEBOOK_USER': 'user',
'CLASSPATH': ':/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar',
'ZEPPELIN_PID_DIR': '/var/run/zeppelin',
'PYSPARK_PYTHON': '/mnt/anaconda3/envs/question-identification-v2/bin/python',
'SPARK_HOME': '/usr/lib/spark',
'ZEPPELIN_NOTEBOOK_S3_BUCKET': 'ch-ml-data',
'ZEPPELIN_WAR_TEMPDIR': '/var/run/zeppelin/webapps',
'ZEPPELIN_CONF_DIR': '/etc/zeppelin/conf',
'ZEPPELIN_NOTEBOOK_STORAGE': 'org.apache.zeppelin.notebook.repo.S3NotebookRepo',
'ZEPPELIN_NOTEBOOK_DIR': '/var/lib/zeppelin/notebook',
'ZEPPELIN_ADDR': '0.0.0.0'
}
}
]
},
{
'Classification': 'hive-site',
'Properties': {
'hive.metastore.client.factory.class': 'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory'
}
},
{
'Classification': 'spark-hive-site',
'Properties': {
'hive.metastore.client.factory.class': 'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory'
}
}
],
Instances={
'Ec2KeyName': 'emr-temporary',
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False,
'Ec2SubnetId': 'subnet-063735e4fa63e3bac',
'AdditionalSlaveSecurityGroups': ["sg-012970517d0a88bae", "sg-01813cf2115b55874", "sg-04563fc7e8ed9e1ec", "sg-07ab30655981361ad"],
'ServiceAccessSecurityGroup': 'sg-00dd6e63d7004176d',
'EmrManagedSlaveSecurityGroup': 'sg-048b83d1a20550b43',
'EmrManagedMasterSecurityGroup': 'sg-017402b74e879aaa5',
'AdditionalMasterSecurityGroups': ["sg-012970517d0a88bae", "sg-01813cf2115b55874", "sg-04563fc7e8ed9e1ec", "sg-07ab30655981361ad"],
'InstanceGroups': [
{
'Name': 'Task',
'InstanceRole': 'TASK',
'InstanceType': 'i3.2xlarge',
'InstanceCount': 1
},
{
'Name': 'Master - 1',
'InstanceRole': 'MASTER',
'InstanceType': 'i3.2xlarge',
'InstanceCount': 1,
},
{
'Name': 'Core - 2',
'InstanceRole': 'CORE',
'InstanceType': 'i3.2xlarge',
'InstanceCount': 1,
'Market': 'SPOT',
'AutoScalingPolicy': {
'Constraints': {
'MinCapacity': 3,
'MaxCapacity': 100
},
'Rules': [
{
'Name': 'memory',
'Description': '',
'Action': {
'SimpleScalingPolicyConfiguration': {
'ScalingAdjustment': 10,
'CoolDown': 300,
'AdjustmentType': 'CHANGE_IN_CAPACITY'
}
},
'Trigger': {
'CloudWatchAlarmDefinition': {
'MetricName': 'YARNMemoryAvailablePercentage',
'ComparisonOperator': 'LESS_THAN',
'Statistic': 'AVERAGE',
'Period': 300,
'EvaluationPeriods': 2,
'Unit': 'PERCENT',
'Namespace': 'AWS/ElasticMapReduce',
'Threshold': 25,
'Dimensions': [
{
'Value': '${emr.clusterId}',
'Key': 'JobFlowId'
}
]
}
}
},
{
'Name': 'mem',
'Description': '',
'Action': {
'SimpleScalingPolicyConfiguration': {
'ScalingAdjustment': -5,
'CoolDown': 300,
'AdjustmentType': 'CHANGE_IN_CAPACITY'
}
},
'Trigger': {
'CloudWatchAlarmDefinition': {
'MetricName': 'YARNMemoryAvailablePercentage',
'ComparisonOperator': 'GREATER_THAN_OR_EQUAL',
'Statistic': 'AVERAGE',
'Period': 300,
'EvaluationPeriods': 18,
'Unit': 'PERCENT',
'Namespace': 'AWS/ElasticMapReduce',
'Threshold': 50,
'Dimensions': [
{
'Value': '${emr.clusterId}',
'Key': 'JobFlowId'
}
],
}
}
}
]
}
}
]
}
)

Related

Boto3 create glue triggers with different types in one workflow

Can anyone please guide me steps to create multiple triggers types one with conditional and other with scheduled trigger type in single workflow
So far I have used create_trigger function . But above requirement not sure how to address.
Can any one help here please.
I have tried with below syntax didn't work
response = client.create_trigger(
Name='two_triggers',
WorkflowName='wf_With_two_tirggers',
Type='SCHEDULED',
Schedule='cron(0 12 * * ? *)',
Actions=[
{
'JobName': 'abc_dev',
'Arguments': {
'string': 'string'
},
'Timeout': 123,
'SecurityConfiguration': 'string',
'NotificationProperty': {
'NotifyDelayAfter': 123
},
'Trigger': 'string'
},
],
Type='CONDITIONAL',
Predicate={
'Logical': 'ANY',
'Conditions': [
{
'LogicalOperator': 'EQUALS',
'JobName': 'def_dev',
'State': 'SUCCEEDED'
},
]
},
Actions=[
{
'JobName': 'ghi_dev',
'Arguments': {
'string': 'string'
},
'Timeout': 123,
'SecurityConfiguration': 'string',
'NotificationProperty': {
'NotifyDelayAfter': 123
},
'CrawlerName': 'string'
},
],
Description='string',
StartOnCreation=True,
Tags={
'string': 'string'
}
)
Below is the design workflow struggling to write code for. Tried with above code for below design using boto3 didn't work
Yes I figured out on an answer. Below is the code for design given in question
import boto3
import os
import logging
glue = boto3.client(service_name="glue", region_name='us-east-1')
response = glue.create_workflow(
Name="dual_trigger_wf")
response1 = glue.create_trigger(
Name="trigger_one_to_many",
WorkflowName="dual_trigger_wf",
Type="SCHEDULED",
Schedule="cron(0 8 * * ? *)",
Actions=[
{
"JobName": "abc",
"Arguments": {"string": "string"},
"Timeout": 123,
"SecurityConfiguration": "string",
"NotificationProperty": {"NotifyDelayAfter": 123},
},
{
"JobName": "def",
"Arguments": {"string": "string"},
"Timeout": 123,
"SecurityConfiguration": "string",
"NotificationProperty": {"NotifyDelayAfter": 123},
},
],
Description="string",
StartOnCreation=False,
)
response2 = glue.create_trigger(
Name="trigger_many_to_one",
WorkflowName="dual_trigger_wf",
Type="CONDITIONAL",
Predicate={
"Logical": "AND",
"Conditions": [
{
"LogicalOperator": "EQUALS",
"JobName": "abc",
"State": "SUCCEEDED",
},
{
"LogicalOperator": "EQUALS",
"JobName": "def",
"State": "SUCCEEDED",
},
],
},
Actions=[
{
"JobName": "ghi",
"Arguments": {"string": "string"},
"Timeout": 123,
"SecurityConfiguration": "string",
"NotificationProperty": {"NotifyDelayAfter": 123},
}
],
Description="string",
StartOnCreation=False,
)

Airflow Emr Dag suceeds but cluster not started

I am trying to start an AWS emr cluster and submit a step using EmrCreateJobFlowOperator and EmrAddStepsOperator, my both steps succed but the cluster is never launch, not even without step
Both of the steps change to succeed status
Here is my code
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'retry_delay': timedelta(minutes=2),
'start_date': datetime(2019, 1, 1),
'end_date': datetime(2019, 2, 1),
'depends_on_past': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
step_args = ["spark-submit", '../test.py']
step = [{"Name": "what_you_do-" + time.strftime("%Y%m%d-%H:%M"),
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': step_args
}
}]
JOB_FLOW_OVERRIDES = {
'Instances': {
'InstanceGroups': [
{
'InstanceRole': 'MASTER',
'InstanceType': 'm4.large',
'InstanceCount': 1
},
{
'InstanceRole': 'CORE',
'InstanceType': 'm4.large',
'InstanceCount': 2,
}
]},
'Name':'airflow-monthly_agg_custom',
'BootstrapActions':[{
'Name': 'Install',
'ScriptBootstrapAction': {
'Path': 's3://dep-buck/bootstrap.sh'
}
}],
'Configurations': [
{
"Classification": "spark-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"PYSPARK_PYTHON": "/usr/bin/python3"
}
}
]
}
]}
dag = DAG('emr_job_flow_automatic_steps_7',
default_args=default_args,
schedule_interval="#daily",
max_active_runs=1,
# schedule_interval='*/1 * * * *',
catchup=True,
# dagrun_timeout=timedelta(seconds=10)
)
cluster_creator = EmrCreateJobFlowOperator(
task_id='create_job_flow2',
job_flow_overrides=JOB_FLOW_OVERRIDES,
aws_conn_id='aws_default',
emr_conn_id='emr_default',
dag=dag
)
step_adder = EmrAddStepsOperator(
task_id='add_steps',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow2', key='return_value') }}",
aws_conn_id='aws_default',
steps=step,
dag=dag
)
cluster_creator.set_downstream(step_adder)
I have tried to search for example or good ocument but there isnt much except function definition on airflow site
for create job flow i have this log repeated several time
for "add step" i have this in log
The Problem was mainly about the visibilty to users and region, it was starting cluster in the default region so i had to change the properties below
Airflow UI > admin > connection > aws_default > extra
{"region_name": "the region i was watching the ec2 console"}
Airflow UI > admin > connection > emr_default > extra
"VisibleToAllUsers": true,

AWS Pricing API not yielding prices for the given search criteria

I am using AWS boto3 pricing api to get the prices of instances.
But I am not getting the results for the combination (us west 2, r3.2x large, Linux, No pre software installed, tenancy =shared)
Here is my code:
pricing = boto3.client('pricing', region_name='us-east-1')
hourlyTermCode = 'JRTCKXETXF'
rateCode = '6YS6EN2CT7'
token = ''
while True:
paginator = pricing.get_paginator('get_products')
pages = paginator.paginate(
ServiceCode='AmazonEC2',
Filters=[
{'Type': 'TERM_MATCH', 'Field': 'operatingSystem', 'Value': 'Linux'},
{'Type': 'TERM_MATCH', 'Field': 'location', 'Value': 'US West (Oregon)'}
],
PaginationConfig={
'StartingToken':token
}
)
for response in pages:
for price in response['PriceList']:
resp = json.loads(price)
product = resp['product'] # ['attributes']['']
sku = product['sku']
if product['productFamily'] == 'Compute Instance':
if str(product['attributes']['instanceType']) == str(amazon_instance_type) :
if str(product['attributes']['operatingSystem']) == 'Linux':
if str(product['attributes']['preInstalledSw']) == 'NA':
if str(product['attributes']['tenancy']) == 'Shared':
sku_key = resp['terms']['OnDemand'].get(sku)
if sku_key:
price = sku_key[sku + '.' + hourlyTermCode + '.' + rateCode]['pricePerUnit']['USD']
print 'here 7'
print price
try:
token = response['NextToken']
except KeyError:
pass
This works:
import json
import boto3
client = boto3.client('pricing', region_name='us-east-1')
response = client.get_products(
ServiceCode='AmazonEC2',
Filters=[
{'Type': 'TERM_MATCH', 'Field': 'operatingSystem', 'Value': 'Linux'},
{'Type': 'TERM_MATCH', 'Field': 'location', 'Value': 'US West (Oregon)'},
{'Type': 'TERM_MATCH', 'Field': 'instanceType', 'Value': 'r3.2xlarge'},
{'Type': 'TERM_MATCH', 'Field': 'tenancy', 'Value': 'Shared'},
{'Type': 'TERM_MATCH', 'Field': 'preInstalledSw', 'Value': 'NA'}
]
)
for pricelist_json in response['PriceList']:
pricelist = json.loads(pricelist_json)
product = pricelist['product']
if product['productFamily'] == 'Compute Instance':
print pricelist['terms']['OnDemand'].values()[0]['priceDimensions'].values()[0][u'pricePerUnit']['USD']
It is based on the output of:
{u'FormatVersion': u'aws_v1', u'PriceList': [u'{
"product": {
"productFamily": "Compute Instance",
"attributes": {
"enhancedNetworkingSupported": "Yes",
"memory": "61 GiB",
"vcpu": "8",
"capacitystatus": "Used",
"locationType": "AWS Region",
"storage": "1 x 160 SSD",
"instanceFamily": "Memory optimized",
"operatingSystem": "Linux",
"physicalProcessor": "Intel Xeon E5-2670 v2 (Ivy Bridge)",
"clockSpeed": "2.5 GHz",
"ecu": "26",
"networkPerformance": "High",
"servicename": "Amazon Elastic Compute Cloud",
"instanceType": "r3.2xlarge",
"tenancy": "Shared",
"usagetype": "USW2-BoxUsage:r3.2xlarge",
"normalizationSizeFactor": "16",
"processorFeatures": "Intel AVX; Intel Turbo",
"servicecode": "AmazonEC2",
"licenseModel": "No License required",
"currentGeneration": "No",
"preInstalledSw": "NA",
"location": "US West (Oregon)",
"processorArchitecture": "64-bit",
"operation": "RunInstances"
},
"sku": "GMTWE5CTY4FEUYDN"
},
"serviceCode": "AmazonEC2",
"terms": {
"OnDemand": {
"GMTWE5CTY4FEUYDN.JRTCKXETXF": {
"priceDimensions": {
"GMTWE5CTY4FEUYDN.JRTCKXETXF.6YS6EN2CT7": {
"unit": "Hrs",
"endRange": "Inf",
"description": "$0.665 per On Demand Linux r3.2xlarge Instance Hour",
"appliesTo": [],
"rateCode": "GMTWE5CTY4FEUYDN.JRTCKXETXF.6YS6EN2CT7",
"beginRange": "0",
"pricePerUnit": {
"USD": "0.6650000000"
}
}
},
"sku": "GMTWE5CTY4FEUYDN",
"effectiveDate": "2018-07-01T00:00:00Z",
"offerTermCode": "JRTCKXETXF",
"termAttributes": {}
}
},
...
},
"version": "20180726190848",
"publicationDate": "2018-07-26T19:08:48Z"
}'
]
}

AWS Boto3 EMR Software Settings Configuration From S3

When you create a new AWS EMR cluster through the AWS Management Console you're able to provide JSON Software Configurations. You can put the JSON file in an S3 bucket and point the Software Configurations to the S3 bucket via the following field,
I need to do this through the AWS Python SDK Boto3 library but I don't see where to do it at in the available fields in their example,
response = client.run_job_flow(
Name='string',
LogUri='string',
AdditionalInfo='string',
AmiVersion='string',
ReleaseLabel='string',
Instances={
'MasterInstanceType': 'string',
'SlaveInstanceType': 'string',
'InstanceCount': 123,
'InstanceGroups': [
{
'Name': 'string',
'Market': 'ON_DEMAND'|'SPOT',
'InstanceRole': 'MASTER'|'CORE'|'TASK',
'BidPrice': 'string',
'InstanceType': 'string',
'InstanceCount': 123,
'Configurations': [
{
'Classification': 'string',
'Configurations': {'... recursive ...'},
'Properties': {
'string': 'string'
}
},
],
'EbsConfiguration': {
'EbsBlockDeviceConfigs': [
{
'VolumeSpecification': {
'VolumeType': 'string',
'Iops': 123,
'SizeInGB': 123
},
'VolumesPerInstance': 123
},
],
'EbsOptimized': True|False
},
'AutoScalingPolicy': {
'Constraints': {
'MinCapacity': 123,
'MaxCapacity': 123
},
'Rules': [
{
'Name': 'string',
'Description': 'string',
'Action': {
'Market': 'ON_DEMAND'|'SPOT',
'SimpleScalingPolicyConfiguration': {
'AdjustmentType': 'CHANGE_IN_CAPACITY'|'PERCENT_CHANGE_IN_CAPACITY'|'EXACT_CAPACITY',
'ScalingAdjustment': 123,
'CoolDown': 123
}
},
'Trigger': {
'CloudWatchAlarmDefinition': {
'ComparisonOperator': 'GREATER_THAN_OR_EQUAL'|'GREATER_THAN'|'LESS_THAN'|'LESS_THAN_OR_EQUAL',
'EvaluationPeriods': 123,
'MetricName': 'string',
'Namespace': 'string',
'Period': 123,
'Statistic': 'SAMPLE_COUNT'|'AVERAGE'|'SUM'|'MINIMUM'|'MAXIMUM',
'Threshold': 123.0,
'Unit': 'NONE'|'SECONDS'|'MICRO_SECONDS'|'MILLI_SECONDS'|'BYTES'|'KILO_BYTES'|'MEGA_BYTES'|'GIGA_BYTES'|'TERA_BYTES'|'BITS'|'KILO_BITS'|'MEGA_BITS'|'GIGA_BITS'|'TERA_BITS'|'PERCENT'|'COUNT'|'BYTES_PER_SECOND'|'KILO_BYTES_PER_SECOND'|'MEGA_BYTES_PER_SECOND'|'GIGA_BYTES_PER_SECOND'|'TERA_BYTES_PER_SECOND'|'BITS_PER_SECOND'|'KILO_BITS_PER_SECOND'|'MEGA_BITS_PER_SECOND'|'GIGA_BITS_PER_SECOND'|'TERA_BITS_PER_SECOND'|'COUNT_PER_SECOND',
'Dimensions': [
{
'Key': 'string',
'Value': 'string'
},
]
}
}
},
]
}
},
],
'InstanceFleets': [
{
'Name': 'string',
'InstanceFleetType': 'MASTER'|'CORE'|'TASK',
'TargetOnDemandCapacity': 123,
'TargetSpotCapacity': 123,
'InstanceTypeConfigs': [
{
'InstanceType': 'string',
'WeightedCapacity': 123,
'BidPrice': 'string',
'BidPriceAsPercentageOfOnDemandPrice': 123.0,
'EbsConfiguration': {
'EbsBlockDeviceConfigs': [
{
'VolumeSpecification': {
'VolumeType': 'string',
'Iops': 123,
'SizeInGB': 123
},
'VolumesPerInstance': 123
},
],
'EbsOptimized': True|False
},
'Configurations': [
{
'Classification': 'string',
'Configurations': {'... recursive ...'},
'Properties': {
'string': 'string'
}
},
]
},
],
'LaunchSpecifications': {
'SpotSpecification': {
'TimeoutDurationMinutes': 123,
'TimeoutAction': 'SWITCH_TO_ON_DEMAND'|'TERMINATE_CLUSTER',
'BlockDurationMinutes': 123
}
}
},
],
'Ec2KeyName': 'string',
'Placement': {
'AvailabilityZone': 'string',
'AvailabilityZones': [
'string',
]
},
'KeepJobFlowAliveWhenNoSteps': True|False,
'TerminationProtected': True|False,
'HadoopVersion': 'string',
'Ec2SubnetId': 'string',
'Ec2SubnetIds': [
'string',
],
'EmrManagedMasterSecurityGroup': 'string',
'EmrManagedSlaveSecurityGroup': 'string',
'ServiceAccessSecurityGroup': 'string',
'AdditionalMasterSecurityGroups': [
'string',
],
'AdditionalSlaveSecurityGroups': [
'string',
]
},
Steps=[
{
'Name': 'string',
'ActionOnFailure': 'TERMINATE_JOB_FLOW'|'TERMINATE_CLUSTER'|'CANCEL_AND_WAIT'|'CONTINUE',
'HadoopJarStep': {
'Properties': [
{
'Key': 'string',
'Value': 'string'
},
],
'Jar': 'string',
'MainClass': 'string',
'Args': [
'string',
]
}
},
],
BootstrapActions=[
{
'Name': 'string',
'ScriptBootstrapAction': {
'Path': 'string',
'Args': [
'string',
]
}
},
],
SupportedProducts=[
'string',
],
NewSupportedProducts=[
{
'Name': 'string',
'Args': [
'string',
]
},
],
Applications=[
{
'Name': 'string',
'Version': 'string',
'Args': [
'string',
],
'AdditionalInfo': {
'string': 'string'
}
},
],
Configurations=[
{
'Classification': 'string',
'Configurations': {'... recursive ...'},
'Properties': {
'string': 'string'
}
},
],
VisibleToAllUsers=True|False,
JobFlowRole='string',
ServiceRole='string',
Tags=[
{
'Key': 'string',
'Value': 'string'
},
],
SecurityConfiguration='string',
AutoScalingRole='string',
ScaleDownBehavior='TERMINATE_AT_INSTANCE_HOUR'|'TERMINATE_AT_TASK_COMPLETION',
CustomAmiId='string',
EbsRootVolumeSize=123,
RepoUpgradeOnBoot='SECURITY'|'NONE',
KerberosAttributes={
'Realm': 'string',
'KdcAdminPassword': 'string',
'CrossRealmTrustPrincipalPassword': 'string',
'ADDomainJoinUser': 'string',
'ADDomainJoinPassword': 'string'
}
)
How can I provide an S3 bucket location that has the Software Configuration JSON file for creating an EMR cluster through the Boto3 library?
Right now the boto3 SDK can't directly import the configuration settings from s3 for you as part of the run_job_flow() function. You would need to setup an S3 client in boto3, download the data as an S3 object and then update the Configuration List part of your EMR dictionary with the JSON data in your S3 file.
An example of how to download a json file from S3 and then load it into memory as a Python Dict can be found over here - Reading an JSON file from S3 using Python boto3
The Configuring Applications - Amazon EMR documentation says:
Supplying a Configuration in the Console
To supply a configuration, you navigate to the Create cluster page and choose Edit software settings. You can then enter the configuration directly (in JSON or using shorthand syntax demonstrated in shadow text) in the console or provide an Amazon S3 URI for a file with a JSON Configurations object.
That seems to be the capability you showed in your question.
The documentation then shows how you can do it via the CLI:
aws emr create-cluster --use-default-roles --release-label emr-5.14.0 --instance-type m4.large --instance-count 2 --applications Name=Hive --configurations https://s3.amazonaws.com/mybucket/myfolder/myConfig.json
This maps to the Configurations options in the JSON you show above:
'Configurations': [
{
'Classification': 'string',
'Configurations': {'... recursive ...'},
'Properties': {
'string': 'string'
}
},
]
Configurations: A configuration classification that applies when provisioning cluster instances, which can include configurations for applications and software that run on the cluster.
It would contain settings such as:
[
{
"Classification": "core-site",
"Properties": {
"hadoop.security.groups.cache.secs": "250"
}
},
{
"Classification": "mapred-site",
"Properties": {
"mapred.tasktracker.map.tasks.maximum": "2",
"mapreduce.map.sort.spill.percent": "0.90",
"mapreduce.tasktracker.reduce.tasks.maximum": "5"
}
}
]
Short answer: Configurations

Getting Dict response - Boto3

I am trying to get the SnapshotId of the below output with no success. I can get the value of the AMI description and the value of AMI_ID.
{
'Images': [
{
'Architecture': 'i386'|'x86_64',
'CreationDate': 'string',
'ImageId': 'string',
'ImageLocation': 'string',
'ImageType': 'machine'|'kernel'|'ramdisk',
'Public': True|False,
'KernelId': 'string',
'OwnerId': 'string',
'Platform': 'Windows',
'ProductCodes': [
{
'ProductCodeId': 'string',
'ProductCodeType': 'devpay'|'marketplace'
},
],
'RamdiskId': 'string',
'State': 'pending'|'available'|'invalid'|'deregistered'|'transient'|'failed'|'error',
'BlockDeviceMappings': [
{
'DeviceName': 'string',
'VirtualName': 'string',
'Ebs': {
'Encrypted': True|False,
'DeleteOnTermination': True|False,
'Iops': 123,
'SnapshotId': 'string',
'VolumeSize': 123,
'VolumeType': 'standard'|'io1'|'gp2'|'sc1'|'st1'
},
'NoDevice': 'string'
},
],
'Description': 'string',
'EnaSupport': True|False,
'Hypervisor': 'ovm'|'xen',
'ImageOwnerAlias': 'string',
'Name': 'string',
'RootDeviceName': 'string',
'RootDeviceType': 'ebs'|'instance-store',
'SriovNetSupport': 'string',
'StateReason': {
'Code': 'string',
'Message': 'string'
},
'Tags': [
{
'Key': 'string',
'Value': 'string'
},
],
'VirtualizationType': 'hvm'|'paravirtual'
},
]
}
Using the following code:
import boto3
client = boto3.client('ec2', region_name='us-east-1')
def verifica_imagem(imagem):
amiresponse = client.describe_images(
Filters=[
{
'Name': 'description',
'Values': [
imagem,
]
},
],
DryRun=False
)
try:
data = str(amiresponse['Images'][0]['Description'])
ami_id = str(amiresponse['Images'][0]['ImageId'])
snapshot_id = str(amiresponse['Images'][0]['SnapshotId'])
except:
print "AMI not exists! Exiting...."
return 1
verifica_imagem('IMAGE_XXXXXXX')
I can't understand how to use the key of SnapshotId. I have tried:
snapshot_id = str(amiresponse['Images']['BlockDeviceMappings']['Ebs'][0]['SnapshotId']) but is not working too.
The value of Images and BlockDeviceMappings is an array and Ebs is a dict.
Use this to fetch the value of SnapshotId,
snapshot_id = amiresponse['Images'][0]['BlockDeviceMappings'][0]['Ebs']['SnapshotId']