Airflow Emr Dag suceeds but cluster not started - amazon-web-services

I am trying to start an AWS emr cluster and submit a step using EmrCreateJobFlowOperator and EmrAddStepsOperator, my both steps succed but the cluster is never launch, not even without step
Both of the steps change to succeed status
Here is my code
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'retry_delay': timedelta(minutes=2),
'start_date': datetime(2019, 1, 1),
'end_date': datetime(2019, 2, 1),
'depends_on_past': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
step_args = ["spark-submit", '../test.py']
step = [{"Name": "what_you_do-" + time.strftime("%Y%m%d-%H:%M"),
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': step_args
}
}]
JOB_FLOW_OVERRIDES = {
'Instances': {
'InstanceGroups': [
{
'InstanceRole': 'MASTER',
'InstanceType': 'm4.large',
'InstanceCount': 1
},
{
'InstanceRole': 'CORE',
'InstanceType': 'm4.large',
'InstanceCount': 2,
}
]},
'Name':'airflow-monthly_agg_custom',
'BootstrapActions':[{
'Name': 'Install',
'ScriptBootstrapAction': {
'Path': 's3://dep-buck/bootstrap.sh'
}
}],
'Configurations': [
{
"Classification": "spark-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"PYSPARK_PYTHON": "/usr/bin/python3"
}
}
]
}
]}
dag = DAG('emr_job_flow_automatic_steps_7',
default_args=default_args,
schedule_interval="#daily",
max_active_runs=1,
# schedule_interval='*/1 * * * *',
catchup=True,
# dagrun_timeout=timedelta(seconds=10)
)
cluster_creator = EmrCreateJobFlowOperator(
task_id='create_job_flow2',
job_flow_overrides=JOB_FLOW_OVERRIDES,
aws_conn_id='aws_default',
emr_conn_id='emr_default',
dag=dag
)
step_adder = EmrAddStepsOperator(
task_id='add_steps',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow2', key='return_value') }}",
aws_conn_id='aws_default',
steps=step,
dag=dag
)
cluster_creator.set_downstream(step_adder)
I have tried to search for example or good ocument but there isnt much except function definition on airflow site
for create job flow i have this log repeated several time
for "add step" i have this in log

The Problem was mainly about the visibilty to users and region, it was starting cluster in the default region so i had to change the properties below
Airflow UI > admin > connection > aws_default > extra
{"region_name": "the region i was watching the ec2 console"}
Airflow UI > admin > connection > emr_default > extra
"VisibleToAllUsers": true,

Related

How to send RequestTag in Lambda function to spin up EMR cluster

My lambda function is to spin up a transient EMR. I am getting the error below:
"errorMessage": "An error occurred (AccessDeniedException) when calling the RunJobFlow operation: User: arn:aws:sts::111111111115:assumed-role/lambda-eks-role/transient_job is not authorized to perform: elasticmapreduce:RunJobFlow on resource: arn:aws:elasticmapreduce:ap-southeast-1:111111111115:cluster/* because no identity-based policy allows the elasticmapreduce:RunJobFlow action",
The above is a result of the IAM role condition below:
{
"Sid": "RunJobFlowExplicitlyWithEMRManagedTag",
"Effect": "Allow",
"Action": [
"elasticmapreduce:RunJobFlow"
],
"Resource": "*",
"Condition": {
"StringEquals": {
"aws:RequestTag/for-use-with-amazon-emr-managed-policies": "true"
}
}
}
I was told to pass the above tag (i.e., "for-use-with-amazon-emr-managed-policies": "true") when I create my cluster. How do I do that? Every time I search for Lambda and RequestTag, I'm not getting anything relevant.
FYI, I have no privilege to change the IAM roles. I was told by the admin, and the exact words were: "Can you add this tag (for-use-with-amazon-emr-managed-policies": "true") to the cluster you are creating"
I believe I have to add the tag in the function launch_transient_emr() but I have no idea where exactly and how (and I'm still searching online for any relevant information). Any guidance is appreciated.
A snippet of my Lambda Code:
import json
import boto3
from datetime import datetime
import urllib.parse
### Steps Configs (under function 'get_emr_step')
TODAY_DATE = datetime.today().strftime("%Y%m%d") + datetime.today().strftime("%H%M%s")[:-3]
JOB_TYPE_MAPPING = {
'cowrie': {
'job-script-path': 's3://bucket-test-transient/transient-job-scripts/emr_type1_job.py',
'output_file_name': 'type1-results/'
},
'suricata': {
'job-script-path': 's3://bucket-test-transient/transient-job-scripts/emr_type2_job.py',
'output_file_name': 'type2-results/'
}
}
### EMR Job Running Configs (under function 'launch_transient_emr')
CLUSTER_NAME = 'transient_emr_cluster_'+TODAY_DATE # TODO: insert some cluster name
LOGURI = os.environ['LOGURI']
RELEASE_LABEL = os.environ['RELEASE_LABEL']
EBS_ROOT_VOLUME_SIZE = os.environ['EBS_ROOT_VOLUME_SIZE']
# Instance Variables
MASTER_INSTANCE_TYPE = os.environ['MASTER_INSTANCE_TYPE']
SLAVE_INSTANCE_TYPE = os.environ['SLAVE_INSTANCE_TYPE']
INSTANCE_COUNT = os.environ['INSTANCE_COUNT']
EC2_SUBNET_ID = os.environ['EC2_SUBNET_ID']
# Roles
JOB_FLOW_ROLE = os.environ['JOB_FLOW_ROLE']
SERVICE_ROLE = os.environ['SERVICE_ROLE']
# Bootstrap
BOOTSTRAP_PATH = os.environ['BOOTSTRAP_PATH']
# Output File Configs
OUTPUT_BUCKET_NAME = os.environ['OUTPUT_BUCKET_NAME']
def get_emr_step(job_type, source_bucket_name, source_key):
job_date = source_key.split("/")[1]
spark_steps = [
{
"Name": job_type+"-daily-job-"+job_date+"-"+TODAY_DATE,
"ActionOnFailure": "TERMINATE_CLUSTER",
"HadoopJarStep": {
"Jar": "command-runner.jar",
"Args": [
"sudo",
"spark-submit",
"--deploy-mode",
"client",
JOB_TYPE_MAPPING[job_type]["job-script-path"],
"--input_bucket_name",
source_bucket_name,
"--input_key_name",
source_key,
"--output_bucket_name",
OUTPUT_BUCKET_NAME,
"--output_file_name",
JOB_TYPE_MAPPING[job_type]["output_file_name"]
],
}
}
]
return spark_steps
def launch_transient_emr(spark_steps):
client = get_emr_client()
response = client.run_job_flow(
Name = CLUSTER_NAME,
LogUri = LOGURI,
ReleaseLabel = RELEASE_LABEL,
EbsRootVolumeSize = EBS_ROOT_VOLUME_SIZE,
Instances={
'MasterInstanceType': MASTER_INSTANCE_TYPE,
'SlaveInstanceType': SLAVE_INSTANCE_TYPE,
'InstanceCount': INSTANCE_COUNT,
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False,
'Ec2SubnetId': EC2_SUBNET_ID
},
Applications = [ {'Name': 'Spark'} ],
Configurations = [
{
'Classification': 'spark-hive-site',
'Properties': {
'hive.metastore.client.factory.class': 'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory'}
},
{
"Classification": "spark",
"Properties": {
"maximizeResourceAllocation": "true"
}
},
{
"Classification": "spark-defaults",
"Properties": {
"spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT":"1",
"spark.network.timeout":"1500"
}
},
{
"Classification": "hdfs-site",
"Properties": {
"dfs.replication":"2"
}
},
{
"Classification": "livy-conf",
"Properties": {
"livy.server.session.timeout": "10h"
}
},
{
"Classification": "emrfs-site",
"Properties": {
"fs.s3.maxConnections":"100"
}
}
],
VisibleToAllUsers = True,
JobFlowRole = JOB_FLOW_ROLE,
ServiceRole = SERVICE_ROLE,
Steps = spark_steps,
BootstrapActions = [
{
'Name': 'string',
'ScriptBootstrapAction': {
'Path': BOOTSTRAP_PATH
}
}
]
)
return response
def get_emr_client():
return boto3.client("emr")
def lambda_handler(event, context):
# Get the object from the event and show its content type
source_bucket_name = event['Records'][0]['s3']['bucket']['name']
source_key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
try:
job_type = 'type1' if 'type1' in source_key else 'type2'
spark_steps = get_emr_step(job_type, source_bucket_name, source_key)
response = launch_transient_emr(spark_steps)
return {"status" :"Successfully launched EMR cluster"}
except Exception as e:
print(e)
raise e
Take a look here https://docs.aws.amazon.com/emr/latest/APIReference/API_RunJobFlow.html
Focus now on this image
The key you need is for-use-with-amazon-emr-managed-policies and the value is true. You can follow the same approach you did for the steps. Your admin is right.

I want to get metrics of multiple ec2s at once from aws cloudwatch

aws cloudwatch can use get_metric_data and get_metric_statistics for single ec2.
However, it is impossible to fetch information from multiple ec2s at once.
How can I get multiple ec2 metrics with one api call?
Metrics should be separated by ec2.
example)
get single ec2 data (success)
import boto3
from datetime import datetime, timedelta
from dateutil.tz import tzutc
session = boto3.Session(profile_name="XXXXXXXXXX")
client = session.client('cloudwatch', 'ap-northeast-2')
response = client.get_metric_data(
MetricDataQueries=[
{
'Id': 'cpuUtilization',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/EC2',
'MetricName': 'CPUUtilization',
'Dimensions': [
{
'Name': 'InstanceId',
'Value': 'i-XXXXXXXXX'
},
]
},
'Period': 60,
'Stat': 'Average'
}
}
],
StartTime=datetime(2022, 7, 7, 0, 0, 0, tzinfo=tzutc()),
EndTime=datetime(2022, 7, 7, 23, 59, 59, tzinfo=tzutc())
)
print(response)
RESULT:
{'MetricDataResults': [{'Id': 'cpuUtilization',
get single ec2 data (fail)
import boto3
from datetime import datetime, timedelta
from dateutil.tz import tzutc
session = boto3.Session(profile_name="XXXXXXXXXX")
client = session.client('cloudwatch', 'ap-northeast-2')
response = client.get_metric_data(
MetricDataQueries=[
{
'Id': 'cpuUtilization',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/EC2',
'MetricName': 'CPUUtilization',
'Dimensions': [
{
'Name': 'InstanceId',
'Value': 'i-XXXXXXXXX'
},
]
},
'Period': 60,
'Stat': 'Average'
}
},
{
'Id': 'cpuUtilization',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/EC2',
'MetricName': 'CPUUtilization',
'Dimensions': [
{
'Name': 'InstanceId',
'Value': 'i-XXXXXXXXX'
},
]
},
'Period': 60,
'Stat': 'Average'
}
}
],
StartTime=datetime(2022, 7, 7, 0, 0, 0, tzinfo=tzutc()),
EndTime=datetime(2022, 7, 7, 23, 59, 59, tzinfo=tzutc())
)
print(response)
RESULT:
botocore.exceptions.ClientError: An error occurred (ValidationError) when calling the GetMetricData operation: The values for parameter id in MetricDataQueries are not unique.
AWS-SDK Cloudwatch MaxQueryTimeRangeExceed
I was able to find the answer I was looking for in the link above. But it has a 3 hour limit. If you have another good way, please share.
const AWS = require("aws-sdk");
AWS.config.loadFromPath("./config.json");
var cloudwatch = new AWS.CloudWatch({apiVersion: "2010-08-01"});
var params = {
StartTime: new Date('june 06, 2022 17:30'),
EndTime: new Date('june 06, 2022 18:00'),
MetricDataQueries: [
{
Id: 'q1',
Expression: "SELECT AVG(CPUUtilization) FROM SCHEMA(\"AWS/EC2\", InstanceId) WHERE InstanceId = 'i-**********'",
Period: '600'
},
],
};
cloudwatch.getMetricData(params, function(err, data) {
if (err) console.log(err, err.stack);
else console.log(data);
});

EMR Cluster: AutoScaling Policy For Instance Group Could Not Attach And Failed

I am trying to automate the EMR cluster creation through boto3. Unfortunately, I'm getting the following warning:
The Auto Scaling policy for instance group ig-MI0ANZ0C3WNN in Amazon EMR cluster j-BS3Y2OAO65R6 (qidv2_historical_3.0.1) could not attach and failed at 2021-09-20 17:41 UTC.
I cannot figure out what is the issue is. This was adapted from an aws cli command which didn't raise any warnings or issues, but after transitioning to boto3, was getting this autoscaling policy warning
cluster_id = self.boto_client().run_job_flow(
Name=self.cluster_name,
LogUri='s3n://aws-logs',
JobFlowRole='EMR_EC2_DefaultRole',
ReleaseLabel=self.release_label,
Applications=[{'Name': 'Spark'},{'Name': 'Hive'},{'Name': 'Hadoop'},{'Name': 'Pig'},{'Name': 'Hue'},
{'Name': 'Zeppelin'},{'Name': 'Livy'},{'Name': 'JupyterHub'},{'Name': 'Tensorflow'}
],
AutoScalingRole='EMR_AutoScaling_DefaultRole',
BootstrapActions=[
{
'Name': 'Custom action',
'ScriptBootstrapAction': {
'Path': 's3://ml-data/emr-bootstrap_spk3.0.1.sh'
}
}
],
ServiceRole='EMR_DefaultRole',
ScaleDownBehavior='TERMINATE_AT_TASK_COMPLETION',
EbsRootVolumeSize=25,
Steps=[
{
'Name': 'Setup Debugging',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['state-pusher-script']
}
},
{
'Name': 'Setup - Sync with S3',
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['aws', 's3', 'sync',
's3://ch-ml-data/',
'/mnt/src/']
}
},
{
'Name': 'Spark Application',
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['cd /mnt/src; bash spark_jobs/qid_pipeline_historical_run.sh']
}
}
],
Configurations=[
{
'Classification': 'zeppelin-env',
'Properties': {},
'Configurations': [
{
'Classification': 'export',
'Properties': {
'ZEPPELIN_PORT': '8890',
'HADOOP_CONF_DIR': '/etc/hadoop/conf',
'ZEPPELIN_LOG_DIR': '/var/log/zeppelin',
'ZEPPELIN_PID': '$ZEPPELIN_PID_DIR/zeppelin.pid',
'MASTER': 'yarn-client',
'SPARK_SUBMIT_OPTIONS': "$SPARK_SUBMIT_OPTIONS --conf '\''spark.executorEnv.PYTHONPATH=/usr/lib/spark/python/lib/py4j-src.zip:/usr/lib/spark/python/:<CPS>{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-src.zip'\'' --conf spark.yarn.isPython=true",
'PYSPARK_DRIVER_PYTHON': '/mnt/anaconda3/envs/question-identification-v2/bin/python',
'ZEPPELIN_NOTEBOOK_USER': 'user',
'CLASSPATH': ':/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar',
'ZEPPELIN_PID_DIR': '/var/run/zeppelin',
'PYSPARK_PYTHON': '/mnt/anaconda3/envs/question-identification-v2/bin/python',
'SPARK_HOME': '/usr/lib/spark',
'ZEPPELIN_NOTEBOOK_S3_BUCKET': 'ch-ml-data',
'ZEPPELIN_WAR_TEMPDIR': '/var/run/zeppelin/webapps',
'ZEPPELIN_CONF_DIR': '/etc/zeppelin/conf',
'ZEPPELIN_NOTEBOOK_STORAGE': 'org.apache.zeppelin.notebook.repo.S3NotebookRepo',
'ZEPPELIN_NOTEBOOK_DIR': '/var/lib/zeppelin/notebook',
'ZEPPELIN_ADDR': '0.0.0.0'
}
}
]
},
{
'Classification': 'hive-site',
'Properties': {
'hive.metastore.client.factory.class': 'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory'
}
},
{
'Classification': 'spark-hive-site',
'Properties': {
'hive.metastore.client.factory.class': 'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory'
}
}
],
Instances={
'Ec2KeyName': 'emr-temporary',
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False,
'Ec2SubnetId': 'subnet-063735e4fa63e3bac',
'AdditionalSlaveSecurityGroups': ["sg-012970517d0a88bae", "sg-01813cf2115b55874", "sg-04563fc7e8ed9e1ec", "sg-07ab30655981361ad"],
'ServiceAccessSecurityGroup': 'sg-00dd6e63d7004176d',
'EmrManagedSlaveSecurityGroup': 'sg-048b83d1a20550b43',
'EmrManagedMasterSecurityGroup': 'sg-017402b74e879aaa5',
'AdditionalMasterSecurityGroups': ["sg-012970517d0a88bae", "sg-01813cf2115b55874", "sg-04563fc7e8ed9e1ec", "sg-07ab30655981361ad"],
'InstanceGroups': [
{
'Name': 'Task',
'InstanceRole': 'TASK',
'InstanceType': 'i3.2xlarge',
'InstanceCount': 1
},
{
'Name': 'Master - 1',
'InstanceRole': 'MASTER',
'InstanceType': 'i3.2xlarge',
'InstanceCount': 1,
},
{
'Name': 'Core - 2',
'InstanceRole': 'CORE',
'InstanceType': 'i3.2xlarge',
'InstanceCount': 1,
'Market': 'SPOT',
'AutoScalingPolicy': {
'Constraints': {
'MinCapacity': 3,
'MaxCapacity': 100
},
'Rules': [
{
'Name': 'memory',
'Description': '',
'Action': {
'SimpleScalingPolicyConfiguration': {
'ScalingAdjustment': 10,
'CoolDown': 300,
'AdjustmentType': 'CHANGE_IN_CAPACITY'
}
},
'Trigger': {
'CloudWatchAlarmDefinition': {
'MetricName': 'YARNMemoryAvailablePercentage',
'ComparisonOperator': 'LESS_THAN',
'Statistic': 'AVERAGE',
'Period': 300,
'EvaluationPeriods': 2,
'Unit': 'PERCENT',
'Namespace': 'AWS/ElasticMapReduce',
'Threshold': 25,
'Dimensions': [
{
'Value': '${emr.clusterId}',
'Key': 'JobFlowId'
}
]
}
}
},
{
'Name': 'mem',
'Description': '',
'Action': {
'SimpleScalingPolicyConfiguration': {
'ScalingAdjustment': -5,
'CoolDown': 300,
'AdjustmentType': 'CHANGE_IN_CAPACITY'
}
},
'Trigger': {
'CloudWatchAlarmDefinition': {
'MetricName': 'YARNMemoryAvailablePercentage',
'ComparisonOperator': 'GREATER_THAN_OR_EQUAL',
'Statistic': 'AVERAGE',
'Period': 300,
'EvaluationPeriods': 18,
'Unit': 'PERCENT',
'Namespace': 'AWS/ElasticMapReduce',
'Threshold': 50,
'Dimensions': [
{
'Value': '${emr.clusterId}',
'Key': 'JobFlowId'
}
],
}
}
}
]
}
}
]
}
)

Airflow DAG EMR EmrCreateJobFlowOperator Doesn't do anythong

I'm trying to run an Airflow dag which Creates an EMR Cluster adds some steps, checks them and finally terminates the EMR Cluster that was created.
But when I run the Airflow Dag, it's continuously on running status and doesn't show any error or log.
Can anyone tell me what I'm doing wrong here ??
Is there any missing parameter that I should add?
Or It's the problem with the dag schedule ?
import airflow
from airflow import DAG
from airflow.contrib.operators.emr_create_job_flow_operator import
EmrCreateJobFlowOperator
from airflow.contrib.operators.emr_add_steps_operator import
EmrAddStepsOperator
from airflow.contrib.sensors.emr_step_sensor import EmrStepSensor
from airflow.contrib.operators.emr_terminate_job_flow_operator import
EmrTerminateJobFlowOperator
DEFAULT_ARGS = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(2),
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False
}
HIVE_CLOUDFRONT = [
{
'Name': 'cloudfront',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
'hive-script',
'--run-hive-script',
'--args',
'-f',
's3://BUCKET/xnder/scripts/Hive_CloudFront.q',
'-d',
'INPUT=s3://BUCKET/',
'-d',
'OUTPUT=s3://BUCKET/output5/'
]
}
}
]
JOB_FLOW_OVERRIDES = {
'Name' : 'test1212',
'LogUri' : 's3://BUCKET/log.txt',
'ReleaseLabel' : 'emr-4.1.0',
'Instances' : {
'InstanceGroups': [
{
'Name': 'Master nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm1.large',
'InstanceCount': 1,
},
{
'Name': 'Slave nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'CORE',
'InstanceType': 'm1.large',
'InstanceCount': 1,
}
],
'KeepJobFlowAliveWhenNoSteps': True,
'TerminationProtected': False
},
'Applications':[{
'Name': 'Hadoop'
}],
'JobFlowRole':'EMR_EC2_DefaultRole',
'ServiceRole':'EMR_DefaultRole'
}
dag = DAG(
'emr_test_manual',
default_args=DEFAULT_ARGS,
dagrun_timeout=timedelta(hours=2),
#schedule_interval='0 3 * * *'
#schedule_interval=timedelta(seconds=10)
schedule_interval='#once'
)
cluster_creator = EmrCreateJobFlowOperator(
task_id='create_job_flow_cluster',
job_flow_overrides=JOB_FLOW_OVERRIDES,
aws_conn_id='aws_default',
emr_conn_id='emr_default',
dag=dag
)
step_adder = EmrAddStepsOperator(
task_id='add_steps',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
steps=HIVE_CLOUDFRONT,
dag=dag
)
step_checker = EmrStepSensor(
task_id='watch_step',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
aws_conn_id='aws_default',
dag=dag
)
cluster_remover = EmrTerminateJobFlowOperator(
task_id='remove_cluster',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
dag=dag
)
cluster_creator.set_downstream(step_adder)
step_adder.set_downstream(step_checker)
step_checker.set_downstream(cluster_remover)
Remove field 'Market': 'ON_DEMAND'

AWS Pricing API not yielding prices for the given search criteria

I am using AWS boto3 pricing api to get the prices of instances.
But I am not getting the results for the combination (us west 2, r3.2x large, Linux, No pre software installed, tenancy =shared)
Here is my code:
pricing = boto3.client('pricing', region_name='us-east-1')
hourlyTermCode = 'JRTCKXETXF'
rateCode = '6YS6EN2CT7'
token = ''
while True:
paginator = pricing.get_paginator('get_products')
pages = paginator.paginate(
ServiceCode='AmazonEC2',
Filters=[
{'Type': 'TERM_MATCH', 'Field': 'operatingSystem', 'Value': 'Linux'},
{'Type': 'TERM_MATCH', 'Field': 'location', 'Value': 'US West (Oregon)'}
],
PaginationConfig={
'StartingToken':token
}
)
for response in pages:
for price in response['PriceList']:
resp = json.loads(price)
product = resp['product'] # ['attributes']['']
sku = product['sku']
if product['productFamily'] == 'Compute Instance':
if str(product['attributes']['instanceType']) == str(amazon_instance_type) :
if str(product['attributes']['operatingSystem']) == 'Linux':
if str(product['attributes']['preInstalledSw']) == 'NA':
if str(product['attributes']['tenancy']) == 'Shared':
sku_key = resp['terms']['OnDemand'].get(sku)
if sku_key:
price = sku_key[sku + '.' + hourlyTermCode + '.' + rateCode]['pricePerUnit']['USD']
print 'here 7'
print price
try:
token = response['NextToken']
except KeyError:
pass
This works:
import json
import boto3
client = boto3.client('pricing', region_name='us-east-1')
response = client.get_products(
ServiceCode='AmazonEC2',
Filters=[
{'Type': 'TERM_MATCH', 'Field': 'operatingSystem', 'Value': 'Linux'},
{'Type': 'TERM_MATCH', 'Field': 'location', 'Value': 'US West (Oregon)'},
{'Type': 'TERM_MATCH', 'Field': 'instanceType', 'Value': 'r3.2xlarge'},
{'Type': 'TERM_MATCH', 'Field': 'tenancy', 'Value': 'Shared'},
{'Type': 'TERM_MATCH', 'Field': 'preInstalledSw', 'Value': 'NA'}
]
)
for pricelist_json in response['PriceList']:
pricelist = json.loads(pricelist_json)
product = pricelist['product']
if product['productFamily'] == 'Compute Instance':
print pricelist['terms']['OnDemand'].values()[0]['priceDimensions'].values()[0][u'pricePerUnit']['USD']
It is based on the output of:
{u'FormatVersion': u'aws_v1', u'PriceList': [u'{
"product": {
"productFamily": "Compute Instance",
"attributes": {
"enhancedNetworkingSupported": "Yes",
"memory": "61 GiB",
"vcpu": "8",
"capacitystatus": "Used",
"locationType": "AWS Region",
"storage": "1 x 160 SSD",
"instanceFamily": "Memory optimized",
"operatingSystem": "Linux",
"physicalProcessor": "Intel Xeon E5-2670 v2 (Ivy Bridge)",
"clockSpeed": "2.5 GHz",
"ecu": "26",
"networkPerformance": "High",
"servicename": "Amazon Elastic Compute Cloud",
"instanceType": "r3.2xlarge",
"tenancy": "Shared",
"usagetype": "USW2-BoxUsage:r3.2xlarge",
"normalizationSizeFactor": "16",
"processorFeatures": "Intel AVX; Intel Turbo",
"servicecode": "AmazonEC2",
"licenseModel": "No License required",
"currentGeneration": "No",
"preInstalledSw": "NA",
"location": "US West (Oregon)",
"processorArchitecture": "64-bit",
"operation": "RunInstances"
},
"sku": "GMTWE5CTY4FEUYDN"
},
"serviceCode": "AmazonEC2",
"terms": {
"OnDemand": {
"GMTWE5CTY4FEUYDN.JRTCKXETXF": {
"priceDimensions": {
"GMTWE5CTY4FEUYDN.JRTCKXETXF.6YS6EN2CT7": {
"unit": "Hrs",
"endRange": "Inf",
"description": "$0.665 per On Demand Linux r3.2xlarge Instance Hour",
"appliesTo": [],
"rateCode": "GMTWE5CTY4FEUYDN.JRTCKXETXF.6YS6EN2CT7",
"beginRange": "0",
"pricePerUnit": {
"USD": "0.6650000000"
}
}
},
"sku": "GMTWE5CTY4FEUYDN",
"effectiveDate": "2018-07-01T00:00:00Z",
"offerTermCode": "JRTCKXETXF",
"termAttributes": {}
}
},
...
},
"version": "20180726190848",
"publicationDate": "2018-07-26T19:08:48Z"
}'
]
}