Airflow DAG EMR EmrCreateJobFlowOperator Doesn't do anythong - amazon-web-services

I'm trying to run an Airflow dag which Creates an EMR Cluster adds some steps, checks them and finally terminates the EMR Cluster that was created.
But when I run the Airflow Dag, it's continuously on running status and doesn't show any error or log.
Can anyone tell me what I'm doing wrong here ??
Is there any missing parameter that I should add?
Or It's the problem with the dag schedule ?
import airflow
from airflow import DAG
from airflow.contrib.operators.emr_create_job_flow_operator import
EmrCreateJobFlowOperator
from airflow.contrib.operators.emr_add_steps_operator import
EmrAddStepsOperator
from airflow.contrib.sensors.emr_step_sensor import EmrStepSensor
from airflow.contrib.operators.emr_terminate_job_flow_operator import
EmrTerminateJobFlowOperator
DEFAULT_ARGS = {
'owner': 'airflow',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(2),
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False
}
HIVE_CLOUDFRONT = [
{
'Name': 'cloudfront',
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
'hive-script',
'--run-hive-script',
'--args',
'-f',
's3://BUCKET/xnder/scripts/Hive_CloudFront.q',
'-d',
'INPUT=s3://BUCKET/',
'-d',
'OUTPUT=s3://BUCKET/output5/'
]
}
}
]
JOB_FLOW_OVERRIDES = {
'Name' : 'test1212',
'LogUri' : 's3://BUCKET/log.txt',
'ReleaseLabel' : 'emr-4.1.0',
'Instances' : {
'InstanceGroups': [
{
'Name': 'Master nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm1.large',
'InstanceCount': 1,
},
{
'Name': 'Slave nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'CORE',
'InstanceType': 'm1.large',
'InstanceCount': 1,
}
],
'KeepJobFlowAliveWhenNoSteps': True,
'TerminationProtected': False
},
'Applications':[{
'Name': 'Hadoop'
}],
'JobFlowRole':'EMR_EC2_DefaultRole',
'ServiceRole':'EMR_DefaultRole'
}
dag = DAG(
'emr_test_manual',
default_args=DEFAULT_ARGS,
dagrun_timeout=timedelta(hours=2),
#schedule_interval='0 3 * * *'
#schedule_interval=timedelta(seconds=10)
schedule_interval='#once'
)
cluster_creator = EmrCreateJobFlowOperator(
task_id='create_job_flow_cluster',
job_flow_overrides=JOB_FLOW_OVERRIDES,
aws_conn_id='aws_default',
emr_conn_id='emr_default',
dag=dag
)
step_adder = EmrAddStepsOperator(
task_id='add_steps',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
steps=HIVE_CLOUDFRONT,
dag=dag
)
step_checker = EmrStepSensor(
task_id='watch_step',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
step_id="{{ task_instance.xcom_pull('add_steps', key='return_value')[0] }}",
aws_conn_id='aws_default',
dag=dag
)
cluster_remover = EmrTerminateJobFlowOperator(
task_id='remove_cluster',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
dag=dag
)
cluster_creator.set_downstream(step_adder)
step_adder.set_downstream(step_checker)
step_checker.set_downstream(cluster_remover)

Remove field 'Market': 'ON_DEMAND'

Related

EMR Cluster: AutoScaling Policy For Instance Group Could Not Attach And Failed

I am trying to automate the EMR cluster creation through boto3. Unfortunately, I'm getting the following warning:
The Auto Scaling policy for instance group ig-MI0ANZ0C3WNN in Amazon EMR cluster j-BS3Y2OAO65R6 (qidv2_historical_3.0.1) could not attach and failed at 2021-09-20 17:41 UTC.
I cannot figure out what is the issue is. This was adapted from an aws cli command which didn't raise any warnings or issues, but after transitioning to boto3, was getting this autoscaling policy warning
cluster_id = self.boto_client().run_job_flow(
Name=self.cluster_name,
LogUri='s3n://aws-logs',
JobFlowRole='EMR_EC2_DefaultRole',
ReleaseLabel=self.release_label,
Applications=[{'Name': 'Spark'},{'Name': 'Hive'},{'Name': 'Hadoop'},{'Name': 'Pig'},{'Name': 'Hue'},
{'Name': 'Zeppelin'},{'Name': 'Livy'},{'Name': 'JupyterHub'},{'Name': 'Tensorflow'}
],
AutoScalingRole='EMR_AutoScaling_DefaultRole',
BootstrapActions=[
{
'Name': 'Custom action',
'ScriptBootstrapAction': {
'Path': 's3://ml-data/emr-bootstrap_spk3.0.1.sh'
}
}
],
ServiceRole='EMR_DefaultRole',
ScaleDownBehavior='TERMINATE_AT_TASK_COMPLETION',
EbsRootVolumeSize=25,
Steps=[
{
'Name': 'Setup Debugging',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['state-pusher-script']
}
},
{
'Name': 'Setup - Sync with S3',
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['aws', 's3', 'sync',
's3://ch-ml-data/',
'/mnt/src/']
}
},
{
'Name': 'Spark Application',
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['cd /mnt/src; bash spark_jobs/qid_pipeline_historical_run.sh']
}
}
],
Configurations=[
{
'Classification': 'zeppelin-env',
'Properties': {},
'Configurations': [
{
'Classification': 'export',
'Properties': {
'ZEPPELIN_PORT': '8890',
'HADOOP_CONF_DIR': '/etc/hadoop/conf',
'ZEPPELIN_LOG_DIR': '/var/log/zeppelin',
'ZEPPELIN_PID': '$ZEPPELIN_PID_DIR/zeppelin.pid',
'MASTER': 'yarn-client',
'SPARK_SUBMIT_OPTIONS': "$SPARK_SUBMIT_OPTIONS --conf '\''spark.executorEnv.PYTHONPATH=/usr/lib/spark/python/lib/py4j-src.zip:/usr/lib/spark/python/:<CPS>{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-src.zip'\'' --conf spark.yarn.isPython=true",
'PYSPARK_DRIVER_PYTHON': '/mnt/anaconda3/envs/question-identification-v2/bin/python',
'ZEPPELIN_NOTEBOOK_USER': 'user',
'CLASSPATH': ':/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar',
'ZEPPELIN_PID_DIR': '/var/run/zeppelin',
'PYSPARK_PYTHON': '/mnt/anaconda3/envs/question-identification-v2/bin/python',
'SPARK_HOME': '/usr/lib/spark',
'ZEPPELIN_NOTEBOOK_S3_BUCKET': 'ch-ml-data',
'ZEPPELIN_WAR_TEMPDIR': '/var/run/zeppelin/webapps',
'ZEPPELIN_CONF_DIR': '/etc/zeppelin/conf',
'ZEPPELIN_NOTEBOOK_STORAGE': 'org.apache.zeppelin.notebook.repo.S3NotebookRepo',
'ZEPPELIN_NOTEBOOK_DIR': '/var/lib/zeppelin/notebook',
'ZEPPELIN_ADDR': '0.0.0.0'
}
}
]
},
{
'Classification': 'hive-site',
'Properties': {
'hive.metastore.client.factory.class': 'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory'
}
},
{
'Classification': 'spark-hive-site',
'Properties': {
'hive.metastore.client.factory.class': 'com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory'
}
}
],
Instances={
'Ec2KeyName': 'emr-temporary',
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False,
'Ec2SubnetId': 'subnet-063735e4fa63e3bac',
'AdditionalSlaveSecurityGroups': ["sg-012970517d0a88bae", "sg-01813cf2115b55874", "sg-04563fc7e8ed9e1ec", "sg-07ab30655981361ad"],
'ServiceAccessSecurityGroup': 'sg-00dd6e63d7004176d',
'EmrManagedSlaveSecurityGroup': 'sg-048b83d1a20550b43',
'EmrManagedMasterSecurityGroup': 'sg-017402b74e879aaa5',
'AdditionalMasterSecurityGroups': ["sg-012970517d0a88bae", "sg-01813cf2115b55874", "sg-04563fc7e8ed9e1ec", "sg-07ab30655981361ad"],
'InstanceGroups': [
{
'Name': 'Task',
'InstanceRole': 'TASK',
'InstanceType': 'i3.2xlarge',
'InstanceCount': 1
},
{
'Name': 'Master - 1',
'InstanceRole': 'MASTER',
'InstanceType': 'i3.2xlarge',
'InstanceCount': 1,
},
{
'Name': 'Core - 2',
'InstanceRole': 'CORE',
'InstanceType': 'i3.2xlarge',
'InstanceCount': 1,
'Market': 'SPOT',
'AutoScalingPolicy': {
'Constraints': {
'MinCapacity': 3,
'MaxCapacity': 100
},
'Rules': [
{
'Name': 'memory',
'Description': '',
'Action': {
'SimpleScalingPolicyConfiguration': {
'ScalingAdjustment': 10,
'CoolDown': 300,
'AdjustmentType': 'CHANGE_IN_CAPACITY'
}
},
'Trigger': {
'CloudWatchAlarmDefinition': {
'MetricName': 'YARNMemoryAvailablePercentage',
'ComparisonOperator': 'LESS_THAN',
'Statistic': 'AVERAGE',
'Period': 300,
'EvaluationPeriods': 2,
'Unit': 'PERCENT',
'Namespace': 'AWS/ElasticMapReduce',
'Threshold': 25,
'Dimensions': [
{
'Value': '${emr.clusterId}',
'Key': 'JobFlowId'
}
]
}
}
},
{
'Name': 'mem',
'Description': '',
'Action': {
'SimpleScalingPolicyConfiguration': {
'ScalingAdjustment': -5,
'CoolDown': 300,
'AdjustmentType': 'CHANGE_IN_CAPACITY'
}
},
'Trigger': {
'CloudWatchAlarmDefinition': {
'MetricName': 'YARNMemoryAvailablePercentage',
'ComparisonOperator': 'GREATER_THAN_OR_EQUAL',
'Statistic': 'AVERAGE',
'Period': 300,
'EvaluationPeriods': 18,
'Unit': 'PERCENT',
'Namespace': 'AWS/ElasticMapReduce',
'Threshold': 50,
'Dimensions': [
{
'Value': '${emr.clusterId}',
'Key': 'JobFlowId'
}
],
}
}
}
]
}
}
]
}
)

Unable to delete route53 record set via boto3

I have following boto3 script:
Import boto3
ChangeBatch={
'Changes': [
{
'Action': 'DELETE',
'ResourceRecordSet': {
'Name': 'test.example.com.',
'Region': 'us-west-1',
'SetIdentifier': 'test1',
'AliasTarget': {
'HostedZoneId': '**675',
'DNSName': 'testexample.example.com.',
'EvaluateTargetHealth': 'True'
},
'HealthCheckId': '**-**-**-675'
}
}
]
}
When I run the above code it does not delete anything. This is a latency based routing policy. Not sure what am I doing wrong I checked online and looked at aws documentation this is the suggested way to delete recordset.
I figured it out. It was missing type.
Import boto3
ChangeBatch={
'Changes': [
{
'Action': 'DELETE',
'ResourceRecordSet': {
'Name': 'test.example.com.',
'Region': 'us-west-1',
'Type': 'A'
'SetIdentifier': 'test1',
'AliasTarget': {
'HostedZoneId': '**675',
'DNSName': 'testexample.example.com.',
'EvaluateTargetHealth': 'True'
},
'HealthCheckId': '**-**-**-675'
}
}
]
}

Airflow Emr Dag suceeds but cluster not started

I am trying to start an AWS emr cluster and submit a step using EmrCreateJobFlowOperator and EmrAddStepsOperator, my both steps succed but the cluster is never launch, not even without step
Both of the steps change to succeed status
Here is my code
default_args = {
'owner': 'airflow',
'depends_on_past': False,
'retry_delay': timedelta(minutes=2),
'start_date': datetime(2019, 1, 1),
'end_date': datetime(2019, 2, 1),
'depends_on_past': False,
'retries': 1,
'retry_delay': timedelta(minutes=5)
}
step_args = ["spark-submit", '../test.py']
step = [{"Name": "what_you_do-" + time.strftime("%Y%m%d-%H:%M"),
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': step_args
}
}]
JOB_FLOW_OVERRIDES = {
'Instances': {
'InstanceGroups': [
{
'InstanceRole': 'MASTER',
'InstanceType': 'm4.large',
'InstanceCount': 1
},
{
'InstanceRole': 'CORE',
'InstanceType': 'm4.large',
'InstanceCount': 2,
}
]},
'Name':'airflow-monthly_agg_custom',
'BootstrapActions':[{
'Name': 'Install',
'ScriptBootstrapAction': {
'Path': 's3://dep-buck/bootstrap.sh'
}
}],
'Configurations': [
{
"Classification": "spark-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"PYSPARK_PYTHON": "/usr/bin/python3"
}
}
]
}
]}
dag = DAG('emr_job_flow_automatic_steps_7',
default_args=default_args,
schedule_interval="#daily",
max_active_runs=1,
# schedule_interval='*/1 * * * *',
catchup=True,
# dagrun_timeout=timedelta(seconds=10)
)
cluster_creator = EmrCreateJobFlowOperator(
task_id='create_job_flow2',
job_flow_overrides=JOB_FLOW_OVERRIDES,
aws_conn_id='aws_default',
emr_conn_id='emr_default',
dag=dag
)
step_adder = EmrAddStepsOperator(
task_id='add_steps',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow2', key='return_value') }}",
aws_conn_id='aws_default',
steps=step,
dag=dag
)
cluster_creator.set_downstream(step_adder)
I have tried to search for example or good ocument but there isnt much except function definition on airflow site
for create job flow i have this log repeated several time
for "add step" i have this in log
The Problem was mainly about the visibilty to users and region, it was starting cluster in the default region so i had to change the properties below
Airflow UI > admin > connection > aws_default > extra
{"region_name": "the region i was watching the ec2 console"}
Airflow UI > admin > connection > emr_default > extra
"VisibleToAllUsers": true,

AWS Pricing API not yielding prices for the given search criteria

I am using AWS boto3 pricing api to get the prices of instances.
But I am not getting the results for the combination (us west 2, r3.2x large, Linux, No pre software installed, tenancy =shared)
Here is my code:
pricing = boto3.client('pricing', region_name='us-east-1')
hourlyTermCode = 'JRTCKXETXF'
rateCode = '6YS6EN2CT7'
token = ''
while True:
paginator = pricing.get_paginator('get_products')
pages = paginator.paginate(
ServiceCode='AmazonEC2',
Filters=[
{'Type': 'TERM_MATCH', 'Field': 'operatingSystem', 'Value': 'Linux'},
{'Type': 'TERM_MATCH', 'Field': 'location', 'Value': 'US West (Oregon)'}
],
PaginationConfig={
'StartingToken':token
}
)
for response in pages:
for price in response['PriceList']:
resp = json.loads(price)
product = resp['product'] # ['attributes']['']
sku = product['sku']
if product['productFamily'] == 'Compute Instance':
if str(product['attributes']['instanceType']) == str(amazon_instance_type) :
if str(product['attributes']['operatingSystem']) == 'Linux':
if str(product['attributes']['preInstalledSw']) == 'NA':
if str(product['attributes']['tenancy']) == 'Shared':
sku_key = resp['terms']['OnDemand'].get(sku)
if sku_key:
price = sku_key[sku + '.' + hourlyTermCode + '.' + rateCode]['pricePerUnit']['USD']
print 'here 7'
print price
try:
token = response['NextToken']
except KeyError:
pass
This works:
import json
import boto3
client = boto3.client('pricing', region_name='us-east-1')
response = client.get_products(
ServiceCode='AmazonEC2',
Filters=[
{'Type': 'TERM_MATCH', 'Field': 'operatingSystem', 'Value': 'Linux'},
{'Type': 'TERM_MATCH', 'Field': 'location', 'Value': 'US West (Oregon)'},
{'Type': 'TERM_MATCH', 'Field': 'instanceType', 'Value': 'r3.2xlarge'},
{'Type': 'TERM_MATCH', 'Field': 'tenancy', 'Value': 'Shared'},
{'Type': 'TERM_MATCH', 'Field': 'preInstalledSw', 'Value': 'NA'}
]
)
for pricelist_json in response['PriceList']:
pricelist = json.loads(pricelist_json)
product = pricelist['product']
if product['productFamily'] == 'Compute Instance':
print pricelist['terms']['OnDemand'].values()[0]['priceDimensions'].values()[0][u'pricePerUnit']['USD']
It is based on the output of:
{u'FormatVersion': u'aws_v1', u'PriceList': [u'{
"product": {
"productFamily": "Compute Instance",
"attributes": {
"enhancedNetworkingSupported": "Yes",
"memory": "61 GiB",
"vcpu": "8",
"capacitystatus": "Used",
"locationType": "AWS Region",
"storage": "1 x 160 SSD",
"instanceFamily": "Memory optimized",
"operatingSystem": "Linux",
"physicalProcessor": "Intel Xeon E5-2670 v2 (Ivy Bridge)",
"clockSpeed": "2.5 GHz",
"ecu": "26",
"networkPerformance": "High",
"servicename": "Amazon Elastic Compute Cloud",
"instanceType": "r3.2xlarge",
"tenancy": "Shared",
"usagetype": "USW2-BoxUsage:r3.2xlarge",
"normalizationSizeFactor": "16",
"processorFeatures": "Intel AVX; Intel Turbo",
"servicecode": "AmazonEC2",
"licenseModel": "No License required",
"currentGeneration": "No",
"preInstalledSw": "NA",
"location": "US West (Oregon)",
"processorArchitecture": "64-bit",
"operation": "RunInstances"
},
"sku": "GMTWE5CTY4FEUYDN"
},
"serviceCode": "AmazonEC2",
"terms": {
"OnDemand": {
"GMTWE5CTY4FEUYDN.JRTCKXETXF": {
"priceDimensions": {
"GMTWE5CTY4FEUYDN.JRTCKXETXF.6YS6EN2CT7": {
"unit": "Hrs",
"endRange": "Inf",
"description": "$0.665 per On Demand Linux r3.2xlarge Instance Hour",
"appliesTo": [],
"rateCode": "GMTWE5CTY4FEUYDN.JRTCKXETXF.6YS6EN2CT7",
"beginRange": "0",
"pricePerUnit": {
"USD": "0.6650000000"
}
}
},
"sku": "GMTWE5CTY4FEUYDN",
"effectiveDate": "2018-07-01T00:00:00Z",
"offerTermCode": "JRTCKXETXF",
"termAttributes": {}
}
},
...
},
"version": "20180726190848",
"publicationDate": "2018-07-26T19:08:48Z"
}'
]
}

Getting Dict response - Boto3

I am trying to get the SnapshotId of the below output with no success. I can get the value of the AMI description and the value of AMI_ID.
{
'Images': [
{
'Architecture': 'i386'|'x86_64',
'CreationDate': 'string',
'ImageId': 'string',
'ImageLocation': 'string',
'ImageType': 'machine'|'kernel'|'ramdisk',
'Public': True|False,
'KernelId': 'string',
'OwnerId': 'string',
'Platform': 'Windows',
'ProductCodes': [
{
'ProductCodeId': 'string',
'ProductCodeType': 'devpay'|'marketplace'
},
],
'RamdiskId': 'string',
'State': 'pending'|'available'|'invalid'|'deregistered'|'transient'|'failed'|'error',
'BlockDeviceMappings': [
{
'DeviceName': 'string',
'VirtualName': 'string',
'Ebs': {
'Encrypted': True|False,
'DeleteOnTermination': True|False,
'Iops': 123,
'SnapshotId': 'string',
'VolumeSize': 123,
'VolumeType': 'standard'|'io1'|'gp2'|'sc1'|'st1'
},
'NoDevice': 'string'
},
],
'Description': 'string',
'EnaSupport': True|False,
'Hypervisor': 'ovm'|'xen',
'ImageOwnerAlias': 'string',
'Name': 'string',
'RootDeviceName': 'string',
'RootDeviceType': 'ebs'|'instance-store',
'SriovNetSupport': 'string',
'StateReason': {
'Code': 'string',
'Message': 'string'
},
'Tags': [
{
'Key': 'string',
'Value': 'string'
},
],
'VirtualizationType': 'hvm'|'paravirtual'
},
]
}
Using the following code:
import boto3
client = boto3.client('ec2', region_name='us-east-1')
def verifica_imagem(imagem):
amiresponse = client.describe_images(
Filters=[
{
'Name': 'description',
'Values': [
imagem,
]
},
],
DryRun=False
)
try:
data = str(amiresponse['Images'][0]['Description'])
ami_id = str(amiresponse['Images'][0]['ImageId'])
snapshot_id = str(amiresponse['Images'][0]['SnapshotId'])
except:
print "AMI not exists! Exiting...."
return 1
verifica_imagem('IMAGE_XXXXXXX')
I can't understand how to use the key of SnapshotId. I have tried:
snapshot_id = str(amiresponse['Images']['BlockDeviceMappings']['Ebs'][0]['SnapshotId']) but is not working too.
The value of Images and BlockDeviceMappings is an array and Ebs is a dict.
Use this to fetch the value of SnapshotId,
snapshot_id = amiresponse['Images'][0]['BlockDeviceMappings'][0]['Ebs']['SnapshotId']