I created a simple step function as follows :
Start -> Start EMR cluster & submit job -> End
I want to find out a mechanism to identify whether my spark step completed successfully or not?
I am able to start EMR cluster and attach a spark job to it, which successfully completes and terminates the cluster.
Followed steps in this link :
Creating AWS EMR cluster with spark step using lambda function fails with "Local file does not exist"
Now, I am looking to get the status, th ejob poller will get me information whether the EMR cluster created successfully or not.
I am looking at ways how I can find out Spark job status
from botocore.vendored import requests
import boto3
import json
def lambda_handler(event, context):
conn = boto3.client("emr")
cluster_id = conn.run_job_flow(
Name='xyz',
ServiceRole='xyz',
JobFlowRole='asd',
VisibleToAllUsers=True,
LogUri='<location>',
ReleaseLabel='emr-5.16.0',
Instances={
'Ec2SubnetId': 'xyz',
'InstanceGroups': [
{
'Name': 'Master',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm4.xlarge',
'InstanceCount': 1,
}
],
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False,
},
Applications=[
{
'Name': 'Spark'
},
{
'Name': 'Hadoop'
}
],
Steps=[{ 'Name': "mystep",
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'jar',
'Args' : [
<insert args> , jar, mainclass
]
}
}]
)
return cluster_id
You can use cli or sdk to list all steps for the cluster and then describe particular step to get its status.
Related
I have build using boto3 a workflow that creates a compute environment, creates a job queue, registers a job definition and finally submits job. Trying 'ls' command works fine, however, when trying command 'docker run hello-world' does not work.
Code to create comp env:
response = client.create_compute_environment(
computeEnvironmentName=com_env_name,
type='MANAGED',
state='ENABLED',
computeResources={
'type': 'EC2',
'allocationStrategy': 'BEST_FIT',
'minvCpus': 0,
'maxvCpus': 5,
'instanceTypes': [
'c3.large',
],
'ec2Configuration': [{
'imageType': 'ECS_AL2',
}],
'subnets': [
subnet_id,
],
'securityGroupIds': [
sec_gr_id,
],
'instanceRole': 'ecsInstanceRole',
},
serviceRole = 'arn:aws:iam::blabla
)
The job queue is defined as:
response = batch_client.create_job_queue(
jobQueueName=queue_name,
state='ENABLED',
priority=1,
computeEnvironmentOrder=[
{
'order': 1,
'computeEnvironment': com_env_name
},
],
)
My goal is to run 'docker run hello-world'. The job definition is defined as follows:
response = batch.register_job_definition(
jobDefinitionName=job_def_name,
type='container',
containerProperties={
'image': 'custom-image',
'memory': 2048,
'vcpus': 2,
'command': ['ls'],
'environment': [
{
'name': "DOCKER_HOST",
'value': "unix:///var/run/docker.sock"
},
],
'volumes': [
{
'host': {
'sourcePath': '//var/run/docker.sock'
},
'name': 'docker'
}],
'mountPoints': [
{
'containerPath': '/var/run/docker.sock',
'sourceVolume': 'docker'
}],
},
)
Are the volumes and mount points properly set? What is missing? Is there a connection between dockers to establish? The output error after submitting the job is:
CannotStartContainerError: Error response from daemon: OCI runtime create failed: container_linux.go:380: starting container process caused: exec: "docker run hello-world": executable file not found in $PATH: unknown
The code for job submission is:
response = batch.submit_job(
jobDefinition=job_def_name,
jobName=job_nom,
jobQueue=job_queue_name,
containerOverrides={
'command': ['docker run hello-world',]
}
I notice two things:
You have an extra slash in your sourcePath
The error message you get seems to indicate that the docker executable doesn't exist in the image you're running. You'll need to use an image that supports docker in docker, such as the standard docker image
I would like to trigger EMR spark job with python code through AWS Lambda after trigger the s3 event.I appreciate if any one can share the configuration/command to invoke the EMR spark job from AWS Lambda function.
Since this question is very generic, I will try to give an example code for doing this. You will have to change certain parameters based upon your actual value.
The way I generally do this is I place the main handler function in one file say named as lambda_handler.py and all the configuration and steps of the EMR in a file named as emr_configuration_and_steps.py.
Please check the code snippet below for lambda_handler.py
import boto3
import emr_configuration_and_steps
import logging
import traceback
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
def create_emr(name):
try:
emr = boto3.client('emr')
cluster_id = emr.run_job_flow(
Name=name,
VisibleToAllUsers=emr_configuration_and_steps.visible_to_all_users,
LogUri=emr_configuration_and_steps.log_uri,
ReleaseLabel=emr_configuration_and_steps.release_label,
Applications=emr_configuration_and_steps.applications,
Tags=emr_configuration_and_steps.tags,
Instances=emr_configuration_and_steps.instances,
Steps=emr_configuration_and_steps.steps,
Configurations=emr_configuration_and_steps.configurations,
ScaleDownBehavior=emr_configuration_and_steps.scale_down_behavior,
ServiceRole=emr_configuration_and_steps.service_role,
JobFlowRole=emr_configuration_and_steps.job_flow_role
)
logger.info("EMR is created successfully")
return cluster_id['JobFlowId']
except Exception as e:
traceback.print_exc()
raise Exception(e)
def lambda_handler(event, context):
logger.info("starting the lambda function for spawning EMR")
try:
emr_cluster_id = create_emr('Name of Your EMR')
logger.info("emr_cluster_id is = " + emr_cluster_id)
except Exception as e:
logger.error("Exception at some step in the process " + str(e))
Now the second file(emr_configuration_and_steps.py) that has all the configuration would look like this.
visible_to_all_users = True
log_uri = 's3://your-s3-log-path-here/'
release_label = 'emr-5.29.0'
applications = [{'Name': 'Spark'}, {'Name': 'Hadoop'}]
tags = [
{'Key': 'Project', 'Value': 'Your-Project Name'},
{'Key': 'Service', 'Value': 'Your-Service Name'},
{'Key': 'Environment', 'Value': 'Development'}
]
instances = {
'Ec2KeyName': 'Your-key-name',
'Ec2SubnetId': 'your-subnet-name',
'InstanceFleets': [
{
"InstanceFleetType": "MASTER",
"TargetOnDemandCapacity": 1,
"TargetSpotCapacity": 0,
"InstanceTypeConfigs": [
{
"WeightedCapacity": 1,
"BidPriceAsPercentageOfOnDemandPrice": 100,
"InstanceType": "m3.xlarge"
}
],
"Name": "Master Node"
},
{
"InstanceFleetType": "CORE",
"TargetSpotCapacity": 8,
"InstanceTypeConfigs": [
{
"WeightedCapacity": 8,
"BidPriceAsPercentageOfOnDemandPrice": 50,
"InstanceType": "m3.xlarge"
}
],
"Name": "Core Node"
},
],
'KeepJobFlowAliveWhenNoSteps': False
}
steps = [
{
'Name': 'Setup Hadoop Debugging',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['state-pusher-script']
}
},
{
"Name": "Active Marker for digital panel",
"ActionOnFailure": 'TERMINATE_CLUSTER',
'HadoopJarStep': {
"Jar": "command-runner.jar",
"Args": [
"spark-submit",
"--deploy-mode",
"cluster",
"--driver-memory", "4g",
"--executor-memory", "4g",
"--executor-cores", "2",
"--class", "your-main-class-full-path-name",
"s3://your-jar-path-SNAPSHOT-jar-with-dependencies.jar"
]
}
}
]
configurations = [
{
"Classification": "spark-log4j",
"Properties": {
"log4j.logger.root": "INFO",
"log4j.logger.org": "INFO",
"log4j.logger.com": "INFO"
}
}
]
scale_down_behavior = 'TERMINATE_AT_TASK_COMPLETION'
service_role = 'EMR_DefaultRole'
job_flow_role = 'EMR_EC2_DefaultRole'
Please adjust the certain path and name according to your use case. To deploy this you need to install boto3 and package/zip these 2 files in a zip file and upload this to your lambda function. By this you should be able to spawn the EMR.
Compute environments created via boto3 are not displayed in AWS console. I can see them in the batch_client.describe_compute_environments() call response:
{
'computeEnvironmentName': 'name',
'computeEnvironmentArn': 'arn:aws:batch:us-east-1:<ID>:compute-environment/ml-retraining-compute-env-second',
'ecsClusterArn': 'arn:aws:ecs:us-east-1:<ID>:cluster/ml-retraining-compute-env-second_Batch_b18fcd09-8d7e-351b-bc0f-13ffa83a6b15',
'type': 'MANAGED',
'state': 'ENABLED',
'status': 'INVALID',
'statusReason': "CLIENT_ERROR - The security group 'sg-2436d85c' does not exist",
'computeResources': {
'type': 'EC2',
'minvCpus': 0,
'maxvCpus': 512,
'desiredvCpus': 24,
'instanceTypes': [
'optimal'
],
'subnets': [
'subnet-fa22de86'
],
'securityGroupIds': [
'sg-2436d85c'
],
'instanceRole': 'arn:aws:iam::<ID>:instance-profile/ecsInstanceRole',
'tags': {
'component': 'ukai-training-pipeline',
'product': 'Cormorant',
'jira_project_team': 'CORPRJ',
'business_unit': 'Threat Systems Products',
'created_by': 'ml-pipeline'
}
},
'serviceRole': 'arn:aws:iam::<ID>:role/AWSBatchServiceRole'
}
but the Compute Environments table on the Batch page in AWS console UI does not show anything. The table is empty. When I try to create compute environment with the same name again via boto3 call, I get this response:
ERROR - Error setting compute environment: An error occurred
(ClientException) when calling the CreateComputeEnvironment operation: Object already exists.
Based on the comments, the issue was the use of different region in the console.
The solution was to change the region.
MY first attempt to create an EMR cluster using a Lambda function fails with the error below. I intend to use script-runner.jar to initiate a python script located in an S3 bucket. Can somebody help me understand this error? What am I exactly missing?
2019-11-21T20:34:59.990Z INFO Ensure step 1 jar file s3a://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar
INFO Failed to download: s3a://<region>.elasticmapreduce/libs/script-runner/script-runner.jar
java.io.IOException: Unable to download 's3a://<region>.elasticmapreduce/libs/script-runner/script-runner.jar'. Only s3 + local files are supported
at aws157.instancecontroller.util.S3Wrapper.fetchHadoopFileToLocal(S3Wrapper.java:353)
at aws157.instancecontroller.master.steprunner.HadoopJarStepRunner$Runner.<init>(HadoopJarStepRunner.java:243)
at aws157.instancecontroller.master.steprunner.HadoopJarStepRunner.createRunner(HadoopJarStepRunner.java:152)
at aws157.instancecontroller.master.steprunner.HadoopJarStepRunner.createRunner(HadoopJarStepRunner.java:146)
at aws157.instancecontroller.master.steprunner.StepExecutor.runStep(StepExecutor.java:136)
at aws157.instancecontroller.master.steprunner.StepExecutor.run(StepExecutor.java:70)
at aws157.instancecontroller.master.steprunner.StepExecutionManager.enqueueStep(StepExecutionManager.java:246)
at aws157.instancecontroller.master.steprunner.StepExecutionManager.doRun(StepExecutionManager.java:193)
at aws157.instancecontroller.master.steprunner.StepExecutionManager.access$000(StepExecutionManager.java:33)
at aws157.instancecontroller.master.steprunner.StepExecutionManager$1.run(StepExecutionManager.java:94)
My loosely written lambda function is below:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import boto3
import datetime
def lambda_handler(event, context):
print ('Creating EMR')
connection = boto3.client('emr', region_name='us-east-1')
print (event)
cluster_id = connection.run_job_flow(
Name='MyTest',
VisibleToAllUsers=True,
JobFlowRole='EMR_EC2_DefaultRole',
ServiceRole='EMR_DefaultRole',
LogUri='s3://bucket-emr/logs',
ReleaseLabel='emr-5.21.0',
Applications=[{'Name': 'Hadoop'}, {'Name': 'Spark'}],
Instances={
'InstanceGroups': [{
'Name': 'Master nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm3.xlarge',
'InstanceCount': 1,
}, {
'Name': 'Slave nodes',
'Market': 'SPOT',
'InstanceRole': 'CORE',
'InstanceType': 'm3.xlarge',
'InstanceCount': 2,
}],
'KeepJobFlowAliveWhenNoSteps': True,
'Ec2KeyName': 'keys-kvp',
'Ec2SubnetId': 'subnet-dsb65490',
'EmrManagedMasterSecurityGroup': 'sg-0daa54d041d1033',
'EmrManagedSlaveSecurityGroup': 'sg-0daa54d041d1033',
},
Configurations=[{
"Classification":"spark-env",
"Properties":{},
"Configurations":[{
"Classification":"export",
"Properties":{
"PYSPARK_PYTHON":"python36",
"PYSPARK_DRIVER_PYTHON":"python36"
}
}]
}],
Steps=[{
'Name': 'mystep',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 's3a://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': [
'/home/hadoop/spark/bin/spark-submit', '--deploy-mode', 'cluster', '--master', 'yarn', 's3a://inscape-script/wordcount.py',
]
}
}]
)
return 'Started cluster {}'.format(cluster_id)
What am I missing in creating the cluster? Thanks in advance.
Can you try changing your 'Jar' argument to this instead,
'Jar': 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-script.html
You can also try using command-runner by changing that 'Jar' argument to
/var/lib/aws/emr/step-runner/hadoop-jars/command-runner.jar
I want to execute spark submit job on AWS EMR cluster based on the file upload event on S3. I am using AWS Lambda function to capture the event but I have no idea how to submit spark submit job on EMR cluster from Lambda function.
Most of the answers that i searched talked about adding a step in the EMR cluster. But I do not know if I can add add any step to fire "spark submit --with args" in the added step.
You can, I had to same thing last week!
Using boto3 for Python (other languages would definitely have a similar solution) you can either start a cluster with the defined step, or attach a step to an already up cluster.
Defining the cluster with the step
def lambda_handler(event, context):
conn = boto3.client("emr")
cluster_id = conn.run_job_flow(
Name='ClusterName',
ServiceRole='EMR_DefaultRole',
JobFlowRole='EMR_EC2_DefaultRole',
VisibleToAllUsers=True,
LogUri='s3n://some-log-uri/elasticmapreduce/',
ReleaseLabel='emr-5.8.0',
Instances={
'InstanceGroups': [
{
'Name': 'Master nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm3.xlarge',
'InstanceCount': 1,
},
{
'Name': 'Slave nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'CORE',
'InstanceType': 'm3.xlarge',
'InstanceCount': 2,
}
],
'Ec2KeyName': 'key-name',
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False
},
Applications=[{
'Name': 'Spark'
}],
Configurations=[{
"Classification":"spark-env",
"Properties":{},
"Configurations":[{
"Classification":"export",
"Properties":{
"PYSPARK_PYTHON":"python35",
"PYSPARK_DRIVER_PYTHON":"python35"
}
}]
}],
BootstrapActions=[{
'Name': 'Install',
'ScriptBootstrapAction': {
'Path': 's3://path/to/bootstrap.script'
}
}],
Steps=[{
'Name': 'StepName',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': [
"/usr/bin/spark-submit", "--deploy-mode", "cluster",
's3://path/to/code.file', '-i', 'input_arg',
'-o', 'output_arg'
]
}
}],
)
return "Started cluster {}".format(cluster_id)
Attaching a step to an already running cluster
As per here
def lambda_handler(event, context):
conn = boto3.client("emr")
# chooses the first cluster which is Running or Waiting
# possibly can also choose by name or already have the cluster id
clusters = conn.list_clusters()
# choose the correct cluster
clusters = [c["Id"] for c in clusters["Clusters"]
if c["Status"]["State"] in ["RUNNING", "WAITING"]]
if not clusters:
sys.stderr.write("No valid clusters\n")
sys.stderr.exit()
# take the first relevant cluster
cluster_id = clusters[0]
# code location on your emr master node
CODE_DIR = "/home/hadoop/code/"
# spark configuration example
step_args = ["/usr/bin/spark-submit", "--spark-conf", "your-configuration",
CODE_DIR + "your_file.py", '--your-parameters', 'parameters']
step = {"Name": "what_you_do-" + time.strftime("%Y%m%d-%H:%M"),
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': step_args
}
}
action = conn.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
return "Added step: %s"%(action)
AWS Lambda function python code if you want to execute Spark jar using spark submit command:
from botocore.vendored import requests
import json
def lambda_handler(event, context):
headers = { "content-type": "application/json" }
url = 'http://ip-address.ec2.internal:8998/batches'
payload = {
'file' : 's3://Bucket/Orchestration/RedshiftJDBC41.jar
s3://Bucket/Orchestration/mysql-connector-java-8.0.12.jar
s3://Bucket/Orchestration/SparkCode.jar',
'className' : 'Main Class Name',
'args' : [event.get('rootPath')]
}
res = requests.post(url, data = json.dumps(payload), headers = headers, verify = False)
json_data = json.loads(res.text)
return json_data.get('id')