EMR spark job with python code through AWS Lambda

EMR spark job with python code through AWS Lambda - amazon-web-services

I would like to trigger EMR spark job with python code through AWS Lambda after trigger the s3 event.I appreciate if any one can share the configuration/command to invoke the EMR spark job from AWS Lambda function.

Since this question is very generic, I will try to give an example code for doing this. You will have to change certain parameters based upon your actual value.
The way I generally do this is I place the main handler function in one file say named as lambda_handler.py and all the configuration and steps of the EMR in a file named as emr_configuration_and_steps.py.
Please check the code snippet below for lambda_handler.py
import boto3
import emr_configuration_and_steps
import logging
import traceback
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(levelname)s:%(name)s:%(message)s')
def create_emr(name):
try:
emr = boto3.client('emr')
cluster_id = emr.run_job_flow(
Name=name,
VisibleToAllUsers=emr_configuration_and_steps.visible_to_all_users,
LogUri=emr_configuration_and_steps.log_uri,
ReleaseLabel=emr_configuration_and_steps.release_label,
Applications=emr_configuration_and_steps.applications,
Tags=emr_configuration_and_steps.tags,
Instances=emr_configuration_and_steps.instances,
Steps=emr_configuration_and_steps.steps,
Configurations=emr_configuration_and_steps.configurations,
ScaleDownBehavior=emr_configuration_and_steps.scale_down_behavior,
ServiceRole=emr_configuration_and_steps.service_role,
JobFlowRole=emr_configuration_and_steps.job_flow_role
)
logger.info("EMR is created successfully")
return cluster_id['JobFlowId']
except Exception as e:
traceback.print_exc()
raise Exception(e)
def lambda_handler(event, context):
logger.info("starting the lambda function for spawning EMR")
try:
emr_cluster_id = create_emr('Name of Your EMR')
logger.info("emr_cluster_id is = " + emr_cluster_id)
except Exception as e:
logger.error("Exception at some step in the process " + str(e))
Now the second file(emr_configuration_and_steps.py) that has all the configuration would look like this.
visible_to_all_users = True
log_uri = 's3://your-s3-log-path-here/'
release_label = 'emr-5.29.0'
applications = [{'Name': 'Spark'}, {'Name': 'Hadoop'}]
tags = [
{'Key': 'Project', 'Value': 'Your-Project Name'},
{'Key': 'Service', 'Value': 'Your-Service Name'},
{'Key': 'Environment', 'Value': 'Development'}
]
instances = {
'Ec2KeyName': 'Your-key-name',
'Ec2SubnetId': 'your-subnet-name',
'InstanceFleets': [
{
"InstanceFleetType": "MASTER",
"TargetOnDemandCapacity": 1,
"TargetSpotCapacity": 0,
"InstanceTypeConfigs": [
{
"WeightedCapacity": 1,
"BidPriceAsPercentageOfOnDemandPrice": 100,
"InstanceType": "m3.xlarge"
}
],
"Name": "Master Node"
},
{
"InstanceFleetType": "CORE",
"TargetSpotCapacity": 8,
"InstanceTypeConfigs": [
{
"WeightedCapacity": 8,
"BidPriceAsPercentageOfOnDemandPrice": 50,
"InstanceType": "m3.xlarge"
}
],
"Name": "Core Node"
},
],
'KeepJobFlowAliveWhenNoSteps': False
}
steps = [
{
'Name': 'Setup Hadoop Debugging',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['state-pusher-script']
}
},
{
"Name": "Active Marker for digital panel",
"ActionOnFailure": 'TERMINATE_CLUSTER',
'HadoopJarStep': {
"Jar": "command-runner.jar",
"Args": [
"spark-submit",
"--deploy-mode",
"cluster",
"--driver-memory", "4g",
"--executor-memory", "4g",
"--executor-cores", "2",
"--class", "your-main-class-full-path-name",
"s3://your-jar-path-SNAPSHOT-jar-with-dependencies.jar"
]
}
}
]
configurations = [
{
"Classification": "spark-log4j",
"Properties": {
"log4j.logger.root": "INFO",
"log4j.logger.org": "INFO",
"log4j.logger.com": "INFO"
}
}
]
scale_down_behavior = 'TERMINATE_AT_TASK_COMPLETION'
service_role = 'EMR_DefaultRole'
job_flow_role = 'EMR_EC2_DefaultRole'
Please adjust the certain path and name according to your use case. To deploy this you need to install boto3 and package/zip these 2 files in a zip file and upload this to your lambda function. By this you should be able to spawn the EMR.

Related

Youtube Video Insert returns "default" video resource

I'm trying to upload a video from an S3 bucket to YouTube, and getting back strange output that implies a successful post, but doesn't give anything expected back. As well, I set attributes like title and description in my code, but as you can see from the output, this isn't actually being set.
Example Output:
{
"id": "-pfZ_BNH9kg",
"snippet": {
"channelId": "UCZ5AUe-rp3rXKeFS0yx4ZBA",
"title": "unknown",
"channelTitle": "Patrick Hanford",
"publishedAt": "2020-04-30T19:22:15.000Z",
"thumbnails": {
"high": {
"url": "https://i.ytimg.com/vi/-pfZ_BNH9kg/hqdefault.jpg",
"height": 360,
"width": 480
},
"default": {
"url": "https://i.ytimg.com/vi/-pfZ_BNH9kg/default.jpg",
"height": 90,
"width": 120
},
"medium": {
"url": "https://i.ytimg.com/vi/-pfZ_BNH9kg/mqdefault.jpg",
"height": 180,
"width": 320
}
},
"localized": {
"title": "unknown",
"description": ""
},
"liveBroadcastContent": "none",
"categoryId": "20",
"description": ""
},
"etag": "Dn5xIderbhAnUk5TAW0qkFFir0M/3T1YGvGo1YyaTKtTpl8JrJqWS4M",
"status": {
"embeddable": true,
"privacyStatus": "public",
"uploadStatus": "uploaded",
"publicStatsViewable": true,
"license": "youtube"
},
"kind": "youtube#video"
}
Upload Code:
def post(self, attempts=None):
TEST_VIDEO = "http://streamon-perm.s3.amazonaws.com/WPHM-48k-pl-33366.mp4"
headers = {"Content-Type": "video/mp4"}
upload_request_body = {
"snippet": {
"title": "Test Video Upload",
"description": "This is a test of uploading videos.",
"categoryId": "22",
},
"status": {
"privacyStatus": "public"
},
"fileDetails": {
"fileName": TEST_VIDEO,
"fileType": "video"
}
}
params = {
"access_token": self.google_token.get("access_token", None),
"id": self.google_token.get("id_token", None),
"part": "snippet, status"
}
extra = {
"client_id": self.client_id,
"client_secret": self.client_secret
}
google_oauth_session = OAuth2Session(
self.client_id,
token=self.google_token,
auto_refresh_url=self.token_url,
auto_refresh_kwargs=extra,
token_updater=self._save_token
)
upload_response = google_oauth_session.post(
self.video_post_url,
headers=headers,
json=upload_request_body,
params=params
)
logger.info("Response from VIDEO UPLOAD: %s", repr(upload_response.content))
return True
I have also tried downloading the file from S3 and uploading with the file directly, and I get the same result. Without proper error messages or anything to go off of, I'm really not sure what to try next. Any help is greatly appreciated.
I have also tried using requests by itself rather than using oauthlib with exactly the same result.
def post(self, attempts=None):
if attempts is None:
attempts = 0
if self.neutered:
msg = "Youtube post() disabled by ENVIRONMENT variables."
logger.info(msg)
return msg
logger.info("Youtube post() entered with attempt # %s", self.post_attempts)
if self.google_token is None:
self.google_token = self._set_google_token()
attempts += 1
self.post(attempts=attempts)
headers = {
"Content-Type": "video/mp4",
"client_id": self.client_id,
"client_secret": self.client_secret,
"Authorization": "Bearer " + self.google_token["access_token"]
}
params = {
"access_token": self.google_token.get("access_token", None),
"id": self.google_token.get("id_token", None),
"part": "snippet, status"
}
upload_request_body = {
"snippet": {
"title": "Test Video Upload",
"description": "This is a test of uploading videos from POST.",
"categoryId": "22",
},
"status": {
"privacyStatus": "public"
},
"fileDetails": {
"fileName": TEST_VIDEO,
"fileType": "video"
}
}
upload_response = requests.post(
self.video_post_url,
params=params,
headers=headers,
json=upload_request_body
)
logger.info("Response from VIDEO UPLOAD: %s", repr(upload_response.content))
return True

I have also tried downloading the file from S3 and uploading with the file directly, and I get the same result.
Your have this issue probably due to the fact that you are not actually sending the file. upload_request_body.fileDetails.fileName is not the place for the link/file. It's just a description attribute.
Have you tried an auto-generated code from https://developers.google.com/youtube/v3/code_samples/code_snippets ?
This is what you can get there:
# -*- coding: utf-8 -*-
# Sample Python code for youtube.videos.insert
# NOTES:
# 1. This sample code uploads a file and can't be executed via this interface.
# To test this code, you must run it locally using your own API credentials.
# See: https://developers.google.com/explorer-help/guides/code_samples#python
# 2. This example makes a simple upload request. We recommend that you consider
# using resumable uploads instead, particularly if you are transferring large
# files or there's a high likelihood of a network interruption or other
# transmission failure. To learn more about resumable uploads, see:
# https://developers.google.com/api-client-library/python/guide/media_upload
import os
import googleapiclient.discovery
from googleapiclient.http import MediaFileUpload
def main():
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "YOUR_API_KEY"
youtube = googleapiclient.discovery.build(
api_service_name, api_version, developerKey = DEVELOPER_KEY)
request = youtube.videos().insert(
part="snippet,status",
body={
"fileDetails": {
"fileName": "qwer",
"fileType": "video"
},
"snippet": {
"categoryId": "22",
"description": "This is a test of uploading videos.",
"title": "Test Video Upload"
},
"status": {
"privacyStatus": "public"
}
},
# TODO: For this request to work, you must replace "YOUR_FILE"
# with a pointer to the actual file you are uploading.
media_body=MediaFileUpload("YOUR_FILE")
)
response = request.execute()
print(response)
if __name__ == "__main__":
main()
I believe it should work.
Or is there any reason not to use googleapiclient?
I'm trying to upload a video from an S3 bucket to YouTube
I doubt that you can upload files from other sites directly to Youtube. Probably you are stuck with the option of uploading files from your own server/drive. I've looked up on the Internet but all I've found is that you can't (although you could in the past). And one can imagine a lot of reasons why this is not allowed (mostly copyright but not exclusively).
Update:
Probably, that was not an exhaustive code snippet. Especially, considering that you need OAuth2.
But here is another one:
https://github.com/youtube/api-samples/blob/master/python/upload_video.py
And yet another:
https://developers.google.com/youtube/v3/guides/uploading_a_video
With OAuth2. There you can also find information on client_secrets.json.
{
"web": {
"client_id": "[[INSERT CLIENT ID HERE]]",
"client_secret": "[[INSERT CLIENT SECRET HERE]]",
"redirect_uris": [],
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://accounts.google.com/o/oauth2/token"
}
}
Also you can checkout some real life projects. For example this one: https://github.com/HA6Bots/Automatic-Youtube-Reddit-Text-To-Speech-Video-Generator-and-Uploader/tree/master/Youtube%20Bot%20Video%20Generator

EMR Cluster creation fails on the step

MY first attempt to create an EMR cluster using a Lambda function fails with the error below. I intend to use script-runner.jar to initiate a python script located in an S3 bucket. Can somebody help me understand this error? What am I exactly missing?
2019-11-21T20:34:59.990Z INFO Ensure step 1 jar file s3a://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar
INFO Failed to download: s3a://<region>.elasticmapreduce/libs/script-runner/script-runner.jar
java.io.IOException: Unable to download 's3a://<region>.elasticmapreduce/libs/script-runner/script-runner.jar'. Only s3 + local files are supported
at aws157.instancecontroller.util.S3Wrapper.fetchHadoopFileToLocal(S3Wrapper.java:353)
at aws157.instancecontroller.master.steprunner.HadoopJarStepRunner$Runner.<init>(HadoopJarStepRunner.java:243)
at aws157.instancecontroller.master.steprunner.HadoopJarStepRunner.createRunner(HadoopJarStepRunner.java:152)
at aws157.instancecontroller.master.steprunner.HadoopJarStepRunner.createRunner(HadoopJarStepRunner.java:146)
at aws157.instancecontroller.master.steprunner.StepExecutor.runStep(StepExecutor.java:136)
at aws157.instancecontroller.master.steprunner.StepExecutor.run(StepExecutor.java:70)
at aws157.instancecontroller.master.steprunner.StepExecutionManager.enqueueStep(StepExecutionManager.java:246)
at aws157.instancecontroller.master.steprunner.StepExecutionManager.doRun(StepExecutionManager.java:193)
at aws157.instancecontroller.master.steprunner.StepExecutionManager.access$000(StepExecutionManager.java:33)
at aws157.instancecontroller.master.steprunner.StepExecutionManager$1.run(StepExecutionManager.java:94)
My loosely written lambda function is below:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import boto3
import datetime
def lambda_handler(event, context):
print ('Creating EMR')
connection = boto3.client('emr', region_name='us-east-1')
print (event)
cluster_id = connection.run_job_flow(
Name='MyTest',
VisibleToAllUsers=True,
JobFlowRole='EMR_EC2_DefaultRole',
ServiceRole='EMR_DefaultRole',
LogUri='s3://bucket-emr/logs',
ReleaseLabel='emr-5.21.0',
Applications=[{'Name': 'Hadoop'}, {'Name': 'Spark'}],
Instances={
'InstanceGroups': [{
'Name': 'Master nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm3.xlarge',
'InstanceCount': 1,
}, {
'Name': 'Slave nodes',
'Market': 'SPOT',
'InstanceRole': 'CORE',
'InstanceType': 'm3.xlarge',
'InstanceCount': 2,
}],
'KeepJobFlowAliveWhenNoSteps': True,
'Ec2KeyName': 'keys-kvp',
'Ec2SubnetId': 'subnet-dsb65490',
'EmrManagedMasterSecurityGroup': 'sg-0daa54d041d1033',
'EmrManagedSlaveSecurityGroup': 'sg-0daa54d041d1033',
},
Configurations=[{
"Classification":"spark-env",
"Properties":{},
"Configurations":[{
"Classification":"export",
"Properties":{
"PYSPARK_PYTHON":"python36",
"PYSPARK_DRIVER_PYTHON":"python36"
}
}]
}],
Steps=[{
'Name': 'mystep',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 's3a://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': [
'/home/hadoop/spark/bin/spark-submit', '--deploy-mode', 'cluster', '--master', 'yarn', 's3a://inscape-script/wordcount.py',
]
}
}]
)
return 'Started cluster {}'.format(cluster_id)
What am I missing in creating the cluster? Thanks in advance.

Can you try changing your 'Jar' argument to this instead,
'Jar': 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-script.html
You can also try using command-runner by changing that 'Jar' argument to
/var/lib/aws/emr/step-runner/hadoop-jars/command-runner.jar

How to check Spark step status programmatically (submitted on EMR cluster)?

I created a simple step function as follows :
Start -> Start EMR cluster & submit job -> End
I want to find out a mechanism to identify whether my spark step completed successfully or not?
I am able to start EMR cluster and attach a spark job to it, which successfully completes and terminates the cluster.
Followed steps in this link :
Creating AWS EMR cluster with spark step using lambda function fails with "Local file does not exist"
Now, I am looking to get the status, th ejob poller will get me information whether the EMR cluster created successfully or not.
I am looking at ways how I can find out Spark job status
from botocore.vendored import requests
import boto3
import json
def lambda_handler(event, context):
conn = boto3.client("emr")
cluster_id = conn.run_job_flow(
Name='xyz',
ServiceRole='xyz',
JobFlowRole='asd',
VisibleToAllUsers=True,
LogUri='<location>',
ReleaseLabel='emr-5.16.0',
Instances={
'Ec2SubnetId': 'xyz',
'InstanceGroups': [
{
'Name': 'Master',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm4.xlarge',
'InstanceCount': 1,
}
],
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False,
},
Applications=[
{
'Name': 'Spark'
},
{
'Name': 'Hadoop'
}
],
Steps=[{ 'Name': "mystep",
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'jar',
'Args' : [
<insert args> , jar, mainclass
]
}
}]
)
return cluster_id

You can use cli or sdk to list all steps for the cluster and then describe particular step to get its status.

Creating AWS EMR cluster with spark step using lambda function fails with "Local file does not exist"

I'm trying to spin up an EMR cluster with a Spark step using a Lambda function.
Here is my lambda function (python 2.7):
import boto3
def lambda_handler(event, context):
conn = boto3.client("emr")
cluster_id = conn.run_job_flow(
Name='LSR Batch Testrun',
ServiceRole='EMR_DefaultRole',
JobFlowRole='EMR_EC2_DefaultRole',
VisibleToAllUsers=True,
LogUri='s3n://aws-logs-171256445476-ap-southeast-2/elasticmapreduce/',
ReleaseLabel='emr-5.16.0',
Instances={
"Ec2SubnetId": "<my-subnet>",
'InstanceGroups': [
{
'Name': 'Master nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm3.xlarge',
'InstanceCount': 1,
},
{
'Name': 'Slave nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'CORE',
'InstanceType': 'm3.xlarge',
'InstanceCount': 2,
}
],
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False
},
Applications=[{
'Name': 'Spark',
'Name': 'Hive'
}],
Configurations=[
{
"Classification": "hive-site",
"Properties": {
"hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
}
},
{
"Classification": "spark-hive-site",
"Properties": {
"hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
}
}
],
Steps=[{
'Name': 'mystep',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 's3://elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': [
"/home/hadoop/spark/bin/spark-submit", "--deploy-mode", "cluster",
"--master", "yarn-cluster", "--class", "org.apache.spark.examples.SparkPi",
"s3://support.elasticmapreduce/spark/1.2.0/spark-examples-1.2.0-hadoop2.4.0.jar", "10"
]
}
}],
)
return "Started cluster {}".format(cluster_id)
The cluster is starting up, but when trying to execute the step it fails. The error log is containing the following exception:
Exception in thread "main" java.lang.RuntimeException: Local file does not exist.
at com.amazon.elasticmapreduce.scriptrunner.ScriptRunner.fetchFile(ScriptRunner.java:30)
at com.amazon.elasticmapreduce.scriptrunner.ScriptRunner.main(ScriptRunner.java:56)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:234)
at org.apache.hadoop.util.RunJar.main(RunJar.java:148)
So it seems like the script-runner is not understanding to pick up the .jar file from S3?
Any help appreciated...

I could solve the problem eventually. Main problem was the broken "Applications" configuration, which has to look like the following instead:
Applications=[{
'Name': 'Spark'
},
{
'Name': 'Hive'
}],
The final Steps element:
Steps=[{
'Name': 'lsr-step1',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
"spark-submit", "--class", "org.apache.spark.examples.SparkPi",
"s3://support.elasticmapreduce/spark/1.2.0/spark-examples-1.2.0-hadoop2.4.0.jar", "10"
]
}
}]

Not all EMR pre-built with ability to copy your jar, script from S3 so you must do that in bootstrap steps:
BootstrapActions=[
{
'Name': 'Install additional components',
'ScriptBootstrapAction': {
'Path': code_dir + '/scripts' + '/emr_bootstrap.sh'
}
}
],
And here is what my bootstrap does
#!/bin/bash
HADOOP="/home/hadoop"
BUCKET="s3://<yourbucket>/<path>"
# Sync jars libraries
aws s3 sync ${BUCKET}/jars/ ${HADOOP}/
aws s3 sync ${BUCKET}/scripts/ ${HADOOP}/
# Install python packages
sudo pip install --upgrade pip
sudo ln -s /usr/local/bin/pip /usr/bin/pip
sudo pip install psycopg2 numpy boto3 pythonds
Then you can call your script and jar like this
{
'Name': 'START YOUR STEP',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': [
"spark-submit", "--jars", ADDITIONAL_JARS,
"--py-files", "/home/hadoop/modules.zip",
"/home/hadoop/<your code>.py"
]
}
},

How to execute spark submit on amazon EMR from Lambda function?

I want to execute spark submit job on AWS EMR cluster based on the file upload event on S3. I am using AWS Lambda function to capture the event but I have no idea how to submit spark submit job on EMR cluster from Lambda function.
Most of the answers that i searched talked about adding a step in the EMR cluster. But I do not know if I can add add any step to fire "spark submit --with args" in the added step.

You can, I had to same thing last week!
Using boto3 for Python (other languages would definitely have a similar solution) you can either start a cluster with the defined step, or attach a step to an already up cluster.
Defining the cluster with the step
def lambda_handler(event, context):
conn = boto3.client("emr")
cluster_id = conn.run_job_flow(
Name='ClusterName',
ServiceRole='EMR_DefaultRole',
JobFlowRole='EMR_EC2_DefaultRole',
VisibleToAllUsers=True,
LogUri='s3n://some-log-uri/elasticmapreduce/',
ReleaseLabel='emr-5.8.0',
Instances={
'InstanceGroups': [
{
'Name': 'Master nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm3.xlarge',
'InstanceCount': 1,
},
{
'Name': 'Slave nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'CORE',
'InstanceType': 'm3.xlarge',
'InstanceCount': 2,
}
],
'Ec2KeyName': 'key-name',
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False
},
Applications=[{
'Name': 'Spark'
}],
Configurations=[{
"Classification":"spark-env",
"Properties":{},
"Configurations":[{
"Classification":"export",
"Properties":{
"PYSPARK_PYTHON":"python35",
"PYSPARK_DRIVER_PYTHON":"python35"
}
}]
}],
BootstrapActions=[{
'Name': 'Install',
'ScriptBootstrapAction': {
'Path': 's3://path/to/bootstrap.script'
}
}],
Steps=[{
'Name': 'StepName',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': [
"/usr/bin/spark-submit", "--deploy-mode", "cluster",
's3://path/to/code.file', '-i', 'input_arg',
'-o', 'output_arg'
]
}
}],
)
return "Started cluster {}".format(cluster_id)
Attaching a step to an already running cluster
As per here
def lambda_handler(event, context):
conn = boto3.client("emr")
# chooses the first cluster which is Running or Waiting
# possibly can also choose by name or already have the cluster id
clusters = conn.list_clusters()
# choose the correct cluster
clusters = [c["Id"] for c in clusters["Clusters"]
if c["Status"]["State"] in ["RUNNING", "WAITING"]]
if not clusters:
sys.stderr.write("No valid clusters\n")
sys.stderr.exit()
# take the first relevant cluster
cluster_id = clusters[0]
# code location on your emr master node
CODE_DIR = "/home/hadoop/code/"
# spark configuration example
step_args = ["/usr/bin/spark-submit", "--spark-conf", "your-configuration",
CODE_DIR + "your_file.py", '--your-parameters', 'parameters']
step = {"Name": "what_you_do-" + time.strftime("%Y%m%d-%H:%M"),
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': step_args
}
}
action = conn.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
return "Added step: %s"%(action)

AWS Lambda function python code if you want to execute Spark jar using spark submit command:
from botocore.vendored import requests
import json
def lambda_handler(event, context):
headers = { "content-type": "application/json" }
url = 'http://ip-address.ec2.internal:8998/batches'
payload = {
'file' : 's3://Bucket/Orchestration/RedshiftJDBC41.jar
s3://Bucket/Orchestration/mysql-connector-java-8.0.12.jar
s3://Bucket/Orchestration/SparkCode.jar',
'className' : 'Main Class Name',
'args' : [event.get('rootPath')]
}
res = requests.post(url, data = json.dumps(payload), headers = headers, verify = False)
json_data = json.loads(res.text)
return json_data.get('id')

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

EMR spark job with python code through AWS Lambda - amazon-web-services

I would like to trigger EMR spark job with python code through AWS Lambda after trigger the s3 event.I appreciate if any one can share the configuration/command to invoke the EMR spark job from AWS Lambda function.

Related

Youtube Video Insert returns "default" video resource

EMR Cluster creation fails on the step

How to check Spark step status programmatically (submitted on EMR cluster)?

Creating AWS EMR cluster with spark step using lambda function fails with "Local file does not exist"

How to execute spark submit on amazon EMR from Lambda function?

Categories

Resources