AWS Batch: customize event - amazon-web-services

I am running an AWS Batch array job with a few thousand children. When the parent job is complete, I'd like to trigger a lambda to do some cleanup on the output.
I can currently trigger an EventBridge rule when each child job finishes. How can we distinguish when the parent job completes?
EDIT: You can identify the parent job via
if ":" not in event["detail"]["jobId"]:
I'd also like to change the data included in the event signal, so I can tell my lambda what bucket the output went to, the name of the output file produced, etc. Can we customize the fields in an event created by Batch?
Currently, the events I'm getting look like this (account number removed):
{
"version": "0",
"id": "f6c74fc4-a943-9592-2be0-99df7b67f79f",
"detail-type": "Batch Job State Change",
"source": "aws.batch",
"account": "",
"time": "2022-12-07T22:51:38Z",
"region": "us-east-1",
"resources": [
"arn:aws:batch:us-east-1::job/138677dc-d684-4e39-b42f-6626515be7c7:2"
],
"detail": {
"jobArn": "arn:aws:batch:us-east-1::job/138677dc-d684-4e39-b42f-6626515be7c7:2",
"jobName": "event_test",
"jobId": "138677dc-d684-4e39-b42f-6626515be7c7:2",
"jobQueue": "arn:aws:batch:us-east-1::job-queue/HLM2",
"status": "SUCCEEDED",
"attempts": [
{
"container": {
"containerInstanceArn": "arn:aws:ecs:us-east-1::container-instance/AWSBatch-HLM2-c929f2e7-5cd9-3f96-8ad3-a2717fb4d2ec/6f1037214ed247aebe5ead4bf19e5929",
"taskArn": "arn:aws:ecs:us-east-1::task/AWSBatch-HLM2-c929f2e7-5cd9-3f96-8ad3-a2717fb4d2ec/5cb0c860da6a484ca6ba23a468a7f76c",
"exitCode": 0,
"logStreamName": "HLM2/default/5cb0c860da6a484ca6ba23a468a7f76c",
"networkInterfaces": []
},
"startedAt": 1670453438605,
"stoppedAt": 1670453497525,
"statusReason": "Essential container in task exited"
}
],
"statusReason": "Essential container in task exited",
"createdAt": 1670453294805,
"startedAt": 1670453438605,
"stoppedAt": 1670453497525,
"dependsOn": [],
"jobDefinition": "arn:aws:batch:us-east-1::job-definition/HLM2:8",
"parameters": {},
"container": {
"image": ".dkr.ecr.us-east-1.amazonaws.com/hlm:latest",
"command": [],
"jobRoleArn": "arn:aws:iam:::role/hlm-ec2-access-role",
"executionRoleArn": "arn:aws:iam:::role/hlm-ec2-access-role",
"volumes": [],
"environment": [],
"mountPoints": [],
"ulimits": [],
"exitCode": 0,
"containerInstanceArn": "arn:aws:ecs:us-east-1::container-instance/AWSBatch-HLM2-c929f2e7-5cd9-3f96-8ad3-a2717fb4d2ec/6f1037214ed247aebe5ead4bf19e5929",
"taskArn": "arn:aws:ecs:us-east-1::task/AWSBatch-HLM2-c929f2e7-5cd9-3f96-8ad3-a2717fb4d2ec/5cb0c860da6a484ca6ba23a468a7f76c",
"logStreamName": "HLM2/default/5cb0c860da6a484ca6ba23a468a7f76c",
"networkInterfaces": [],
"resourceRequirements": [
{
"value": "2",
"type": "VCPU"
},
{
"value": "8192",
"type": "MEMORY"
}
],
"secrets": []
},
"arrayProperties": {
"statusSummary": {},
"index": 2
},
"timeout": {
"attemptDurationSeconds": 1800
},
"tags": {
"resourceArn": "arn:aws:batch:us-east-1::job/138677dc-d684-4e39-b42f-6626515be7c7"
},
"platformCapabilities": [
"EC2"
],
"eksAttempts": []
}
}

Related

Task in ECS not being called by Step Functions

I have a DAG in Step Functions which is executed by an app within an ECS container. This DAG is scheduled to run every day. I'm seeing the following behavior:
Some random tasks fail with States. Timeout error (the timeout is set to 30 min)
I couldn't find the CloudWatch logs for these tasks that failed. This indicates that the tasks itself are never called.
Why this happens? Is there any setting I can use to prevent such behavior? Below follows the task code:
Input:
.
{
"version": "0",
"id": "ff4a2a37-7024-a213-70f0-11df9104484a",
"detail-type": "Scheduled Event",
"source": "aws.events",
"account": "",
"time": "2022-11-22T08:30:00Z",
"region": "us-east-1",
"resources": [
"arn:aws:events:rule/bi-datalake-hml"
],
"detail": {}
}
Output:
.
{
"Failures": [],
"SdkHttpMetadata": {
"AllHttpHeaders": {
"x-amzn-RequestId": [
"50737371-c24f-4901-98a5-bdef19278cf8"
],
"Content-Length": [
"2224"
],
"Date": [
"Tue, 22 Nov 2022 08:30:46 GMT"
],
"Content-Type": [
"application/x-amz-json-1.1"
]
},
"HttpHeaders": {
"Content-Length": "2224",
"Content-Type": "application/x-amz-json-1.1",
"Date": "Tue, 22 Nov 2022 08:30:46 GMT",
"x-amzn-RequestId": "50737371-c24f-4901-98a5-bdef19278cf8"
},
"HttpStatusCode": 200
},
"SdkResponseMetadata": {
"RequestId": "50737371-c24f-4901-98a5-bdef19278cf8"
},
"Tasks": [
{
"Attachments": [
{
"Details": [
{
"Name": "subnetId",
"Value": "subnet-62d7664e"
}
],
"Id": "eacd4cdf-3ce8-4927-a869-52eb71f553b3",
"Status": "PRECREATED",
"Type": "ElasticNetworkInterface"
}
],
"Attributes": [
{
"Name": "ecs.cpu-architecture",
"Value": "x86_64"
}
],
"AvailabilityZone": "us-east-1c",
"ClusterArn": "arn:aws::cluster/bi-datalake-hml",
"Containers": [
{
"ContainerArn": "arn::container/bi-datalake-hml/ad12e4d9017443d889055024d1932ddf/eb1c44d5-01b8-4c52-8950-8676b6ecd949",
"Cpu": "0",
"GpuIds": [],
"Image": ".dkr.ecr.us-east-1.amazonaws.com/bi_dbtoncloud_hml:latest",
"LastStatus": "PENDING",
"ManagedAgents": [],
"Name": "bi_dbtoncloud_hml",
"NetworkBindings": [],
"NetworkInterfaces": [],
"TaskArn": "arn:aws:ecs:us-east-1::task/bi-datalake-hml/ad12e4d9017443d889055024d1932ddf"
}
],
"Cpu": "256",
"CreatedAt": 1669105846495,
"DesiredStatus": "RUNNING",
"EnableExecuteCommand": false,
"EphemeralStorage": {
"SizeInGiB": 20
},
"Group": "family:bi_dbtoncloud_hml",
"InferenceAccelerators": [],
"LastStatus": "PROVISIONING",
"LaunchType": "FARGATE",
"Memory": "1024",
"Overrides": {
"ContainerOverrides": [
{
"Command": [],
"Environment": [
{
"Name": "DBT_MODEL_TO_RUN",
"Value": "alpha_order_address_shipping"
},
{
"Name": "TASK_TOKEN",
"Value": "AQCIAAAAKgAAAAMAAAAAAAAAAaP+z4q5FgxDHKZhHvzNF0PDV8l/5AkxTlorAGbQfnjdDJE1P1NWf+Jj1OINDelJ0RLrsAtdwIDJcAFmehGFj9mGJ905+T9sdWmKbsSjHuR0fCksAw==vIiI8wfR+LDUo1zPl03VOvkCHUeD5mzrDBoyjVRpA7QuqJ8ocA5OmVSN6MEGg3eS24H/3m3/MZRbmRNydbvI5DIB9PDD5seYIJamDTlfqEtYESgxWBoPrlVmvuphEnw5orSIeh5sZpsKm3/AlzB4OsoZaJleWBd+1WQbWclKEpV9bG3aKCsJO5rYyVaI7Ik09lTrogpL0VeulC2q/rY4cXR/r3lPA9ZL9YalUXgqij2ZuZIRby63hEjzTcoIkieMQMRcRd7XmKb1p8LsE2nSfSRnRotn9JeKVn7/4UBmWx0iQUd/14Dw1TXUSDwUx0sAPN7dtv2RdnqdLfJI6LoJnuPagkySFTaPOsJ0jpgLi9cjsJMVlWboCXlH57fj0JNosDLisxYlf5R3lB4paspZ8DHfgfAW5Saywc/KCCby7wfyYvVKpXvsJFIoBdtXaXs9tlbyTmlg9Dy9Oaol33ZFhVrOuDTzgDf4x6Mguxz1cQEaze8Ui0G2NDCoNxPU+WffSkrsb0Pflf5LsPfHAuI2"
},
{
"Name": "DBT_ENV",
"Value": "hml"
}
],
"EnvironmentFiles": [],
"Name": "bi_dbtoncloud_hml",
"ResourceRequirements": []
}
],
"InferenceAcceleratorOverrides": []
},
"PlatformFamily": "Linux",
"PlatformVersion": "1.4.0",
"Tags": [],
"TaskArn": "arn:aws:ecs:us-east-1::task/bi-datalake-hml/ad12e4d9017443d889055024d1932ddf",
"TaskDefinitionArn": "arn:aws:ecs:us-east-1::task-definition/bi_dbtoncloud_hml:2",
"Version": 1
}
]
}
If you need more information, please let me know.

How to run an ArangoDB container in ECS?

I'm trying to create an ArangoDB cluster in ECS using the default arangodb/arangodb-starter container but when I start my ECS Task, I'm getting an error saying that /usr/sbin/arangod was not found.
I pulled the arangodb/arangodb-starter image locally using docker pull and then I tagged it according to the push commands from ECR, I pushed it to ECR and I created an ECS Task (Fargate) for it. I created a service in ECS to start that task and the container starts, but the ECS Service logs show this error:
|INFO| Starting arangodb version 0.15.5, build 7832707 component=arangodb
[ERROR| Cannot find arangod (expected at /usr/sbin/arangod). component=arangodb
How to solve this:
1 - Install ArangoDB locally or run the ArangoDB starter in docker. (see README for details).
I started the exact same container by tag locally and it works. Why doesn't it work in ECS?
edit The ECS Task definition is in the snippet below:
{
"taskDefinitionArn": "arn:aws:ecs:eu-west-1:123456789:task-definition/dev-arangodb-server:1",
"containerDefinitions": [
{
"name": "dev-arangodb-server",
"image": "123456789.dkr.ecr.eu-west-1.amazonaws.com/arangodb:latest",
"cpu": 0,
"links": [],
"portMappings": [
{
"containerPort": 8529,
"hostPort": 8529,
"protocol": "tcp"
}
],
"essential": true,
"entryPoint": [],
"command": [],
"environment": [
{
"name": "ARANGO_ROOT_PASSWORD",
"value": "password"
}
],
"environmentFiles": [],
"mountPoints": [
{
"sourceVolume": "storage",
"containerPath": "/mnt/storage",
"readOnly": false
}
],
"volumesFrom": [],
"secrets": [],
"dnsServers": [],
"dnsSearchDomains": [],
"extraHosts": [],
"dockerSecurityOptions": [],
"dockerLabels": {},
"ulimits": [],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-create-group": "true",
"awslogs-group": "/ecs/dev-arangodb-server",
"awslogs-region": "eu-west-1",
"awslogs-stream-prefix": "ecs"
},
"secretOptions": []
},
"systemControls": []
}
],
"family": "dev-arangodb-server",
"taskRoleArn": "arn:aws:iam::123456789:role/dev-aws-ecs-ecr-power-user",
"executionRoleArn": "arn:aws:iam::123456789:role/ecsTaskExecutionRole",
"networkMode": "awsvpc",
"revision": 1,
"volumes": [
{
"name": "storage",
"host": {}
}
],
"status": "ACTIVE",
"requiresAttributes": [
{
"name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
},
{
"name": "ecs.capability.execution-role-awslogs"
},
{
"name": "com.amazonaws.ecs.capability.ecr-auth"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.17"
},
{
"name": "com.amazonaws.ecs.capability.task-iam-role"
},
{
"name": "ecs.capability.execution-role-ecr-pull"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
},
{
"name": "ecs.capability.task-eni"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.29"
}
],
"placementConstraints": [],
"compatibilities": [
"EC2",
"FARGATE"
],
"requiresCompatibilities": [
"FARGATE"
],
"cpu": "1024",
"memory": "3072",
"runtimePlatform": {
"cpuArchitecture": "X86_64",
"operatingSystemFamily": "LINUX"
},
"registeredAt": "2022-11-03T08:43:25.264Z",
"registeredBy": "arn:aws:iam::123456789:user/MY_USER",
"tags": [
{
"key": "ecs:taskDefinition:createdFrom",
"value": "ecs-console-v2"
},
{
"key": "ecs:taskDefinition:stackId",
"value": "arn:aws:cloudformation:eu-west-1:123456789:stack/ECS-Console-V2-TaskDefinition-e1519bf7-ff78-423a-951d-2bc8d79242ec/925d88d0-5b53-11ed-97a3-066ee48e3b9b"
}
]
}
I tested on my cluster and seems like that image is not running with default options like yours task definition. That image is not documented so we don't know how to start it correctly
Please try this official image and do the same process. Remember the environment, or you will face this issue.
error: database is uninitialized and password option is not specified
You need to specify one of ARANGO_ROOT_PASSWORD, ARANGO_ROOT_PASSWORD_FILE, ARANGO_NO_AUTH and ARANGO_RANDOM_ROOT_PASSWORD

AWS EVENTBRIDGE: Add content filtering to ECS task state changes

I am trying to create an eventbridge rule whenever ECS task is deleted abnormally.
Normally ECS sends all events event the created or attached states too but I want to filter only DELETEDstate.
I am using CDK to create my event rule. I am trying to implement content filtering based on status which is present in attachment field which is again a part of detail field.
Sample event from ECS Task ->
{
"version": "0",
"id": "3317b2af-7005-947d-b652-f55e762e571a",
"detail-type": "ECS Task State Change",
"source": "aws.ecs",
"account": "111122223333",
"time": "2020-01-23T17:57:58Z",
"region": "us-west-2",
"resources": [
"arn:aws:ecs:us-west-2:111122223333:task/FargateCluster/c13b4cb40f1f4fe4a2971f76ae5a47ad"
],
"detail": {
"attachments": [
{
"id": "1789bcae-ddfb-4d10-8ebe-8ac87ddba5b8",
"type": "eni",
"status": "ATTACHED",
"details": [
{
"name": "subnetId",
"value": "subnet-abcd1234"
},
{
"name": "networkInterfaceId",
"value": "eni-abcd1234"
},
{
"name": "macAddress",
"value": "0a:98:eb:a7:29:ba"
},
{
"name": "privateIPv4Address",
"value": "10.0.0.139"
}
]
}
],
"availabilityZone": "us-west-2c",
"clusterArn": "arn:aws:ecs:us-west-2:111122223333:cluster/FargateCluster",
"containers": [
{
"containerArn": "arn:aws:ecs:us-west-2:111122223333:container/cf159fd6-3e3f-4a9e-84f9-66cbe726af01",
"lastStatus": "RUNNING",
"name": "FargateApp",
"image": "111122223333.dkr.ecr.us-west-2.amazonaws.com/hello-repository:latest",
"imageDigest": "sha256:74b2c688c700ec95a93e478cdb959737c148df3fbf5ea706abe0318726e885e6",
"runtimeId": "ad64cbc71c7fb31c55507ec24c9f77947132b03d48d9961115cf24f3b7307e1e",
"taskArn": "arn:aws:ecs:us-west-2:111122223333:task/FargateCluster/c13b4cb40f1f4fe4a2971f76ae5a47ad",
"networkInterfaces": [
{
"attachmentId": "1789bcae-ddfb-4d10-8ebe-8ac87ddba5b8",
"privateIpv4Address": "10.0.0.139"
}
],
"cpu": "0"
}
],
"createdAt": "2020-01-23T17:57:34.402Z",
"launchType": "FARGATE",
"cpu": "256",
"memory": "512",
"desiredStatus": "RUNNING",
"group": "family:sample-fargate",
"lastStatus": "RUNNING",
"overrides": {
"containerOverrides": [
{
"name": "FargateApp"
}
]
},
"connectivity": "CONNECTED",
"connectivityAt": "2020-01-23T17:57:38.453Z",
"pullStartedAt": "2020-01-23T17:57:52.103Z",
"startedAt": "2020-01-23T17:57:58.103Z",
"pullStoppedAt": "2020-01-23T17:57:55.103Z",
"updatedAt": "2020-01-23T17:57:58.103Z",
"taskArn": "arn:aws:ecs:us-west-2:111122223333:task/FargateCluster/c13b4cb40f1f4fe4a2971f76ae5a47ad",
"taskDefinitionArn": "arn:aws:ecs:us-west-2:111122223333:task-definition/sample-fargate:1",
"version": 4,
"platformVersion": "1.3.0"
}
}
cdk code
{
eventPattern: {
source: ['aws.ecs'],
detailType: ['ECS Task State Change'],
detail: {
clusterArn: [cluster.clusterArn],
attachments: [{ status: [{ prefix: 'DELETED' }] }] // this is not working
},
},
}
Edit: It *is* possible to filter on objects within arrays
detail: { "attachments": {"status": ["DELETED"] } }
EventBridge can match scalars in an array, but not arbitrary objects in an array:
docs: If the value in the event is an array, then the event pattern matches if the intersection of the event pattern array and the event array is non-empty.
That means EventBridge cannot match only "status": "DELETED". What are your options?
Base your pattern on a correlated non-array key-value pair, e.g. "lastStatus": "STOPPED".
Match all patterns. Add logic to the event target to ignore uninteresting patterns.
Note: because you say the array reliably has only one element, you can transform the event detail before it gets sent to the target. This does not help with the matching problem, but can make downstream filtering easier. Here is a CDK example for a Lambda target:
rule.addTarget(
new targets.LambdaFunction(func, {
event: events.RuleTargetInput.fromObject({
status: events.EventField.fromPath('$.detail.attachments[0].status'),
original: events.EventField.fromPath('$'),
}),
})
);
The Lambda receives the reshaped event detail:
{
"status": "ATTACHED",
"original": <the original event>
}

describe-task-definition not returning hostname value

I have the need to find the hostname of my ECS task via the CLI, which according to the Amazon documentation should be available via the CLI:
https://docs.aws.amazon.com/cli/latest/reference/ecs/describe-task-definition.html
However, when I run the describe-task-definition it is not returning the information:
> aws ecs describe-task-definition --task-definition my-test-task:1
{
"taskDefinition": {
"status": "ACTIVE",
"networkMode": "bridge",
"family": "my-test-task",
"placementConstraints": [],
"requiresAttributes": [
{
"name": "com.amazonaws.ecs.capability.ecr-auth"
},
{
"name": "com.amazonaws.ecs.capability.docker-remote-api.1.21"
}
],
"volumes": [
{
"host": {
"sourcePath": "/opt/cf/rails-app/public/"
},
"name": "ruby-on-rails-public-volume"
}
],
"taskDefinitionArn": "arn:aws:ecs:us-east-1:accountId:task-definition/my-test-task:1",
"containerDefinitions": [
{
"memoryReservation": 1024,
"environment": [
{
"name": "DATABASE_HOSTNAME",
"value": "hostname"
},
{
"name": "PUMA_WORKERS",
"value": "2"
},
{
"name": "RAILS_ENV",
"value": "staging"
},
{
"name": "DATABASE_NAME",
"value": "ruby-on-rails"
},
{
"name": "DEBIAN_FRONTEND",
"value": "noninteractive"
},
{
"name": "PORT",
"value": "8080"
},
{
"name": "LANG",
"value": "en_US.UTF-8"
},
{
"name": "DATABASE_PASSWORD",
"value": "cf"
},
{
"name": "DATABASE_USER",
"value": "cf"
},
{
"name": "PUMA_MAX_THREADS",
"value": "6"
}
],
"name": "my-test-task",
"mountPoints": [
{
"sourceVolume": "ruby-on-rails-public-volume",
"containerPath": "/opt/cf/rails-app/public/"
}
],
"image": "accountId.dkr.ecr.us-east-1.amazonaws.com/cf/rails:latest",
"cpu": 1024,
"portMappings": [
{
"protocol": "tcp",
"containerPort": 8080,
"hostPort": 8080
}
],
"command": [
"puma",
"-C",
"config/puma.rb"
],
"essential": true,
"volumesFrom": []
}
],
"revision": 1
}
}
I am not sure what I need to do to get that value included. I confirmed I am running the latest CLI.
Thanks!
First of all the hoostname is not defined to task definition. Its defined to container definition inside task definition. Secondly, There is no default hostname, You have to explicitly define the hostname in the container definition while creating task definition revision. By default, it uses container id as the hostname.

AWS Data Pipeline stuck on Waiting For Runner

My goal is to copy a table in a postgreSQL database running on AWS RDS to a .csv file on Amazone S3. For this I use AWS data pipeline and found the following tutorial however when I follow all steps my pipeline is stuck at: "WAITING FOR RUNNER" see screenshot. The AWS documentation states:
ensure that you set a valid value for either the runsOn or workerGroup
fields for those tasks
however the field "runs on" is set. Any idea why this pipeline is stuck?
and my definition file:
{
"objects": [
{
"output": {
"ref": "DataNodeId_Z8iDO"
},
"input": {
"ref": "DataNodeId_hEUzs"
},
"name": "DefaultCopyActivity01",
"runsOn": {
"ref": "ResourceId_oR8hY"
},
"id": "CopyActivityId_8zaDw",
"type": "CopyActivity"
},
{
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"name": "DefaultResource1",
"id": "ResourceId_oR8hY",
"type": "Ec2Resource",
"terminateAfter": "1 Hour"
},
{
"*password": "xxxxxxxxx",
"name": "DefaultDatabase1",
"id": "DatabaseId_BWxRr",
"type": "RdsDatabase",
"region": "eu-central-1",
"rdsInstanceId": "aqueduct30v05.cgpnumwmfcqc.eu-central-1.rds.amazonaws.com",
"username": "xxxx"
},
{
"name": "DefaultDataFormat1",
"id": "DataFormatId_wORsu",
"type": "CSV"
},
{
"database": {
"ref": "DatabaseId_BWxRr"
},
"name": "DefaultDataNode2",
"id": "DataNodeId_hEUzs",
"type": "SqlDataNode",
"table": "y2018m07d12_rh_ws_categorization_label_postgis_v01_v04",
"selectQuery": "SELECT * FROM y2018m07d12_rh_ws_categorization_label_postgis_v01_v04 LIMIT 100"
},
{
"failureAndRerunMode": "CASCADE",
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"pipelineLogUri": "s3://rutgerhofste-data-pipeline/logs",
"scheduleType": "ONDEMAND",
"name": "Default",
"id": "Default"
},
{
"dataFormat": {
"ref": "DataFormatId_wORsu"
},
"filePath": "s3://rutgerhofste-data-pipeline/test",
"name": "DefaultDataNode1",
"id": "DataNodeId_Z8iDO",
"type": "S3DataNode"
}
],
"parameters": []
}
Usually "WAITING FOR RUNNER" state implies that it is waiting for a resource (such as an EMR cluster). You seem to have not set 'workGroup' field. It means that you have specified "What" to do, but have not specified "who" should do it.