Allowing ECS Task to read from Kinesis data stream - amazon-web-services

I'm deploying an app through ECS (with FARGATE being the capacity provider). My app needs to access a Kinesis stream (already existing and running). I can't figure out the exact IAM assume policy I need to provide. I have the below configuration in Terraform (removed tags, log configuration and proprietary names). Every time I deploy the task I receive an error that the task couldn't assume the role.
What am I missing?
resource "aws_ecs_cluster" "cluster" {
name = var.cluster_name
}
resource "aws_ecs_service" "service" {
name = var.service_name
cluster = aws_ecs_cluster.cluster.id
task_definition = aws_ecs_task_definition.task.arn
desired_count = var.task_count
launch_type = var.task_launch_type
load_balancer {
target_group_arn = var.alb_target
container_name = "container"
container_port = 3000
}
network_configuration {
subnets = [for subnet in var.subnets : "${subnet}"]
assign_public_ip = true
security_groups = [var.sg_id]
}
}
resource "aws_ecs_task_definition" "task" {
family = "task_family"
container_definitions = file( var.container_definitions_json )
requires_compatibilities = ["FARGATE"]
network_mode = "awsvpc"
memory = 1024
cpu = 512
execution_role_arn = "${aws_iam_role.ecsTaskExecutionRole.arn}"
task_role_arn = "${aws_iam_role.ecsTaskRole.arn}"
}
resource "aws_iam_role" "ecsTaskRole" {
name = "ecsTaskRole"
assume_role_policy = "${data.aws_iam_policy_document.ecsTaskRole.json}"
}
data "aws_caller_identity" "current" {}
data "aws_partition" "current" {}
data "aws_region" "current" {}
data "aws_iam_policy_document" "ecsTaskRole" {
statement {
effect = "Allow"
actions = ["sts:AssumeRole"]
principals {
type = "AWS"
identifiers = [
format("arn:%s:iam::%s:root", data.aws_partition.current.partition, data.aws_caller_identity.current.account_id)
]
}
}
}
resource "aws_iam_role" "ecsTaskExecutionRole" {
name = "ecsTaskExecutionRole"
assume_role_policy = "${data.aws_iam_policy_document.assume_role_policy.json}"
}
data "aws_iam_policy_document" "assume_role_policy" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
}
}
}
resource "aws_iam_role_policy_attachment" "ecsTaskExecutionRole_policy" {
role = "${aws_iam_role.ecsTaskExecutionRole.name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}

Both roles must have a trust policy that allows ecs-tasks.amazonaws.com.
See this document for the task role, and this document for the execution role.

Related

How to figure out why health checks aren't passing in ecs fargate with alb?

I'm quite new into devops, and suffering with setting up a test project for a couple of weeks now.
I have made a terraform file which is supposed to set up most of the project:
# Get subnets
data "aws_subnets" "subnets" {
filter {
name = "vpc-id"
values = [var.vpc_id]
}
}
# Get security groups
data "aws_security_groups" "security_groups" {
filter {
name = "vpc-id"
values = [var.vpc_id]
}
}
resource "aws_s3_bucket" "lb_logs" {
bucket = "${var.app_name}-load-balancer-${var.env}-logs"
}
resource "aws_s3_bucket_server_side_encryption_configuration" "encryption" {
bucket = aws_s3_bucket.lb_logs.bucket
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
resource "aws_s3_bucket_versioning" "versioning" {
bucket = aws_s3_bucket.lb_logs.bucket
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_acl" "acl" {
bucket = aws_s3_bucket.lb_logs.bucket
acl = "private"
}
data "aws_iam_policy_document" "lb_logs_s3_put_object" {
statement {
effect = "Allow"
principals {
type = "AWS"
identifiers = ["arn:aws:iam::156460612806:root"]
}
actions = ["s3:PutObject"]
resources = ["${aws_s3_bucket.lb_logs.arn}/*"]
}
}
resource "aws_s3_bucket_policy" "lb_logs_s3_put_object" {
bucket = aws_s3_bucket.lb_logs.id
policy = data.aws_iam_policy_document.lb_logs_s3_put_object.json
}
# Create load balancer
resource "aws_lb" "load_balancer" {
name = "${var.app_name}-load-balancer-${var.env}"
subnets = data.aws_subnets.subnets.ids
security_groups = data.aws_security_groups.security_groups.ids
load_balancer_type = "application"
access_logs {
bucket = aws_s3_bucket.lb_logs.bucket
enabled = true
}
tags = {
Environment = "${var.env}"
}
}
resource "aws_lb_target_group" "blue_target" {
name = "${var.app_name}-blue-target-${var.env}"
protocol = "HTTPS"
port = var.port
target_type = "ip"
vpc_id = var.vpc_id
health_check {
healthy_threshold = 5
interval = 30
matcher = 200
path = "${var.health_check_path}"
protocol = "HTTPS"
timeout = 10
unhealthy_threshold = 2
}
}
resource "aws_lb_target_group" "green_target" {
name = "${var.app_name}-green-target-${var.env}"
protocol = "HTTPS"
port = var.port
target_type = "ip"
vpc_id = var.vpc_id
health_check {
healthy_threshold = 5
interval = 30
matcher = 200
path = "${var.health_check_path}"
protocol = "HTTPS"
timeout = 10
unhealthy_threshold = 2
}
}
data "aws_acm_certificate" "cert" {
domain = var.domain
statuses = ["ISSUED"]
most_recent = true
}
resource "aws_lb_listener" "listener" {
load_balancer_arn = aws_lb.load_balancer.arn
port = var.port
protocol = "HTTPS"
ssl_policy = "ELBSecurityPolicy-2016-08"
certificate_arn = data.aws_acm_certificate.cert.arn
default_action {
type = "forward"
target_group_arn = aws_lb_target_group.blue_target.arn
}
}
# ECS
resource "aws_ecs_cluster" "cluster" {
name = "${var.app_name}-cluster-${var.env}"
}
data "aws_ecr_repository" "ecr_repository" {
name = var.image_repo_name
}
resource "aws_iam_role" "ecs_task_role" {
name = "EcsTaskRole"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
},
]
})
}
resource "aws_iam_policy" "secrets_manager_read_policy" {
name = "SecretsManagerRead"
description = "Read only access to secrets manager"
policy = jsonencode({
Version = "2012-10-17",
Statement = [
{
Sid = "",
Effect = "Allow",
Action = [
"secretsmanager:GetRandomPassword",
"secretsmanager:GetResourcePolicy",
"secretsmanager:GetSecretValue",
"secretsmanager:DescribeSecret",
"secretsmanager:ListSecretVersionIds",
"secretsmanager:ListSecrets"
],
Resource = "*"
}
]
})
}
resource "aws_iam_role_policy_attachment" "attach_secrets_manager_read_to_task_role" {
role = aws_iam_role.ecs_task_role.name
policy_arn = aws_iam_policy.secrets_manager_read_policy.arn
}
resource "aws_iam_role_policy_attachment" "attach_s3_read_to_task_role" {
role = aws_iam_role.ecs_task_role.name
policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
}
resource "aws_iam_role_policy_attachment" "attach_ses_to_task_role" {
role = aws_iam_role.ecs_task_role.name
policy_arn = "arn:aws:iam::aws:policy/AmazonSESFullAccess"
}
resource "aws_iam_role" "ecs_exec_role" {
name = "EcsExecRole"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
},
]
})
}
resource "aws_iam_policy" "log_groups_write_policy" {
name = "LogGroupsWrite"
description = "Read only access to secrets manager"
policy = jsonencode({
Version = "2012-10-17",
Statement = [
{
Sid = "",
Effect = "Allow",
Action = [
"logs:CreateLogGroup"
],
Resource = "*"
}
]
})
}
resource "aws_iam_role_policy_attachment" "attach_secrets_manager_read_to_exec_role" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = aws_iam_policy.log_groups_write_policy.arn
}
resource "aws_iam_role_policy_attachment" "attach_ecs_task_exec_to_exec_role" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
resource "aws_iam_role_policy_attachment" "attach_fault_injection_simulator_to_exec_role" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorECSAccess"
}
resource "aws_ecs_task_definition" "task_def" {
family = "${var.app_name}-task-def-${var.env}"
network_mode = "awsvpc"
task_role_arn = aws_iam_role.ecs_task_role.arn
execution_role_arn = aws_iam_role.ecs_exec_role.arn
requires_compatibilities = ["FARGATE"]
cpu = "256"
memory = "512"
runtime_platform {
cpu_architecture = "X86_64"
operating_system_family = "LINUX"
}
container_definitions = jsonencode([
{
name = "${var.app_name}-container-${var.env}"
image = "${data.aws_ecr_repository.ecr_repository.repository_url}:latest"
cpu = 0
essential = true
portMappings = [
{
containerPort = var.port
hostPort = var.port
},
]
environment = [
{
name = "PORT",
value = tostring("${var.port}")
},
{
name = "NODE_ENV",
value = var.env
}
]
logConfiguration = {
logDriver = "awslogs"
options = {
"awslogs-create-group" = "true"
"awslogs-group" = "${var.app_name}-task-def-${var.env}"
"awslogs-region" = "${var.region}"
"awslogs-stream-prefix" = "ecs"
}
}
},
])
}
resource "aws_ecs_service" "service" {
lifecycle {
ignore_changes = [
task_definition,
load_balancer,
]
}
cluster = aws_ecs_cluster.cluster.arn
name = "${var.app_name}-service-${var.env}"
task_definition = aws_ecs_task_definition.task_def.arn
load_balancer {
target_group_arn = aws_lb_target_group.blue_target.arn
container_name = "${var.app_name}-container-${var.env}"
container_port = var.port
}
capacity_provider_strategy {
capacity_provider = "FARGATE"
base = 0
weight = 1
}
scheduling_strategy = "REPLICA"
deployment_controller {
type = "CODE_DEPLOY"
}
platform_version = "1.4.0"
network_configuration {
assign_public_ip = true
subnets = data.aws_subnets.subnets.ids
security_groups = data.aws_security_groups.security_groups.ids
}
desired_count = 1
}
# DEPLOYMENT
resource "aws_codedeploy_app" "codedeploy_app" {
name = "${var.app_name}-application-${var.env}"
compute_platform = "ECS"
}
resource "aws_iam_role" "codedeploy_role" {
name = "CodedeployRole"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "codedeploy.amazonaws.com"
}
},
]
})
}
resource "aws_iam_role_policy_attachment" "attach_codedeploy_role" {
role = aws_iam_role.codedeploy_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSCodeDeployRole"
}
resource "aws_iam_role_policy_attachment" "attach_codedeploy_role_for_ecs" {
role = aws_iam_role.codedeploy_role.name
policy_arn = "arn:aws:iam::aws:policy/AWSCodeDeployRoleForECS"
}
resource "aws_codedeploy_deployment_group" "deployment_group" {
app_name = aws_codedeploy_app.codedeploy_app.name
deployment_config_name = "CodeDeployDefault.ECSAllAtOnce"
auto_rollback_configuration {
enabled = true
events = ["DEPLOYMENT_FAILURE"]
}
blue_green_deployment_config {
deployment_ready_option {
action_on_timeout = "CONTINUE_DEPLOYMENT"
wait_time_in_minutes = 0
}
terminate_blue_instances_on_deployment_success {
action = "TERMINATE"
termination_wait_time_in_minutes = 5
}
}
deployment_group_name = "${var.app_name}-deployment-group-${var.env}"
deployment_style {
deployment_option = "WITH_TRAFFIC_CONTROL"
deployment_type = "BLUE_GREEN"
}
load_balancer_info {
target_group_pair_info {
prod_traffic_route {
listener_arns = [aws_lb_listener.listener.arn]
}
target_group {
name = aws_lb_target_group.blue_target.name
}
target_group {
name = aws_lb_target_group.green_target.name
}
}
}
service_role_arn = aws_iam_role.codedeploy_role.arn
ecs_service {
service_name = aws_ecs_service.service.name
cluster_name = aws_ecs_cluster.cluster.name
}
}
resource "aws_appautoscaling_target" "scalable_target" {
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
scalable_dimension = "ecs:service:DesiredCount"
min_capacity = 1
max_capacity = 5
}
resource "aws_appautoscaling_policy" "cpu_scaling_policy" {
name = "${var.app_name}-cpu-scaling-policy-${var.env}"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
scalable_dimension = "ecs:service:DesiredCount"
policy_type = "TargetTrackingScaling"
target_tracking_scaling_policy_configuration {
target_value = 70
predefined_metric_specification {
predefined_metric_type = "ECSServiceAverageCPUUtilization"
}
scale_out_cooldown = 300
scale_in_cooldown = 300
disable_scale_in = false
}
}
resource "aws_appautoscaling_policy" "memory_scaling_policy" {
name = "${var.app_name}-memory-scaling-policy-${var.env}"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
scalable_dimension = "ecs:service:DesiredCount"
policy_type = "TargetTrackingScaling"
target_tracking_scaling_policy_configuration {
target_value = 70
predefined_metric_specification {
predefined_metric_type = "ECSServiceAverageMemoryUtilization"
}
scale_out_cooldown = 300
scale_in_cooldown = 300
disable_scale_in = false
}
}
I've created a project which is without HTTPS, and custom domain (started small, built it step-by-step, first without auto-scaling, logging, and other fancy stuff). it works fine, health checks are passing, can connect, etc.
I've decided to create the exact same thing just with HTTPS, and instead of using the alb's dns to call the api, assign a custom domain.
The load balancer is constantly creating/destroying instances because health checks are failing.
I was doing some research, and I couldn't find a way to debug why this is happening. All I know from the container logs is that it starts, all good, no errors, but they are being terminated because health checks are failing. I cannot access any logs about why, I can only see that there are unhealthy targets.
Now because it is in a VPC, and they don't have a static ip address, and set to HTTPS, it seems like from the load balancer level down to containers it's a black box where it's impossible to debug.
Couldn't think of anything else, I set my security group to allow all requests from all ports to check if I can call the health check endpoint.
Turns out I can, but it returns 502. More detailed logs from the load balancer:
type https
time 2023-02-10T14:37:00.099726Z
elb app/myapp-load-balancer-staging/c6aabdb240600ca8
client:port myip:38255
target:port targetip:3000
request_processing_time -1
target_processing_time -1
response_processing_time -1
elb_status_code 502
target_status_code -
received_bytes 360
sent_bytes 277
"request" "GET https://api.myapp.com:3000/rest/health HTTP/1.1"
"user_agent" "PostmanRuntime/7.29.0"
ssl_cipher <some text>-SHA256
ssl_protocol TLSv1.2
target_group_arn arn:aws:elasticloadbalancing:eu-west-1:myaccountnumber:targetgroup/myapp-blue-target-staging/id
"trace_id" "Root=1-63e6568c-7c78be0f1e967e59370fbb80"
"domain_name" "api.myapp.com"
"chosen_cert_arn" "arn:aws:acm:eu-west-1:myaccountnumber:certificate/certid"
matched_rule_priority 0
request_creation_time 2023-02-10T14:37:00.096000Z
"actions_executed" "forward"
"redirect_url" "-"
"error_reason" "-"
"target:port_list" "172.31.2.112:3000"
"target_status_code_list" "-"
"classification" "Ambiguous"
"classification_reason" "UndefinedContentLengthSemantics"
All I could find is this guide on the topic, but it just explained the problem, didn't show a solution.
Helping me to spot what I'm doing wrong would help a lot, but I'd really appreciate a guide on how to debug these things between the load balancer and containers as they are set so secure with vpcs and everything that even the admins cannot access them.
This is because you are using var.port for all the port settings, for the load balancer listener, target group traffic port, and container port. And you have configured the target group to use the HTTPS protocol. However the SSL traffic is terminated at the load balancer. Only the load balancer has an SSL certificate, so only the load balancer can handle HTTPS traffic. The traffic from the load balancer to the container is still HTTP.
You need to separate out your port settings and traffic protocol settings so that only the load balancer listener is using port 443/HTTPS. The other ports should be configured to use port HTTP just like they were before when everything was working for you, before you enabled SSL.

How to solve error in my code? (Terraform ECS ec2 launch type)

=task-definition=
resource "aws_ecs_task_definition" "task_definition" {
family = "task-defi"
container_definitions = data.template_file.task_definition_json.rendered
requires_compatibilities = ["EC2"]
}
=
task_definition_json
=ecs-service=
resource "aws_ecs_service" "service" {
cluster = aws_ecs_cluster.cluster.id
desired_count = 1
launch_type = "EC2"
name = "service"
task_definition = aws_ecs_task_definition.task_definition.arn
load_balancer {
target_group_arn = module.alb.default_alb_target_group
container_name = "task-defi"
container_port = 8880
}
depends_on = [module.alb.alb_listner]
}
=auto scaling=
resource "aws_appautoscaling_target" "ecs_target" {
max_capacity = 10
min_capacity = 2
resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"
}
Screenshot of Error
Error Message
=> service service was unable to place a task because no container instance met all of its requirements. Reason: No Container Instances were found in your cluster. For more information, see the Troubleshooting section of the Amazon ECS Developer Guide.
I don't think solve this problem anymore
Is there any terraform resource code I need to add?
Please tell me how i can solve this problem...

Node Status "NotReady" After "terraform apply" command

I created AWS EKS and Managed Node Group with Terraform. Actually it is working clearly. But, whenever i create namespace and run "terraform apply" command, my Nodes status becoming "NotReady". If i delete the node groups and run "terraform apply" command again, it is successfully running.
Anyone have idea about this problem??
Thanks.
resource "aws_iam_role" "nodes_general" {
name = "eks-node-group-general"
assume_role_policy = file("./policies/node-groups/node-group-policy.json")
}
resource "aws_iam_role_policy_attachment" "amazon_eks_cni_policy_general" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
role = aws_iam_role.nodes_general.name
}
resource "aws_iam_role_policy_attachment" "amazon_eks_cluster_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.nodes_general.name
}
resource "aws_iam_role_policy_attachment" "amazon_eks_worker_node_policy_general" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
role = aws_iam_role.nodes_general.name
}
resource "aws_iam_role_policy_attachment" "amazon_ec2_container_registry_read_only" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.nodes_general.name
}
resource "aws_eks_node_group" "nodes_general" {
cluster_name = var.cluster_name
node_group_name = "nodes-general"
node_role_arn = aws_iam_role.nodes_general.arn
subnet_ids = ["subnet_id_1", "subnet_id_2", "subnet_id_3"]
lifecycle {
create_before_destroy = true
}
scaling_config {
desired_size = 2
max_size = 5
min_size = 1
}
update_config {
max_unavailable = 1
}
ami_type = "AL2_x86_64"
capacity_type = "ON_DEMAND"
disk_size = 50 #100GB need to be
force_update_version = false
instance_types = ["t3.small"]
labels = {
role = "nodes-general"
}
version = "1.20"
depends_on = [
aws_iam_role_policy_attachment.amazon_eks_worker_node_policy_general,
aws_iam_role_policy_attachment.amazon_eks_cni_policy_general,
aws_iam_role_policy_attachment.amazon_eks_cluster_policy,
aws_iam_role_policy_attachment.amazon_ec2_container_registry_read_only
]
}

Call an SSM document from terraform

I was wondering if anyone could help with this issue? I'm trying to call an SSM document using terraform to stop an ec2 instance. But, it doesn't seems to work. I keep having the error:
Automation Step Execution fails when it is changing the state of each instance. Get Exception from StopInstances API of ec2 Service. Exception Message from StopInstances API: [You are not authorized to perform this operation.
Any suggestion here?
As you could see, there are the right roles. I pass it in parameter.
provider "aws" {
profile = "profile"
region = "eu-west-1"
}
data "aws_ssm_document" "stop_ec2_doc" {
name = "AWS-StopEC2Instance"
document_format = "JSON"
}
data "aws_iam_policy_document" "assume_role" {
version = "2012-10-17"
statement {
sid = "EC2AssumeRole"
effect = "Allow"
actions = ["sts:AssumeRole"]
principals {
identifiers = ["ec2.amazonaws.com"]
type = "Service"
}
principals {
identifiers = ["ssm.amazonaws.com"]
type = "Service"
}
}
}
data "aws_ami" "latest_amazon_2" {
most_recent = true
owners = ["amazon"]
name_regex = "^amzn2-ami-hvm-.*x86_64-gp2"
}
#
resource "aws_iam_role" "iam_assume_role" {
name = "iam_assume_role"
assume_role_policy = data.aws_iam_policy_document.assume_role.json
}
#
resource "aws_iam_role_policy_attachment" "role_1" {
role = aws_iam_role.iam_assume_role.name
policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}
# the instance profile
resource "aws_iam_instance_profile" "iam_instance_profile" {
name = "iam_instance_profile"
role = aws_iam_role.iam_assume_role.name
}
# amazon ec2 instances
resource "aws_instance" "ec2_instances" {
count = 2
ami = data.aws_ami.latest_amazon_2.id
instance_type = "t2.micro"
subnet_id = "subnet-12345678901"
iam_instance_profile = aws_iam_instance_profile.iam_instance_profile.name
root_block_device {
volume_size = 8
volume_type = "gp2"
delete_on_termination = true
}
}
resource "aws_ssm_association" "example" {
name = data.aws_ssm_document.stop_ec2_doc.name
parameters = {
AutomationAssumeRole = "arn:aws:iam::12345678901:role/aws-service-role/ssm.amazonaws.com/AWSServiceRoleForAmazonSSM"
InstanceId = aws_instance.ec2_instances[0].id
}
}
Any suggestion is welcome. I tried to create an easy Terraform code to illustrate what I'm trying to do. And to me it should be straight forward.
I create the role. I create the instance profile. I create the association passing the proper role and the instance id.
AWSServiceRoleForAmazonSSM role does not have permissions to stop instances. Instead you should create new role for SSM with such permissions. The simplest way is as follows:
resource "aws_iam_role" "ssm_role" {
name = "ssm_role"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "ssm.amazonaws.com"
}
},
]
})
}
resource "aws_iam_role_policy_attachment" "ec2-attach" {
role = aws_iam_role.ssm_role.name
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2FullAccess"
}
resource "aws_ssm_association" "example" {
name = data.aws_ssm_document.stop_ec2_doc.name
parameters = {
AutomationAssumeRole = aws_iam_role.ssm_role.arn
InstanceId = aws_instance.ec2_instances[0].id
}
}
The AmazonEC2FullAccess is too permissive just for stopping instances, but I use it as a working example.

AWS - FailedInvocations with EC2 Instance

I'm currently try to create a cronjob using AWS Fargate and Cloudwatch Events. But it results into FailedInvocations and I don't know why.
First, i've setup the variables:
# The AWS region
variable "region" {
type = string
default = "eu-central-1"
}
# The application's name
variable "app" {
type = string
default = "fargate"
}
# The environment that is being built
variable "environment" {
type = string
default = "dev"
}
# The expression for the CloudWatch event
variable "schedule_expression" {
type = string
default = "rate(1 minute)"
}
# The tag mutability setting for the repository (defaults to MUTABLE)
variable "image_tag_mutability" {
type = string
default = "MUTABLE"
}
variable "availability_zones" {
type = list(string)
default = [
"eu-central-1a",
"eu-central-1b"
]
}
variable "task_cpu" {
type = string
default = "256"
}
variable "task_memory" {
type = string
default = "512"
}
variable "saml_users" {
type = list(string)
default = []
}
# locals
locals {
namespace = "${var.app}-${var.environment}"
log_group = "/fargate/task/${local.namespace}"
}
Then, i've created a user role working with my saml account:
# The user role policy document with SAML identification
data "aws_iam_policy_document" "developers" {
statement {
effect = "Allow"
actions = ["sts:AssumeRoleWithSAML"]
principals {
type = "Federated"
identifiers = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:saml-provider/simplesaml"]
}
condition {
test = "StringEquals"
values = ["https://signin.aws.amazon.com/saml"]
variable = "SAML:aud"
}
}
}
# Create a user role
resource "aws_iam_role" "developers" {
name = "developers"
description = "The user role with SAML identification"
max_session_duration = "43200"
assume_role_policy = data.aws_iam_policy_document.developers.json
}
data "aws_iam_policy" "ReadOnlyAccess" {
arn = "arn:aws:iam::aws:policy/ReadOnlyAccess"
}
resource "aws_iam_role_policy_attachment" "developers_ReadOnlyAccess" {
policy_arn = data.aws_iam_policy.ReadOnlyAccess.arn
role = aws_iam_role.developers.name
}
Also an ECR:
# Create an ECR repo at the app/image level
resource "aws_ecr_repository" "app" {
name = var.app
image_tag_mutability = var.image_tag_mutability
}
# Grant access to saml users
resource "aws_ecr_repository_policy" "app" {
repository = aws_ecr_repository.app.name
policy = data.aws_iam_policy_document.ecr.json
}
# The ECR policies for saml users
data "aws_iam_policy_document" "ecr" {
statement {
actions = [
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"ecr:BatchCheckLayerAvailability",
"ecr:PutImage",
"ecr:InitiateLayerUpload",
"ecr:UploadLayerPart",
"ecr:CompleteLayerUpload",
"ecr:DescribeRepositories",
"ecr:GetRepositoryPolicy",
"ecr:ListImages",
"ecr:DescribeImages",
"ecr:DeleteRepository",
"ecr:BatchDeleteImage",
"ecr:SetRepositoryPolicy",
"ecr:DeleteRepositoryPolicy",
"ecr:GetLifecyclePolicy",
"ecr:PutLifecyclePolicy",
"ecr:DeleteLifecyclePolicy",
"ecr:GetLifecyclePolicyPreview",
"ecr:StartLifecyclePolicyPreview",
]
principals {
type = "AWS"
# Add permission for every saml user since assumed roles can't be wildcard
identifiers = [
for saml_user in var.saml_users:
"arn:aws:sts::${data.aws_caller_identity.current.account_id}:assumed-role/${aws_iam_role.developers.name}/${saml_user}"
]
}
}
}
# Returns the name of the ECR registry, this will be used later in various scripts
output "docker_registry" {
value = aws_ecr_repository.app.repository_url
}
Now the VPS which is required:
resource "aws_vpc" "main" {
cidr_block = "10.10.0.0/16"
}
# Create private subnets, each in a given AZ
resource "aws_subnet" "private" {
count = length(var.availability_zones)
cidr_block = cidrsubnet(aws_vpc.main.cidr_block, 8, count.index)
availability_zone = var.availability_zones[count.index]
vpc_id = aws_vpc.main.id
}
# Create public subnets, each in a given AZ
resource "aws_subnet" "public" {
count = length(var.availability_zones)
cidr_block = cidrsubnet(aws_vpc.main.cidr_block, 8, length(var.availability_zones) + count.index)
availability_zone = var.availability_zones[count.index]
vpc_id = aws_vpc.main.id
map_public_ip_on_launch = true
}
# IGW for the public subnet
resource "aws_internet_gateway" "gw" {
vpc_id = aws_vpc.main.id
}
# Route the public subnet traffic through the IGW
resource "aws_route" "internet_access" {
route_table_id = aws_vpc.main.main_route_table_id
destination_cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.gw.id
}
# Create a NAT gateway with an EIP for each private subnet to get internet connectivity
resource "aws_eip" "gw" {
count = length(var.availability_zones)
vpc = true
depends_on = [aws_internet_gateway.gw]
}
resource "aws_nat_gateway" "gw" {
count = length(var.availability_zones)
subnet_id = element(aws_subnet.public.*.id, count.index)
allocation_id = element(aws_eip.gw.*.id, count.index)
}
# Create a new route table for the private subnets
# And make it route non-local traffic through the NAT gateway to the internet
resource "aws_route_table" "private" {
count = length(var.availability_zones)
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
nat_gateway_id = element(aws_nat_gateway.gw.*.id, count.index)
}
}
# Explicitely associate the newly created route tables to the private subnets (so they don't default to the main route table)
resource "aws_route_table_association" "private" {
count = length(var.availability_zones)
subnet_id = element(aws_subnet.private.*.id, count.index)
route_table_id = element(aws_route_table.private.*.id, count.index)
}
resource "aws_security_group" "sg" {
name = local.namespace
description = "Default security group"
vpc_id = aws_vpc.main.id
}
# Allows task to establish connections to all resources
resource "aws_security_group_rule" "ecs_task_egress_rule" {
description = "Allows task to establish connections to all resources"
type = "egress"
from_port = "0"
to_port = "0"
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
security_group_id = aws_security_group.sg.id
}
Finally the ECS Cluster and the Cloudwatch Events:
resource "aws_ecs_cluster" "cluster" {
name = "cluster"
}
resource "aws_ecs_task_definition" "cron" {
family = "cron"
network_mode = "awsvpc"
requires_compatibilities = ["FARGATE"]
execution_role_arn = aws_iam_role.ecs-tasks.arn
cpu = var.task_cpu
memory = var.task_memory
container_definitions = <<DEFINITION
[
{
"image": "${aws_ecr_repository.app.repository_url}",
"name": "app",
"cpu": ${var.task_cpu},
"memory": ${var.task_memory},
"networkMode": "awsvpc",
"portMappings": []
}
]
DEFINITION
}
resource "aws_ecs_service" "service" {
name = "service"
cluster = aws_ecs_cluster.cluster.id
task_definition = aws_ecs_task_definition.cron.arn
desired_count = 0
launch_type = "FARGATE"
network_configuration {
security_groups = [aws_security_group.sg.id]
subnets = [
for subnet in aws_subnet.private:
subnet.id
]
}
}
# Allow task execution role to be assumed by ecs
data "aws_iam_policy_document" "ecs-tasks_assume_role_policy" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
}
}
}
# ECS Tasks role
resource "aws_iam_role" "ecs-tasks" {
name = "${local.namespace}-ecs"
assume_role_policy = data.aws_iam_policy_document.ecs-tasks_assume_role_policy.json
}
# Allow task execution role to work with ecr and cw logs
resource "aws_iam_role_policy_attachment" "ecs-tasks_AmazonECSTaskExecutionRolePolicy" {
role = aws_iam_role.ecs-tasks.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
# Events execution role
resource "aws_iam_role" "events" {
name = "${local.namespace}-events"
assume_role_policy = data.aws_iam_policy_document.events_assume_role_policy.json
}
# Allow events role to be assumed by events service
data "aws_iam_policy_document" "events_assume_role_policy" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["events.amazonaws.com"]
}
}
}
# Setup a scheduled task
resource "aws_cloudwatch_event_rule" "scheduled_task" {
is_enabled = true
name = local.namespace
description = "Runs fargate task ${local.namespace}: ${var.schedule_expression}"
schedule_expression = var.schedule_expression
}
# Setup the target for the scheduled task
resource "aws_cloudwatch_event_target" "scheduled_task" {
rule = aws_cloudwatch_event_rule.scheduled_task.name
target_id = local.namespace
arn = aws_ecs_cluster.cluster.arn
role_arn = aws_iam_role.events.arn
ecs_target {
task_count = 1
task_definition_arn = aws_ecs_task_definition.cron.arn
}
}
The only way to launch the Service minutely is to setup desired_count to 1, but i want to setup a 5 minute cron.