AWS - FailedInvocations with EC2 Instance - amazon-web-services

I'm currently try to create a cronjob using AWS Fargate and Cloudwatch Events. But it results into FailedInvocations and I don't know why.
First, i've setup the variables:
# The AWS region
variable "region" {
type = string
default = "eu-central-1"
}
# The application's name
variable "app" {
type = string
default = "fargate"
}
# The environment that is being built
variable "environment" {
type = string
default = "dev"
}
# The expression for the CloudWatch event
variable "schedule_expression" {
type = string
default = "rate(1 minute)"
}
# The tag mutability setting for the repository (defaults to MUTABLE)
variable "image_tag_mutability" {
type = string
default = "MUTABLE"
}
variable "availability_zones" {
type = list(string)
default = [
"eu-central-1a",
"eu-central-1b"
]
}
variable "task_cpu" {
type = string
default = "256"
}
variable "task_memory" {
type = string
default = "512"
}
variable "saml_users" {
type = list(string)
default = []
}
# locals
locals {
namespace = "${var.app}-${var.environment}"
log_group = "/fargate/task/${local.namespace}"
}
Then, i've created a user role working with my saml account:
# The user role policy document with SAML identification
data "aws_iam_policy_document" "developers" {
statement {
effect = "Allow"
actions = ["sts:AssumeRoleWithSAML"]
principals {
type = "Federated"
identifiers = ["arn:aws:iam::${data.aws_caller_identity.current.account_id}:saml-provider/simplesaml"]
}
condition {
test = "StringEquals"
values = ["https://signin.aws.amazon.com/saml"]
variable = "SAML:aud"
}
}
}
# Create a user role
resource "aws_iam_role" "developers" {
name = "developers"
description = "The user role with SAML identification"
max_session_duration = "43200"
assume_role_policy = data.aws_iam_policy_document.developers.json
}
data "aws_iam_policy" "ReadOnlyAccess" {
arn = "arn:aws:iam::aws:policy/ReadOnlyAccess"
}
resource "aws_iam_role_policy_attachment" "developers_ReadOnlyAccess" {
policy_arn = data.aws_iam_policy.ReadOnlyAccess.arn
role = aws_iam_role.developers.name
}
Also an ECR:
# Create an ECR repo at the app/image level
resource "aws_ecr_repository" "app" {
name = var.app
image_tag_mutability = var.image_tag_mutability
}
# Grant access to saml users
resource "aws_ecr_repository_policy" "app" {
repository = aws_ecr_repository.app.name
policy = data.aws_iam_policy_document.ecr.json
}
# The ECR policies for saml users
data "aws_iam_policy_document" "ecr" {
statement {
actions = [
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"ecr:BatchCheckLayerAvailability",
"ecr:PutImage",
"ecr:InitiateLayerUpload",
"ecr:UploadLayerPart",
"ecr:CompleteLayerUpload",
"ecr:DescribeRepositories",
"ecr:GetRepositoryPolicy",
"ecr:ListImages",
"ecr:DescribeImages",
"ecr:DeleteRepository",
"ecr:BatchDeleteImage",
"ecr:SetRepositoryPolicy",
"ecr:DeleteRepositoryPolicy",
"ecr:GetLifecyclePolicy",
"ecr:PutLifecyclePolicy",
"ecr:DeleteLifecyclePolicy",
"ecr:GetLifecyclePolicyPreview",
"ecr:StartLifecyclePolicyPreview",
]
principals {
type = "AWS"
# Add permission for every saml user since assumed roles can't be wildcard
identifiers = [
for saml_user in var.saml_users:
"arn:aws:sts::${data.aws_caller_identity.current.account_id}:assumed-role/${aws_iam_role.developers.name}/${saml_user}"
]
}
}
}
# Returns the name of the ECR registry, this will be used later in various scripts
output "docker_registry" {
value = aws_ecr_repository.app.repository_url
}
Now the VPS which is required:
resource "aws_vpc" "main" {
cidr_block = "10.10.0.0/16"
}
# Create private subnets, each in a given AZ
resource "aws_subnet" "private" {
count = length(var.availability_zones)
cidr_block = cidrsubnet(aws_vpc.main.cidr_block, 8, count.index)
availability_zone = var.availability_zones[count.index]
vpc_id = aws_vpc.main.id
}
# Create public subnets, each in a given AZ
resource "aws_subnet" "public" {
count = length(var.availability_zones)
cidr_block = cidrsubnet(aws_vpc.main.cidr_block, 8, length(var.availability_zones) + count.index)
availability_zone = var.availability_zones[count.index]
vpc_id = aws_vpc.main.id
map_public_ip_on_launch = true
}
# IGW for the public subnet
resource "aws_internet_gateway" "gw" {
vpc_id = aws_vpc.main.id
}
# Route the public subnet traffic through the IGW
resource "aws_route" "internet_access" {
route_table_id = aws_vpc.main.main_route_table_id
destination_cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.gw.id
}
# Create a NAT gateway with an EIP for each private subnet to get internet connectivity
resource "aws_eip" "gw" {
count = length(var.availability_zones)
vpc = true
depends_on = [aws_internet_gateway.gw]
}
resource "aws_nat_gateway" "gw" {
count = length(var.availability_zones)
subnet_id = element(aws_subnet.public.*.id, count.index)
allocation_id = element(aws_eip.gw.*.id, count.index)
}
# Create a new route table for the private subnets
# And make it route non-local traffic through the NAT gateway to the internet
resource "aws_route_table" "private" {
count = length(var.availability_zones)
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
nat_gateway_id = element(aws_nat_gateway.gw.*.id, count.index)
}
}
# Explicitely associate the newly created route tables to the private subnets (so they don't default to the main route table)
resource "aws_route_table_association" "private" {
count = length(var.availability_zones)
subnet_id = element(aws_subnet.private.*.id, count.index)
route_table_id = element(aws_route_table.private.*.id, count.index)
}
resource "aws_security_group" "sg" {
name = local.namespace
description = "Default security group"
vpc_id = aws_vpc.main.id
}
# Allows task to establish connections to all resources
resource "aws_security_group_rule" "ecs_task_egress_rule" {
description = "Allows task to establish connections to all resources"
type = "egress"
from_port = "0"
to_port = "0"
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
security_group_id = aws_security_group.sg.id
}
Finally the ECS Cluster and the Cloudwatch Events:
resource "aws_ecs_cluster" "cluster" {
name = "cluster"
}
resource "aws_ecs_task_definition" "cron" {
family = "cron"
network_mode = "awsvpc"
requires_compatibilities = ["FARGATE"]
execution_role_arn = aws_iam_role.ecs-tasks.arn
cpu = var.task_cpu
memory = var.task_memory
container_definitions = <<DEFINITION
[
{
"image": "${aws_ecr_repository.app.repository_url}",
"name": "app",
"cpu": ${var.task_cpu},
"memory": ${var.task_memory},
"networkMode": "awsvpc",
"portMappings": []
}
]
DEFINITION
}
resource "aws_ecs_service" "service" {
name = "service"
cluster = aws_ecs_cluster.cluster.id
task_definition = aws_ecs_task_definition.cron.arn
desired_count = 0
launch_type = "FARGATE"
network_configuration {
security_groups = [aws_security_group.sg.id]
subnets = [
for subnet in aws_subnet.private:
subnet.id
]
}
}
# Allow task execution role to be assumed by ecs
data "aws_iam_policy_document" "ecs-tasks_assume_role_policy" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
}
}
}
# ECS Tasks role
resource "aws_iam_role" "ecs-tasks" {
name = "${local.namespace}-ecs"
assume_role_policy = data.aws_iam_policy_document.ecs-tasks_assume_role_policy.json
}
# Allow task execution role to work with ecr and cw logs
resource "aws_iam_role_policy_attachment" "ecs-tasks_AmazonECSTaskExecutionRolePolicy" {
role = aws_iam_role.ecs-tasks.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
# Events execution role
resource "aws_iam_role" "events" {
name = "${local.namespace}-events"
assume_role_policy = data.aws_iam_policy_document.events_assume_role_policy.json
}
# Allow events role to be assumed by events service
data "aws_iam_policy_document" "events_assume_role_policy" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["events.amazonaws.com"]
}
}
}
# Setup a scheduled task
resource "aws_cloudwatch_event_rule" "scheduled_task" {
is_enabled = true
name = local.namespace
description = "Runs fargate task ${local.namespace}: ${var.schedule_expression}"
schedule_expression = var.schedule_expression
}
# Setup the target for the scheduled task
resource "aws_cloudwatch_event_target" "scheduled_task" {
rule = aws_cloudwatch_event_rule.scheduled_task.name
target_id = local.namespace
arn = aws_ecs_cluster.cluster.arn
role_arn = aws_iam_role.events.arn
ecs_target {
task_count = 1
task_definition_arn = aws_ecs_task_definition.cron.arn
}
}
The only way to launch the Service minutely is to setup desired_count to 1, but i want to setup a 5 minute cron.

Related

How to figure out why health checks aren't passing in ecs fargate with alb?

I'm quite new into devops, and suffering with setting up a test project for a couple of weeks now.
I have made a terraform file which is supposed to set up most of the project:
# Get subnets
data "aws_subnets" "subnets" {
filter {
name = "vpc-id"
values = [var.vpc_id]
}
}
# Get security groups
data "aws_security_groups" "security_groups" {
filter {
name = "vpc-id"
values = [var.vpc_id]
}
}
resource "aws_s3_bucket" "lb_logs" {
bucket = "${var.app_name}-load-balancer-${var.env}-logs"
}
resource "aws_s3_bucket_server_side_encryption_configuration" "encryption" {
bucket = aws_s3_bucket.lb_logs.bucket
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
resource "aws_s3_bucket_versioning" "versioning" {
bucket = aws_s3_bucket.lb_logs.bucket
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_acl" "acl" {
bucket = aws_s3_bucket.lb_logs.bucket
acl = "private"
}
data "aws_iam_policy_document" "lb_logs_s3_put_object" {
statement {
effect = "Allow"
principals {
type = "AWS"
identifiers = ["arn:aws:iam::156460612806:root"]
}
actions = ["s3:PutObject"]
resources = ["${aws_s3_bucket.lb_logs.arn}/*"]
}
}
resource "aws_s3_bucket_policy" "lb_logs_s3_put_object" {
bucket = aws_s3_bucket.lb_logs.id
policy = data.aws_iam_policy_document.lb_logs_s3_put_object.json
}
# Create load balancer
resource "aws_lb" "load_balancer" {
name = "${var.app_name}-load-balancer-${var.env}"
subnets = data.aws_subnets.subnets.ids
security_groups = data.aws_security_groups.security_groups.ids
load_balancer_type = "application"
access_logs {
bucket = aws_s3_bucket.lb_logs.bucket
enabled = true
}
tags = {
Environment = "${var.env}"
}
}
resource "aws_lb_target_group" "blue_target" {
name = "${var.app_name}-blue-target-${var.env}"
protocol = "HTTPS"
port = var.port
target_type = "ip"
vpc_id = var.vpc_id
health_check {
healthy_threshold = 5
interval = 30
matcher = 200
path = "${var.health_check_path}"
protocol = "HTTPS"
timeout = 10
unhealthy_threshold = 2
}
}
resource "aws_lb_target_group" "green_target" {
name = "${var.app_name}-green-target-${var.env}"
protocol = "HTTPS"
port = var.port
target_type = "ip"
vpc_id = var.vpc_id
health_check {
healthy_threshold = 5
interval = 30
matcher = 200
path = "${var.health_check_path}"
protocol = "HTTPS"
timeout = 10
unhealthy_threshold = 2
}
}
data "aws_acm_certificate" "cert" {
domain = var.domain
statuses = ["ISSUED"]
most_recent = true
}
resource "aws_lb_listener" "listener" {
load_balancer_arn = aws_lb.load_balancer.arn
port = var.port
protocol = "HTTPS"
ssl_policy = "ELBSecurityPolicy-2016-08"
certificate_arn = data.aws_acm_certificate.cert.arn
default_action {
type = "forward"
target_group_arn = aws_lb_target_group.blue_target.arn
}
}
# ECS
resource "aws_ecs_cluster" "cluster" {
name = "${var.app_name}-cluster-${var.env}"
}
data "aws_ecr_repository" "ecr_repository" {
name = var.image_repo_name
}
resource "aws_iam_role" "ecs_task_role" {
name = "EcsTaskRole"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
},
]
})
}
resource "aws_iam_policy" "secrets_manager_read_policy" {
name = "SecretsManagerRead"
description = "Read only access to secrets manager"
policy = jsonencode({
Version = "2012-10-17",
Statement = [
{
Sid = "",
Effect = "Allow",
Action = [
"secretsmanager:GetRandomPassword",
"secretsmanager:GetResourcePolicy",
"secretsmanager:GetSecretValue",
"secretsmanager:DescribeSecret",
"secretsmanager:ListSecretVersionIds",
"secretsmanager:ListSecrets"
],
Resource = "*"
}
]
})
}
resource "aws_iam_role_policy_attachment" "attach_secrets_manager_read_to_task_role" {
role = aws_iam_role.ecs_task_role.name
policy_arn = aws_iam_policy.secrets_manager_read_policy.arn
}
resource "aws_iam_role_policy_attachment" "attach_s3_read_to_task_role" {
role = aws_iam_role.ecs_task_role.name
policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
}
resource "aws_iam_role_policy_attachment" "attach_ses_to_task_role" {
role = aws_iam_role.ecs_task_role.name
policy_arn = "arn:aws:iam::aws:policy/AmazonSESFullAccess"
}
resource "aws_iam_role" "ecs_exec_role" {
name = "EcsExecRole"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
},
]
})
}
resource "aws_iam_policy" "log_groups_write_policy" {
name = "LogGroupsWrite"
description = "Read only access to secrets manager"
policy = jsonencode({
Version = "2012-10-17",
Statement = [
{
Sid = "",
Effect = "Allow",
Action = [
"logs:CreateLogGroup"
],
Resource = "*"
}
]
})
}
resource "aws_iam_role_policy_attachment" "attach_secrets_manager_read_to_exec_role" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = aws_iam_policy.log_groups_write_policy.arn
}
resource "aws_iam_role_policy_attachment" "attach_ecs_task_exec_to_exec_role" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
resource "aws_iam_role_policy_attachment" "attach_fault_injection_simulator_to_exec_role" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorECSAccess"
}
resource "aws_ecs_task_definition" "task_def" {
family = "${var.app_name}-task-def-${var.env}"
network_mode = "awsvpc"
task_role_arn = aws_iam_role.ecs_task_role.arn
execution_role_arn = aws_iam_role.ecs_exec_role.arn
requires_compatibilities = ["FARGATE"]
cpu = "256"
memory = "512"
runtime_platform {
cpu_architecture = "X86_64"
operating_system_family = "LINUX"
}
container_definitions = jsonencode([
{
name = "${var.app_name}-container-${var.env}"
image = "${data.aws_ecr_repository.ecr_repository.repository_url}:latest"
cpu = 0
essential = true
portMappings = [
{
containerPort = var.port
hostPort = var.port
},
]
environment = [
{
name = "PORT",
value = tostring("${var.port}")
},
{
name = "NODE_ENV",
value = var.env
}
]
logConfiguration = {
logDriver = "awslogs"
options = {
"awslogs-create-group" = "true"
"awslogs-group" = "${var.app_name}-task-def-${var.env}"
"awslogs-region" = "${var.region}"
"awslogs-stream-prefix" = "ecs"
}
}
},
])
}
resource "aws_ecs_service" "service" {
lifecycle {
ignore_changes = [
task_definition,
load_balancer,
]
}
cluster = aws_ecs_cluster.cluster.arn
name = "${var.app_name}-service-${var.env}"
task_definition = aws_ecs_task_definition.task_def.arn
load_balancer {
target_group_arn = aws_lb_target_group.blue_target.arn
container_name = "${var.app_name}-container-${var.env}"
container_port = var.port
}
capacity_provider_strategy {
capacity_provider = "FARGATE"
base = 0
weight = 1
}
scheduling_strategy = "REPLICA"
deployment_controller {
type = "CODE_DEPLOY"
}
platform_version = "1.4.0"
network_configuration {
assign_public_ip = true
subnets = data.aws_subnets.subnets.ids
security_groups = data.aws_security_groups.security_groups.ids
}
desired_count = 1
}
# DEPLOYMENT
resource "aws_codedeploy_app" "codedeploy_app" {
name = "${var.app_name}-application-${var.env}"
compute_platform = "ECS"
}
resource "aws_iam_role" "codedeploy_role" {
name = "CodedeployRole"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "codedeploy.amazonaws.com"
}
},
]
})
}
resource "aws_iam_role_policy_attachment" "attach_codedeploy_role" {
role = aws_iam_role.codedeploy_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSCodeDeployRole"
}
resource "aws_iam_role_policy_attachment" "attach_codedeploy_role_for_ecs" {
role = aws_iam_role.codedeploy_role.name
policy_arn = "arn:aws:iam::aws:policy/AWSCodeDeployRoleForECS"
}
resource "aws_codedeploy_deployment_group" "deployment_group" {
app_name = aws_codedeploy_app.codedeploy_app.name
deployment_config_name = "CodeDeployDefault.ECSAllAtOnce"
auto_rollback_configuration {
enabled = true
events = ["DEPLOYMENT_FAILURE"]
}
blue_green_deployment_config {
deployment_ready_option {
action_on_timeout = "CONTINUE_DEPLOYMENT"
wait_time_in_minutes = 0
}
terminate_blue_instances_on_deployment_success {
action = "TERMINATE"
termination_wait_time_in_minutes = 5
}
}
deployment_group_name = "${var.app_name}-deployment-group-${var.env}"
deployment_style {
deployment_option = "WITH_TRAFFIC_CONTROL"
deployment_type = "BLUE_GREEN"
}
load_balancer_info {
target_group_pair_info {
prod_traffic_route {
listener_arns = [aws_lb_listener.listener.arn]
}
target_group {
name = aws_lb_target_group.blue_target.name
}
target_group {
name = aws_lb_target_group.green_target.name
}
}
}
service_role_arn = aws_iam_role.codedeploy_role.arn
ecs_service {
service_name = aws_ecs_service.service.name
cluster_name = aws_ecs_cluster.cluster.name
}
}
resource "aws_appautoscaling_target" "scalable_target" {
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
scalable_dimension = "ecs:service:DesiredCount"
min_capacity = 1
max_capacity = 5
}
resource "aws_appautoscaling_policy" "cpu_scaling_policy" {
name = "${var.app_name}-cpu-scaling-policy-${var.env}"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
scalable_dimension = "ecs:service:DesiredCount"
policy_type = "TargetTrackingScaling"
target_tracking_scaling_policy_configuration {
target_value = 70
predefined_metric_specification {
predefined_metric_type = "ECSServiceAverageCPUUtilization"
}
scale_out_cooldown = 300
scale_in_cooldown = 300
disable_scale_in = false
}
}
resource "aws_appautoscaling_policy" "memory_scaling_policy" {
name = "${var.app_name}-memory-scaling-policy-${var.env}"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
scalable_dimension = "ecs:service:DesiredCount"
policy_type = "TargetTrackingScaling"
target_tracking_scaling_policy_configuration {
target_value = 70
predefined_metric_specification {
predefined_metric_type = "ECSServiceAverageMemoryUtilization"
}
scale_out_cooldown = 300
scale_in_cooldown = 300
disable_scale_in = false
}
}
I've created a project which is without HTTPS, and custom domain (started small, built it step-by-step, first without auto-scaling, logging, and other fancy stuff). it works fine, health checks are passing, can connect, etc.
I've decided to create the exact same thing just with HTTPS, and instead of using the alb's dns to call the api, assign a custom domain.
The load balancer is constantly creating/destroying instances because health checks are failing.
I was doing some research, and I couldn't find a way to debug why this is happening. All I know from the container logs is that it starts, all good, no errors, but they are being terminated because health checks are failing. I cannot access any logs about why, I can only see that there are unhealthy targets.
Now because it is in a VPC, and they don't have a static ip address, and set to HTTPS, it seems like from the load balancer level down to containers it's a black box where it's impossible to debug.
Couldn't think of anything else, I set my security group to allow all requests from all ports to check if I can call the health check endpoint.
Turns out I can, but it returns 502. More detailed logs from the load balancer:
type https
time 2023-02-10T14:37:00.099726Z
elb app/myapp-load-balancer-staging/c6aabdb240600ca8
client:port myip:38255
target:port targetip:3000
request_processing_time -1
target_processing_time -1
response_processing_time -1
elb_status_code 502
target_status_code -
received_bytes 360
sent_bytes 277
"request" "GET https://api.myapp.com:3000/rest/health HTTP/1.1"
"user_agent" "PostmanRuntime/7.29.0"
ssl_cipher <some text>-SHA256
ssl_protocol TLSv1.2
target_group_arn arn:aws:elasticloadbalancing:eu-west-1:myaccountnumber:targetgroup/myapp-blue-target-staging/id
"trace_id" "Root=1-63e6568c-7c78be0f1e967e59370fbb80"
"domain_name" "api.myapp.com"
"chosen_cert_arn" "arn:aws:acm:eu-west-1:myaccountnumber:certificate/certid"
matched_rule_priority 0
request_creation_time 2023-02-10T14:37:00.096000Z
"actions_executed" "forward"
"redirect_url" "-"
"error_reason" "-"
"target:port_list" "172.31.2.112:3000"
"target_status_code_list" "-"
"classification" "Ambiguous"
"classification_reason" "UndefinedContentLengthSemantics"
All I could find is this guide on the topic, but it just explained the problem, didn't show a solution.
Helping me to spot what I'm doing wrong would help a lot, but I'd really appreciate a guide on how to debug these things between the load balancer and containers as they are set so secure with vpcs and everything that even the admins cannot access them.
This is because you are using var.port for all the port settings, for the load balancer listener, target group traffic port, and container port. And you have configured the target group to use the HTTPS protocol. However the SSL traffic is terminated at the load balancer. Only the load balancer has an SSL certificate, so only the load balancer can handle HTTPS traffic. The traffic from the load balancer to the container is still HTTP.
You need to separate out your port settings and traffic protocol settings so that only the load balancer listener is using port 443/HTTPS. The other ports should be configured to use port HTTP just like they were before when everything was working for you, before you enabled SSL.

Choosing CIDR blocks

I am struggling to understand CIDR blocks in the way I am using them. My understanding is (probably wrong) that they are a way of reserving a range of IP addresses for your environment, and you can apportion them across applications. But I can't get it working in my case. I am using terraform to manage a simple environment. A VPC containing a Lambda and an RDS instance. The RDS will not be publicly accessible, the lambda will be invoked by an HTTP trigger. Each of the Lambda and RDS instance need their own subnets, the RDS needs two. I have this configuration in terraform which keeps failing with this and similar errors:
The new Subnets are not in the same Vpc as the existing subnet group
The terraform set up is:
resource "aws_vpc" "main" {
cidr_block = "10.0.0.0/16"
tags = {
Name = "vpc"
}
}
resource "aws_subnet" "rds_subnet_1a" {
vpc_id = aws_vpc.main.id
cidr_block = "10.0.1.0/24"
availability_zone = "eu-west-1a"
tags = {
Name = "rds_subnet_1a"
}
}
resource "aws_subnet" "rds_subnet_1b" {
vpc_id = aws_vpc.main.id
cidr_block = "10.0.2.0/24"
availability_zone = "eu-west-1b"
tags = {
Name = "rds_subnet_1b"
}
}
resource "aws_subnet" "lambda_subnet_1a" {
vpc_id = aws_vpc.main.id
cidr_block = "10.0.3.0/24"
availability_zone = "eu-west-1a"
tags = {
Name = "lambda_subnet_1a"
}
}
resource "aws_db_subnet_group" "default" {
name = "main"
subnet_ids = [aws_subnet.rds_subnet_1a.id, aws_subnet.rds_subnet_1b.id]
tags = {
Name = "My DB subnet group"
}
}
resource "aws_security_group" "rds" {
name = "rds-sg"
vpc_id = aws_vpc.main.id
ingress {
from_port = 5432
to_port = 5432
protocol = "tcp"
cidr_blocks = ["10.0.0.0/16"]
}
egress {
from_port = 5432
to_port = 5432
protocol = "tcp"
cidr_blocks = ["10.0.0.0/16"]
}
tags = {
Name = "rds-sg"
}
}
resource "aws_security_group" "lambda" {
name = "lambda_sg"
vpc_id = aws_vpc.main.id
ingress {
protocol = -1
self = true
from_port = 0
to_port = 0
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["10.0.0.0/16"]
}
tags = {
Name = "lambda_sg"
}
}
I know this is basic, but I just think if I get some answers to my situation it may help me understand the concepts better.
EDIT - lambda config:
resource "aws_lambda_function" "api_uprn" {
function_name = "api-uprn"
s3_bucket = aws_s3_bucket.lambdas_bucket.id
s3_key = "api-uprn/function_0.0.8.zip"
runtime = "python3.9"
handler = "app.main.handler"
role = aws_iam_role.lambda_exec.arn
vpc_config {
subnet_ids = [aws_subnet.subnet_1a.id]
security_group_ids = [aws_security_group.lambda.id]
}
}
resource "aws_cloudwatch_log_group" "api_uprn" {
name = "/aws/lambda/${aws_lambda_function.api_uprn.function_name}"
retention_in_days = 30
}
resource "aws_iam_role" "lambda_exec" {
name = "api_uprn"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "lambda.amazonaws.com"
}
}
]
})
}
resource "aws_iam_role_policy_attachment" "lambda_policy" {
role = aws_iam_role.lambda_exec.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
}
resource "aws_iam_role_policy_attachment" "rds_read" {
role = aws_iam_role.lambda_exec.name
policy_arn = "arn:aws:iam::aws:policy/AmazonRDSReadOnlyAccess"
}
resource "aws_iam_role_policy_attachment" "lambda_vpc_access" {
role = aws_iam_role.lambda_exec.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
}
Can you please post here, the full error? It will be easier to understand what state is throwing the error!
My tip is that you need to change your subnet_ids at your lambda configuration. From what I understand, your lambda configuration should be like this:
resource "aws_lambda_function" "api_uprn" {
function_name = "api-uprn"
s3_bucket = aws_s3_bucket.lambdas_bucket.id
s3_key = "api-uprn/function_0.0.8.zip"
runtime = "python3.9"
handler = "app.main.handler"
role = aws_iam_role.lambda_exec.arn
vpc_config {
subnet_ids = [aws_subnet. lambda_subnet_1a.id]
security_group_ids = [aws_security_group.lambda.id]
}
}

Allowing ECS Task to read from Kinesis data stream

I'm deploying an app through ECS (with FARGATE being the capacity provider). My app needs to access a Kinesis stream (already existing and running). I can't figure out the exact IAM assume policy I need to provide. I have the below configuration in Terraform (removed tags, log configuration and proprietary names). Every time I deploy the task I receive an error that the task couldn't assume the role.
What am I missing?
resource "aws_ecs_cluster" "cluster" {
name = var.cluster_name
}
resource "aws_ecs_service" "service" {
name = var.service_name
cluster = aws_ecs_cluster.cluster.id
task_definition = aws_ecs_task_definition.task.arn
desired_count = var.task_count
launch_type = var.task_launch_type
load_balancer {
target_group_arn = var.alb_target
container_name = "container"
container_port = 3000
}
network_configuration {
subnets = [for subnet in var.subnets : "${subnet}"]
assign_public_ip = true
security_groups = [var.sg_id]
}
}
resource "aws_ecs_task_definition" "task" {
family = "task_family"
container_definitions = file( var.container_definitions_json )
requires_compatibilities = ["FARGATE"]
network_mode = "awsvpc"
memory = 1024
cpu = 512
execution_role_arn = "${aws_iam_role.ecsTaskExecutionRole.arn}"
task_role_arn = "${aws_iam_role.ecsTaskRole.arn}"
}
resource "aws_iam_role" "ecsTaskRole" {
name = "ecsTaskRole"
assume_role_policy = "${data.aws_iam_policy_document.ecsTaskRole.json}"
}
data "aws_caller_identity" "current" {}
data "aws_partition" "current" {}
data "aws_region" "current" {}
data "aws_iam_policy_document" "ecsTaskRole" {
statement {
effect = "Allow"
actions = ["sts:AssumeRole"]
principals {
type = "AWS"
identifiers = [
format("arn:%s:iam::%s:root", data.aws_partition.current.partition, data.aws_caller_identity.current.account_id)
]
}
}
}
resource "aws_iam_role" "ecsTaskExecutionRole" {
name = "ecsTaskExecutionRole"
assume_role_policy = "${data.aws_iam_policy_document.assume_role_policy.json}"
}
data "aws_iam_policy_document" "assume_role_policy" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
}
}
}
resource "aws_iam_role_policy_attachment" "ecsTaskExecutionRole_policy" {
role = "${aws_iam_role.ecsTaskExecutionRole.name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
Both roles must have a trust policy that allows ecs-tasks.amazonaws.com.
See this document for the task role, and this document for the execution role.

Terraform dial tcp dns error while creating AWS ALB ingress with EKS cluster

I am trying to use Terraform to create an AWS EKS cluster with an ALB load balancer and kubernetes ingress.
I have been using this git repo and this blog to guide me.
The deploy fails with the following errors immediately after the cluster has been created.
Error: Post "https://E8475B1B3693C979073BF0D721D876A7.sk1.ap-southeast-1.eks.amazonaws.com/api/v1/namespaces/kube-system/configmaps": dial tcp: lookup E8475B1B3693C979073BF0D721D876A7.sk1.ap-southeast-1.eks.amazonaws.com on 8.8.8.8:53: no such host
on modules/alb/alb_ingress_controller.tf line 1, in resource "kubernetes_config_map" "aws_auth":
1: resource "kubernetes_config_map" "aws_auth" {
Error: Post "https://E8475B1B3693C979073BF0D721D876A7.sk1.ap-southeast-1.eks.amazonaws.com/apis/rbac.authorization.k8s.io/v1/clusterroles": dial tcp: lookup E8475B1B3693C979073BF0D721D876A7.sk1.ap-southeast-1.eks.amazonaws.com on 8.8.8.8:53: no such host
on modules/alb/alb_ingress_controller.tf line 20, in resource "kubernetes_cluster_role" "alb-ingress":
20: resource "kubernetes_cluster_role" "alb-ingress" {
Error: Post "https://E8475B1B3693C979073BF0D721D876A7.sk1.ap-southeast-1.eks.amazonaws.com/apis/rbac.authorization.k8s.io/v1/clusterrolebindings": dial tcp: lookup E8475B1B3693C979073BF0D721D876A7.sk1.ap-southeast-1.eks.amazonaws.com on 8.8.8.8:53: no such host
on modules/alb/alb_ingress_controller.tf line 41, in resource "kubernetes_cluster_role_binding" "alb-ingress":
41: resource "kubernetes_cluster_role_binding" "alb-ingress" {
Error: Post "https://E8475B1B3693C979073BF0D721D876A7.sk1.ap-southeast-1.eks.amazonaws.com/api/v1/namespaces/kube-system/serviceaccounts": dial tcp: lookup E8475B1B3693C979073BF0D721D876A7.sk1.ap-southeast-1.eks.amazonaws.com on 8.8.8.8:53: no such host
on modules/alb/alb_ingress_controller.tf line 62, in resource "kubernetes_service_account" "alb-ingress":
62: resource "kubernetes_service_account" "alb-ingress" {
Error: Failed to create Ingress 'default/main-ingress' because: Post "https://E8475B1B3693C979073BF0D721D876A7.sk1.ap-southeast-1.eks.amazonaws.com/apis/extensions/v1beta1/namespaces/default/ingresses": dial tcp: lookup E8475B1B3693C979073BF0D721D876A7.sk1.ap-southeast-1.eks.amazonaws.com on 8.8.8.8:53: no such host
on modules/alb/kubernetes_ingress.tf line 1, in resource "kubernetes_ingress" "main":
1: resource "kubernetes_ingress" "main" {
Error: Post "https://641480DEC80EB445C6CBBEDC9D1F0234.yl4.ap-southeast-1.eks.amazonaws.com/api/v1/namespaces/kube-system/configmaps": dial tcp 10.0.21.192:443: connect: no route to host
on modules/eks/allow_nodes.tf line 22, in resource "kubernetes_config_map" "aws_auth":
22: resource "kubernetes_config_map" "aws_auth" {
Here is my Terraform code:
provider "aws" {
region = var.aws_region
version = "~> 2.65.0"
ignore_tags {
keys = ["kubernetes.io/role/internal-elb", "app.kubernetes.io/name"]
key_prefixes = ["kubernetes.io/cluster/", "alb.ingress.kubernetes.io/"]
}
}
resource "kubernetes_config_map" "aws_auth" {
metadata {
name = "aws-auth"
namespace = "kube-system"
}
data = {
mapRoles = <<EOF
- rolearn: ${var.iam_role_node}
username: system:node:{{EC2PrivateDNSName}}
groups:
- system:bootstrappers
- system:nodes
EOF
}
depends_on = [
var.eks_cluster_name
]
}
resource "kubernetes_cluster_role" "alb-ingress" {
metadata {
name = "alb-ingress-controller"
labels = {
"app.kubernetes.io/name" = "alb-ingress-controller"
}
}
rule {
api_groups = ["", "extensions"]
resources = ["configmaps", "endpoints", "events", "ingresses", "ingresses/status", "services"]
verbs = ["create", "get", "list", "update", "watch", "patch"]
}
rule {
api_groups = ["", "extensions"]
resources = ["nodes", "pods", "secrets", "services", "namespaces"]
verbs = ["get", "list", "watch"]
}
}
resource "kubernetes_cluster_role_binding" "alb-ingress" {
metadata {
name = "alb-ingress-controller"
labels = {
"app.kubernetes.io/name" = "alb-ingress-controller"
}
}
role_ref {
api_group = "rbac.authorization.k8s.io"
kind = "ClusterRole"
name = "alb-ingress-controller"
}
subject {
kind = "ServiceAccount"
name = "alb-ingress-controller"
namespace = "kube-system"
}
}
resource "kubernetes_service_account" "alb-ingress" {
metadata {
name = "alb-ingress-controller"
namespace = "kube-system"
labels = {
"app.kubernetes.io/name" = "alb-ingress-controller"
}
}
automount_service_account_token = true
}
resource "kubernetes_deployment" "alb-ingress" {
metadata {
name = "alb-ingress-controller"
labels = {
"app.kubernetes.io/name" = "alb-ingress-controller"
}
namespace = "kube-system"
}
spec {
selector {
match_labels = {
"app.kubernetes.io/name" = "alb-ingress-controller"
}
}
template {
metadata {
labels = {
"app.kubernetes.io/name" = "alb-ingress-controller"
}
}
spec {
volume {
name = kubernetes_service_account.alb-ingress.default_secret_name
secret {
secret_name = kubernetes_service_account.alb-ingress.default_secret_name
}
}
container {
# This is where you change the version when Amazon comes out with a new version of the ingress controller
image = "docker.io/amazon/aws-alb-ingress-controller:v1.1.7"
name = "alb-ingress-controller"
args = [
"--ingress-class=alb",
"--cluster-name=${var.eks_cluster_name}",
"--aws-vpc-id=${var.vpc_id}",
"--aws-region=${var.aws_region}"]
}
service_account_name = "alb-ingress-controller"
}
}
}
}
########################################################################################
# setup provider for kubernetes
//data "external" "aws_iam_authenticator" {
// program = ["sh", "-c", "aws-iam-authenticator token -i ${var.cluster_name} | jq -r -c .status"]
//}
data "aws_eks_cluster_auth" "tf_eks_cluster" {
name = aws_eks_cluster.tf_eks_cluster.name
}
provider "kubernetes" {
host = aws_eks_cluster.tf_eks_cluster.endpoint
cluster_ca_certificate = base64decode(aws_eks_cluster.tf_eks_cluster.certificate_authority.0.data)
//token = data.external.aws_iam_authenticator.result.token
token = data.aws_eks_cluster_auth.tf_eks_cluster.token
load_config_file = false
version = "~> 1.9"
}
# Allow worker nodes to join cluster via config map
resource "kubernetes_config_map" "aws_auth" {
metadata {
name = "aws-auth"
namespace = "kube-system"
}
data = {
mapRoles = <<EOF
- rolearn: ${aws_iam_role.tf-eks-node.arn}
username: system:node:{{EC2PrivateDNSName}}
groups:
- system:bootstrappers
- system:nodes
EOF
}
depends_on = [aws_eks_cluster.tf_eks_cluster, aws_autoscaling_group.tf_eks_cluster]
}
resource "kubernetes_ingress" "main" {
metadata {
name = "main-ingress"
annotations = {
"alb.ingress.kubernetes.io/scheme" = "internet-facing"
"kubernetes.io/ingress.class" = "alb"
"alb.ingress.kubernetes.io/subnets" = var.app_subnet_stringlist
"alb.ingress.kubernetes.io/certificate-arn" = "${data.aws_acm_certificate.api.arn}, ${data.aws_acm_certificate.gitea.arn}"
"alb.ingress.kubernetes.io/listen-ports" = <<JSON
[
{"HTTP": 80},
{"HTTPS": 443}
]
JSON
"alb.ingress.kubernetes.io/actions.ssl-redirect" = <<JSON
{
"Type": "redirect",
"RedirectConfig": {
"Protocol": "HTTPS",
"Port": "443",
"StatusCode": "HTTP_301"
}
}
JSON
}
}
spec {
rule {
host = "api.xactpos.com"
http {
path {
backend {
service_name = "ssl-redirect"
service_port = "use-annotation"
}
path = "/*"
}
path {
backend {
service_name = "app-service1"
service_port = 80
}
path = "/service1"
}
path {
backend {
service_name = "app-service2"
service_port = 80
}
path = "/service2"
}
}
}
rule {
host = "gitea.xactpos.com"
http {
path {
backend {
service_name = "ssl-redirect"
service_port = "use-annotation"
}
path = "/*"
}
path {
backend {
service_name = "api-service1"
service_port = 80
}
path = "/service3"
}
path {
backend {
service_name = "api-service2"
service_port = 80
}
path = "/graphq4"
}
}
}
}
}
resource "aws_security_group" "eks-alb" {
name = "eks-alb-public"
description = "Security group allowing public traffic for the eks load balancer."
vpc_id = var.vpc_id
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = map(
"Name", "terraform-eks-alb",
"kubernetes.io/cluster/tf-eks-cluster", "owned"
)
}
resource "aws_security_group_rule" "eks-alb-public-https" {
description = "Allow eks load balancer to communicate with public traffic securely."
cidr_blocks = ["0.0.0.0/0"]
from_port = 443
protocol = "tcp"
security_group_id = aws_security_group.eks-alb.id
to_port = 443
type = "ingress"
}
resource "aws_security_group_rule" "eks-alb-public-http" {
description = "Allow eks load balancer to communicate with public traffic."
cidr_blocks = ["0.0.0.0/0"]
from_port = 80
protocol = "tcp"
security_group_id = aws_security_group.eks-alb.id
to_port = 80
type = "ingress"
}
resource "aws_eks_cluster" "tf_eks_cluster" {
name = var.cluster_name
role_arn = aws_iam_role.tf-eks-cluster.arn
vpc_config {
security_group_ids = [aws_security_group.tf-eks-cluster.id]
subnet_ids = var.app_subnet_ids
endpoint_private_access = true
endpoint_public_access = false
}
depends_on = [
aws_iam_role_policy_attachment.tf-eks-cluster-AmazonEKSClusterPolicy,
aws_iam_role_policy_attachment.tf-eks-cluster-AmazonEKSServicePolicy,
]
}
# Setup for IAM role needed to setup an EKS cluster
resource "aws_iam_role" "tf-eks-cluster" {
name = "tf-eks-cluster"
assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "eks.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}
resource "aws_iam_role_policy_attachment" "tf-eks-cluster-AmazonEKSClusterPolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.tf-eks-cluster.name
}
resource "aws_iam_role_policy_attachment" "tf-eks-cluster-AmazonEKSServicePolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSServicePolicy"
role = aws_iam_role.tf-eks-cluster.name
}
########################################################################################
# Setup IAM role & instance profile for worker nodes
resource "aws_iam_role" "tf-eks-node" {
name = "tf-eks-node"
assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}
resource "aws_iam_instance_profile" "tf-eks-node" {
name = "tf-eks-node"
role = aws_iam_role.tf-eks-node.name
}
resource "aws_iam_role_policy_attachment" "tf-eks-node-AmazonEKSWorkerNodePolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
role = aws_iam_role.tf-eks-node.name
}
resource "aws_iam_role_policy_attachment" "tf-eks-node-AmazonEKS_CNI_Policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
role = aws_iam_role.tf-eks-node.name
}
resource "aws_iam_role_policy_attachment" "tf-eks-node-AmazonEC2ContainerRegistryReadOnly" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.tf-eks-node.name
}
resource "aws_iam_role_policy_attachment" "tf-eks-node-AmazonEC2FullAccess" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2FullAccess"
role = aws_iam_role.tf-eks-node.name
}
resource "aws_iam_role_policy_attachment" "tf-eks-node-alb-ingress_policy" {
policy_arn = aws_iam_policy.alb-ingress.arn
role = aws_iam_role.tf-eks-node.name
}
resource "aws_iam_policy" "alb-ingress" {
name = "alb-ingress-policy"
policy = file("${path.module}/alb_ingress_policy.json")
}
# generate KUBECONFIG as output to save in ~/.kube/config locally
# save the 'terraform output eks_kubeconfig > config', run 'mv config ~/.kube/config' to use it for kubectl
locals {
kubeconfig = <<KUBECONFIG
apiVersion: v1
clusters:
- cluster:
server: ${aws_eks_cluster.tf_eks_cluster.endpoint}
certificate-authority-data: ${aws_eks_cluster.tf_eks_cluster.certificate_authority.0.data}
name: kubernetes
contexts:
- context:
cluster: kubernetes
user: aws
name: aws
current-context: aws
kind: Config
preferences: {}
users:
- name: aws
user:
exec:
apiVersion: client.authentication.k8s.io/v1alpha1
command: aws-iam-authenticator
args:
- "token"
- "-i"
- "${var.cluster_name}"
KUBECONFIG
}
########################################################################################
# Setup AutoScaling Group for worker nodes
# Setup data source to get amazon-provided AMI for EKS nodes
data "aws_ami" "eks-worker" {
filter {
name = "name"
values = ["amazon-eks-node-v*"]
}
most_recent = true
owners = ["602401143452"] # Amazon EKS AMI Account ID
}
# Is provided in demo code, no idea what it's used for though! TODO: DELETE
# data "aws_region" "current" {}
# EKS currently documents this required userdata for EKS worker nodes to
# properly configure Kubernetes applications on the EC2 instance.
# We utilize a Terraform local here to simplify Base64 encode this
# information and write it into the AutoScaling Launch Configuration.
# More information: https://docs.aws.amazon.com/eks/latest/userguide/launch-workers.html
locals {
tf-eks-node-userdata = <<USERDATA
#!/bin/bash
set -o xtrace
/etc/eks/bootstrap.sh --apiserver-endpoint '${aws_eks_cluster.tf_eks_cluster.endpoint}' --b64-cluster-ca '${aws_eks_cluster.tf_eks_cluster.certificate_authority.0.data}' '${var.cluster_name}'
USERDATA
}
resource "aws_launch_configuration" "tf_eks_cluster" {
associate_public_ip_address = true
iam_instance_profile = aws_iam_instance_profile.tf-eks-node.name
image_id = data.aws_ami.eks-worker.id
instance_type = var.instance_type
name_prefix = "tf-eks-spot"
security_groups = [aws_security_group.tf-eks-node.id]
user_data_base64 = base64encode(local.tf-eks-node-userdata)
lifecycle {
create_before_destroy = true
}
}
resource "aws_lb_target_group" "tf_eks_cluster" {
name = "tf-eks-cluster"
port = 31742
protocol = "HTTP"
vpc_id = var.vpc_id
target_type = "instance"
}
resource "aws_autoscaling_group" "tf_eks_cluster" {
desired_capacity = "2"
launch_configuration = aws_launch_configuration.tf_eks_cluster.id
max_size = "3"
min_size = 1
name = "tf-eks-cluster"
vpc_zone_identifier = var.app_subnet_ids
target_group_arns = [aws_lb_target_group.tf_eks_cluster.arn]
tag {
key = "Name"
value = "tf-eks-cluster"
propagate_at_launch = true
}
tag {
key = "kubernetes.io/cluster/${var.cluster_name}"
value = "owned"
propagate_at_launch = true
}
}
resource "aws_security_group" "tf-eks-cluster" {
name = "terraform-eks-cluster"
description = "Cluster communication with worker nodes"
vpc_id = var.vpc_id
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "terraform-eks"
}
}
resource "aws_security_group" "tf-eks-node" {
name = "terraform-eks-node"
description = "Security group for all nodes in the cluster"
vpc_id = var.vpc_id
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = {
Name = "terraform-eks"
}
}
# Allow inbound traffic from your local workstation external IP
# to the Kubernetes. You will need to replace A.B.C.D below with
# your real IP. Services like icanhazip.com can help you find this.
resource "aws_security_group_rule" "tf-eks-cluster-ingress-workstation-https" {
cidr_blocks = [var.accessing_computer_ip]
description = "Allow workstation to communicate with the cluster API Server"
from_port = 443
protocol = "tcp"
security_group_id = aws_security_group.tf-eks-cluster.id
to_port = 443
type = "ingress"
}
########################################################################################
# Setup worker node security group
resource "aws_security_group_rule" "tf-eks-node-ingress-self" {
description = "Allow node to communicate with each other"
from_port = 0
protocol = "-1"
security_group_id = aws_security_group.tf-eks-node.id
source_security_group_id = aws_security_group.tf-eks-node.id
to_port = 65535
type = "ingress"
}
resource "aws_security_group_rule" "tf-eks-node-ingress-cluster" {
description = "Allow worker Kubelets and pods to receive communication from the cluster control plane"
from_port = 1025
protocol = "tcp"
security_group_id = aws_security_group.tf-eks-node.id
source_security_group_id = aws_security_group.tf-eks-cluster.id
to_port = 65535
type = "ingress"
}
# allow worker nodes to access EKS master
resource "aws_security_group_rule" "tf-eks-cluster-ingress-node-https" {
description = "Allow pods to communicate with the cluster API Server"
from_port = 443
protocol = "tcp"
security_group_id = aws_security_group.tf-eks-node.id
source_security_group_id = aws_security_group.tf-eks-cluster.id
to_port = 443
type = "ingress"
}
resource "aws_security_group_rule" "tf-eks-node-ingress-master" {
description = "Allow cluster control to receive communication from the worker Kubelets"
from_port = 443
protocol = "tcp"
security_group_id = aws_security_group.tf-eks-cluster.id
source_security_group_id = aws_security_group.tf-eks-node.id
to_port = 443
type = "ingress"
}
resource "aws_internet_gateway" "eks" {
vpc_id = aws_vpc.eks.id
tags = {
Name = "internet_gateway"
}
}
resource "aws_eip" "nat_gateway" {
count = var.subnet_count
vpc = true
}
resource "aws_nat_gateway" "eks" {
count = var.subnet_count
allocation_id = aws_eip.nat_gateway.*.id[count.index]
subnet_id = aws_subnet.gateway.*.id[count.index]
tags = {
Name = "nat_gateway"
}
depends_on = [aws_internet_gateway.eks]
}
resource "aws_route_table" "application" {
count = var.subnet_count
vpc_id = aws_vpc.eks.id
route {
cidr_block = "0.0.0.0/0"
nat_gateway_id = aws_nat_gateway.eks.*.id[count.index]
}
tags = {
Name = "eks_application"
}
}
resource "aws_route_table" "vpn" {
vpc_id = aws_vpc.eks.id
tags = {
Name = "eks_vpn"
}
}
resource "aws_route_table" "gateway" {
vpc_id = aws_vpc.eks.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.eks.id
}
tags = {
Name = "eks_gateway"
}
}
resource "aws_route_table_association" "application" {
count = var.subnet_count
subnet_id = aws_subnet.application.*.id[count.index]
route_table_id = aws_route_table.application.*.id[count.index]
}
resource "aws_route_table_association" "vpn" {
count = var.subnet_count
subnet_id = aws_subnet.vpn.*.id[count.index]
route_table_id = aws_route_table.vpn.id
}
resource "aws_route_table_association" "gateway" {
count = var.subnet_count
subnet_id = aws_subnet.gateway.*.id[count.index]
route_table_id = aws_route_table.gateway.id
}
data "aws_availability_zones" "available" {}
resource "aws_subnet" "gateway" {
count = var.subnet_count
availability_zone = data.aws_availability_zones.available.names[count.index]
cidr_block = "10.0.1${count.index}.0/24"
vpc_id = aws_vpc.eks.id
map_public_ip_on_launch = true
tags = {
Name = "eks_gateway"
}
}
resource "aws_subnet" "application" {
count = var.subnet_count
availability_zone = data.aws_availability_zones.available.names[count.index]
cidr_block = "10.0.2${count.index}.0/24"
vpc_id = aws_vpc.eks.id
map_public_ip_on_launch = true
tags = map(
"Name", "eks_application",
"kubernetes.io/cluster/${var.cluster_name}", "shared"
)
}
resource "aws_subnet" "vpn" {
count = var.subnet_count
availability_zone = data.aws_availability_zones.available.names[count.index]
cidr_block = "10.0.3${count.index}.0/24"
vpc_id = aws_vpc.eks.id
tags = {
Name = "eks_vpn"
}
}
resource "aws_vpc" "eks" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true
tags = map(
"Name", "eks-vpc",
"kubernetes.io/cluster/${var.cluster_name}", "shared"
)
}
I had tried to create the Kubernetes deployment in a single massive Terraform manifest. I needed to separate the Kubernetes deployment into a separate Terraform manifest which I applied after updating the ~/.kube/config file.
The DNS errors were due to this file not being current for the new cluster.
Additionally, I needed to ensure that endpoint_private_access = true is set in the eks cluster resource.

AWS ECS Terraform: The requested configuration is currently not supported. Launching EC2 instance failed

I have been trying to spin up ECS using terraform. About two days ago it was working as expected, however today I tried to run terraform apply and I keep getting an error saying
"The requested configuration is currently not supported. Launching EC2 instance failed"
I have researched a lot about this issue, I tried hardcoding the VPC tenancy to default, I've tried changing the region, the instance type and nothing seems to fix the issue.
The is my terraform config:
provider "aws" {
region = var.region
}
data "aws_availability_zones" "available" {}
# Define a vpc
resource "aws_vpc" "motivy_vpc" {
cidr_block = var.motivy_network_cidr
tags = {
Name = var.motivy_vpc
}
enable_dns_support = "true"
instance_tenancy = "default"
enable_dns_hostnames = "true"
}
# Internet gateway for the public subnet
resource "aws_internet_gateway" "motivy_ig" {
vpc_id = aws_vpc.motivy_vpc.id
tags = {
Name = "motivy_ig"
}
}
# Public subnet 1
resource "aws_subnet" "motivy_public_sn_01" {
vpc_id = aws_vpc.motivy_vpc.id
cidr_block = var.motivy_public_01_cidr
availability_zone = data.aws_availability_zones.available.names[0]
tags = {
Name = "motivy_public_sn_01"
}
}
# Public subnet 2
resource "aws_subnet" "motivy_public_sn_02" {
vpc_id = aws_vpc.motivy_vpc.id
cidr_block = var.motivy_public_02_cidr
availability_zone = data.aws_availability_zones.available.names[1]
tags = {
Name = "motivy_public_sn_02"
}
}
# Routing table for public subnet 1
resource "aws_route_table" "motivy_public_sn_rt_01" {
vpc_id = aws_vpc.motivy_vpc.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.motivy_ig.id
}
tags = {
Name = "motivy_public_sn_rt_01"
}
}
# Routing table for public subnet 2
resource "aws_route_table" "motivy_public_sn_rt_02" {
vpc_id = aws_vpc.motivy_vpc.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.motivy_ig.id
}
tags = {
Name = "motivy_public_sn_rt_02"
}
}
# Associate the routing table to public subnet 1
resource "aws_route_table_association" "motivy_public_sn_rt_01_assn" {
subnet_id = aws_subnet.motivy_public_sn_01.id
route_table_id = aws_route_table.motivy_public_sn_rt_01.id
}
# Associate the routing table to public subnet 2
resource "aws_route_table_association" "motivy_public_sn_rt_02_assn" {
subnet_id = aws_subnet.motivy_public_sn_02.id
route_table_id = aws_route_table.motivy_public_sn_rt_02.id
}
# ECS Instance Security group
resource "aws_security_group" "motivy_public_sg" {
name = "motivys_public_sg"
description = "Test public access security group"
vpc_id = aws_vpc.motivy_vpc.id
ingress {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = [
"0.0.0.0/0"]
}
ingress {
from_port = 80
to_port = 80
protocol = "tcp"
cidr_blocks = [
"0.0.0.0/0"]
}
ingress {
from_port = 5000
to_port = 5000
protocol = "tcp"
cidr_blocks = [
"0.0.0.0/0"]
}
ingress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = [
var.motivy_public_01_cidr,
var.motivy_public_02_cidr
]
}
egress {
# allow all traffic to private SN
from_port = "0"
to_port = "0"
protocol = "-1"
cidr_blocks = [
"0.0.0.0/0"]
}
tags = {
Name = "motivy_public_sg"
}
}
data "aws_ecs_task_definition" "motivy_server" {
task_definition = aws_ecs_task_definition.motivy_server.family
}
resource "aws_ecs_task_definition" "motivy_server" {
family = "motivy_server"
container_definitions = file("task-definitions/service.json")
}
data "aws_ami" "latest_ecs" {
most_recent = true # get the latest version
filter {
name = "name"
values = [
"amzn2-ami-ecs-*"] # ECS optimized image
}
owners = [
"amazon" # Only official images
]
}
resource "aws_launch_configuration" "ecs-launch-configuration" {
name = "ecs-launch-configuration"
image_id = data.aws_ami.latest_ecs.id
instance_type = "t2.micro"
iam_instance_profile = aws_iam_instance_profile.ecs-instance-profile.id
root_block_device {
volume_type = "standard"
volume_size = 100
delete_on_termination = true
}
enable_monitoring = true
lifecycle {
create_before_destroy = true
}
security_groups = [aws_security_group.motivy_public_sg.id]
associate_public_ip_address = "true"
key_name = var.ecs_key_pair_name
user_data = <<EOF
#!/bin/bash
echo ECS_CLUSTER=${var.ecs_cluster} >> /etc/ecs/ecs.config
EOF
}
resource "aws_appautoscaling_target" "ecs_motivy_server_target" {
max_capacity = 2
min_capacity = 1
resource_id = "service/${aws_ecs_cluster.motivy_ecs_cluster.name}/${aws_ecs_service.motivy_server_service.name}"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"
depends_on = [ aws_ecs_service.motivy_server_service ]
}
resource "aws_iam_role" "ecs-instance-role" {
name = "ecs-instance-role"
path = "/"
assume_role_policy = data.aws_iam_policy_document.ecs-instance-policy.json
}
data "aws_iam_policy_document" "ecs-instance-policy" {
statement {
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["ec2.amazonaws.com"]
}
}
}
resource "aws_iam_role_policy_attachment" "ecs-instance-role-attachment" {
role = aws_iam_role.ecs-instance-role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"
}
resource "aws_iam_instance_profile" "ecs-instance-profile" {
name = "ecs-instance-profile"
path = "/"
role = aws_iam_role.ecs-instance-role.id
provisioner "local-exec" {
command = "sleep 10"
}
}
resource "aws_autoscaling_group" "motivy-server-autoscaling-group" {
name = "motivy-server-autoscaling-group"
termination_policies = [
"OldestInstance" # When a “scale down” event occurs, which instances to kill first?
]
default_cooldown = 30
health_check_grace_period = 30
max_size = var.max_instance_size
min_size = var.min_instance_size
desired_capacity = var.desired_capacity
# Use this launch configuration to define “how” the EC2 instances are to be launched
launch_configuration = aws_launch_configuration.ecs-launch-configuration.name
lifecycle {
create_before_destroy = true
}
# Refer to vpc.tf for more information
# You could use the private subnets here instead,
# if you want the EC2 instances to be hidden from the internet
vpc_zone_identifier = [aws_subnet.motivy_public_sn_01.id, aws_subnet.motivy_public_sn_02.id]
tags = [{
key = "Name",
value = var.ecs_cluster,
# Make sure EC2 instances are tagged with this tag as well
propagate_at_launch = true
}]
}
resource "aws_alb" "motivy_server_alb_load_balancer" {
name = "motivy-alb-load-balancer"
security_groups = [aws_security_group.motivy_public_sg.id]
subnets = [aws_subnet.motivy_public_sn_01.id, aws_subnet.motivy_public_sn_02.id]
tags = {
Name = "motivy_server_alb_load_balancer"
}
}
resource "aws_alb_target_group" "motivy_server_target_group" {
name = "motivy-server-target-group"
port = 5000
protocol = "HTTP"
vpc_id = aws_vpc.motivy_vpc.id
deregistration_delay = "10"
health_check {
healthy_threshold = "2"
unhealthy_threshold = "6"
interval = "30"
matcher = "200,301,302"
path = "/"
protocol = "HTTP"
timeout = "5"
}
stickiness {
type = "lb_cookie"
}
tags = {
Name = "motivy-server-target-group"
}
}
resource "aws_alb_listener" "alb-listener" {
load_balancer_arn = aws_alb.motivy_server_alb_load_balancer.arn
port = "80"
protocol = "HTTP"
default_action {
target_group_arn = aws_alb_target_group.motivy_server_target_group.arn
type = "forward"
}
}
resource "aws_autoscaling_attachment" "asg_attachment_motivy_server" {
autoscaling_group_name = aws_autoscaling_group.motivy-server-autoscaling-group.id
alb_target_group_arn = aws_alb_target_group.motivy_server_target_group.arn
}
This is the exact error I get
Error: "motivy-server-autoscaling-group": Waiting up to 10m0s: Need at least 2 healthy instances in ASG, have 0. Most recent activity: {
ActivityId: "a775c531-9496-fdf9-5157-ab2448626293",
AutoScalingGroupName: "motivy-server-autoscaling-group",
Cause: "At 2020-04-05T22:10:28Z an instance was started in response to a difference between desired and actual capacity, increasing the capacity from 0 to 2.",
Description: "Launching a new EC2 instance. Status Reason: The requested configuration is currently not supported. Please check the documentation for supported configurations. Launching EC2 instance failed.",
Details: "{\"Subnet ID\":\"subnet-05de5fc0e994d05fe\",\"Availability Zone\":\"us-east-1a\"}",
EndTime: 2020-04-05 22:10:29 +0000 UTC,
Progress: 100,
StartTime: 2020-04-05 22:10:29.439 +0000 UTC,
StatusCode: "Failed",
StatusMessage: "The requested configuration is currently not supported. Please check the documentation for supported configurations. Launching EC2 instance failed."
}
I'm not sure why it worked two days ago.
But recent Amazon ECS-optimized AMIs' volume_type is gp2.
You should choose gp2 as root_block_device.volume_type.
resource "aws_launch_configuration" "ecs-launch-configuration" {
# ...
root_block_device {
volume_type = "gp2"
volume_size = 100
delete_on_termination = true
}
# ...
}
data "aws_ami" "latest_ecs" {
most_recent = true # get the latest version
filter {
name = "name"
values = ["amzn2-ami-ecs-hvm-*-x86_64-ebs"] # ECS optimized image
}
owners = [
"amazon" # Only official images
]
}
For me worked using t3 gen instead of t2