Terraform AWS LB healthcheck failed - amazon-web-services

I have a terraform following code that's configuring me a gateway service on AWS ECS Fargate. Services that are not under load balancer which are in private network work as expected however gateway with added LB is failing it's health check and every 2-3 minute is deprovisioning and provisioning new task. Docker file is exposing a service on port 3000.
Here's a terraform plan that is failing
locals {
gateway_version = "1.0.0"
gateway_port = 3000
}
## VPC
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "3.11.0"
name = "${var.env}-vpc"
cidr = "20.0.0.0/16"
enable_ipv6 = true
azs = ["eu-central-1a", "eu-central-1b"]
public_subnets = ["20.0.1.0/24", "20.0.2.0/24"]
private_subnets = ["20.0.86.0/24", "20.0.172.0/24"]
elasticache_subnets = ["20.0.31.0/24", "20.0.32.0/24"]
enable_nat_gateway = true
single_nat_gateway = true
tags = {
Terraform = "true"
}
}
## Security Groups
module "sg" {
source = "terraform-aws-modules/security-group/aws"
version = "~> 4.0"
name = "${var.env}-sg-default"
description = "Default service security group"
vpc_id = module.vpc.vpc_id
ingress_cidr_blocks = ["0.0.0.0/0"]
ingress_rules = [
"all-icmp",
"http-80-tcp",
"https-443-tcp",
"mysql-tcp",
"rabbitmq-4369-tcp",
"rabbitmq-5671-tcp",
"rabbitmq-5672-tcp",
"rabbitmq-15672-tcp",
"rabbitmq-25672-tcp",
"redis-tcp"
]
egress_rules = ["all-all"]
}
module "security_group" {
source = "terraform-aws-modules/security-group/aws"
version = "~> 4.0"
name = "${var.env}-sg-lb"
description = "Security group for ALB"
vpc_id = module.vpc.vpc_id
ingress_cidr_blocks = ["0.0.0.0/0"]
ingress_rules = ["http-80-tcp", "all-icmp"]
egress_rules = ["all-all"]
}
resource "aws_security_group" "service_security_group" {
name = "${var.env}-lb-connection"
ingress {
from_port = 0
to_port = 0
protocol = "-1"
# Only allowing traffic in from the load balancer security group
security_groups = [module.security_group.security_group_id]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
vpc_id = module.vpc.vpc_id
}
## ECS Cluster
resource "aws_ecs_cluster" "default" {
name = "${var.env}-cluster"
}
## ECR
data "aws_ecr_repository" "gateway_ecr" {
name = "gateway-${var.env}"
}
## ECS Task Definition
resource "aws_ecs_task_definition" "gateway_task" {
family = "${var.env}-gateway-task"
container_definitions = <<DEFINITION
[
{
"name": "${var.env}-gateway-task",
"image": "${data.aws_ecr_repository.gateway_ecr.repository_url}:${local.gateway_version}",
"networkMode": "awsvpc",
"essential": true,
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": "${aws_cloudwatch_log_group.gateway_logs.name}",
"awslogs-stream-prefix": "ecs",
"awslogs-region": "${var.aws-region}"
}
},
"portMappings": [
{
"containerPort": ${local.gateway_port},
"hostPort": ${local.gateway_port}
}
],
"environment": [
{
"name": "AWS_REGION",
"value": "${var.aws-region}"
},
{
"name": "PORT",
"value": "${local.gateway_port}"
},
{
"name": "STAGE",
"value": "${var.env}"
},
{
"name": "NODE_ENV",
"value": "development"
},
{
"name": "VERSION",
"value": "${local.gateway_version}"
}
],
"memory": 512,
"cpu": 256
}
]
DEFINITION
requires_compatibilities = ["FARGATE"]
network_mode = "awsvpc"
memory = 512
cpu = 256
task_role_arn = aws_iam_role.gateway_task_definition_role.arn
execution_role_arn = aws_iam_role.gateway_task_execution_role.arn
}
## ECS Service
resource "aws_ecs_service" "gateway_service" {
name = "${var.env}-gateway-service"
cluster = aws_ecs_cluster.default.id
task_definition = aws_ecs_task_definition.gateway_task.arn
launch_type = "FARGATE"
desired_count = 1
force_new_deployment = true
network_configuration {
subnets = concat(
module.vpc.public_subnets,
module.vpc.private_subnets,
)
security_groups = [
module.sg.security_group_id,
aws_security_group.service_security_group.id
]
assign_public_ip = true
}
lifecycle {
ignore_changes = [desired_count]
}
load_balancer {
target_group_arn = aws_lb_target_group.target_group.arn
container_name = aws_ecs_task_definition.gateway_task.family
container_port = local.gateway_port
}
}
## Cloudwatch Log Group
resource "aws_cloudwatch_log_group" "gateway_logs" {
name = "${var.env}-gateway-log-group"
tags = {
Name = "${var.env}-gateway-log-group"
}
}
## IAM Roles
resource "aws_iam_role" "gateway_task_definition_role" {
name = "${var.env}-gateway-task-definition-role"
assume_role_policy = data.aws_iam_policy_document.gateway_assume_role_policy.json
tags = {
Name = "${var.env}-gateway-task-definition-role"
}
}
resource "aws_iam_role" "gateway_task_execution_role" {
name = "${var.env}-gateway-task-execution-role"
assume_role_policy = data.aws_iam_policy_document.gateway_assume_role_policy.json
tags = {
Name = "${var.env}-gateway-task-execution-role"
}
}
data "aws_iam_policy_document" "gateway_assume_role_policy" {
statement {
effect = "Allow"
actions = ["sts:AssumeRole"]
principals {
type = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
}
}
}
resource "aws_iam_role_policy" "gateway_exec" {
name = "${var.env}-gateway-execution-role-policy"
role = aws_iam_role.gateway_task_execution_role.id
policy = data.aws_iam_policy_document.gateway_exec_policy.json
}
data "aws_iam_policy_document" "gateway_exec_policy" {
statement {
effect = "Allow"
resources = ["*"]
actions = [
"ecr:GetAuthorizationToken",
"ecr:BatchCheckLayerAvailability",
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"logs:CreateLogStream",
"logs:PutLogEvents",
]
}
}
## ALB
resource "aws_lb" "alb" {
name = "${var.env}-lb"
load_balancer_type = "application"
subnets = module.vpc.public_subnets
security_groups = [module.security_group.security_group_id]
}
resource "aws_lb_target_group" "target_group" {
name = "target-group"
port = 80
protocol = "HTTP"
target_type = "ip"
vpc_id = module.vpc.vpc_id
health_check {
matcher = "200,301,302"
path = "/health"
interval = 120
timeout = 30
}
}
resource "aws_lb_listener" "listener" {
load_balancer_arn = aws_alb.alb.arn
port = 80
protocol = "HTTP"
default_action {
type = "forward"
target_group_arn = aws_lb_target_group.target_group.arn
}
}
That's the error
Task failed ELB health checks in (target-group arn:aws:elasticloadbalancing:eu-central-1:129228585726:targetgroup/target-group/5853904c0d3ad322)
After it's deployed I see that a ECS service is started and it's working there however I don't see any requests to check it's health

Your target group uses port = 80, but your ECS task definition specifies port 3000. So this is likely reason why your ALB can't connect to your containers.

The load balancer tries to check if it is able to reach the application on the specified target port. In your case it is 3000.
Replace your target group resource to use the application port for LB healthchecks to pass.
resource "aws_lb_target_group" "target_group" {
name = "target-group"
port = 3000
protocol = "HTTP"
target_type = "ip"
vpc_id = module.vpc.vpc_id
health_check {
matcher = "200,301,302"
path = "/health"
interval = 120
timeout = 30
}
}

Target group was not an issue -> the issue was wrong security_group which didn't allowed to hit port 3000

Related

AWS ECS Fargate Container Failure

Issue:
I am deploying ECS Fargate using Terraform. When I deploy everything goes great, but then the task fails saying "Essential container in task exited". I expanded the task and it says exit code 134. I have tried upping the CPU and Memory, and triple checking the files, but I can't figure out what is wrong. Anyone have any advice?
Files:
fargate.tf
resource "aws_ecs_task_definition" "backend_task" {
family = "backend_example_app_family"
// Fargate is a type of ECS that requires awsvpc network_mode
requires_compatibilities = ["FARGATE"]
network_mode = "awsvpc"
// Valid sizes are shown here: https://aws.amazon.com/fargate/pricing/
memory = "1024"
cpu = "512"
// Fargate requires task definitions to have an execution role ARN to support ECR images
execution_role_arn = "${aws_iam_role.ecs_role.arn}"
container_definitions = <<EOT
[
{
"name": "example_app_container",
"image": "250531645249.dkr.ecr.us-east-1.amazonaws.com/ecr_example_repo:latest",
"memory": 512,
"essential": true,
"portMappings": [
{
"containerPort": 3000,
"hostPort": 3000
}
]
}
]
EOT
}
resource "aws_ecs_cluster" "backend_cluster" {
name = "backend_cluster_example_app"
}
resource "aws_ecs_service" "backend_service" {
name = "backend_service"
cluster = "${aws_ecs_cluster.backend_cluster.id}"
task_definition = "${aws_ecs_task_definition.backend_task.arn}"
launch_type = "FARGATE"
desired_count = 1
network_configuration {
subnets = ["${aws_subnet.public_a.id}", "${aws_subnet.public_b.id}"]
security_groups = ["${aws_security_group.security_group_example_app.id}"]
assign_public_ip = true
}
}
iam.tf
resource "aws_iam_role" "ecs_role" {
name = "ecs_role_example_app"
assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Principal": {
"Service": "ecs-tasks.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}
resource "aws_iam_role_policy_attachment" "ecs_policy_attachment" {
role = "${aws_iam_role.ecs_role.name}"
// This policy adds logging + ecr permissions
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
network.tf
resource "aws_vpc" "vpc_example_app" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true
}
resource "aws_subnet" "public_a" {
vpc_id = "${aws_vpc.vpc_example_app.id}"
cidr_block = "10.0.1.0/24"
availability_zone = "us-east-1a"
}
resource "aws_subnet" "public_b" {
vpc_id = "${aws_vpc.vpc_example_app.id}"
cidr_block = "10.0.2.0/24"
availability_zone = "us-east-1b"
}
resource "aws_internet_gateway" "internet_gateway" {
vpc_id = "${aws_vpc.vpc_example_app.id}"
}
resource "aws_route" "internet_access" {
route_table_id = "${aws_vpc.vpc_example_app.main_route_table_id}"
destination_cidr_block = "0.0.0.0/0"
gateway_id = "${aws_internet_gateway.internet_gateway.id}"
}
resource "aws_security_group" "security_group_example_app" {
name = "security_group_example_app"
description = "Allow TLS inbound traffic on port 80 (http)"
vpc_id = "${aws_vpc.vpc_example_app.id}"
ingress {
from_port = 80
to_port = 4000
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}

ECR VPC endpoint for fargate not working as expected

I am using Fargate for a task that runs every hour. As the docker image size is 1.5go, I want to use a ECR VPC endpoint to optimize the AWS data transfer fee.
The fargate tasks run in a private subnet. The route table of one of the private subnet is the following (where eigw is eigress only internet gateway and nat-01 is a nat gateway in the public subnet):
Destination Target
10.50.0.0/16 local
0.0.0.0/0 nat-01ec80c2754229321
2a05:d012:43e:de00::/56 local
::/0 eigw-0a0c583a8390d5736
Expected behavior: Right now, the Fargate task takes around 1 minute to start, due to the time it takes to docker pull the image. I expect that with the ECR VPC endpoint, the time it takes to start would go down.
Actual behavior: There is not a single second difference, which means I probably did something wrong!
My terraform setup:
The VPC, subnets and route tables:
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "~> 2.21.0"
name = "dev-vpc"
cidr = "10.10.0.0/16"
azs = ["eu-west-3a", "eu-west-3b"]
private_subnets = ["10.10.0.0/20", "10.10.32.0/20"]
public_subnets = ["10.10.128.0/20", "10.10.160.0/20"]
enable_nat_gateway = true
single_nat_gateway = true
reuse_nat_ips = false
enable_vpn_gateway = false
enable_dns_hostnames = true
create_database_subnet_group = true
enable_ipv6 = true
assign_ipv6_address_on_creation = true
private_subnet_assign_ipv6_address_on_creation = false
public_subnet_ipv6_prefixes = [0, 1]
private_subnet_ipv6_prefixes = [2, 3]
database_subnet_ipv6_prefixes = [4, 5]
database_subnets = ["10.10.64.0/20", "10.10.80.0/20"]
tags = {
ManagedByTerraform = "true"
EnvironmentType = "dev"
}
}
# the SG for the VPC endpoints
resource "aws_security_group" "vpce" {
name = "dev-vpce-sg"
vpc_id = module.vpc.vpc_id
ingress {
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = [module.vpc.vpc_cidr_block]
}
tags = {
Environment = "dev"
}
}
# all the VPC endpoints needed (from AWS documentation)
resource "aws_vpc_endpoint" "ecr_endpoint" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.ecr.dkr"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "dkr-endpoint"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "ecr_api_endpoint" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.ecr.api"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "ecr-api-endpoint"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "s3" {
vpc_id = module.vpc.vpc_id
service_name = "com.amazonaws.eu-west-3.s3"
vpc_endpoint_type = "Gateway"
route_table_ids = module.vpc.private_route_table_ids
policy = <<-EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Principal": "*",
"Action": [
"s3:PutObjectAcl",
"s3:PutObject",
"s3:ListBucket",
"s3:GetObject",
"s3:Delete*"
],
"Resource": [
"arn:aws:s3:::prod-eu-west-3-starport-layer-bucket",
"arn:aws:s3:::prod-eu-west-3-starport-layer-bucket/*"
]
}
]
}
EOF
tags = {
Name = "s3-endpoint"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "logs" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.logs"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "logs-endpoint"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "ecs_agent" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.ecs-agent"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "ecs-agent"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "ecs_telemetry" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.ecs-telemetry"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "telemetry"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "ecs_endpoint" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.ecs"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "ecs-endpoint"
Environment = "dev"
}
}
Can you let me know what can be wrong in my setup?
I have zero knowledge in network engineering, so please let me know if you need further information.

Impossible to SSH to EC2 instance and unable to place ECS task

Given the following terraform.tf file:
provider "aws" {
profile = "default"
region = "us-east-1"
}
locals {
vpc_name = "some-vpc-name"
dev_vpn_source = "*.*.*.*/32" # Insted of * I have a CIDR block of our VPN here
}
resource "aws_vpc" "vpc" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
tags = {
Name: local.vpc_name
}
}
resource "aws_subnet" "a" {
cidr_block = "10.0.0.0/17"
vpc_id = aws_vpc.vpc.id
tags = {
Name: "${local.vpc_name}-a"
}
}
resource "aws_subnet" "b" {
cidr_block = "10.0.128.0/17"
vpc_id = aws_vpc.vpc.id
tags = {
Name: "${local.vpc_name}-b"
}
}
resource "aws_security_group" "ssh" {
name = "${local.vpc_name}-ssh"
vpc_id = aws_vpc.vpc.id
tags = {
Name: "${local.vpc_name}-ssh"
}
}
resource "aws_security_group_rule" "ingress-ssh" {
from_port = 22
protocol = "ssh"
security_group_id = aws_security_group.ssh.id
to_port = 22
type = "ingress"
cidr_blocks = [local.dev_vpn_source]
description = "SSH access for developer"
}
resource "aws_security_group" "outbound" {
name = "${local.vpc_name}-outbound"
vpc_id = aws_vpc.vpc.id
tags = {
Name: "${local.vpc_name}-outbound"
}
}
resource "aws_security_group_rule" "egress" {
from_port = 0
protocol = "all"
security_group_id = aws_security_group.outbound.id
to_port = 65535
type = "egress"
cidr_blocks = ["0.0.0.0/0"]
description = "All outbound allowed"
}
module "ecs-clusters" {
source = "./ecs-clusters/"
subnets = [aws_subnet.a, aws_subnet.b]
vpc_name = local.vpc_name
security_groups = [aws_security_group.ssh, aws_security_group.outbound]
}
And the following ecs-clusters/ecs-cluster.tf file:
variable "vpc_name" {
type = string
}
variable "subnets" {
type = list(object({
id: string
}))
}
variable "security_groups" {
type = list(object({
id: string
}))
}
data "aws_ami" "amazon_linux_ecs" {
most_recent = true
owners = ["amazon"]
filter {
name = "name"
values = ["amzn2-ami-ecs*"]
}
}
resource "aws_iam_instance_profile" "ecs-launch-profile" {
name = "${var.vpc_name}-ecs"
role = "ecsInstanceRole"
}
resource "aws_launch_template" "ecs" {
name = "${var.vpc_name}-ecs"
image_id = data.aws_ami.amazon_linux_ecs.id
instance_type = "r5.4xlarge"
key_name = "some-ssh-key-name"
iam_instance_profile {
name = "${var.vpc_name}-ecs"
}
block_device_mappings {
device_name = "/dev/xvda"
ebs {
volume_type = "gp3"
volume_size = 1024
delete_on_termination = false
}
}
network_interfaces {
associate_public_ip_address = true
subnet_id = var.subnets[0].id
security_groups = var.security_groups[*].id
}
update_default_version = true
}
resource "aws_autoscaling_group" "ecs-autoscaling_group" {
name = "${var.vpc_name}-ecs"
vpc_zone_identifier = [for subnet in var.subnets: subnet.id]
desired_capacity = 1
max_size = 1
min_size = 1
protect_from_scale_in = true
launch_template {
id = aws_launch_template.ecs.id
version = aws_launch_template.ecs.latest_version
}
tag {
key = "Name"
propagate_at_launch = true
value = "${var.vpc_name}-ecs"
}
depends_on = [aws_launch_template.ecs]
}
resource "aws_ecs_capacity_provider" "ecs-capacity-provider" {
name = var.vpc_name
auto_scaling_group_provider {
auto_scaling_group_arn = aws_autoscaling_group.ecs-autoscaling_group.arn
managed_termination_protection = "ENABLED"
managed_scaling {
maximum_scaling_step_size = 1
minimum_scaling_step_size = 1
status = "ENABLED"
target_capacity = 1
}
}
depends_on = [aws_autoscaling_group.ecs-autoscaling_group]
}
resource "aws_ecs_cluster" "ecs-cluster" {
name = var.vpc_name
capacity_providers = [aws_ecs_capacity_provider.ecs-capacity-provider.name]
depends_on = [aws_ecs_capacity_provider.ecs-capacity-provider]
}
resource "aws_iam_role" "ecs-execution" {
name = "${var.vpc_name}-ecs-execution"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "ecs-tasks.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role" "ecs" {
name = "${var.vpc_name}-ecs"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "ecs-tasks.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role_policy_attachment" "execution-role" {
role = aws_iam_role.ecs-execution.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
resource "aws_iam_role_policy_attachment" "role" {
role = aws_iam_role.ecs.name
policy_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
}
I'm facing two problems:
I can't SSH into EC2 instance created by the autoscaling group, despite the fact that I'm using the same SSH key and VPN to access other EC2 instances. My VPN client config includes route to the target machine via VPN gateway.
I can't execute task on the ESC cluster. The task gets stuck in provisioning status and then fails with "Unable to run task". The task is configured to use 1 GB of RAM and 1 vCPU.
What am I doing wrong?
Based on the comments.
There were two issues with the original setup:
Lack of connectivity to ECS and ECR services, which was solved by enabling internet access in the VPC. It is also possible to use VPC interface endpoints for ECS, ECR and S3, if the internet access is not desired.
Container instances did not register with ECS. This was fixed by using user_data to bootstrap ECS instances so that they can register with the ECS cluster.

Health Checks Fail - Terraform - ECS Cluster - Dynamic Port Mapping Health Checks fail on 1 of 2 Ports?

I have an environment setup with Terraform. One of our SecOps team needs SonarQube CE installed for automatic scans, smells, vuln checks. So I have it running in AWS, over our VPN, DNS resolves to an internal facing ALB that points traffic to a target group of instances that make up the ECS Cluster. There is a launch config and auto scaling group as well. Only running one container per host currently would like to set this to 2-4 at some point.
The problem I am having is the instances are registering to the target group on both the container port:9000 and the dynamic ephemeral port, 32768. The health checks to the dynamic port are working fine, however the health checks to port 9000 are failing. This is causing the instances to cycle between initial, unhealthy, and terminating repeatedly. Other than this annoying issue the application runs fine. RDS connects, we can use SonarQube just fine.
I've tried removing the references to the container port in Terraform, I'll also state that this is an extremely secure environment. All egress traffic from any VPC is filtered out through a McAffee Cloud Proxy applicance. When I first stood this up in a sandbox account, with egress to 0.0.0.0/0 everything worked fine. I've spent a few hours on this now, at the point of scratching my head.
Hopefully someone else has been here and will share their insight. Tomorrow is a new day after all. HELP!
ERROR Message when I remove the port from the target group
aws_lb_target_group.ecs: port should be set when target type is instance
ERROR Message when I set the port to 0
aws_ecs_service.ecs: InvalidParameterException: The container sonarqube did not have a container port 0 defined.
ERROR Message when I set the container port to 0 in the taskdef.
aws_ecs_task_definition.task: ClientException: Invalid 'containerPort' setting for container 'sonarqube'
ecs-taskdef.tf
resource "aws_ecs_task_definition" "task" {
family = "${var.name}-${var.env}"
network_mode = "bridge"
cpu = 8192
memory = 16384
execution_role_arn = "${var.ecs-exec-role}"
container_definitions = <<DEFINITION
[
{
"name": "${var.name}",
"image":"${var.image}",
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": "/ecs/${var.cluster_name}-${var.name}",
"awslogs-region": "${var.region}",
"awslogs-stream-prefix": "ecs"
}
},
"portMappings": [
{
"containerPort": 9000
}
],
"environment": [
{
"name": "sonar.jdbc.password",
"value": "${var.password}"
},
{
"name": "sonar.jdbc.url",
"value": "jdbc:mysql://${var.rds_url}:${var.port}/sonar?useUnicode=true&characterEncoding=utf8&rewriteBatchedStatements=true&useConfigs=maxPerformance"
},
{
"name": "sonar.jdbc.username",
"value": "${var.username}"
}
]
}
]
DEFINITION
}
resource "aws_ecs_service" "ecs" {
name = "${var.name}-${var.env}"
cluster = "${var.cluster_name}"
task_definition = "${aws_ecs_task_definition.task.arn}"
scheduling_strategy = "DAEMON"
lifecycle {
ignore_changes = ["desired_count"]
}
load_balancer {
target_group_arn = "${aws_lb_target_group.ecs.arn}"
container_name = "${var.name}"
container_port = 9000 #Removed & Terraform complains with an error.
}
}
elb.tf
resource "aws_lb" "ecs" {
name = "${var.name_prefix}-${var.name}-tf"
internal = true
load_balancer_type = "application"
security_groups = ["${var.security_groups}"]
subnets = ["${var.subnets}"]
enable_deletion_protection = false
tags = "${merge(var.tags, map("Name", "${var.name_prefix}-${var.name}-elb"))}"
}
resource "aws_lb_listener" "ecs" {
load_balancer_arn = "${aws_lb.ecs.arn}"
port = 80
protocol = "HTTP"
default_action {
type = "redirect"
redirect {
port = "443"
protocol = "HTTPS"
status_code = "HTTP_301"
}
}
}
resource "aws_lb_listener" "ssl" {
load_balancer_arn = "${aws_lb.ecs.arn}"
port = 443
protocol = "HTTPS"
lifecycle {
create_before_destroy = true
}
ssl_policy = "ELBSecurityPolicy-2016-08"
certificate_arn = "arn:aws:acm:REDACTED"
default_action {
type = "forward"
target_group_arn = "${aws_lb_target_group.ecs.arn}"
}
}
resource "aws_lb_target_group" "ecs" {
name = "${var.cluster_name}"
protocol = "HTTP"
port = 9000 #must be here or TF errors instance type must have port
vpc_id = "${var.vpc_id}"
lifecycle {
create_before_destroy = true
}
}
ec2.tf
resource "aws_autoscaling_group" "asg" {
availability_zones = ["${var.region}a", "${var.region}b", "${var.region}d"]
name = "${var.name}-${var.env}-asg"
max_size = "${var.min_size}"
min_size = "${var.max_size}"
health_check_grace_period = 300
health_check_type = "ELB"
desired_capacity = "${var.desired_size}"
launch_configuration = "${aws_launch_configuration.alc.name}"
vpc_zone_identifier = ["${var.subnet_ids}"]
target_group_arns = ["${var.target_arn}"]
lifecycle {
create_before_destroy = true
}
tag {
key = "Environment"
value = "${var.name}"
propagate_at_launch = true
}
tag {
key = "Name"
value = "${var.name_prefix}-${var.name}.ecs"
propagate_at_launch = true
}
}
resource "aws_launch_configuration" "alc" {
name_prefix = "${var.name_prefix}.ecs"
image_id = "${lookup(var.ecs-images, var.region)}"
instance_type = "${var.instance_type}"
iam_instance_profile = "${aws_iam_instance_profile.ecs-instance-profile.arn}"
user_data = "${data.template_file.userdata.rendered}"
key_name = "${var.key_name}"
security_groups = ["${var.security_groups}"]
lifecycle {
create_before_destroy = true
}
root_block_device {
volume_type = "io1"
iops = "1000"
volume_size = "${var.volume_size}"
}
}
data "template_file" "userdata" {
template = "${file("${path.module}/userdata/ecs-instances.sh")}"
vars {
cluster-name = "${aws_ecs_cluster.cluster.name}"
}
}
resource "aws_security_group" "allow_all_from_cluster" {
name = "${var.name_prefix}-${var.name}-ecs-cluster"
description = "Allow traffic from cluster"
vpc_id = "${var.vpc_id}"
tags = "${merge(var.tags, map("Name", "${var.name_prefix}-${var.name}-sg"))}"
lifecycle {
create_before_destroy = true
}
ingress { #open to VPC IP's
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["172.27.160.0/22"]
}
ingress { #open to corp network redirected to 443
from_port = 80
to_port = 80
protocol = "tcp"
cidr_blocks = ["10.0.0.0/8"]
}
ingress { #http access for corp users
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = ["10.0.0.0/8"]
}
egress { #open to VPC IP's
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["172.27.160.0/22"]
}
egress { #ephemeral response to corp users
from_port = 32768
to_port = 65535
protocol = "tcp"
cidr_blocks = ["10.0.0.0/8"]
}
}
iam.tf
resource "aws_iam_role" "iam_role" {
name = "${var.name}-ecs-role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Principal": {
"Service": "ecs.amazonaws.com"
},
"Action": "sts:AssumeRole"
},
{
"Sid": "",
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
EOF
}
resource "aws_iam_policy" "efs-policy" {
name = "${var.env}-efs-access-policy"
path = "/"
description = "Allow ${var.env} cluster access to EFS"
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"elasticfilesystem:*"
],
"Effect": "Allow",
"Resource": "*"
}
]
}
EOF
}
resource "aws_iam_role_policy_attachment" "ecs-service-role" {
role = "${aws_iam_role.iam_role.name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceRole"
}
resource "aws_iam_role_policy_attachment" "ecs-service-for-ec2-role" {
role = "${aws_iam_role.iam_role.name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"
}
resource "aws_iam_role_policy_attachment" "ssm-service-role" {
role = "${aws_iam_role.iam_role.name}"
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM"
}
resource "aws_iam_role_policy_attachment" "efs-for-ec2-role" {
role = "${aws_iam_role.iam_role.name}"
policy_arn = "${aws_iam_policy.efs-policy.arn}"
}
resource "aws_iam_instance_profile" "ecs-instance-profile" {
name = "${var.env}-ecs"
role = "${aws_iam_role.iam_role.name}"
}
Expected health checks to only take place on dynamic port. I can remove the intance from the target group on port 9000. Each instance shows up in the registered targets secion twice, for both ports. I remove the port 9000 and the instances stays in service.
If you are using ephemeral ports then it doesn't really care what you specify in the containerPort. In my Terraform I am using port 9000 as the defined ContainerPort just because it needs a value and the hostPort is specified as 0. The security groups have been created to account for the use of ephemeral and the health check seems to work as expected using / and trafficPort.

Using Terraform to create AWS ECS with target group always timing out

Terraform Version
v0.11.3
Affected Resources
aws_ecs_service
aws_ecs_task_definition
aws_alb
aws_alb_target_group
aws_alb_listener
Error
I'm setting up an ECS cluster with currently one service. Had several issues getting the service up without breaking, but now my service can't seem to keep a container running.
service phoenix-web (instance i-079707fc669361a81) (port 80) is unhealthy in target-group tgqaphoenix-web due to (reason Request timed out)
Related?
Once my resources are up, I can't seem to find a public dns link on any instance or on the vpc gateway
main.tf for my ECS Service module:
data "template_file" "ecs_task_definition_config" {
template = "${file("config/ecs-task.json")}"
}
resource "aws_ecs_task_definition" "phoenix-web" {
lifecycle {
create_before_destroy = true
}
family = "nginx-phoenix-task"
container_definitions = "${data.template_file.ecs_task_definition_config.rendered}"
}
resource "aws_security_group" "main" {
vpc_id = "${var.vpc_id}"
tags {
Name = "sg${var.name}LoadBalancer"
Project = "${var.name}"
Environment = "${var.environment}"
}
}
resource "aws_security_group_rule" "app_lb_https_ingress" {
type = "ingress"
from_port = 80
to_port = 80
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
security_group_id = "${aws_security_group.main.id}"
}
resource "aws_alb" "main" {
security_groups = ["${aws_security_group.main.id}"]
subnets = ["${var.public_subnet_ids}"]
name = "alb-${var.environment}-${var.name}"
access_logs {
bucket = "${var.access_log_bucket}"
prefix = "${var.access_log_prefix}"
}
tags {
Name = "alb-${var.environment}-${var.name}"
Project = "${var.name}"
Environment = "${var.environment}"
}
}
resource "aws_alb_target_group" "main" {
name = "tg${var.environment}${var.name}"
health_check {
healthy_threshold = "3"
interval = "30"
protocol = "HTTP"
timeout = "3"
path = "/healthz"
unhealthy_threshold = "2"
}
port = "80"
protocol = "HTTP"
vpc_id = "${var.vpc_id}"
tags {
Name = "tg${var.environment}${var.name}"
Project = "${var.name}"
Environment = "${var.environment}"
}
depends_on = ["aws_alb.main"]
}
resource "aws_alb_listener" "https" {
load_balancer_arn = "${aws_alb.main.id}"
port = "80"
protocol = "HTTP"
default_action {
target_group_arn = "${aws_alb_target_group.main.id}"
type = "forward"
}
}
resource "aws_ecs_service" "service" {
lifecycle {
create_before_destroy = true
}
name = "${var.name}"
cluster = "${var.environment}"
task_definition = "${aws_ecs_task_definition.phoenix-web.id}"
desired_count = "${var.desired_count}"
deployment_minimum_healthy_percent = "${var.deployment_min_healthy_percent}"
deployment_maximum_percent = "${var.deployment_max_percent}"
iam_role = "${aws_iam_role.ecs-role.id}"
load_balancer {
target_group_arn = "${aws_alb_target_group.main.id}"
container_name = "phoenix-web"
container_port = "80"
}
depends_on = ["aws_iam_role.ecs-role", "null_resource.alb_exists"]
}
resource "aws_iam_role_policy" "ecs-policy" {
name = "ecs-policy"
role = "${aws_iam_role.ecs-role.id}"
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"ecs:CreateCluster",
"ecs:DeregisterContainerInstance",
"ecs:DiscoverPollEndpoint",
"ecs:Poll",
"ecs:RegisterContainerInstance",
"ecs:StartTelemetrySession",
"ecs:Submit*",
"ecr:GetAuthorizationToken",
"ecr:BatchCheckLayerAvailability",
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"ec2:AuthorizeSecurityGroupIngress",
"ec2:Describe*",
"elasticloadbalancing:DeregisterInstancesFromLoadBalancer",
"elasticloadbalancing:Describe*",
"elasticloadbalancing:RegisterInstancesWithLoadBalancer",
"elasticloadbalancing:RegisterTargets",
"elasticloadbalancing:DeregisterTargets"
],
"Resource": "*"
}
]
}
EOF
depends_on = ["aws_iam_role.ecs-role"]
}
resource "aws_iam_role" "ecs-role" {
name = "ecs-role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "ecs.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_appautoscaling_target" "main" {
service_namespace = "ecs"
resource_id = "service/${var.environment}/${var.name}"
scalable_dimension = "ecs:service:DesiredCount"
role_arn = "${aws_iam_role.ecs-role.arn}"
min_capacity = "${var.min_count}"
max_capacity = "${var.max_count}"
depends_on = [
"aws_ecs_service.service",
]
}
resource "null_resource" "alb_exists" {
triggers {
alb_name = "${aws_alb_target_group.main.id}"
}
}
main.tf for my ECS cluster module
module "s3-log-storage" {
source = "cloudposse/s3-log-storage/aws"
version = "0.1.3"
# insert the 3 required variables here
namespace = "mmt-ecs"
stage = "${var.environment}"
name = "logs-bucket"
policy = <<POLICY
{
"Id": "Policy1519319575520",
"Version": "2012-10-17",
"Statement": [
{
"Sid": "Stmt1519319570434",
"Action": [
"s3:PutObject",
"s3:PutObjectAcl",
"s3:PutObjectTagging",
"s3:PutObjectVersionAcl",
"s3:PutObjectVersionTagging"
],
"Effect": "Allow",
"Resource": "arn:aws:s3:::mmt-ecs-qa-logs-bucket/*",
"Principal": "*"
}
]
}
POLICY
}
module "network" {
source = "../network"
environment = "${var.environment}"
vpc_cidr = "${var.vpc_cidr}"
public_subnet_cidrs = "${var.public_subnet_cidrs}"
private_subnet_cidrs = "${var.private_subnet_cidrs}"
availability_zones = "${var.availability_zones}"
depends_id = ""
}
module "ecs_instances" {
source = "../ecs_instances"
environment = "${var.environment}"
cluster = "${var.cluster}"
instance_group = "${var.instance_group}"
private_subnet_ids = "${module.network.private_subnet_ids}"
aws_ami = "${var.ecs_aws_ami}"
instance_type = "${var.instance_type}"
max_size = "${var.max_size}"
min_size = "${var.min_size}"
desired_capacity = "${var.desired_capacity}"
vpc_id = "${module.network.vpc_id}"
iam_instance_profile_id = "${aws_iam_instance_profile.ecs.id}"
key_name = "${var.key_name}"
load_balancers = "${var.load_balancers}"
depends_id = "${module.network.depends_id}"
custom_userdata = "${var.custom_userdata}"
cloudwatch_prefix = "${var.cloudwatch_prefix}"
}
module "web-phoenix-service" {
source = "../services/web-phoenix"
environment = "${var.environment}"
vpc_id = "${module.network.vpc_id}"
public_subnet_ids = "${module.network.public_subnet_ids}"
name = "phoenix-web"
deployment_max_percent = "200"
deployment_min_healthy_percent = "100"
max_count = "2"
min_count = "1"
desired_count = "1"
ecs_service_role_name = "${aws_iam_instance_profile.ecs.id}"
access_log_bucket = "${module.s3-log-storage.bucket_id}"
access_log_prefix = "ALB"
}
resource "aws_ecs_cluster" "cluster" {
name = "${var.cluster}"
}
It seems the application health check is failing i.e. /healthz. You start debugging issue like below:
1) Spin up a container in your local and check whether it is working or not. Per your health check info above, you should be able to access application like http://someip:port/healthz
If this works
2) Are you exposing port 80 while building docker image ? Check in docker file.
3) if above two steps seems okay, then try accessing your application by using EC S instance ip as soon as task is running.
http://ecsinstanceip:port/healthz .
4) If 3 also works, they try increasing the health check timeout period so that the application gets more time to pass its health check..
Clue 1
Make sure that the ECS container instance's security group is able to accept ports 1024-65535 inside the VPN (don't open it for the outside world)
Clue 2
On the task definition for the portMappings specify it like:
"portMappings": [
{
"hostPort": 0,
"protocol": "tcp",
"containerPort": 80
}
],
Note here:
containerPort is what you expose from your container, where you app is listening with its healthcheck
hostPort would be what port you bind for forwarding on the host. Leave it 0 an it will be automatically assigned by ECS, that's why you need to open 1024-65535 on the SG. This is needed so you will be able to run the same task definition multiple times on the same instance (scale horizontally).