Impossible to SSH to EC2 instance and unable to place ECS task - amazon-web-services

Given the following terraform.tf file:
provider "aws" {
profile = "default"
region = "us-east-1"
}
locals {
vpc_name = "some-vpc-name"
dev_vpn_source = "*.*.*.*/32" # Insted of * I have a CIDR block of our VPN here
}
resource "aws_vpc" "vpc" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
tags = {
Name: local.vpc_name
}
}
resource "aws_subnet" "a" {
cidr_block = "10.0.0.0/17"
vpc_id = aws_vpc.vpc.id
tags = {
Name: "${local.vpc_name}-a"
}
}
resource "aws_subnet" "b" {
cidr_block = "10.0.128.0/17"
vpc_id = aws_vpc.vpc.id
tags = {
Name: "${local.vpc_name}-b"
}
}
resource "aws_security_group" "ssh" {
name = "${local.vpc_name}-ssh"
vpc_id = aws_vpc.vpc.id
tags = {
Name: "${local.vpc_name}-ssh"
}
}
resource "aws_security_group_rule" "ingress-ssh" {
from_port = 22
protocol = "ssh"
security_group_id = aws_security_group.ssh.id
to_port = 22
type = "ingress"
cidr_blocks = [local.dev_vpn_source]
description = "SSH access for developer"
}
resource "aws_security_group" "outbound" {
name = "${local.vpc_name}-outbound"
vpc_id = aws_vpc.vpc.id
tags = {
Name: "${local.vpc_name}-outbound"
}
}
resource "aws_security_group_rule" "egress" {
from_port = 0
protocol = "all"
security_group_id = aws_security_group.outbound.id
to_port = 65535
type = "egress"
cidr_blocks = ["0.0.0.0/0"]
description = "All outbound allowed"
}
module "ecs-clusters" {
source = "./ecs-clusters/"
subnets = [aws_subnet.a, aws_subnet.b]
vpc_name = local.vpc_name
security_groups = [aws_security_group.ssh, aws_security_group.outbound]
}
And the following ecs-clusters/ecs-cluster.tf file:
variable "vpc_name" {
type = string
}
variable "subnets" {
type = list(object({
id: string
}))
}
variable "security_groups" {
type = list(object({
id: string
}))
}
data "aws_ami" "amazon_linux_ecs" {
most_recent = true
owners = ["amazon"]
filter {
name = "name"
values = ["amzn2-ami-ecs*"]
}
}
resource "aws_iam_instance_profile" "ecs-launch-profile" {
name = "${var.vpc_name}-ecs"
role = "ecsInstanceRole"
}
resource "aws_launch_template" "ecs" {
name = "${var.vpc_name}-ecs"
image_id = data.aws_ami.amazon_linux_ecs.id
instance_type = "r5.4xlarge"
key_name = "some-ssh-key-name"
iam_instance_profile {
name = "${var.vpc_name}-ecs"
}
block_device_mappings {
device_name = "/dev/xvda"
ebs {
volume_type = "gp3"
volume_size = 1024
delete_on_termination = false
}
}
network_interfaces {
associate_public_ip_address = true
subnet_id = var.subnets[0].id
security_groups = var.security_groups[*].id
}
update_default_version = true
}
resource "aws_autoscaling_group" "ecs-autoscaling_group" {
name = "${var.vpc_name}-ecs"
vpc_zone_identifier = [for subnet in var.subnets: subnet.id]
desired_capacity = 1
max_size = 1
min_size = 1
protect_from_scale_in = true
launch_template {
id = aws_launch_template.ecs.id
version = aws_launch_template.ecs.latest_version
}
tag {
key = "Name"
propagate_at_launch = true
value = "${var.vpc_name}-ecs"
}
depends_on = [aws_launch_template.ecs]
}
resource "aws_ecs_capacity_provider" "ecs-capacity-provider" {
name = var.vpc_name
auto_scaling_group_provider {
auto_scaling_group_arn = aws_autoscaling_group.ecs-autoscaling_group.arn
managed_termination_protection = "ENABLED"
managed_scaling {
maximum_scaling_step_size = 1
minimum_scaling_step_size = 1
status = "ENABLED"
target_capacity = 1
}
}
depends_on = [aws_autoscaling_group.ecs-autoscaling_group]
}
resource "aws_ecs_cluster" "ecs-cluster" {
name = var.vpc_name
capacity_providers = [aws_ecs_capacity_provider.ecs-capacity-provider.name]
depends_on = [aws_ecs_capacity_provider.ecs-capacity-provider]
}
resource "aws_iam_role" "ecs-execution" {
name = "${var.vpc_name}-ecs-execution"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "ecs-tasks.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role" "ecs" {
name = "${var.vpc_name}-ecs"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "ecs-tasks.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role_policy_attachment" "execution-role" {
role = aws_iam_role.ecs-execution.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
resource "aws_iam_role_policy_attachment" "role" {
role = aws_iam_role.ecs.name
policy_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
}
I'm facing two problems:
I can't SSH into EC2 instance created by the autoscaling group, despite the fact that I'm using the same SSH key and VPN to access other EC2 instances. My VPN client config includes route to the target machine via VPN gateway.
I can't execute task on the ESC cluster. The task gets stuck in provisioning status and then fails with "Unable to run task". The task is configured to use 1 GB of RAM and 1 vCPU.
What am I doing wrong?

Based on the comments.
There were two issues with the original setup:
Lack of connectivity to ECS and ECR services, which was solved by enabling internet access in the VPC. It is also possible to use VPC interface endpoints for ECS, ECR and S3, if the internet access is not desired.
Container instances did not register with ECS. This was fixed by using user_data to bootstrap ECS instances so that they can register with the ECS cluster.

Related

AWS ECS Fargate Container Failure

Issue:
I am deploying ECS Fargate using Terraform. When I deploy everything goes great, but then the task fails saying "Essential container in task exited". I expanded the task and it says exit code 134. I have tried upping the CPU and Memory, and triple checking the files, but I can't figure out what is wrong. Anyone have any advice?
Files:
fargate.tf
resource "aws_ecs_task_definition" "backend_task" {
family = "backend_example_app_family"
// Fargate is a type of ECS that requires awsvpc network_mode
requires_compatibilities = ["FARGATE"]
network_mode = "awsvpc"
// Valid sizes are shown here: https://aws.amazon.com/fargate/pricing/
memory = "1024"
cpu = "512"
// Fargate requires task definitions to have an execution role ARN to support ECR images
execution_role_arn = "${aws_iam_role.ecs_role.arn}"
container_definitions = <<EOT
[
{
"name": "example_app_container",
"image": "250531645249.dkr.ecr.us-east-1.amazonaws.com/ecr_example_repo:latest",
"memory": 512,
"essential": true,
"portMappings": [
{
"containerPort": 3000,
"hostPort": 3000
}
]
}
]
EOT
}
resource "aws_ecs_cluster" "backend_cluster" {
name = "backend_cluster_example_app"
}
resource "aws_ecs_service" "backend_service" {
name = "backend_service"
cluster = "${aws_ecs_cluster.backend_cluster.id}"
task_definition = "${aws_ecs_task_definition.backend_task.arn}"
launch_type = "FARGATE"
desired_count = 1
network_configuration {
subnets = ["${aws_subnet.public_a.id}", "${aws_subnet.public_b.id}"]
security_groups = ["${aws_security_group.security_group_example_app.id}"]
assign_public_ip = true
}
}
iam.tf
resource "aws_iam_role" "ecs_role" {
name = "ecs_role_example_app"
assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Principal": {
"Service": "ecs-tasks.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}
resource "aws_iam_role_policy_attachment" "ecs_policy_attachment" {
role = "${aws_iam_role.ecs_role.name}"
// This policy adds logging + ecr permissions
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
network.tf
resource "aws_vpc" "vpc_example_app" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true
}
resource "aws_subnet" "public_a" {
vpc_id = "${aws_vpc.vpc_example_app.id}"
cidr_block = "10.0.1.0/24"
availability_zone = "us-east-1a"
}
resource "aws_subnet" "public_b" {
vpc_id = "${aws_vpc.vpc_example_app.id}"
cidr_block = "10.0.2.0/24"
availability_zone = "us-east-1b"
}
resource "aws_internet_gateway" "internet_gateway" {
vpc_id = "${aws_vpc.vpc_example_app.id}"
}
resource "aws_route" "internet_access" {
route_table_id = "${aws_vpc.vpc_example_app.main_route_table_id}"
destination_cidr_block = "0.0.0.0/0"
gateway_id = "${aws_internet_gateway.internet_gateway.id}"
}
resource "aws_security_group" "security_group_example_app" {
name = "security_group_example_app"
description = "Allow TLS inbound traffic on port 80 (http)"
vpc_id = "${aws_vpc.vpc_example_app.id}"
ingress {
from_port = 80
to_port = 4000
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}

NodeCreationFailure: Instances failed to join the kubernetes cluster

I'm hoping this is my final error before this works, but I'm getting this error
Error: error waiting for EKS Node Group (mvp-eks:mvp-node-group) to create: unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'. last error: 1 error occurred:
│ * i-012d9a73b270a9af9, i-0e4530288f0bd2023, i-0ecfed4fe95fa3e3c: NodeCreationFailure: Instances failed to
join the kubernetes cluster
I'm assuming it's either a security group or some sort of networking error, but I have no clue what it could be. Here's my networking related resources- I'm trying to deploy the kubernetes nodes to private subnets, while I have a bastion host and nat gateway in the public ones.
resource "aws_vpc" "vpc" {
cidr_block = "10.1.0.0/16"
tags = {
Name = "${var.name}-vpc"
}
}
resource "aws_subnet" "public_subnet" {
count = length(var.azs)
vpc_id = aws_vpc.vpc.id
cidr_block = var.public_cidrs[count.index]
availability_zone = var.azs[count.index]
map_public_ip_on_launch = true
tags = {
Name = "${var.name}-public-subnet-${count.index + 1}"
}
}
resource "aws_subnet" "private_subnet" {
count = length(var.azs)
vpc_id = aws_vpc.vpc.id
cidr_block = var.private_cidrs[count.index]
availability_zone = var.azs[count.index]
map_public_ip_on_launch = false
tags = {
"kubernetes.io/cluster/${var.cluster_name}" = "shared"
"kubernetes.io/role/internal-elb" = "1"
}
}
resource "aws_internet_gateway" "internet_gateway" {
vpc_id = aws_vpc.vpc.id
tags = {
Name = "${var.name}-internet-gateway"
}
}
resource "aws_route_table" "public_rt" {
vpc_id = aws_vpc.vpc.id
tags = {
Name = "${var.name}-public-rt"
}
}
resource "aws_route" "default_route" {
route_table_id = aws_route_table.public_rt.id
destination_cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.internet_gateway.id
}
resource "aws_route_table_association" "public_assoc" {
count = length(var.public_cidrs)
subnet_id = aws_subnet.public_subnet[count.index].id
route_table_id = aws_route_table.public_rt.id
}
resource "aws_eip" "nat_eip" {
count = length(var.public_cidrs)
vpc = true
depends_on = [aws_internet_gateway.internet_gateway]
tags = {
Name = "${var.name}-nat-eip-${count.index + 1}"
}
}
resource "aws_nat_gateway" "nat_gateway" {
count = length(var.public_cidrs)
allocation_id = aws_eip.nat_eip[count.index].id
subnet_id = aws_subnet.public_subnet[count.index].id
depends_on = [aws_internet_gateway.internet_gateway]
tags = {
Name = "${var.name}-NAT-gateway-${count.index + 1}"
}
}
Here's my eks cluster and node-group resources
resource "aws_iam_role" "eks_cluster" {
name = "${var.name}-eks-cluster-role"
assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "eks.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}
resource "aws_iam_role_policy_attachment" "amazon_eks_cluster_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.eks_cluster.name
}
resource "aws_eks_cluster" "eks" {
name = var.cluster_name
role_arn = aws_iam_role.eks_cluster.arn
## k8s Version
version = var.k8s_version
vpc_config {
endpoint_private_access = true
endpoint_public_access = false
subnet_ids = [
aws_subnet.private_subnet[0].id,
aws_subnet.private_subnet[1].id,
aws_subnet.private_subnet[2].id,
]
}
depends_on = [
aws_iam_role_policy_attachment.amazon_eks_cluster_policy
]
}
resource "aws_iam_role" "nodes_eks" {
name = "role-node-group-eks"
assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}
resource "aws_iam_role_policy_attachment" "amazon_eks_worker_node_policy_eks" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
role = aws_iam_role.nodes_eks.name
}
resource "aws_iam_role_policy_attachment" "amazon_eks_cni_policy_eks" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
role = aws_iam_role.nodes_eks.name
}
resource "aws_iam_role_policy_attachment" "amazon_ec2_container_registry_read_only" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.nodes_eks.name
}
resource "aws_eks_node_group" "nodes_eks" {
cluster_name = aws_eks_cluster.eks.name
node_group_name = "${var.name}-node-group"
node_role_arn = aws_iam_role.nodes_eks.arn
subnet_ids = [
aws_subnet.private_subnet[0].id,
aws_subnet.private_subnet[1].id,
aws_subnet.private_subnet[2].id,
]
remote_access {
ec2_ssh_key = aws_key_pair.bastion_auth.id
}
scaling_config {
desired_size = 3
max_size = 6
min_size = 3
}
ami_type = "AL2_x86_64"
capacity_type = "ON_DEMAND"
disk_size = 20
instance_types = [var.instance_type]
labels = {
role = "nodes-group-1"
}
version = var.k8s_version
depends_on = [
aws_iam_role_policy_attachment.amazon_eks_worker_node_policy_eks,
aws_iam_role_policy_attachment.amazon_eks_cni_policy_eks,
aws_iam_role_policy_attachment.amazon_ec2_container_registry_read_only,
]
}
I followed multiple guides that used all of these iam policies so I think they have the right permissions, but who knows at this point. I can't tell if it's an IAM permission issue or a networking issue

I'm struggling to deploy my EKS node/node-group using terraform

I was getting this error first
NodeCreationFailure: Instances failed to
join the kubernetes cluster
and I didn't have my private subnets tagged right. I found examples online where they tagged their vpc and subnet a certain way, so I copied that and now I'm getting this error
Error: Cycle: aws_eks_cluster.eks, aws_subnet.private_subnet
This is frustrating, but here's my main.tf file condensed to all of the relevant resource blocks. This is my entire vpc section, since I feel like it could be anything in here based off other posts. Also for context, I'm trying to deploy the cluster inside private subnets.
resource "aws_vpc" "vpc" {
cidr_block = "10.1.0.0/16"
tags = {
"kubernetes.io/cluster/${var.cluster_name}" = "shared"
}
}
resource "aws_subnet" "public_subnet" {
count = length(var.azs)
vpc_id = aws_vpc.vpc.id
cidr_block = var.public_cidrs[count.index]
availability_zone = var.azs[count.index]
map_public_ip_on_launch = true
tags = {
Name = "${var.name}-public-subnet-${count.index + 1}"
}
}
resource "aws_subnet" "private_subnet" {
count = length(var.azs)
vpc_id = aws_vpc.vpc.id
cidr_block = var.private_cidrs[count.index]
availability_zone = var.azs[count.index]
map_public_ip_on_launch = false
tags = {
"kubernetes.io/cluster/${aws_eks_cluster.eks.name}" = "shared"
"kubernetes.io/role/internal-elb" = "1"
}
}
resource "aws_internet_gateway" "internet_gateway" {
vpc_id = aws_vpc.vpc.id
tags = {
Name = "${var.name}-internet-gateway"
}
}
resource "aws_route_table" "public_rt" {
vpc_id = aws_vpc.vpc.id
tags = {
Name = "${var.name}-public-rt"
}
}
resource "aws_route" "default_route" {
route_table_id = aws_route_table.public_rt.id
destination_cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.internet_gateway.id
}
resource "aws_route_table_association" "public_assoc" {
count = length(var.public_cidrs)
subnet_id = aws_subnet.public_subnet[count.index].id
route_table_id = aws_route_table.public_rt.id
}
resource "aws_eip" "nat_eip" {
count = length(var.public_cidrs)
vpc = true
depends_on = [aws_internet_gateway.internet_gateway]
tags = {
Name = "${var.name}-nat-eip-${count.index + 1}"
}
}
resource "aws_nat_gateway" "nat_gateway" {
count = length(var.public_cidrs)
allocation_id = aws_eip.nat_eip[count.index].id
subnet_id = aws_subnet.public_subnet[count.index].id
depends_on = [aws_internet_gateway.internet_gateway]
tags = {
Name = "${var.name}-NAT-gateway-${count.index + 1}"
}
}
Here's all of my source blocks related to my cluster and nodes
resource "aws_iam_role" "eks_cluster" {
name = "${var.name}-eks-cluster-role"
assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "eks.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}
resource "aws_iam_role_policy_attachment" "amazon_eks_cluster_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.eks_cluster.name
}
resource "aws_eks_cluster" "eks" {
name = var.cluster_name
role_arn = aws_iam_role.eks_cluster.arn
## k8s Version
version = var.k8s_version
vpc_config {
endpoint_private_access = true
endpoint_public_access = false
subnet_ids = [
aws_subnet.private_subnet[0].id,
aws_subnet.private_subnet[1].id,
aws_subnet.private_subnet[2].id,
]
}
depends_on = [
aws_iam_role_policy_attachment.amazon_eks_cluster_policy
]
}
resource "aws_iam_role" "nodes_eks" {
name = "role-node-group-eks"
assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}
resource "aws_iam_role_policy_attachment" "amazon_eks_worker_node_policy_eks" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
role = aws_iam_role.nodes_eks.name
}
resource "aws_iam_role_policy_attachment" "amazon_eks_cni_policy_eks" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
role = aws_iam_role.nodes_eks.name
}
resource "aws_iam_role_policy_attachment" "amazon_ec2_container_registry_read_only" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.nodes_eks.name
}
resource "aws_eks_node_group" "nodes_eks" {
cluster_name = aws_eks_cluster.eks.name
node_group_name = "${var.name}-node-group"
node_role_arn = aws_iam_role.nodes_eks.arn
subnet_ids = [
aws_subnet.private_subnet[0].id,
aws_subnet.private_subnet[1].id,
aws_subnet.private_subnet[2].id,
]
remote_access {
ec2_ssh_key = aws_key_pair.bastion_auth.id
}
scaling_config {
desired_size = 3
max_size = 6
min_size = 3
}
ami_type = "AL2_x86_64"
capacity_type = "ON_DEMAND"
disk_size = 20
instance_types = [var.instance_type]
labels = {
role = "nodes-group-1"
}
version = var.k8s_version
depends_on = [
aws_iam_role_policy_attachment.amazon_eks_worker_node_policy_eks,
aws_iam_role_policy_attachment.amazon_eks_cni_policy_eks,
aws_iam_role_policy_attachment.amazon_ec2_container_registry_read_only,
]
}
In the private-subnet resource, you are referencing your EKS cluster in the tag: ${aws_eks_cluster.eks.name}, which creates a dependency for this resource on EKS cluster.
resource "aws_subnet" "private_subnet" {
count = length(var.azs)
vpc_id = aws_vpc.vpc.id
cidr_block = var.private_cidrs[count.index]
availability_zone = var.azs[count.index]
map_public_ip_on_launch = false
tags = {
"kubernetes.io/cluster/${aws_eks_cluster.eks.name}" = "shared" <- this creates dependency
"kubernetes.io/role/internal-elb" = "1"
}
}
On the other side, you are referencing the same private subnet, when you create the EKS cluster, which now creates a dependency for this resource on the private subnet.
resource "aws_eks_cluster" "eks" {
name = var.cluster_name
role_arn = aws_iam_role.eks_cluster.arn
## k8s Version
version = var.k8s_version
vpc_config {
endpoint_private_access = true
endpoint_public_access = false
subnet_ids = [
aws_subnet.private_subnet[0].id, <- this creates dependency
aws_subnet.private_subnet[1].id, <- this creates dependency
aws_subnet.private_subnet[2].id, <- this creates dependency
]
}
depends_on = [
aws_iam_role_policy_attachment.amazon_eks_cluster_policy
]
}
And in a result, you get a dependency cycle that causes your error.
To solve it, update the tag for the private subnet to:
tags = {
"kubernetes.io/cluster/${var.cluster_name}" = "shared"
"kubernetes.io/role/internal-elb" = "1"
}

Error creating EKS node-group with terraform

While I am trying to deploy EKS via Terraform, I am facing an error with node-group creation.
I am getting the following error:
Error: error waiting for EKS Node Group (Self-Hosted-Runner:Self-Hosted-Runner-default-node-group) to create:
unexpected state 'CREATE_FAILED', wanted target 'ACTIVE'.
last error: 1 error occurred:i-04db15f25be4212fb, i-07bd88adabaa103c0, i-0915982ac0f217fe4:
NodeCreationFailure: Instances failed to join the kubernetes cluster.
with module.eks.aws_eks_node_group.eks-node-group,
│ on ../../modules/aws/eks/eks-node-group.tf line 1, in resource "aws_eks_node_group" "eks-node-group":
│ 1: resource "aws_eks_node_group" "eks-node-group" {
EKS
# EKS Cluster Resources
resource "aws_eks_cluster" "eks" {
name = var.cluster-name
version = var.k8s-version
role_arn = aws_iam_role.cluster.arn
vpc_config {
security_group_ids = [var.security_group]
subnet_ids = var.private_subnets
}
enabled_cluster_log_types = var.eks-cw-logging
depends_on = [
aws_iam_role_policy_attachment.cluster-AmazonEKSClusterPolicy,
aws_iam_role_policy_attachment.cluster-AmazonEKSServicePolicy,
]
}
EKS-NODE-GROUP
resource "aws_eks_node_group" "eks-node-group" {
cluster_name = var.cluster-name
node_group_name = "${var.cluster-name}-default-node-group"
node_role_arn = aws_iam_role.node.arn
subnet_ids = var.private_subnets
capacity_type = "SPOT"
node_group_name_prefix = null #"Creates a unique name beginning with the specified prefix. Conflicts with node_group_name"
scaling_config {
desired_size = var.desired-capacity
max_size = var.max-size
min_size = var.min-size
}
update_config {
max_unavailable = 1
}
instance_types = [var.node-instance-type]
# Ensure that IAM Role permissions are created before and deleted after EKS Node Group handling.
# Otherwise, EKS will not be able to properly delete EC2 Instances and Elastic Network Interfaces.
depends_on = [
aws_eks_cluster.eks,
aws_iam_role_policy_attachment.node-AmazonEKSWorkerNodePolicy,
aws_iam_role_policy_attachment.node-AmazonEKS_CNI_Policy
]
tags = {
Name = "${var.cluster-name}-default-node-group"
}
}
IAM
# IAM
# CLUSTER
resource "aws_iam_role" "cluster" {
name = "${var.cluster-name}-eks-cluster-role"
assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "eks.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}
resource "aws_iam_role_policy_attachment" "cluster-AmazonEKSClusterPolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.cluster.name
}
resource "aws_iam_role_policy_attachment" "cluster-AmazonEKSServicePolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSServicePolicy"
role = aws_iam_role.cluster.name
}
# NODES
resource "aws_iam_role" "node" {
name = "${var.cluster-name}-eks-node-role"
assume_role_policy = <<POLICY
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
POLICY
}
resource "aws_iam_role_policy_attachment" "node-AmazonEKSWorkerNodePolicy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
role = aws_iam_role.node.name
}
resource "aws_iam_role_policy_attachment" "node-AmazonEKS_CNI_Policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
role = aws_iam_role.node.name
}
resource "aws_iam_role_policy_attachment" "node-AmazonEC2ContainerRegistryReadOnly" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.node.name
}
resource "aws_iam_instance_profile" "node" {
name = "${var.cluster-name}-eks-node-instance-profile"
role = aws_iam_role.node.name
}
Security Group
# Create Security Group
resource "aws_security_group" "cluster" {
name = "terraform_cluster"
description = "AWS security group for terraform"
vpc_id = aws_vpc.vpc1.id
# Input
ingress {
from_port = "1"
to_port = "65365"
protocol = "TCP"
cidr_blocks = [var.address_allowed, var.vpc1_cidr_block]
}
# Output
egress {
from_port = 0 # any port
to_port = 0 # any port
protocol = "-1" # any protocol
cidr_blocks = ["0.0.0.0/0"] # any destination
}
# ICMP Ping
ingress {
from_port = -1
to_port = -1
protocol = "icmp"
cidr_blocks = [var.address_allowed, var.vpc1_cidr_block]
}
tags = merge(
{
Name = "onboarding-sg",
},
var.tags,
)
}
VPC
# Create VPC
resource "aws_vpc" "vpc1" {
cidr_block = var.vpc1_cidr_block
instance_tenancy = "default"
enable_dns_support = true
enable_dns_hostnames = true
tags = merge(
{
Name = "onboarding-vpc",
},
var.tags,
)
}
# Subnet Public
resource "aws_subnet" "subnet_public1" {
vpc_id = aws_vpc.vpc1.id
cidr_block = var.subnet_public1_cidr_block[0]
map_public_ip_on_launch = "true" #it makes this a public subnet
availability_zone = data.aws_availability_zones.available.names[0]
tags = merge(
{
Name = "onboarding-public-sub",
"kubernetes.io/role/elb" = "1"
},
var.tags,
)
}
# Subnet Private
resource "aws_subnet" "subnet_private1" {
for_each = { for idx, cidr_block in var.subnet_private1_cidr_block: cidr_block => idx}
vpc_id = aws_vpc.vpc1.id
cidr_block = each.key
map_public_ip_on_launch = "false" //it makes this a public subnet
availability_zone = data.aws_availability_zones.available.names[each.value]
tags = merge(
{
Name = "onboarding-private-sub",
"kubernetes.io/role/internal-elb" = "1",
"kubernetes.io/cluster/${var.cluster-name}" = "owned"
},
var.tags,
)
}
tfvars
#General vars
region = "eu-west-1"
#Bucket vars
bucket = "tf-state"
tag_name = "test"
tag_environment = "Dev"
acl = "private"
versioning_enabled = "Enabled"
# Network EKS vars
aws_public_key_path = "~/.ssh/id_rsa.pub"
aws_key_name = "aws-k8s"
address_allowed = "/32" # Office public IP Address
vpc1_cidr_block = "10.0.0.0/16"
subnet_public1_cidr_block = ["10.0.128.0/20", "10.0.144.0/20", "10.0.160.0/20"]
subnet_private1_cidr_block = ["10.0.0.0/19", "10.0.32.0/19", "10.0.64.0/19"]
tags = {
Scost = "testing",
Terraform = "true",
Environment = "testing"
}
#EKS
cluster-name = "Self-Hosted-Runner"
k8s-version = "1.21"
node-instance-type = "t3.medium"
desired-capacity = "3"
max-size = "7"
min-size = "1"
# db-subnet-cidr = ["10.0.192.0/21", "10.0.200.0/21", "10.0.208.0/21"]
eks-cw-logging = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
ec2-key-public-key = ""
"issues" : [ {
"code" : "NodeCreationFailure",
"message" : "Instances failed to join the kubernetes cluster",
What do you think I missed configured?

ECR VPC endpoint for fargate not working as expected

I am using Fargate for a task that runs every hour. As the docker image size is 1.5go, I want to use a ECR VPC endpoint to optimize the AWS data transfer fee.
The fargate tasks run in a private subnet. The route table of one of the private subnet is the following (where eigw is eigress only internet gateway and nat-01 is a nat gateway in the public subnet):
Destination Target
10.50.0.0/16 local
0.0.0.0/0 nat-01ec80c2754229321
2a05:d012:43e:de00::/56 local
::/0 eigw-0a0c583a8390d5736
Expected behavior: Right now, the Fargate task takes around 1 minute to start, due to the time it takes to docker pull the image. I expect that with the ECR VPC endpoint, the time it takes to start would go down.
Actual behavior: There is not a single second difference, which means I probably did something wrong!
My terraform setup:
The VPC, subnets and route tables:
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "~> 2.21.0"
name = "dev-vpc"
cidr = "10.10.0.0/16"
azs = ["eu-west-3a", "eu-west-3b"]
private_subnets = ["10.10.0.0/20", "10.10.32.0/20"]
public_subnets = ["10.10.128.0/20", "10.10.160.0/20"]
enable_nat_gateway = true
single_nat_gateway = true
reuse_nat_ips = false
enable_vpn_gateway = false
enable_dns_hostnames = true
create_database_subnet_group = true
enable_ipv6 = true
assign_ipv6_address_on_creation = true
private_subnet_assign_ipv6_address_on_creation = false
public_subnet_ipv6_prefixes = [0, 1]
private_subnet_ipv6_prefixes = [2, 3]
database_subnet_ipv6_prefixes = [4, 5]
database_subnets = ["10.10.64.0/20", "10.10.80.0/20"]
tags = {
ManagedByTerraform = "true"
EnvironmentType = "dev"
}
}
# the SG for the VPC endpoints
resource "aws_security_group" "vpce" {
name = "dev-vpce-sg"
vpc_id = module.vpc.vpc_id
ingress {
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = [module.vpc.vpc_cidr_block]
}
tags = {
Environment = "dev"
}
}
# all the VPC endpoints needed (from AWS documentation)
resource "aws_vpc_endpoint" "ecr_endpoint" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.ecr.dkr"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "dkr-endpoint"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "ecr_api_endpoint" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.ecr.api"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "ecr-api-endpoint"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "s3" {
vpc_id = module.vpc.vpc_id
service_name = "com.amazonaws.eu-west-3.s3"
vpc_endpoint_type = "Gateway"
route_table_ids = module.vpc.private_route_table_ids
policy = <<-EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "",
"Effect": "Allow",
"Principal": "*",
"Action": [
"s3:PutObjectAcl",
"s3:PutObject",
"s3:ListBucket",
"s3:GetObject",
"s3:Delete*"
],
"Resource": [
"arn:aws:s3:::prod-eu-west-3-starport-layer-bucket",
"arn:aws:s3:::prod-eu-west-3-starport-layer-bucket/*"
]
}
]
}
EOF
tags = {
Name = "s3-endpoint"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "logs" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.logs"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "logs-endpoint"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "ecs_agent" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.ecs-agent"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "ecs-agent"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "ecs_telemetry" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.ecs-telemetry"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "telemetry"
Environment = "dev"
}
}
resource "aws_vpc_endpoint" "ecs_endpoint" {
vpc_id = module.vpc.vpc_id
private_dns_enabled = true
service_name = "com.amazonaws.eu-west-3.ecs"
vpc_endpoint_type = "Interface"
security_group_ids = [
aws_security_group.vpce.id,
]
subnet_ids = module.vpc.private_subnets
tags = {
Name = "ecs-endpoint"
Environment = "dev"
}
}
Can you let me know what can be wrong in my setup?
I have zero knowledge in network engineering, so please let me know if you need further information.