From 58592538d9e56601665c8b5ffccba99e64f20826 Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sun, 31 Aug 2025 11:54:49 -0700 Subject: [PATCH 01/11] bringing down the API Signed-off-by: Irving Popovetsky --- terraform/alb.tf | 5 ++ terraform/apps.tf | 89 ++++++++++++++++---------------- terraform/asg.tf | 2 +- terraform/pybot/main.tf | 8 +++ terraform/python_backend/main.tf | 7 +++ 5 files changed, 66 insertions(+), 45 deletions(-) diff --git a/terraform/alb.tf b/terraform/alb.tf index 8277560..bd553e6 100644 --- a/terraform/alb.tf +++ b/terraform/alb.tf @@ -39,6 +39,11 @@ resource "aws_security_group" "lb_security_group" { resource "aws_lb" "ecs" { name_prefix = "oc" security_groups = [aws_security_group.lb_security_group.id] + access_logs { + bucket = "oc-alb-logs" + enabled = true + prefix = "2025" + } load_balancer_type = "application" internal = false diff --git a/terraform/apps.tf b/terraform/apps.tf index c2a7955..6fb6ef7 100644 --- a/terraform/apps.tf +++ b/terraform/apps.tf @@ -19,58 +19,58 @@ resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_attach" { ################################################################################ # Backend Prod -module "python_backend_prod" { - source = "./python_backend" +# module "python_backend_prod" { +# source = "./python_backend" - env = "prod" - vpc_id = data.aws_vpc.use2.id - logs_group = aws_cloudwatch_log_group.ecslogs.name - ecs_cluster_id = module.ecs.cluster_id - task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn - image_tag = "master" -} +# env = "prod" +# vpc_id = data.aws_vpc.use2.id +# logs_group = aws_cloudwatch_log_group.ecslogs.name +# ecs_cluster_id = module.ecs.cluster_id +# task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn +# image_tag = "master" +# } -resource "aws_lb_listener_rule" "python_backend_prod" { - listener_arn = aws_lb_listener.default_https.arn +# resource "aws_lb_listener_rule" "python_backend_prod" { +# listener_arn = aws_lb_listener.default_https.arn - action { - type = "forward" - target_group_arn = module.python_backend_prod.lb_tg_arn - } +# action { +# type = "forward" +# target_group_arn = module.python_backend_prod.lb_tg_arn +# } - condition { - host_header { - values = ["backend.operationcode.org", "api.operationcode.org"] - } - } -} +# condition { +# host_header { +# values = ["backend.operationcode.org", "api.operationcode.org"] +# } +# } +# } # Backend Staging -module "python_backend_staging" { - source = "./python_backend" +# module "python_backend_staging" { +# source = "./python_backend" - env = "staging" - vpc_id = data.aws_vpc.use2.id - logs_group = aws_cloudwatch_log_group.ecslogs.name - ecs_cluster_id = module.ecs.cluster_id - task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn - image_tag = "staging" -} +# env = "staging" +# vpc_id = data.aws_vpc.use2.id +# logs_group = aws_cloudwatch_log_group.ecslogs.name +# ecs_cluster_id = module.ecs.cluster_id +# task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn +# image_tag = "staging" +# } -resource "aws_lb_listener_rule" "python_backend_staging" { - listener_arn = aws_lb_listener.default_https.arn +# resource "aws_lb_listener_rule" "python_backend_staging" { +# listener_arn = aws_lb_listener.default_https.arn - action { - type = "forward" - target_group_arn = module.python_backend_staging.lb_tg_arn - } +# action { +# type = "forward" +# target_group_arn = module.python_backend_staging.lb_tg_arn +# } - condition { - host_header { - values = ["backend-staging.operationcode.org", "api.staging.operationcode.org"] - } - } -} +# condition { +# host_header { +# values = ["backend-staging.operationcode.org", "api.staging.operationcode.org"] +# } +# } +# } # Redirector for shut down sites resource "aws_lb_listener_rule" "shutdown_sites_redirector" { @@ -91,9 +91,10 @@ resource "aws_lb_listener_rule" "shutdown_sites_redirector" { host_header { values = [ "resources.operationcode.org", - "resources.staging.operationcode.org", "resources-staging.operationcode.org", - "pybot.staging.operationcode.org", + "api.operationcode.org", + "backend-staging.operationcode.org", + "api.staging.operationcode.org", ] } } diff --git a/terraform/asg.tf b/terraform/asg.tf index 8b7fce0..62e523c 100644 --- a/terraform/asg.tf +++ b/terraform/asg.tf @@ -37,7 +37,7 @@ module "autoscaling" { { delete_on_termination = true device_index = 0 - associate_public_ip_address = false + associate_public_ip_address = true security_groups = [module.autoscaling_sg.security_group_id] } ] diff --git a/terraform/pybot/main.tf b/terraform/pybot/main.tf index ce9f4f7..24fe40a 100644 --- a/terraform/pybot/main.tf +++ b/terraform/pybot/main.tf @@ -52,6 +52,14 @@ resource "aws_ecs_task_definition" "pybot" { } } + # healthCheck = { + # command = ["CMD-SHELL", "wget -q http://localhost:5000/health || exit 1"] + # interval = 30 + # timeout = 5 + # retries = 3 + # startPeriod = 60 + # } + secrets = local.secrets_env mountPoints = [] diff --git a/terraform/python_backend/main.tf b/terraform/python_backend/main.tf index 0d3f79d..df7d5c2 100644 --- a/terraform/python_backend/main.tf +++ b/terraform/python_backend/main.tf @@ -53,6 +53,13 @@ resource "aws_ecs_task_definition" "python_backend" { } } + # healthCheck = { + # command = ["CMD-SHELL", "wget -q http://localhost:8000/healthz || exit 1"] + # interval = 30 + # timeout = 5 + # retries = 3 + # startPeriod = 60 + # } environment = [ { From dec0ddcbafb13a7d02d793fba4f708dc52935b19 Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sun, 31 Aug 2025 12:44:58 -0700 Subject: [PATCH 02/11] Try to reduce spot availability issues Signed-off-by: Irving Popovetsky --- terraform/apps.tf | 2 +- terraform/asg.tf | 37 ++++++++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/terraform/apps.tf b/terraform/apps.tf index 6fb6ef7..6a80234 100644 --- a/terraform/apps.tf +++ b/terraform/apps.tf @@ -93,7 +93,7 @@ resource "aws_lb_listener_rule" "shutdown_sites_redirector" { "resources.operationcode.org", "resources-staging.operationcode.org", "api.operationcode.org", - "backend-staging.operationcode.org", + "backend.operationcode.org", "api.staging.operationcode.org", ] } diff --git a/terraform/asg.tf b/terraform/asg.tf index 62e523c..8993b71 100644 --- a/terraform/asg.tf +++ b/terraform/asg.tf @@ -10,12 +10,39 @@ module "autoscaling" { version = "~> 6.5" name = "${local.name}-spot" - instance_type = "t3.small" min_size = 1 - max_size = 2 - desired_capacity = 1 - instance_market_options = { - market_type = "spot" + max_size = 4 + desired_capacity = 2 + + # Enable mixed instances policy + use_mixed_instances_policy = true + + # Mixed Instances Policy for better availability + mixed_instances_policy = { + instances_distribution = { + on_demand_base_capacity = 0 + on_demand_percentage_above_base_capacity = 0 + spot_allocation_strategy = "capacity-optimized" + } + + override = [ + { + instance_type = "t3.small" + weighted_capacity = "2" + }, + { + instance_type = "t3a.small" + weighted_capacity = "2" + }, + { + instance_type = "t3.micro" + weighted_capacity = "1" + }, + { + instance_type = "t3a.micro" + weighted_capacity = "1" + } + ] } image_id = jsondecode(data.aws_ssm_parameter.ecs_optimized_ami.value)["image_id"] From a05b488c7217ac55279419c1b345a8dc0409475a Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Mon, 1 Sep 2025 12:37:34 -0700 Subject: [PATCH 03/11] Switch to arm due to amd64 spot unreliability Signed-off-by: Irving Popovetsky --- terraform/apps.tf | 2 +- terraform/asg.tf | 37 +++++++++++++++++++++++++------------ 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/terraform/apps.tf b/terraform/apps.tf index 6a80234..9ec62b1 100644 --- a/terraform/apps.tf +++ b/terraform/apps.tf @@ -200,7 +200,7 @@ module "pybot_prod" { logs_group = aws_cloudwatch_log_group.ecslogs.name ecs_cluster_id = module.ecs.cluster_id task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn - image_tag = "master" + image_tag = "latest" } resource "aws_lb_listener_rule" "pybot_prod" { diff --git a/terraform/asg.tf b/terraform/asg.tf index 8993b71..0a32938 100644 --- a/terraform/asg.tf +++ b/terraform/asg.tf @@ -1,7 +1,8 @@ # https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-optimized_AMI.html#ecs-optimized-ami-linux data "aws_ssm_parameter" "ecs_optimized_ami" { - name = "/aws/service/ecs/optimized-ami/amazon-linux-2023/recommended" + # name = "/aws/service/ecs/optimized-ami/amazon-linux-2023/recommended" + name = "/aws/service/ecs/optimized-ami/amazon-linux-2023/arm64/recommended" } # https://registry.terraform.io/modules/terraform-aws-modules/autoscaling/aws/latest @@ -10,7 +11,7 @@ module "autoscaling" { version = "~> 6.5" name = "${local.name}-spot" - min_size = 1 + min_size = 2 max_size = 4 desired_capacity = 2 @@ -27,22 +28,34 @@ module "autoscaling" { override = [ { - instance_type = "t3.small" - weighted_capacity = "2" - }, - { - instance_type = "t3a.small" - weighted_capacity = "2" - }, - { - instance_type = "t3.micro" + instance_type = "t4g.small" weighted_capacity = "1" }, { - instance_type = "t3a.micro" + instance_type = "t4g.micro" weighted_capacity = "1" } ] + + #amd64 options + # override = [ + # { + # instance_type = "t3.small" + # weighted_capacity = "2" + # }, + # { + # instance_type = "t3a.small" + # weighted_capacity = "2" + # }, + # { + # instance_type = "t3.micro" + # weighted_capacity = "1" + # }, + # { + # instance_type = "t3a.micro" + # weighted_capacity = "1" + # } + # ] } image_id = jsondecode(data.aws_ssm_parameter.ecs_optimized_ami.value)["image_id"] From 6719605abb8b69786023dca42e62cf6a33c936d3 Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Mon, 1 Sep 2025 16:04:41 -0700 Subject: [PATCH 04/11] Update back-end image to the latest one in ECR if needed Signed-off-by: Irving Popovetsky --- terraform/apps.tf | 4 ++-- terraform/python_backend/main.tf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/terraform/apps.tf b/terraform/apps.tf index 9ec62b1..5d905ed 100644 --- a/terraform/apps.tf +++ b/terraform/apps.tf @@ -27,7 +27,7 @@ resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_attach" { # logs_group = aws_cloudwatch_log_group.ecslogs.name # ecs_cluster_id = module.ecs.cluster_id # task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn -# image_tag = "master" +# image_tag = "latest" # } # resource "aws_lb_listener_rule" "python_backend_prod" { @@ -54,7 +54,7 @@ resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_attach" { # logs_group = aws_cloudwatch_log_group.ecslogs.name # ecs_cluster_id = module.ecs.cluster_id # task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn -# image_tag = "staging" +# image_tag = "latest" # } # resource "aws_lb_listener_rule" "python_backend_staging" { diff --git a/terraform/python_backend/main.tf b/terraform/python_backend/main.tf index df7d5c2..2409aef 100644 --- a/terraform/python_backend/main.tf +++ b/terraform/python_backend/main.tf @@ -33,7 +33,7 @@ resource "aws_ecs_task_definition" "python_backend" { container_definitions = jsonencode([ { name = "python_backend_${var.env}" - image = "operationcode/back-end:${var.image_tag}" + image = "633607774026.dkr.ecr.us-east-2.amazonaws.com/back-end:${var.image_tag}" essential = true portMappings = [ From db0899ba333180ea3c8da77fe591ae40a07c93aa Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sat, 6 Sep 2025 08:10:12 -0700 Subject: [PATCH 05/11] API still up for now Signed-off-by: Irving Popovetsky --- terraform/apps.tf | 44 +++++++++++++++----------------- terraform/asg.tf | 15 +++++++++++ terraform/pybot/main.tf | 16 ++++++------ terraform/python_backend/main.tf | 14 +++++----- 4 files changed, 51 insertions(+), 38 deletions(-) diff --git a/terraform/apps.tf b/terraform/apps.tf index 5d905ed..1596c56 100644 --- a/terraform/apps.tf +++ b/terraform/apps.tf @@ -19,31 +19,31 @@ resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_attach" { ################################################################################ # Backend Prod -# module "python_backend_prod" { -# source = "./python_backend" +module "python_backend_prod" { + source = "./python_backend" -# env = "prod" -# vpc_id = data.aws_vpc.use2.id -# logs_group = aws_cloudwatch_log_group.ecslogs.name -# ecs_cluster_id = module.ecs.cluster_id -# task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn -# image_tag = "latest" -# } + env = "prod" + vpc_id = data.aws_vpc.use2.id + logs_group = aws_cloudwatch_log_group.ecslogs.name + ecs_cluster_id = module.ecs.cluster_id + task_execution_role = data.aws_iam_role.ecs_task_execution_role.arn + image_tag = "latest" +} -# resource "aws_lb_listener_rule" "python_backend_prod" { -# listener_arn = aws_lb_listener.default_https.arn +resource "aws_lb_listener_rule" "python_backend_prod" { + listener_arn = aws_lb_listener.default_https.arn -# action { -# type = "forward" -# target_group_arn = module.python_backend_prod.lb_tg_arn -# } + action { + type = "forward" + target_group_arn = module.python_backend_prod.lb_tg_arn + } -# condition { -# host_header { -# values = ["backend.operationcode.org", "api.operationcode.org"] -# } -# } -# } + condition { + host_header { + values = ["backend.operationcode.org", "api.operationcode.org"] + } + } +} # Backend Staging # module "python_backend_staging" { @@ -92,8 +92,6 @@ resource "aws_lb_listener_rule" "shutdown_sites_redirector" { values = [ "resources.operationcode.org", "resources-staging.operationcode.org", - "api.operationcode.org", - "backend.operationcode.org", "api.staging.operationcode.org", ] } diff --git a/terraform/asg.tf b/terraform/asg.tf index 0a32938..e3412b6 100644 --- a/terraform/asg.tf +++ b/terraform/asg.tf @@ -107,6 +107,21 @@ module "autoscaling" { # reduce cloudwatch costs enable_monitoring = false + # Enable essential autoscaling metrics + enabled_metrics = [ + "GroupDesiredCapacity", + "GroupInServiceCapacity", + "GroupInServiceInstances", + "GroupMaxSize", + "GroupMinSize", + "GroupPendingCapacity", + "GroupPendingInstances", + "GroupTerminatingCapacity", + "GroupTerminatingInstances", + "GroupTotalCapacity", + "GroupTotalInstances" + ] + tags = local.tags } diff --git a/terraform/pybot/main.tf b/terraform/pybot/main.tf index 24fe40a..7469130 100644 --- a/terraform/pybot/main.tf +++ b/terraform/pybot/main.tf @@ -11,7 +11,7 @@ locals { # CHANGEME once infra scales up cpu = var.env == "prod" ? 256 : 256 - memory = var.env == "prod" ? 512 : 256 + memory = var.env == "prod" ? 256 : 128 count = var.env == "prod" ? 1 : 1 @@ -52,13 +52,13 @@ resource "aws_ecs_task_definition" "pybot" { } } - # healthCheck = { - # command = ["CMD-SHELL", "wget -q http://localhost:5000/health || exit 1"] - # interval = 30 - # timeout = 5 - # retries = 3 - # startPeriod = 60 - # } + healthCheck = { + command = ["CMD-SHELL", "wget -q -O /dev/null http://localhost:5000/health"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 60 + } secrets = local.secrets_env diff --git a/terraform/python_backend/main.tf b/terraform/python_backend/main.tf index 2409aef..b8c72f8 100644 --- a/terraform/python_backend/main.tf +++ b/terraform/python_backend/main.tf @@ -53,13 +53,13 @@ resource "aws_ecs_task_definition" "python_backend" { } } - # healthCheck = { - # command = ["CMD-SHELL", "wget -q http://localhost:8000/healthz || exit 1"] - # interval = 30 - # timeout = 5 - # retries = 3 - # startPeriod = 60 - # } + healthCheck = { + command = ["CMD-SHELL", "wget -q -O /dev/null http://localhost:8000/healthz"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 60 + } environment = [ { From ef20ed6fa4e91eddb9dcc2996d1aa9eb05f7227c Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sat, 25 Oct 2025 16:55:00 -0700 Subject: [PATCH 06/11] Fix an issue where the pg driver was confused about the current timezone Signed-off-by: Irving Popovetsky --- terraform/python_backend/main.tf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/terraform/python_backend/main.tf b/terraform/python_backend/main.tf index b8c72f8..da3e905 100644 --- a/terraform/python_backend/main.tf +++ b/terraform/python_backend/main.tf @@ -94,6 +94,14 @@ resource "aws_ecs_task_definition" "python_backend" { "name" : "DB_ENGINE", "value" : "django.db.backends.postgresql" }, + { + "name" : "TZ", + "value" : "UTC" + }, + { + "name" : "PGTZ", + "value" : "UTC" + }, ] secrets = local.secrets_env From 0e20d7b1155f8dc4151237a8a9c48fad8950cbbf Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sun, 26 Oct 2025 08:49:19 -0700 Subject: [PATCH 07/11] switch to GHA because circleci is clearly wonky Signed-off-by: Irving Popovetsky --- .circleci/config.yml | 16 ------ .github/workflows/terraform.yml | 42 ++++++++++++++ dns/README.md | 11 ---- dns/operationcode.net/provider.tf | 8 --- dns/operationcode.net/records.tf | 13 ----- dns/operationcode.net/terraform.tfvars | 7 --- dns/operationcode.net/variables.tf | 4 -- dns/operationcode.org/provider.tf | 8 --- dns/operationcode.org/records.tf | 78 -------------------------- dns/operationcode.org/terraform.tfvars | 9 --- dns/operationcode.org/variables.tf | 14 ----- dns/terraform.tfvars | 12 ---- 12 files changed, 42 insertions(+), 180 deletions(-) delete mode 100644 .circleci/config.yml create mode 100644 .github/workflows/terraform.yml delete mode 100644 dns/README.md delete mode 100644 dns/operationcode.net/provider.tf delete mode 100644 dns/operationcode.net/records.tf delete mode 100644 dns/operationcode.net/terraform.tfvars delete mode 100644 dns/operationcode.net/variables.tf delete mode 100644 dns/operationcode.org/provider.tf delete mode 100644 dns/operationcode.org/records.tf delete mode 100644 dns/operationcode.org/terraform.tfvars delete mode 100644 dns/operationcode.org/variables.tf delete mode 100644 dns/terraform.tfvars diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 6e2a58f..0000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,16 +0,0 @@ -version: '2.1' -orbs: - terraform: circleci/terraform@3.1 -workflows: - deploy_infrastructure: - jobs: - - terraform/fmt: - checkout: true - context: terraform - path: terraform - - terraform/validate: - checkout: true - context: terraform - path: terraform - requires: - - terraform/fmt diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml new file mode 100644 index 0000000..d1a7eb5 --- /dev/null +++ b/.github/workflows/terraform.yml @@ -0,0 +1,42 @@ +name: Terraform validation + +on: + push: + pull_request: + +jobs: + terraform-fmt: + name: Terraform Format + runs-on: ubuntu-latest + defaults: + run: + working-directory: terraform + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + + - name: Terraform Format + run: terraform fmt -check -recursive + + terraform-validate: + name: Terraform Validate + runs-on: ubuntu-latest + needs: terraform-fmt + defaults: + run: + working-directory: terraform + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + + - name: Terraform Init + run: terraform init -backend=false + + - name: Terraform Validate + run: terraform validate diff --git a/dns/README.md b/dns/README.md deleted file mode 100644 index e111996..0000000 --- a/dns/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Configure DNS records for Operation Code with Terraform - -### warning - -Changes to this portion of the repository will have effects on real resources, including production. - - -### tools - -Terragrunt - we're using Terragrunt as a wrapper in order to simplify running Terraform in a CI/CD pipeline. -Terratest - a test framework for Terraform code. diff --git a/dns/operationcode.net/provider.tf b/dns/operationcode.net/provider.tf deleted file mode 100644 index 028214d..0000000 --- a/dns/operationcode.net/provider.tf +++ /dev/null @@ -1,8 +0,0 @@ -/* -credentials sourced from DNSIMPLE_TOKEN and DNSIMPLE_ACCOUNT vars -*/ -provider "dnsimple" {} - -terraform { - backend "s3" {} -} diff --git a/dns/operationcode.net/records.tf b/dns/operationcode.net/records.tf deleted file mode 100644 index 6107264..0000000 --- a/dns/operationcode.net/records.tf +++ /dev/null @@ -1,13 +0,0 @@ -resource "dnsimple_record" "root" { - domain = "${var.hosted-zone}" - name = "" - type = "URL" - value = "https://operationcode.org" -} - -resource "dnsimple_record" "www" { - domain = "${var.hosted-zone}" - name = "www" - type = "URL" - value = "https://operationcode.org" -} diff --git a/dns/operationcode.net/terraform.tfvars b/dns/operationcode.net/terraform.tfvars deleted file mode 100644 index d156ed5..0000000 --- a/dns/operationcode.net/terraform.tfvars +++ /dev/null @@ -1,7 +0,0 @@ -terragrunt = { - include { - path = "${find_in_parent_folders()}" - } -} - -hosted-zone = "operationcode.net" \ No newline at end of file diff --git a/dns/operationcode.net/variables.tf b/dns/operationcode.net/variables.tf deleted file mode 100644 index afb350d..0000000 --- a/dns/operationcode.net/variables.tf +++ /dev/null @@ -1,4 +0,0 @@ -variable "hosted-zone" { - description = "Hosted zone" - type = "string" -} diff --git a/dns/operationcode.org/provider.tf b/dns/operationcode.org/provider.tf deleted file mode 100644 index 028214d..0000000 --- a/dns/operationcode.org/provider.tf +++ /dev/null @@ -1,8 +0,0 @@ -/* -credentials sourced from DNSIMPLE_TOKEN and DNSIMPLE_ACCOUNT vars -*/ -provider "dnsimple" {} - -terraform { - backend "s3" {} -} diff --git a/dns/operationcode.org/records.tf b/dns/operationcode.org/records.tf deleted file mode 100644 index 68a0d2e..0000000 --- a/dns/operationcode.org/records.tf +++ /dev/null @@ -1,78 +0,0 @@ -resource "dnsimple_record" "root" { - domain = "${var.hosted-zone}" - name = "" - type = "ALIAS" - value = "alias.zeit.co" - ttl = 60 -} - -resource "dnsimple_record" "www" { - domain = "${var.hosted-zone}" - name = "www" - type = "ALIAS" - value = "alias.zeit.co" - ttl = 60 -} - -resource "dnsimple_record" "www_txt" { - domain = "${var.hosted-zone}" - name = "_now" - type = "TXT" - value = "Qmd8XawRvuECtFLQm8SytbgcW2PV2jthtfMy7ujLnTN2gL" -} - -resource "dnsimple_record" "api" { - domain = "${var.hosted-zone}" - name = "api" - type = "CNAME" - value = "backend.k8s.operationcode.org" -} - -resource "dnsimple_record" "staging_api" { - domain = "${var.hosted-zone}" - name = "api.staging" - type = "CNAME" - value = "backend-staging.k8s.operationcode.org" -} - -resource "dnsimple_record" "pybot" { - domain = "${var.hosted-zone}" - name = "pybot" - type = "CNAME" - value = "${var.pybot-lb-ingress}" -} - -resource "dnsimple_record" "staging_pybot" { - domain = "${var.hosted-zone}" - name = "pybot.staging" - type = "CNAME" - value = "${var.pybot-lb-ingress}" -} - -resource "dnsimple_record" "staging_pybot_cert_verification" { - domain = "${var.hosted-zone}" - name = "_69b5c7278c7c13092899e1b67e8de6c1.pybot.staging" - type = "CNAME" - value = "_9fdd906cbd86d545894523f8cd809812.ltfvzjuylp.acm-validations.aws" -} - -resource "dnsimple_record" "qa_pybot_cert_verification" { - domain = "${var.hosted-zone}" - name = "_037e6ed502a36d6a457bafe40ec0f7e4.pybot.qa.operationcode.org." - type = "CNAME" - value = "_25bac267a2f0d12eb4cb47a487e276c5.gwpjclltnz.acm-validations.aws." -} - -resource "dnsimple_record" "resources_api" { - domain = "${var.hosted-zone}" - name = "resources" - type = "CNAME" - value = "resources.k8s.operationcode.org" -} - -resource "dnsimple_record" "resources_staging_api" { - domain = "${var.hosted-zone}" - name = "resources.staging" - type = "CNAME" - value = "resources-staging.k8s.operationcode.org" -} diff --git a/dns/operationcode.org/terraform.tfvars b/dns/operationcode.org/terraform.tfvars deleted file mode 100644 index 9cce439..0000000 --- a/dns/operationcode.org/terraform.tfvars +++ /dev/null @@ -1,9 +0,0 @@ -terragrunt = { - include { - path = "${find_in_parent_folders()}" - } -} - -hosted-zone = "operationcode.org" -k8s-cluster-ingress = "ac206d147f3ed11e7a802062a4d50822-1344197385.us-east-2.elb.amazonaws.com" -pybot-lb-ingress = "pyback-lb-197482116.us-east-2.elb.amazonaws.com" \ No newline at end of file diff --git a/dns/operationcode.org/variables.tf b/dns/operationcode.org/variables.tf deleted file mode 100644 index 225a95b..0000000 --- a/dns/operationcode.org/variables.tf +++ /dev/null @@ -1,14 +0,0 @@ -variable "hosted-zone" { - description = "Hosted zone" - type = "string" -} - -variable "k8s-cluster-ingress" { - description = "Load balancer URL for the Kubernetes ingress" - type = "string" -} - -variable "pybot-lb-ingress" { - description = "Load balancer URL for pybot subdomain ingress" - type = "string" -} \ No newline at end of file diff --git a/dns/terraform.tfvars b/dns/terraform.tfvars deleted file mode 100644 index 5bb50e4..0000000 --- a/dns/terraform.tfvars +++ /dev/null @@ -1,12 +0,0 @@ -terragrunt = { - remote_state { - backend = "s3" - config { - bucket = "operationcode-infra-config" - key = "operationcode_infra/dns/${path_relative_to_include()}/terraform.tfstate" - region = "us-east-2" - encrypt = true - dynamodb_table = "opcode-terraform-lock" - } - } -} \ No newline at end of file From 6279dfbe49801d76b761ca603247610382816983 Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sun, 26 Oct 2025 10:25:00 -0700 Subject: [PATCH 08/11] another ipv6 attempt, failed. --- IPv6_MIGRATION_NOTES.md | 200 ++++++++++++++++++++++++++++++++++ terraform/.terraform.lock.hcl | 34 +++--- terraform/asg.tf | 11 +- terraform/main.tf | 2 +- 4 files changed, 224 insertions(+), 23 deletions(-) create mode 100644 IPv6_MIGRATION_NOTES.md diff --git a/IPv6_MIGRATION_NOTES.md b/IPv6_MIGRATION_NOTES.md new file mode 100644 index 0000000..3f812e3 --- /dev/null +++ b/IPv6_MIGRATION_NOTES.md @@ -0,0 +1,200 @@ +# IPv6 Migration Attempt and Module Upgrade Notes + +## Executive Summary + +**Attempted:** IPv6-only EC2 instances to save ~$7-14/month on public IPv4 charges +**Result:** **REVERTED** - Not viable without AWS NAT64 support +**Successful:** Module version upgrades (ASG, Security Group, AWS Provider) + +## What We Tried (October 2025) + +### 1. IPv6-Only Configuration Attempt + +**Modified:** `terraform/asg.tf` +- Disabled public IPv4: `associate_public_ip_address = false` +- Enabled IPv6: `ipv6_address_count = 1` +- Configured ECS agent and Docker for IPv6 + +**Infrastructure verified working:** +- ✅ VPC has IPv6 CIDR: `2600:1f16:78e:d400::/56` +- ✅ Subnets have IPv6 CIDRs with auto-assign enabled +- ✅ Route table: `::/0` → Internet Gateway +- ✅ DNS64 enabled on all subnets +- ✅ AWS dual-stack endpoints available: + - `ecs.us-east-2.api.aws` → `2600:1f70:6000:c0:...` + - `ecr.us-east-2.api.aws` → `2600:1f70:6000:80:...` + - `logs.us-east-2.api.aws` → `2600:1f70:6000:200:...` + +### Why It Failed + +**Root cause:** AWS provides DNS64 but **NOT NAT64** + +**What this means:** +- **DNS64** (✅ provided): Translates DNS queries from A records to AAAA records using `64:ff9b::/96` prefix +- **NAT64** (❌ NOT provided): Would translate actual IPv6 packets to IPv4 for IPv4-only services +- Result: Instances can resolve IPv4-only services to IPv6 addresses, but packets time out with no NAT64 gateway + +**Services that broke:** +- ❌ AWS SSM Agent (IPv4-only): `dial tcp [64:ff9b::392:b12]:443: i/o timeout` +- ❌ ECS container health checks failed +- ❌ Any IPv4-only external dependencies + +**Services that worked:** +- ✅ ECS control plane (has dual-stack endpoint) +- ✅ ECR (has dual-stack endpoint) +- ✅ CloudWatch Logs (has dual-stack endpoint) + +### 2. Terraform Module Version Upgrades (SUCCESSFUL) + +**Successfully Updated Modules:** + +| Module | Old Version | New Version | Status | +|--------|-------------|-------------|--------| +| `terraform-aws-modules/autoscaling/aws` | ~> 6.5 | ~> 8.3 | ✅ Applied | +| `terraform-aws-modules/security-group/aws` | ~> 4.0 | ~> 5.3 | ✅ Applied | +| AWS Provider | >= 4.6 | >= 5.0 | ✅ Applied | +| `terraform-aws-modules/ecs/aws` | ~> 4.0 | ~> 4.1 | ✅ Applied (kept at v4 to avoid cluster recreation) | + +**Why we didn't go further:** +- ECS v6.x: Breaking API changes (cluster recreation required) +- ASG v9.x: Breaking changes in `mixed_instances_policy` structure + +**Installed Versions:** +- AWS Provider: v5.100.0 +- ECS Module: v4.1.3 +- Autoscaling Module: v8.3.1 +- Security Group Module: v5.3.1 + +## Current Configuration (Post-Revert) + +**Final State:** +- ✅ Instances have public IPv4 (reverted from IPv6-only) +- ✅ Instances have IPv6 addresses +- ✅ Dual-stack networking +- ✅ Module upgrades applied +- ❌ No cost savings (still paying for public IPv4) + +**Configuration:** +```hcl +# terraform/asg.tf +network_interfaces = [ + { + associate_public_ip_address = true # Reverted to true + ipv6_address_count = 1 # Still have IPv6 + # ... + } +] + +# terraform/ecs.tf - user_data +# Standard ECS config, no IPv6-specific settings +``` + +## What Would Need to Change for IPv6-Only to Work + +**Waiting for AWS to provide:** + +1. **Native NAT64 Service** + - Similar to NAT Gateway but for IPv6→IPv4 translation + - Would allow IPv6-only instances to reach IPv4-only services + - **This is the blocker - AWS doesn't offer this** + +2. **Alternative: All services support dual-stack** + - Every AWS service with IPv6 endpoints + - Particularly: SSM, EC2 Messages, SSM Messages + - Currently only ECS, ECR, CloudWatch Logs, S3 support dual-stack + +**Self-managed workarounds we rejected:** + +1. **Deploy NAT64 on EC2** (Jool/Tayga software) + - Cost: ~$3-5/month + maintenance burden + - Complexity: High (setup, monitoring, SPOF) + - Not worth $7-14/month savings + +2. **VPC Endpoints for IPv4-only services** + - Cost: ~$7-10/month + - Would eliminate savings + - Previous testing showed higher cost than benefit + +3. **Disable SSM entirely** + - Lose remote management capability + - Not acceptable for production + +## Lessons Learned + +### What We Discovered + +1. **DNS64 ≠ NAT64** + - DNS64 only translates DNS queries, not actual traffic + - Need both DNS64 + NAT64 for IPv6-only to work + - AWS provides DNS64 but not NAT64 + +2. **Docker IPv6 Configuration Issues** + - Enabling Docker IPv6 (`"ipv6": true`) broke dual-stack networking + - Caused container health check failures + - Required instance refresh to fix + +3. **AWS Service IPv6 Support is Inconsistent** + - Some services have dual-stack: ECS, ECR, CloudWatch, S3 + - Some services are IPv4-only: SSM, EC2 Messages + - Use `.api.aws` suffix for dual-stack endpoints when available + +4. **Cost-Benefit Analysis** + - Potential savings: ~$7-14/month (public IPv4 charges) + - VPC endpoint costs: ~$7-10/month (negates savings) + - Self-managed NAT64: High complexity for minimal savings + - **Conclusion:** Not worth the effort at this scale + +### Technical Details Documented + +**VPC IPv6 Configuration:** +- VPC CIDR: `2600:1f16:78e:d400::/56` +- Subnets: `2600:1f16:78e:d400::/64`, `d401::/64`, `d402::/64` +- DNS64 prefix: `64:ff9b::/96` +- Route: `::/0` → `igw-e39ab08a` + +**Error signatures to watch for:** +``` +dial tcp [64:ff9b::xxx:xxx]:443: i/o timeout +``` +This indicates DNS64 translation without NAT64 gateway. + +## Future Retry Conditions + +**Only attempt IPv6-only again when ONE of these is true:** + +1. ✅ **AWS launches managed NAT64 service** + - Monitor AWS announcements for VPC NAT64 Gateway + - Similar to existing NAT Gateway but for IPv6→IPv4 + +2. ✅ **All required AWS services support dual-stack** + - Specifically need: SSM, EC2 Messages, SSM Messages with IPv6 + - Check: https://docs.aws.amazon.com/general/latest/gr/aws-ipv6-support.html + +3. ✅ **Public IPv4 costs exceed $20-30/month** + - At current scale (2-4 instances), savings too small + - If scale increases significantly, complexity might be worth it + +4. ✅ **VPC Endpoint costs drop significantly** + - If AWS reduces endpoint pricing below ~$3/month per endpoint + - Would make endpoint solution viable + +**How to check service IPv6 support:** +```bash +dig service-name.region.api.aws AAAA +short +# If returns IPv6 address, service supports dual-stack +``` + +## Rollback Summary + +**What we reverted:** +1. Changed `associate_public_ip_address` back to `true` +2. Removed IPv6-specific ECS agent configuration +3. Removed Docker IPv6 configuration +4. Triggered instance refresh to replace broken instances + +**What we kept:** +- IPv6 addressing (instances have both IPv4 and IPv6) +- Module version upgrades +- Updated security group module + +**Recovery time:** ~5 minutes for instance refresh to complete diff --git a/terraform/.terraform.lock.hcl b/terraform/.terraform.lock.hcl index e81272e..8cc010f 100644 --- a/terraform/.terraform.lock.hcl +++ b/terraform/.terraform.lock.hcl @@ -2,24 +2,24 @@ # Manual edits may be lost in future updates. provider "registry.terraform.io/hashicorp/aws" { - version = "5.53.0" - constraints = ">= 3.29.0, >= 4.6.0, >= 4.57.0" + version = "5.100.0" + constraints = ">= 3.29.0, >= 4.66.1, >= 5.0.0, >= 5.85.0, < 6.0.0" hashes = [ - "h1:ucNFgeMRknvGjwQrVf6FzR9I5kYpFxEl3F0MeVgloBw=", - "zh:2adad39412111d19a5195474d6b95577fc25ccf06d88a90019bee0efba33a1e3", - "zh:51226453a14f95b0d1163cfecafc9cf1a92ce5f66e42e6b4065d83a813836a2c", - "zh:62450fadb56db9c18d50bb8b7728a3d009be608d7ee0d4fe95c85ccb521dff83", - "zh:6f3ad977a9cc4800847c136690b1c0a0fd8437705062163d29dc4e9429598950", - "zh:71ca0a16b735b8d34b7127dd7d1e1e5d1eaac9c9f792e08abde291b5beb947d5", - "zh:7ae9cf4838eea80288305be0a3e69b39ffff86ede7b4319be421f06d32d04fb6", - "zh:93abc2db5ad995cfee014eb7446abc7caedc427e141d375a11993e6e199076b5", - "zh:9560b3424d97da804e98ee86b474b7370afefa09baf350cae7f33afb3f1aa209", + "h1:Ijt7pOlB7Tr7maGQIqtsLFbl7pSMIj06TVdkoSBcYOw=", + "zh:054b8dd49f0549c9a7cc27d159e45327b7b65cf404da5e5a20da154b90b8a644", + "zh:0b97bf8d5e03d15d83cc40b0530a1f84b459354939ba6f135a0086c20ebbe6b2", + "zh:1589a2266af699cbd5d80737a0fe02e54ec9cf2ca54e7e00ac51c7359056f274", + "zh:6330766f1d85f01ae6ea90d1b214b8b74cc8c1badc4696b165b36ddd4cc15f7b", + "zh:7c8c2e30d8e55291b86fcb64bdf6c25489d538688545eb48fd74ad622e5d3862", + "zh:99b1003bd9bd32ee323544da897148f46a527f622dc3971af63ea3e251596342", "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", - "zh:9eb57a9b649c217ac4eeb27af2a1935c18bd9bc8fb1be07434e7de74729eff46", - "zh:b5f32dcbe71ea22c2090eeeaec9af3e098d7b8c3e4491f34ffdfdc6f1c1abf81", - "zh:c9fbd5417f266c773055178e87bb4091df7f0542b72bf5ad0a4ae27045a2b7ca", - "zh:d518b3c52c8a9f79769dbe1b3683d25b4cdc8bfc77a3b3cd9c85f74e6c7383e1", - "zh:db741be21f32404bb87d73d25b1b7fd9b813b00aeb20a130ed8806d44dc26680", - "zh:ed1a8bb4d08653d87265ae534d6fc33bbdabae1608692a1ee364fce03548d36c", + "zh:9f8b909d3ec50ade83c8062290378b1ec553edef6a447c56dadc01a99f4eaa93", + "zh:aaef921ff9aabaf8b1869a86d692ebd24fbd4e12c21205034bb679b9caf883a2", + "zh:ac882313207aba00dd5a76dbd572a0ddc818bb9cbf5c9d61b28fe30efaec951e", + "zh:bb64e8aff37becab373a1a0cc1080990785304141af42ed6aa3dd4913b000421", + "zh:dfe495f6621df5540d9c92ad40b8067376350b005c637ea6efac5dc15028add4", + "zh:f0ddf0eaf052766cfe09dea8200a946519f653c384ab4336e2a4a64fdd6310e9", + "zh:f1b7e684f4c7ae1eed272b6de7d2049bb87a0275cb04dbb7cda6636f600699c9", + "zh:ff461571e3f233699bf690db319dfe46aec75e58726636a0d97dd9ac6e32fb70", ] } diff --git a/terraform/asg.tf b/terraform/asg.tf index e3412b6..90f6690 100644 --- a/terraform/asg.tf +++ b/terraform/asg.tf @@ -8,7 +8,7 @@ data "aws_ssm_parameter" "ecs_optimized_ami" { # https://registry.terraform.io/modules/terraform-aws-modules/autoscaling/aws/latest module "autoscaling" { source = "terraform-aws-modules/autoscaling/aws" - version = "~> 6.5" + version = "~> 8.0" # v9+ has breaking API changes in mixed_instances_policy name = "${local.name}-spot" min_size = 2 @@ -77,7 +77,8 @@ module "autoscaling" { { delete_on_termination = true device_index = 0 - associate_public_ip_address = true + associate_public_ip_address = true # set to False to use IPv6 only - still doesn't fully work with SSM and ECS as of Oct 2025 + ipv6_address_count = 1 # Assign one IPv6 address security_groups = [module.autoscaling_sg.security_group_id] } ] @@ -128,7 +129,7 @@ module "autoscaling" { # https://registry.terraform.io/modules/terraform-aws-modules/security-group/aws/latest module "autoscaling_sg" { source = "terraform-aws-modules/security-group/aws" - version = "~> 4.0" + version = "~> 5.3" # Latest version name = local.name description = "Autoscaling group security group" @@ -142,7 +143,7 @@ module "autoscaling_sg" { } ] - # Inbound all high ports from the alb + # Inbound all high ports from the alb (IPv4 and IPv6) ingress_with_source_security_group_id = [ { source_security_group_id = aws_security_group.lb_security_group.id @@ -152,7 +153,7 @@ module "autoscaling_sg" { } ] - egress_rules = ["all-all"] + egress_rules = ["all-all"] # Already includes IPv4 and IPv6 egress tags = local.tags } diff --git a/terraform/main.tf b/terraform/main.tf index 59a5398..3a5133a 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -4,7 +4,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 4.6" + version = ">= 5.0" # Updated from 4.6 to 5.0 } } } From dd4b426581695e879ccc07f9cd9dfe2ab8a5136b Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sun, 26 Oct 2025 10:27:14 -0700 Subject: [PATCH 09/11] don't double-run GHA actions --- .github/workflows/terraform.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml index d1a7eb5..e8b0818 100644 --- a/.github/workflows/terraform.yml +++ b/.github/workflows/terraform.yml @@ -2,6 +2,8 @@ name: Terraform validation on: push: + branches: + - main pull_request: jobs: From 09924b66165df4b867c0ae4128ae0f5bb6870be3 Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sun, 26 Oct 2025 10:28:26 -0700 Subject: [PATCH 10/11] fix comments failing fmt check Signed-off-by: Irving Popovetsky --- terraform/asg.tf | 10 +++++----- terraform/main.tf | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/terraform/asg.tf b/terraform/asg.tf index 90f6690..1bae699 100644 --- a/terraform/asg.tf +++ b/terraform/asg.tf @@ -8,7 +8,7 @@ data "aws_ssm_parameter" "ecs_optimized_ami" { # https://registry.terraform.io/modules/terraform-aws-modules/autoscaling/aws/latest module "autoscaling" { source = "terraform-aws-modules/autoscaling/aws" - version = "~> 8.0" # v9+ has breaking API changes in mixed_instances_policy + version = "~> 8.0" # v9+ has breaking API changes in mixed_instances_policy name = "${local.name}-spot" min_size = 2 @@ -77,8 +77,8 @@ module "autoscaling" { { delete_on_termination = true device_index = 0 - associate_public_ip_address = true # set to False to use IPv6 only - still doesn't fully work with SSM and ECS as of Oct 2025 - ipv6_address_count = 1 # Assign one IPv6 address + associate_public_ip_address = true # set to False to use IPv6 only - still doesn't fully work with SSM and ECS as of Oct 2025 + ipv6_address_count = 1 # Assign one IPv6 address security_groups = [module.autoscaling_sg.security_group_id] } ] @@ -129,7 +129,7 @@ module "autoscaling" { # https://registry.terraform.io/modules/terraform-aws-modules/security-group/aws/latest module "autoscaling_sg" { source = "terraform-aws-modules/security-group/aws" - version = "~> 5.3" # Latest version + version = "~> 5.3" # Latest version name = local.name description = "Autoscaling group security group" @@ -153,7 +153,7 @@ module "autoscaling_sg" { } ] - egress_rules = ["all-all"] # Already includes IPv4 and IPv6 egress + egress_rules = ["all-all"] # Already includes IPv4 and IPv6 egress tags = local.tags } diff --git a/terraform/main.tf b/terraform/main.tf index 3a5133a..c6e898a 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -4,7 +4,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.0" # Updated from 4.6 to 5.0 + version = ">= 5.0" # Updated from 4.6 to 5.0 } } } From 5ef505818813adebcd4b90b38b68f589a1291698 Mon Sep 17 00:00:00 2001 From: Irving Popovetsky Date: Sun, 26 Oct 2025 10:32:13 -0700 Subject: [PATCH 11/11] make the action more efficient --- .github/workflows/terraform.yml | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml index e8b0818..76719d8 100644 --- a/.github/workflows/terraform.yml +++ b/.github/workflows/terraform.yml @@ -7,8 +7,8 @@ on: pull_request: jobs: - terraform-fmt: - name: Terraform Format + terraform: + name: Terraform format and validate runs-on: ubuntu-latest defaults: run: @@ -23,20 +23,6 @@ jobs: - name: Terraform Format run: terraform fmt -check -recursive - terraform-validate: - name: Terraform Validate - runs-on: ubuntu-latest - needs: terraform-fmt - defaults: - run: - working-directory: terraform - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v3 - - name: Terraform Init run: terraform init -backend=false