diff --git a/README.md b/README.md
index 6e5994a305..9d04e55425 100644
--- a/README.md
+++ b/README.md
@@ -214,6 +214,7 @@ Join our discord community via [this invite link](https://discord.gg/bxgXW8jJGh)
| [runners\_scale\_up\_lambda\_memory\_size](#input\_runners\_scale\_up\_lambda\_memory\_size) | Memory size limit in MB for scale-up lambda. | `number` | `512` | no |
| [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no |
| [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.
`schedule_expression`: is used to configure the schedule for the lambda.
`enabled`: enable or disable the lambda trigger via the EventBridge.
`lambda_memory_size`: lambda memery size limit.
`lambda_timeout`: timeout for the lambda in seconds.
`config`: configuration for the lambda function. Token path will be read by default from the module. |
object({
schedule_expression = optional(string, "rate(1 day)")
enabled = optional(bool, true)
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
}) | {
"config": {}
} | no |
+| [scale\_down\_parameter\_store\_tier](#input\_scale\_down\_parameter\_store\_tier) | SSM Parameter Store tier to use for scale-down configuration parameters. | `string` | `"Standard"` | no |
| [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no |
| [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no |
| [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secrets. | object({
root = optional(string, "github-action-runners")
app = optional(string, "app")
runners = optional(string, "runners")
webhook = optional(string, "webhook")
use_prefix = optional(bool, true)
}) | `{}` | no |
diff --git a/examples/multi-runner/main.tf b/examples/multi-runner/main.tf
index acbcdb8081..e1d83cd26e 100644
--- a/examples/multi-runner/main.tf
+++ b/examples/multi-runner/main.tf
@@ -98,7 +98,6 @@ module "runners" {
# runner_extra_labels = ["amazon"]
# runners_maximum_count = 1
# enable_ephemeral_runners = true
- # scale_down_schedule_expression = "cron(* * * * ? *)"
# }
# }
# }
@@ -107,6 +106,7 @@ module "runners" {
subnet_ids = module.base.vpc.private_subnets
runners_scale_up_lambda_timeout = 60
runners_scale_down_lambda_timeout = 60
+ scale_down_schedule_expression = "cron(* * * * ? *)"
prefix = local.environment
tags = {
Project = "ProjectX"
diff --git a/examples/multi-runner/templates/runner-configs/linux-arm64.yaml b/examples/multi-runner/templates/runner-configs/linux-arm64.yaml
index 0c6cae01b5..bb45f6df40 100644
--- a/examples/multi-runner/templates/runner-configs/linux-arm64.yaml
+++ b/examples/multi-runner/templates/runner-configs/linux-arm64.yaml
@@ -19,7 +19,6 @@ runner_config:
id_ssm_parameter_arn: ${ami_id_ssm_parameter_arn}
runners_maximum_count: 1
delay_webhook_event: 0
- scale_down_schedule_expression: cron(* * * * ? *)
runner_hook_job_started: |
echo "Running pre job hook as $(whoami)"
runner_hook_job_completed: |
diff --git a/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu-2204.yaml b/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu-2204.yaml
index 2b1ac15ee8..b1530bee84 100644
--- a/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu-2204.yaml
+++ b/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu-2204.yaml
@@ -19,7 +19,6 @@ runner_config:
- m5a.large
runners_maximum_count: 1
delay_webhook_event: 0
- scale_down_schedule_expression: cron(* * * * ? *)
userdata_template: ./templates/user-data.sh
ami:
owners:
diff --git a/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu.yaml b/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu.yaml
index 8ae700d570..dca632afbc 100644
--- a/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu.yaml
+++ b/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu.yaml
@@ -20,7 +20,6 @@ runner_config:
- m5a.large
runners_maximum_count: 1
delay_webhook_event: 0
- scale_down_schedule_expression: cron(* * * * ? *)
userdata_template: ./templates/user-data.sh
ami:
owners:
diff --git a/examples/multi-runner/templates/runner-configs/linux-x64.yaml b/examples/multi-runner/templates/runner-configs/linux-x64.yaml
index 146c340836..91c564a14c 100644
--- a/examples/multi-runner/templates/runner-configs/linux-x64.yaml
+++ b/examples/multi-runner/templates/runner-configs/linux-x64.yaml
@@ -21,7 +21,6 @@ runner_config:
enable_on_demand_failover_for_errors: ['InsufficientInstanceCapacity']
create_service_linked_role_spot: true
delay_webhook_event: 0
- scale_down_schedule_expression: cron(* * * * ? *)
runner_metadata_options:
instance_metadata_tags: disabled
http_endpoint: enabled
diff --git a/examples/multi-runner/templates/runner-configs/windows-x64.yaml b/examples/multi-runner/templates/runner-configs/windows-x64.yaml
index fdf8be6533..3d13dd612a 100644
--- a/examples/multi-runner/templates/runner-configs/windows-x64.yaml
+++ b/examples/multi-runner/templates/runner-configs/windows-x64.yaml
@@ -13,7 +13,6 @@ runner_config:
- c5.large
runners_maximum_count: 1
delay_webhook_event: 5
- scale_down_schedule_expression: cron(* * * * ? *)
runner_boot_time_in_minutes: 20
ami_filter:
name:
diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts
index 6779dd39d2..c5815eaff2 100644
--- a/lambdas/functions/control-plane/src/aws/runners.ts
+++ b/lambdas/functions/control-plane/src/aws/runners.ts
@@ -298,8 +298,7 @@ async function createInstances(
}
// If launchTime is undefined, this will return false
-export function bootTimeExceeded(ec2Runner: { launchTime?: Date }): boolean {
- const runnerBootTimeInMinutes = process.env.RUNNER_BOOT_TIME_IN_MINUTES;
+export function bootTimeExceeded(ec2Runner: { launchTime?: Date }, runnerBootTimeInMinutes: number): boolean {
const launchTimePlusBootTime = moment(ec2Runner.launchTime).utc().add(runnerBootTimeInMinutes, 'minutes');
return launchTimePlusBootTime < moment(new Date()).utc();
}
diff --git a/lambdas/functions/control-plane/src/modules.d.ts b/lambdas/functions/control-plane/src/modules.d.ts
index 7570f29035..ca5f166ec9 100644
--- a/lambdas/functions/control-plane/src/modules.d.ts
+++ b/lambdas/functions/control-plane/src/modules.d.ts
@@ -14,8 +14,9 @@ declare namespace NodeJS {
PARAMETER_GITHUB_APP_CLIENT_SECRET_NAME: string;
PARAMETER_GITHUB_APP_ID_NAME: string;
PARAMETER_GITHUB_APP_KEY_BASE64_NAME: string;
+ RUNNER_BOOT_TIME_IN_MINUTES: string;
RUNNER_OWNER: string;
- SCALE_DOWN_CONFIG: string;
+ SCALE_DOWN_CONFIG_SSM_PATH_PREFIX: string;
SSM_TOKEN_PATH: string;
SSM_CLEANUP_CONFIG: string;
SUBNET_IDS: string;
diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts
index 162a7d0f6d..0bbc976ed6 100644
--- a/lambdas/functions/control-plane/src/pool/pool.ts
+++ b/lambdas/functions/control-plane/src/pool/pool.ts
@@ -37,6 +37,7 @@ export async function adjust(event: PoolEvent): Promiseobject({
enable = optional(bool, false)
namespace = optional(string, "GitHub Runners")
metric = optional(object({
enable_github_app_rate_limit = optional(bool, true)
enable_job_retry = optional(bool, true)
enable_spot_termination_warning = optional(bool, true)
}), {})
}) | `{}` | no |
-| [multi\_runner\_config](#input\_multi\_runner\_config) | multi\_runner\_config = {map(object({
runner_config = object({
runner_os = string
runner_architecture = string
runner_metadata_options = optional(map(any), {
instance_metadata_tags = "enabled"
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 1
})
ami = optional(object({
filter = optional(map(list(string)), { state = ["available"] })
owners = optional(list(string), ["amazon"])
id_ssm_parameter_arn = optional(string, null)
kms_key_arn = optional(string, null)
}), null) # Defaults to null, in which case the module falls back to individual AMI variables (deprecated)
# Deprecated: Use ami object instead
ami_filter = optional(map(list(string)), { state = ["available"] })
ami_owners = optional(list(string), ["amazon"])
ami_id_ssm_parameter_name = optional(string, null)
ami_kms_key_arn = optional(string, "")
create_service_linked_role_spot = optional(bool, false)
credit_specification = optional(string, null)
delay_webhook_event = optional(number, 30)
disable_runner_autoupdate = optional(bool, false)
ebs_optimized = optional(bool, false)
enable_ephemeral_runners = optional(bool, false)
enable_job_queued_check = optional(bool, null)
enable_on_demand_failover_for_errors = optional(list(string), [])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
enable_userdata = optional(bool, true)
instance_allocation_strategy = optional(string, "lowest-price")
instance_max_spot_price = optional(string, null)
instance_target_capacity_type = optional(string, "spot")
instance_types = list(string)
job_queue_retention_in_seconds = optional(number, 86400)
minimum_running_time_in_minutes = optional(number, null)
pool_runner_owner = optional(string, null)
runner_as_root = optional(bool, false)
runner_boot_time_in_minutes = optional(number, 5)
runner_disable_default_labels = optional(bool, false)
runner_extra_labels = optional(list(string), [])
runner_group_name = optional(string, "Default")
runner_name_prefix = optional(string, "")
runner_run_as = optional(string, "ec2-user")
runners_maximum_count = number
runner_additional_security_group_ids = optional(list(string), [])
scale_down_schedule_expression = optional(string, "cron(*/5 * * * ? *)")
scale_up_reserved_concurrent_executions = optional(number, 1)
userdata_template = optional(string, null)
userdata_content = optional(string, null)
enable_jit_config = optional(bool, null)
enable_runner_detailed_monitoring = optional(bool, false)
enable_cloudwatch_agent = optional(bool, true)
cloudwatch_config = optional(string, null)
userdata_pre_install = optional(string, "")
userdata_post_install = optional(string, "")
runner_hook_job_started = optional(string, "")
runner_hook_job_completed = optional(string, "")
runner_ec2_tags = optional(map(string), {})
runner_iam_role_managed_policy_arns = optional(list(string), [])
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
idle_config = optional(list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
})), [])
cpu_options = optional(object({
core_count = number
threads_per_core = number
}), null)
runner_log_files = optional(list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
})), null)
block_device_mappings = optional(list(object({
delete_on_termination = optional(bool, true)
device_name = optional(string, "/dev/xvda")
encrypted = optional(bool, true)
iops = optional(number)
kms_key_id = optional(string)
snapshot_id = optional(string)
throughput = optional(number)
volume_size = number
volume_type = optional(string, "gp3")
})), [{
volume_size = 30
}])
pool_config = optional(list(object({
schedule_expression = string
schedule_expression_timezone = optional(string)
size = number
})), [])
job_retry = optional(object({
enable = optional(bool, false)
delay_in_seconds = optional(number, 300)
delay_backoff = optional(number, 2)
lambda_memory_size = optional(number, 256)
lambda_timeout = optional(number, 30)
max_attempts = optional(number, 1)
}), {})
})
matcherConfig = object({
labelMatchers = list(list(string))
exactMatch = optional(bool, false)
priority = optional(number, 999)
})
redrive_build_queue = optional(object({
enabled = bool
maxReceiveCount = number
}), {
enabled = false
maxReceiveCount = null
})
})) | n/a | yes |
+| [multi\_runner\_config](#input\_multi\_runner\_config) | multi\_runner\_config = {map(object({
runner_config = object({
runner_os = string
runner_architecture = string
runner_metadata_options = optional(map(any), {
instance_metadata_tags = "enabled"
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 1
})
ami = optional(object({
filter = optional(map(list(string)), { state = ["available"] })
owners = optional(list(string), ["amazon"])
id_ssm_parameter_arn = optional(string, null)
kms_key_arn = optional(string, null)
}), null) # Defaults to null, in which case the module falls back to individual AMI variables (deprecated)
# Deprecated: Use ami object instead
ami_filter = optional(map(list(string)), { state = ["available"] })
ami_owners = optional(list(string), ["amazon"])
ami_id_ssm_parameter_name = optional(string, null)
ami_kms_key_arn = optional(string, "")
create_service_linked_role_spot = optional(bool, false)
credit_specification = optional(string, null)
delay_webhook_event = optional(number, 30)
disable_runner_autoupdate = optional(bool, false)
ebs_optimized = optional(bool, false)
enable_ephemeral_runners = optional(bool, false)
enable_job_queued_check = optional(bool, null)
enable_on_demand_failover_for_errors = optional(list(string), [])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
enable_userdata = optional(bool, true)
instance_allocation_strategy = optional(string, "lowest-price")
instance_max_spot_price = optional(string, null)
instance_target_capacity_type = optional(string, "spot")
instance_types = list(string)
job_queue_retention_in_seconds = optional(number, 86400)
minimum_running_time_in_minutes = optional(number, null)
pool_runner_owner = optional(string, null)
runner_as_root = optional(bool, false)
runner_boot_time_in_minutes = optional(number, 5)
runner_disable_default_labels = optional(bool, false)
runner_extra_labels = optional(list(string), [])
runner_group_name = optional(string, "Default")
runner_name_prefix = optional(string, "")
runner_run_as = optional(string, "ec2-user")
runners_maximum_count = number
runner_additional_security_group_ids = optional(list(string), [])
scale_up_reserved_concurrent_executions = optional(number, 1)
userdata_template = optional(string, null)
userdata_content = optional(string, null)
enable_jit_config = optional(bool, null)
enable_runner_detailed_monitoring = optional(bool, false)
enable_cloudwatch_agent = optional(bool, true)
cloudwatch_config = optional(string, null)
userdata_pre_install = optional(string, "")
userdata_post_install = optional(string, "")
runner_hook_job_started = optional(string, "")
runner_hook_job_completed = optional(string, "")
runner_ec2_tags = optional(map(string), {})
runner_iam_role_managed_policy_arns = optional(list(string), [])
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
idle_config = optional(list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
})), [])
cpu_options = optional(object({
core_count = number
threads_per_core = number
}), null)
runner_log_files = optional(list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
})), null)
block_device_mappings = optional(list(object({
delete_on_termination = optional(bool, true)
device_name = optional(string, "/dev/xvda")
encrypted = optional(bool, true)
iops = optional(number)
kms_key_id = optional(string)
snapshot_id = optional(string)
throughput = optional(number)
volume_size = number
volume_type = optional(string, "gp3")
})), [{
volume_size = 30
}])
pool_config = optional(list(object({
schedule_expression = string
schedule_expression_timezone = optional(string)
size = number
})), [])
job_retry = optional(object({
enable = optional(bool, false)
delay_in_seconds = optional(number, 300)
delay_backoff = optional(number, 2)
lambda_memory_size = optional(number, 256)
lambda_timeout = optional(number, 30)
max_attempts = optional(number, 1)
}), {})
})
matcherConfig = object({
labelMatchers = list(list(string))
exactMatch = optional(bool, false)
priority = optional(number, 999)
})
redrive_build_queue = optional(object({
enabled = bool
maxReceiveCount = number
}), {
enabled = false
maxReceiveCount = null
})
})) | n/a | yes |
| [pool\_lambda\_reserved\_concurrent\_executions](#input\_pool\_lambda\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no |
| [pool\_lambda\_timeout](#input\_pool\_lambda\_timeout) | Time out for the pool lambda in seconds. | `number` | `60` | no |
| [prefix](#input\_prefix) | The prefix used for naming resources | `string` | `"github-actions"` | no |
@@ -170,6 +171,8 @@ module "multi-runner" {
| [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no |
| [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.object({
schedule_expression = optional(string, "rate(1 day)")
enabled = optional(bool, true)
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
}) | {
"config": {}
} | no |
| [scale\_down\_lambda\_memory\_size](#input\_scale\_down\_lambda\_memory\_size) | Memory size limit in MB for scale down. | `number` | `512` | no |
+| [scale\_down\_parameter\_store\_tier](#input\_scale\_down\_parameter\_store\_tier) | SSM Parameter Store tier used to store consolidated scale-down configuration. | `string` | `"Standard"` | no |
+| [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no |
| [scale\_up\_lambda\_memory\_size](#input\_scale\_up\_lambda\_memory\_size) | Memory size limit in MB for scale\_up lambda. | `number` | `512` | no |
| [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secreets. | object({
root = optional(string, "github-action-runners")
app = optional(string, "app")
runners = optional(string, "runners")
webhook = optional(string, "webhook")
}) | `{}` | no |
| [state\_event\_rule\_binaries\_syncer](#input\_state\_event\_rule\_binaries\_syncer) | Option to disable EventBridge Lambda trigger for the binary syncer, useful to stop automatic updates of binary distribution | `string` | `"ENABLED"` | no |
@@ -196,6 +199,7 @@ module "multi-runner" {
| [instance\_termination\_handler](#output\_instance\_termination\_handler) | n/a |
| [instance\_termination\_watcher](#output\_instance\_termination\_watcher) | n/a |
| [runners\_map](#output\_runners\_map) | n/a |
+| [scale\_down](#output\_scale\_down) | Lambda to scale-down runners |
| [ssm\_parameters](#output\_ssm\_parameters) | n/a |
| [webhook](#output\_webhook) | n/a |
diff --git a/modules/multi-runner/main.tf b/modules/multi-runner/main.tf
index 905cc7f793..38b9286af4 100644
--- a/modules/multi-runner/main.tf
+++ b/modules/multi-runner/main.tf
@@ -17,6 +17,8 @@ locals {
unique_os_and_arch = { for i, v in local.tmp_distinct_list_unique_os_and_arch : "${v.os_type}_${v.architecture}" => v }
ssm_root_path = "/${var.ssm_paths.root}/${var.prefix}"
+
+ scale_down_parameter_path_prefix = "${local.ssm_root_path}/scale-down"
}
resource "random_string" "random" {
@@ -24,3 +26,53 @@ resource "random_string" "random" {
special = false
upper = false
}
+
+locals {
+ scale_down_environment_configs = [
+ for k, v in local.runner_config : {
+ environment = "${var.prefix}-${k}"
+ idle_config = v.runner_config.idle_config
+ minimum_running_time_in_minutes = coalesce(
+ v.runner_config.minimum_running_time_in_minutes,
+ v.runner_config.runner_os == "windows" ? 15 : 5
+ )
+ runner_boot_time_in_minutes = v.runner_config.runner_boot_time_in_minutes
+ }
+ ]
+}
+
+module "scale_down" {
+ source = "../runners/scale-down"
+
+ environments = local.scale_down_environment_configs
+ prefix = var.prefix
+ schedule_expression = var.scale_down_schedule_expression
+ ssm_parameter_path_prefix = local.scale_down_parameter_path_prefix
+ scale_down_parameter_store_tier = var.scale_down_parameter_store_tier
+
+ github_app_parameters = local.github_app_parameters
+ lambda_s3_bucket = var.lambda_s3_bucket
+ runners_lambda_s3_key = var.runners_lambda_s3_key
+ runners_lambda_s3_object_version = var.runners_lambda_s3_object_version
+ lambda_runtime = var.lambda_runtime
+ lambda_timeout = var.runners_scale_down_lambda_timeout
+ lambda_memory_size = var.scale_down_lambda_memory_size
+ lambda_architecture = var.lambda_architecture
+ lambda_zip = var.runners_lambda_zip
+ lambda_subnet_ids = var.lambda_subnet_ids
+ lambda_security_group_ids = var.lambda_security_group_ids
+ lambda_tags = var.lambda_tags
+ tracing_config = var.tracing_config
+ logging_retention_in_days = var.logging_retention_in_days
+ logging_kms_key_id = var.logging_kms_key_id
+ kms_key_arn = coalesce(var.kms_key_arn, "")
+ ghes_url = var.ghes_url
+ ghes_ssl_verify = var.ghes_ssl_verify
+ user_agent = var.user_agent
+ log_level = var.log_level
+ metrics = var.metrics
+ role_path = var.role_path
+ role_permissions_boundary = var.role_permissions_boundary
+ aws_partition = var.aws_partition
+ tags = local.tags
+}
diff --git a/modules/multi-runner/outputs.tf b/modules/multi-runner/outputs.tf
index 2f2b1d3458..137f3efcc2 100644
--- a/modules/multi-runner/outputs.tf
+++ b/modules/multi-runner/outputs.tf
@@ -7,13 +7,10 @@ output "runners_map" {
launch_template_ami_id = runner.launch_template.image_id
lambda_up = runner.lambda_scale_up
lambda_up_log_group = runner.lambda_scale_up_log_group
- lambda_down = runner.lambda_scale_down
- lambda_down_log_group = runner.lambda_scale_down_log_group
lambda_pool = runner.lambda_pool
lambda_pool_log_group = runner.lambda_pool_log_group
role_runner = runner.role_runner
role_scale_up = runner.role_scale_up
- role_scale_down = runner.role_scale_down
role_pool = runner.role_pool
runners_log_groups = runner.runners_log_groups
logfiles = runner.logfiles
@@ -21,6 +18,17 @@ output "runners_map" {
}
}
+output "scale_down" {
+ description = "Lambda to scale-down runners"
+ value = {
+ lambda = module.scale_down.lambda
+ lambda_log_group = module.scale_down.lambda_log_group
+ role = module.scale_down.role
+ event_rule = module.scale_down.cloudwatch_event_rule
+ ssm_parameters = module.scale_down.ssm_parameters
+ }
+}
+
output "binaries_syncer_map" {
value = { for runner_binary_key, runner_binary in module.runner_binaries : runner_binary_key => {
lambda = runner_binary.lambda
diff --git a/modules/multi-runner/runners.tf b/modules/multi-runner/runners.tf
index 811ab36260..4032b4e368 100644
--- a/modules/multi-runner/runners.tf
+++ b/modules/multi-runner/runners.tf
@@ -42,7 +42,7 @@ module "runners" {
disable_runner_autoupdate = each.value.runner_config.disable_runner_autoupdate
enable_managed_runner_security_group = var.enable_managed_runner_security_group
enable_runner_detailed_monitoring = each.value.runner_config.enable_runner_detailed_monitoring
- scale_down_schedule_expression = each.value.runner_config.scale_down_schedule_expression
+ scale_down_schedule_expression = null # multi-runner handles scaling down all runners
minimum_running_time_in_minutes = each.value.runner_config.minimum_running_time_in_minutes
runner_boot_time_in_minutes = each.value.runner_config.runner_boot_time_in_minutes
runner_disable_default_labels = each.value.runner_config.runner_disable_default_labels
@@ -69,6 +69,7 @@ module "runners" {
lambda_timeout_scale_up = var.runners_scale_up_lambda_timeout
lambda_scale_down_memory_size = var.scale_down_lambda_memory_size
lambda_timeout_scale_down = var.runners_scale_down_lambda_timeout
+ scale_down_parameter_store_tier = var.scale_down_parameter_store_tier
lambda_subnet_ids = var.lambda_subnet_ids
lambda_security_group_ids = var.lambda_security_group_ids
lambda_tags = var.lambda_tags
diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf
index edbdb33059..cff6350f48 100644
--- a/modules/multi-runner/variables.tf
+++ b/modules/multi-runner/variables.tf
@@ -104,7 +104,6 @@ variable "multi_runner_config" {
runner_run_as = optional(string, "ec2-user")
runners_maximum_count = number
runner_additional_security_group_ids = optional(list(string), [])
- scale_down_schedule_expression = optional(string, "cron(*/5 * * * ? *)")
scale_up_reserved_concurrent_executions = optional(number, 1)
userdata_template = optional(string, null)
userdata_content = optional(string, null)
@@ -213,10 +212,9 @@ variable "multi_runner_config" {
runner_name_prefix: "Prefix for the GitHub runner name."
runner_run_as: "Run the GitHub actions agent as user."
runners_maximum_count: "The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check."
- scale_down_schedule_expression: "Scheduler expression to check every x for scale down."
scale_up_reserved_concurrent_executions: "Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations."
userdata_template: "Alternative user-data template, replacing the default template. By providing your own user_data you have to take care of installing all required software, including the action runner. Variables userdata_pre/post_install are ignored."
- enable_jit_config "Overwrite the default behavior for JIT configuration. By default JIT configuration is enabled for ephemeral runners and disabled for non-ephemeral runners. In case of GHES check first if the JIT config API is avaialbe. In case you upgradeing from 3.x to 4.x you can set `enable_jit_config` to `false` to avoid a breaking change when having your own AMI."
+ enable_jit_config: "Overwrite the default behavior for JIT configuration. By default JIT configuration is enabled for ephemeral runners and disabled for non-ephemeral runners. In case of GHES check first if the JIT config API is avaialbe. In case you upgradeing from 3.x to 4.x you can set `enable_jit_config` to `false` to avoid a breaking change when having your own AMI."
enable_runner_detailed_monitoring: "Should detailed monitoring be enabled for the runner. Set this to true if you want to use detailed monitoring. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-cloudwatch-new.html for details."
enable_cloudwatch_agent: "Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`."
cloudwatch_config: "(optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details."
@@ -268,6 +266,22 @@ variable "runners_scale_down_lambda_timeout" {
default = 60
}
+variable "scale_down_schedule_expression" {
+ description = "Scheduler expression to check every x for scale down."
+ type = string
+ default = "cron(*/5 * * * ? *)"
+}
+
+variable "scale_down_parameter_store_tier" {
+ description = "SSM Parameter Store tier used to store consolidated scale-down configuration."
+ type = string
+ default = "Standard"
+ validation {
+ condition = contains(["Standard", "Advanced"], var.scale_down_parameter_store_tier)
+ error_message = "`scale_down_parameter_store_tier` must be either `Standard` or `Advanced`."
+ }
+}
+
variable "webhook_lambda_zip" {
description = "File location of the webhook lambda zip file."
type = string
diff --git a/modules/runners/README.md b/modules/runners/README.md
index cf62c2c96a..d0aa472979 100644
--- a/modules/runners/README.md
+++ b/modules/runners/README.md
@@ -14,12 +14,6 @@ The action runners are created via a launch template; in the launch template onl
The scale up lambda is triggered by events on a SQS queue. Events on this queue are delayed, which will give the workflow some time to start running on available runners. For each event the lambda will check if the workflow is still queued and no other limits are reached. In that case the lambda will create a new EC2 instance. The lambda only needs to know which launch template to use and which subnets are available. From the available subnets a random one will be chosen. Once the instance is created the event is assumed as handled, and we assume the workflow wil start at some moment once the created instance is ready.
-### Lambda scale down
-
-The scale down lambda is triggered via a CloudWatch event. The event is triggered by a cron expression defined in the variable `scale_down_schedule_expression` (https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/ScheduledEvents.html). For scaling down GitHub does not provide a good API yet, therefore we run the scaling down based on this event every x minutes. Each time the lambda is triggered it tries to remove all runners older than x minutes (configurable) managed in this deployment. In case the runner can be removed from GitHub, which means it is not executing a workflow, the lambda will terminate the EC2 instance.
-
---8<-- "modules/runners/scale-down-state-diagram.md:mkdocs_scale_down_state_diagram"
-
## Lambda Function
The Lambda function is written in [TypeScript](https://www.typescriptlang.org/) and requires Node 12.x and yarn. Sources are located in [./lambdas/runners]. Two lambda functions share the same sources, there is one entry point for `scaleDown` and another one for `scaleUp`.
@@ -67,23 +61,20 @@ yarn run dist
|------|--------|---------|
| [job\_retry](#module\_job\_retry) | ./job-retry | n/a |
| [pool](#module\_pool) | ./pool | n/a |
+| [scale\_down](#module\_scale\_down) | ./scale-down | n/a |
## Resources
| Name | Type |
|------|------|
-| [aws_cloudwatch_event_rule.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource |
| [aws_cloudwatch_event_rule.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource |
-| [aws_cloudwatch_event_target.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource |
| [aws_cloudwatch_event_target.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource |
| [aws_cloudwatch_log_group.gh_runners](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
-| [aws_cloudwatch_log_group.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
| [aws_cloudwatch_log_group.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
| [aws_cloudwatch_log_group.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource |
| [aws_iam_instance_profile.runner](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_instance_profile) | resource |
| [aws_iam_policy.ami_id_ssm_parameter_read](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource |
| [aws_iam_role.runner](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
-| [aws_iam_role.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
| [aws_iam_role.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
| [aws_iam_role.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource |
| [aws_iam_role_policy.cloudwatch](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
@@ -93,9 +84,6 @@ yarn run dist
| [aws_iam_role_policy.ec2](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
| [aws_iam_role_policy.job_retry_sqs_publish](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
| [aws_iam_role_policy.runner_session_manager_aws_managed](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
-| [aws_iam_role_policy.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
-| [aws_iam_role_policy.scale_down_logging](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
-| [aws_iam_role_policy.scale_down_xray](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
| [aws_iam_role_policy.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
| [aws_iam_role_policy.scale_up_logging](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
| [aws_iam_role_policy.scale_up_xray](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
@@ -106,15 +94,12 @@ yarn run dist
| [aws_iam_role_policy.ssm_parameters](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource |
| [aws_iam_role_policy_attachment.ami_id_ssm_parameter_read](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
| [aws_iam_role_policy_attachment.managed_policies](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
-| [aws_iam_role_policy_attachment.scale_down_vpc_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
| [aws_iam_role_policy_attachment.scale_up_vpc_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
| [aws_iam_role_policy_attachment.ssm_housekeeper_vpc_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
| [aws_iam_role_policy_attachment.xray_tracing](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource |
| [aws_lambda_event_source_mapping.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_event_source_mapping) | resource |
-| [aws_lambda_function.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
| [aws_lambda_function.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
| [aws_lambda_function.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource |
-| [aws_lambda_permission.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource |
| [aws_lambda_permission.scale_runners_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource |
| [aws_lambda_permission.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource |
| [aws_launch_template.runner](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/launch_template) | resource |
@@ -221,7 +206,8 @@ yarn run dist
| [runners\_lambda\_s3\_object\_version](#input\_runners\_lambda\_s3\_object\_version) | S3 object version for runners lambda function. Useful if S3 versioning is enabled on source bucket. | `string` | `null` | no |
| [runners\_maximum\_count](#input\_runners\_maximum\_count) | The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check. | `number` | `3` | no |
| [s3\_runner\_binaries](#input\_s3\_runner\_binaries) | Bucket details for cached GitHub binary. | object({
arn = string
id = string
key = string
}) | n/a | yes |
-| [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no |
+| [scale\_down\_parameter\_store\_tier](#input\_scale\_down\_parameter\_store\_tier) | SSM Parameter Store tier used to store scale-down configuration. | `string` | `"Standard"` | no |
+| [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. Set to null to disable scale-down Lambda creation. | `string` | `"cron(*/5 * * * ? *)"` | no |
| [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no |
| [sqs\_build\_queue](#input\_sqs\_build\_queue) | SQS queue to consume accepted build events. | object({
arn = string
url = string
}) | n/a | yes |
| [ssm\_housekeeper](#input\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.object({
schedule_expression = optional(string, "rate(1 day)")
state = optional(string, "ENABLED")
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
}) | {
"config": {}
} | no |
@@ -253,4 +239,5 @@ yarn run dist
| [role\_scale\_down](#output\_role\_scale\_down) | n/a |
| [role\_scale\_up](#output\_role\_scale\_up) | n/a |
| [runners\_log\_groups](#output\_runners\_log\_groups) | List of log groups from different log files of runner machine. |
+| [scale\_down\_ssm\_parameters](#output\_scale\_down\_ssm\_parameters) | n/a |
diff --git a/modules/runners/outputs.tf b/modules/runners/outputs.tf
index 8f366dce90..5a7fbef083 100644
--- a/modules/runners/outputs.tf
+++ b/modules/runners/outputs.tf
@@ -19,15 +19,19 @@ output "role_scale_up" {
}
output "lambda_scale_down" {
- value = aws_lambda_function.scale_down
+ value = try(module.scale_down[0].lambda, null)
}
output "lambda_scale_down_log_group" {
- value = aws_cloudwatch_log_group.scale_down
+ value = try(module.scale_down[0].lambda_log_group, null)
}
output "role_scale_down" {
- value = aws_iam_role.scale_down
+ value = try(module.scale_down[0].role, null)
+}
+
+output "scale_down_ssm_parameters" {
+ value = try(module.scale_down[0].ssm_parameters, null)
}
output "lambda_pool" {
diff --git a/modules/runners/scale-down-state-diagram.md b/modules/runners/scale-down-state-diagram.md
deleted file mode 100644
index b4f260eb2a..0000000000
--- a/modules/runners/scale-down-state-diagram.md
+++ /dev/null
@@ -1,150 +0,0 @@
-# GitHub Actions Runner Scale-Down State Diagram
-
-
-
-The scale-down Lambda function runs on a scheduled basis (every 5 minutes by default) to manage GitHub Actions runner instances. It performs a two-phase cleanup process: first terminating confirmed orphaned instances, then evaluating active runners to maintain the desired idle capacity while removing unnecessary instances.
-
-```mermaid
-stateDiagram-v2
- [*] --> ScheduledExecution : Cron Trigger every 5 min
-
- ScheduledExecution --> Phase1_OrphanTermination : Start Phase 1
-
- state Phase1_OrphanTermination {
- [*] --> ListOrphanInstances : Query EC2 for ghr orphan true
-
- ListOrphanInstances --> CheckOrphanType : For each orphan
-
- state CheckOrphanType <list(object({
environment = string
idle_config = list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
}))
minimum_running_time_in_minutes = number
runner_boot_time_in_minutes = number
})) | n/a | yes |
+| [ghes\_ssl\_verify](#input\_ghes\_ssl\_verify) | Verify GitHub Enterprise Server SSL certificate | `bool` | `true` | no |
+| [ghes\_url](#input\_ghes\_url) | GitHub Enterprise Server URL | `string` | `null` | no |
+| [github\_app\_parameters](#input\_github\_app\_parameters) | GitHub App SSM parameters | object({
id = object({
name = string
arn = string
})
key_base64 = object({
name = string
arn = string
})
}) | n/a | yes |
+| [kms\_key\_arn](#input\_kms\_key\_arn) | KMS key ARN for SSM parameter decryption | `string` | `""` | no |
+| [lambda\_architecture](#input\_lambda\_architecture) | Lambda architecture (x86\_64 or arm64) | `string` | n/a | yes |
+| [lambda\_memory\_size](#input\_lambda\_memory\_size) | Lambda memory size in MB | `number` | n/a | yes |
+| [lambda\_runtime](#input\_lambda\_runtime) | Lambda runtime | `string` | n/a | yes |
+| [lambda\_s3\_bucket](#input\_lambda\_s3\_bucket) | S3 bucket for Lambda deployment package | `string` | `null` | no |
+| [lambda\_security\_group\_ids](#input\_lambda\_security\_group\_ids) | List of security group IDs for Lambda VPC configuration | `list(string)` | `[]` | no |
+| [lambda\_subnet\_ids](#input\_lambda\_subnet\_ids) | List of subnet IDs for Lambda VPC configuration | `list(string)` | `[]` | no |
+| [lambda\_tags](#input\_lambda\_tags) | Tags for Lambda function | `map(string)` | `{}` | no |
+| [lambda\_timeout](#input\_lambda\_timeout) | Lambda timeout in seconds | `number` | n/a | yes |
+| [lambda\_zip](#input\_lambda\_zip) | Path to Lambda deployment package | `string` | n/a | yes |
+| [log\_level](#input\_log\_level) | Log level for Lambda function | `string` | `"info"` | no |
+| [logging\_kms\_key\_id](#input\_logging\_kms\_key\_id) | KMS key ID for CloudWatch log encryption | `string` | `null` | no |
+| [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | CloudWatch log retention in days | `number` | n/a | yes |
+| [metrics](#input\_metrics) | Metrics configuration | object({
enable = optional(bool, false)
namespace = optional(string, "GitHub Runners")
metric = optional(object({
enable_github_app_rate_limit = optional(bool, true)
}), {})
}) | `{}` | no |
+| [prefix](#input\_prefix) | Prefix for Lambda function name | `string` | n/a | yes |
+| [role\_path](#input\_role\_path) | IAM role path | `string` | n/a | yes |
+| [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | IAM role permissions boundary ARN | `string` | `null` | no |
+| [runners\_lambda\_s3\_key](#input\_runners\_lambda\_s3\_key) | S3 key for Lambda deployment package | `string` | `null` | no |
+| [runners\_lambda\_s3\_object\_version](#input\_runners\_lambda\_s3\_object\_version) | S3 object version for Lambda deployment package | `string` | `null` | no |
+| [scale\_down\_parameter\_store\_tier](#input\_scale\_down\_parameter\_store\_tier) | SSM Parameter Store tier to use for persisted scale-down configuration. | `string` | `"Standard"` | no |
+| [schedule\_expression](#input\_schedule\_expression) | CloudWatch Event schedule expression | `string` | `"cron(*/5 * * * ? *)"` | no |
+| [ssm\_parameter\_path\_prefix](#input\_ssm\_parameter\_path\_prefix) | Base SSM parameter path prefix used to store scale-down configuration (without environment suffix). | `string` | n/a | yes |
+| [tags](#input\_tags) | Tags to apply to resources | `map(string)` | `{}` | no |
+| [tracing\_config](#input\_tracing\_config) | Lambda tracing configuration | object({
mode = optional(string, null)
capture_http_requests = optional(bool, false)
capture_error = optional(bool, false)
}) | `{}` | no |
+| [user\_agent](#input\_user\_agent) | User agent string for GitHub API requests | `string` | `null` | no |
+
+## Outputs
+
+| Name | Description |
+|------|-------------|
+| [cloudwatch\_event\_rule](#output\_cloudwatch\_event\_rule) | CloudWatch Event Rule for scale-down |
+| [lambda](#output\_lambda) | Scale-down Lambda function |
+| [lambda\_log\_group](#output\_lambda\_log\_group) | Scale-down Lambda log group |
+| [role](#output\_role) | Scale-down Lambda IAM role |
+| [ssm\_parameters](#output\_ssm\_parameters) | Scale-down configuration parameters stored in SSM |
+
diff --git a/modules/runners/scale-down/main.tf b/modules/runners/scale-down/main.tf
new file mode 100644
index 0000000000..e3c0a31391
--- /dev/null
+++ b/modules/runners/scale-down/main.tf
@@ -0,0 +1,178 @@
+locals {
+ managed_environments = [for e in var.environments : e.environment]
+ environment_map = { for env in var.environments : env.environment => env }
+ normalized_ssm_parameter_prefix = "/${trim(var.ssm_parameter_path_prefix, "/")}"
+ scale_down_parameter_name_prefix = local.normalized_ssm_parameter_prefix
+}
+
+data "aws_caller_identity" "current" {}
+
+data "aws_region" "current" {}
+
+locals {
+ arn_ssm_parameters_path_scale_down_config = "arn:${var.aws_partition}:ssm:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:parameter${local.scale_down_parameter_name_prefix}"
+}
+
+resource "aws_ssm_parameter" "scale_down_config" {
+ for_each = local.environment_map
+
+ name = "${local.scale_down_parameter_name_prefix}/${each.key}"
+ description = "Scale-down configuration for environment ${each.key}"
+ type = "String"
+ tier = var.scale_down_parameter_store_tier
+ overwrite = true
+ value = jsonencode({
+ environment = each.key
+ idle_config = each.value.idle_config
+ minimum_running_time_in_minutes = each.value.minimum_running_time_in_minutes
+ runner_boot_time_in_minutes = each.value.runner_boot_time_in_minutes
+ })
+ tags = var.tags
+}
+
+# IAM assume role policy for Lambda
+data "aws_iam_policy_document" "lambda_assume_role_policy" {
+ statement {
+ actions = ["sts:AssumeRole"]
+
+ principals {
+ type = "Service"
+ identifiers = ["lambda.amazonaws.com"]
+ }
+ }
+}
+
+# X-Ray tracing policy
+data "aws_iam_policy_document" "lambda_xray" {
+ count = var.tracing_config.mode != null ? 1 : 0
+ statement {
+ actions = [
+ "xray:BatchGetTraces",
+ "xray:GetTraceSummaries",
+ "xray:PutTelemetryRecords",
+ "xray:PutTraceSegments"
+ ]
+ effect = "Allow"
+ resources = [
+ "*"
+ ]
+ sid = "AllowXRay"
+ }
+}
+
+resource "aws_lambda_function" "scale_down" {
+ s3_bucket = var.lambda_s3_bucket != null ? var.lambda_s3_bucket : null
+ s3_key = var.runners_lambda_s3_key != null ? var.runners_lambda_s3_key : null
+ s3_object_version = var.runners_lambda_s3_object_version != null ? var.runners_lambda_s3_object_version : null
+ filename = var.lambda_s3_bucket == null ? var.lambda_zip : null
+ source_code_hash = var.lambda_s3_bucket == null ? filebase64sha256(var.lambda_zip) : null
+ function_name = "${var.prefix}-scale-down"
+ role = aws_iam_role.scale_down.arn
+ handler = "index.scaleDownHandler"
+ runtime = var.lambda_runtime
+ timeout = var.lambda_timeout
+ tags = merge(var.tags, var.lambda_tags)
+ memory_size = var.lambda_memory_size
+ architectures = [var.lambda_architecture]
+
+ environment {
+ variables = {
+ ENABLE_METRIC_GITHUB_APP_RATE_LIMIT = var.metrics.enable && var.metrics.metric.enable_github_app_rate_limit
+ GHES_URL = var.ghes_url
+ LOG_LEVEL = var.log_level
+ NODE_TLS_REJECT_UNAUTHORIZED = var.ghes_url != null && !var.ghes_ssl_verify ? 0 : 1
+ PARAMETER_GITHUB_APP_ID_NAME = var.github_app_parameters.id.name
+ PARAMETER_GITHUB_APP_KEY_BASE64_NAME = var.github_app_parameters.key_base64.name
+ POWERTOOLS_LOGGER_LOG_EVENT = var.log_level == "debug" ? "true" : "false"
+ POWERTOOLS_METRICS_NAMESPACE = var.metrics.namespace
+ POWERTOOLS_SERVICE_NAME = "runners-scale-down"
+ POWERTOOLS_TRACE_ENABLED = var.tracing_config.mode != null ? true : false
+ POWERTOOLS_TRACER_CAPTURE_ERROR = var.tracing_config.capture_error
+ POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests
+ SCALE_DOWN_CONFIG_SSM_PATH_PREFIX = local.scale_down_parameter_name_prefix
+ USER_AGENT = var.user_agent
+ }
+ }
+
+ dynamic "vpc_config" {
+ for_each = length(var.lambda_subnet_ids) > 0 && length(var.lambda_security_group_ids) > 0 ? [true] : []
+ content {
+ security_group_ids = var.lambda_security_group_ids
+ subnet_ids = var.lambda_subnet_ids
+ }
+ }
+
+ dynamic "tracing_config" {
+ for_each = var.tracing_config.mode != null ? [true] : []
+ content {
+ mode = var.tracing_config.mode
+ }
+ }
+}
+
+resource "aws_cloudwatch_log_group" "scale_down" {
+ name = "/aws/lambda/${aws_lambda_function.scale_down.function_name}"
+ retention_in_days = var.logging_retention_in_days
+ kms_key_id = var.logging_kms_key_id
+ tags = var.tags
+}
+
+resource "aws_cloudwatch_event_rule" "scale_down" {
+ name = "${var.prefix}-scale-down-rule"
+ schedule_expression = var.schedule_expression
+ tags = var.tags
+}
+
+resource "aws_cloudwatch_event_target" "scale_down" {
+ rule = aws_cloudwatch_event_rule.scale_down.name
+ arn = aws_lambda_function.scale_down.arn
+}
+
+resource "aws_lambda_permission" "scale_down" {
+ statement_id = "AllowExecutionFromCloudWatch"
+ action = "lambda:InvokeFunction"
+ function_name = aws_lambda_function.scale_down.function_name
+ principal = "events.amazonaws.com"
+ source_arn = aws_cloudwatch_event_rule.scale_down.arn
+}
+
+resource "aws_iam_role" "scale_down" {
+ name = "${substr("${var.prefix}-scale-down-lambda", 0, 54)}-${substr(md5("${var.prefix}-scale-down-lambda"), 0, 8)}"
+ assume_role_policy = data.aws_iam_policy_document.lambda_assume_role_policy.json
+ path = var.role_path
+ permissions_boundary = var.role_permissions_boundary
+ tags = var.tags
+}
+
+resource "aws_iam_role_policy" "scale_down" {
+ name = "scale-down-policy"
+ role = aws_iam_role.scale_down.name
+ policy = templatefile("${path.module}/policies/lambda-scale-down.json", {
+ environments = jsonencode(local.managed_environments)
+ github_app_id_arn = var.github_app_parameters.id.arn
+ github_app_key_base64_arn = var.github_app_parameters.key_base64.arn
+ kms_key_arn = var.kms_key_arn
+ arn_ssm_parameters_path_scale_down_config = local.arn_ssm_parameters_path_scale_down_config
+ })
+}
+
+resource "aws_iam_role_policy" "scale_down_logging" {
+ name = "logging-policy"
+ role = aws_iam_role.scale_down.name
+ policy = templatefile("${path.module}/policies/lambda-cloudwatch.json", {
+ log_group_arn = aws_cloudwatch_log_group.scale_down.arn
+ })
+}
+
+resource "aws_iam_role_policy_attachment" "scale_down_vpc_execution_role" {
+ count = length(var.lambda_subnet_ids) > 0 ? 1 : 0
+ role = aws_iam_role.scale_down.name
+ policy_arn = "arn:${var.aws_partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
+}
+
+resource "aws_iam_role_policy" "scale_down_xray" {
+ count = var.tracing_config.mode != null ? 1 : 0
+ name = "xray-policy"
+ policy = data.aws_iam_policy_document.lambda_xray[0].json
+ role = aws_iam_role.scale_down.name
+}
diff --git a/modules/runners/scale-down/outputs.tf b/modules/runners/scale-down/outputs.tf
new file mode 100644
index 0000000000..ff275fdcf7
--- /dev/null
+++ b/modules/runners/scale-down/outputs.tf
@@ -0,0 +1,24 @@
+output "lambda" {
+ description = "Scale-down Lambda function"
+ value = aws_lambda_function.scale_down
+}
+
+output "lambda_log_group" {
+ description = "Scale-down Lambda log group"
+ value = aws_cloudwatch_log_group.scale_down
+}
+
+output "role" {
+ description = "Scale-down Lambda IAM role"
+ value = aws_iam_role.scale_down
+}
+
+output "cloudwatch_event_rule" {
+ description = "CloudWatch Event Rule for scale-down"
+ value = aws_cloudwatch_event_rule.scale_down
+}
+
+output "ssm_parameters" {
+ description = "Scale-down configuration parameters stored in SSM"
+ value = aws_ssm_parameter.scale_down_config
+}
diff --git a/modules/runners/scale-down/policies/lambda-cloudwatch.json b/modules/runners/scale-down/policies/lambda-cloudwatch.json
new file mode 100644
index 0000000000..ad9246bcb3
--- /dev/null
+++ b/modules/runners/scale-down/policies/lambda-cloudwatch.json
@@ -0,0 +1,10 @@
+{
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Action": ["logs:CreateLogStream", "logs:PutLogEvents"],
+ "Resource": "${log_group_arn}*"
+ }
+ ]
+}
diff --git a/modules/runners/scale-down/policies/lambda-scale-down.json b/modules/runners/scale-down/policies/lambda-scale-down.json
new file mode 100644
index 0000000000..9bb4582b6f
--- /dev/null
+++ b/modules/runners/scale-down/policies/lambda-scale-down.json
@@ -0,0 +1,76 @@
+{
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Action": [
+ "ec2:DescribeInstances",
+ "ec2:DescribeTags"
+ ],
+ "Resource": [
+ "*"
+ ]
+ },
+ {
+ "Effect": "Allow",
+ "Action": [
+ "ec2:TerminateInstances",
+ "ec2:CreateTags",
+ "ec2:DeleteTags"
+ ],
+ "Resource": [
+ "*"
+ ],
+ "Condition": {
+ "StringEquals": {
+ "ec2:ResourceTag/ghr:Application": "github-action-runner"
+ }
+ }
+ },
+ {
+ "Effect": "Allow",
+ "Action": [
+ "ec2:TerminateInstances",
+ "ec2:CreateTags",
+ "ec2:DeleteTags"
+ ],
+ "Resource": [
+ "*"
+ ],
+ "Condition": {
+ "StringEquals": {
+ "ec2:ResourceTag/ghr:environment": ${environments}
+ }
+ }
+ },
+ {
+ "Effect": "Allow",
+ "Action": [
+ "ssm:GetParameter"
+ ],
+ "Resource": [
+ "${github_app_key_base64_arn}",
+ "${github_app_id_arn}"
+ ]
+ },
+ {
+ "Effect": "Allow",
+ "Action": [
+ "ssm:GetParametersByPath"
+ ],
+ "Resource": [
+ "${arn_ssm_parameters_path_scale_down_config}",
+ "${arn_ssm_parameters_path_scale_down_config}/*"
+ ]
+%{ if kms_key_arn != "" ~}
+ },
+ {
+ "Effect": "Allow",
+ "Action": [
+ "kms:Decrypt"
+ ],
+ "Resource": "${kms_key_arn}"
+%{ endif ~}
+ }
+ ]
+}
diff --git a/modules/runners/scale-down/scale-down-state-diagram.md b/modules/runners/scale-down/scale-down-state-diagram.md
new file mode 100644
index 0000000000..f346697de4
--- /dev/null
+++ b/modules/runners/scale-down/scale-down-state-diagram.md
@@ -0,0 +1,166 @@
+# GitHub Actions Runner Scale-Down State Diagram
+
+
+
+The scale-down Lambda function runs on a scheduled basis (every 5 minutes by default) to manage GitHub Actions runner instances. It processes each environment configuration sequentially, performing a two-phase cleanup process for each: first terminating confirmed orphaned instances, then evaluating active runners to maintain the desired idle capacity while removing unnecessary instances. GitHub API responses are cached across environments to optimize rate limit usage.
+
+```mermaid
+stateDiagram-v2
+ [*] --> ScheduledExecution : Cron Trigger every 5 min
+
+ ScheduledExecution --> ResetCache : Clear GitHub API cache
+
+ ResetCache --> ProcessEnvironments
+
+ state ProcessEnvironments {
+ [*] --> ProcessEnvironment
+
+ state ProcessEnvironment {
+ [*] --> Phase1_OrphanTermination : Start Phase 1
+
+ state Phase1_OrphanTermination {
+ [*] --> ListOrphanInstances : Query EC2 for ghr orphan true
+
+ ListOrphanInstances --> CheckOrphanType : For each orphan
+
+ state CheckOrphanType <