From a83ece81c887aaaf7012744e166945db9f214052 Mon Sep 17 00:00:00 2001 From: Iain Lane Date: Tue, 21 Oct 2025 11:14:44 +0100 Subject: [PATCH 1/3] feat(multi-runner)!: support running the `scale-down` lambda once for every runner group Iterating the list of active runners in the GitHub API can be slow and expensive in terms of rate limit consumption. It's a paginated API, returning up to 100 runners per page. With several thousand runners across many runner groups, running `scale-down` once per runner group can quickly eat up large portions of the rate limit. Here we break the Terraform `scale-down` module into its own sub-module, so that `multi-runner` can create one instance of the Lambda function instead of the `runner` module managing it. A flag is added to the `runner` module to disable the `scale-down` function creation in the `multi-runner` case. Then the Lambda's code is modified to accept a list of configurations, and process them all. With this, we only need to fetch the list of runners once for all runner groups. BREAKING CHANGE: When using the `multi-runner` module, the per-group `scale_down_schedule_expression` is no longer supported. Only needed if you are using the `multi-runner` module. One instance of `scale-down` will now handle all runner groups. 1. Remove any `scale_down_schedule_expression` settings from your `multi_runner_config` runner configs. 2. To customise the frequency of the consolidated `scale-down` function, set the `scale_down_schedule_expression` variable on the `multi-runner` module itself. --- examples/multi-runner/main.tf | 2 +- .../templates/runner-configs/linux-arm64.yaml | 1 - .../runner-configs/linux-x64-ubuntu-2204.yaml | 1 - .../runner-configs/linux-x64-ubuntu.yaml | 1 - .../templates/runner-configs/linux-x64.yaml | 1 - .../templates/runner-configs/windows-x64.yaml | 1 - .../control-plane/src/aws/runners.ts | 3 +- .../functions/control-plane/src/modules.d.ts | 3 +- .../functions/control-plane/src/pool/pool.ts | 7 +- .../src/scale-runners/scale-down-config.ts | 7 + .../src/scale-runners/scale-down.test.ts | 151 +++++++++++++- .../src/scale-runners/scale-down.ts | 31 ++- modules/multi-runner/README.md | 5 +- modules/multi-runner/main.tf | 48 +++++ modules/multi-runner/outputs.tf | 13 +- modules/multi-runner/runners.tf | 2 +- modules/multi-runner/variables.tf | 10 +- modules/runners/README.md | 19 +- modules/runners/outputs.tf | 6 +- modules/runners/scale-down-state-diagram.md | 150 -------------- modules/runners/scale-down.tf | 146 ++++---------- modules/runners/scale-down/README.md | 84 ++++++++ modules/runners/scale-down/main.tf | 149 ++++++++++++++ modules/runners/scale-down/outputs.tf | 19 ++ .../policies/lambda-cloudwatch.json | 10 + .../policies/lambda-scale-down.json | 66 +++++++ .../scale-down/scale-down-state-diagram.md | 166 ++++++++++++++++ modules/runners/scale-down/variables.tf | 185 ++++++++++++++++++ modules/runners/variables.tf | 2 +- 29 files changed, 974 insertions(+), 315 deletions(-) delete mode 100644 modules/runners/scale-down-state-diagram.md create mode 100644 modules/runners/scale-down/README.md create mode 100644 modules/runners/scale-down/main.tf create mode 100644 modules/runners/scale-down/outputs.tf create mode 100644 modules/runners/scale-down/policies/lambda-cloudwatch.json create mode 100644 modules/runners/scale-down/policies/lambda-scale-down.json create mode 100644 modules/runners/scale-down/scale-down-state-diagram.md create mode 100644 modules/runners/scale-down/variables.tf diff --git a/examples/multi-runner/main.tf b/examples/multi-runner/main.tf index acbcdb8081..e1d83cd26e 100644 --- a/examples/multi-runner/main.tf +++ b/examples/multi-runner/main.tf @@ -98,7 +98,6 @@ module "runners" { # runner_extra_labels = ["amazon"] # runners_maximum_count = 1 # enable_ephemeral_runners = true - # scale_down_schedule_expression = "cron(* * * * ? *)" # } # } # } @@ -107,6 +106,7 @@ module "runners" { subnet_ids = module.base.vpc.private_subnets runners_scale_up_lambda_timeout = 60 runners_scale_down_lambda_timeout = 60 + scale_down_schedule_expression = "cron(* * * * ? *)" prefix = local.environment tags = { Project = "ProjectX" diff --git a/examples/multi-runner/templates/runner-configs/linux-arm64.yaml b/examples/multi-runner/templates/runner-configs/linux-arm64.yaml index 0c6cae01b5..bb45f6df40 100644 --- a/examples/multi-runner/templates/runner-configs/linux-arm64.yaml +++ b/examples/multi-runner/templates/runner-configs/linux-arm64.yaml @@ -19,7 +19,6 @@ runner_config: id_ssm_parameter_arn: ${ami_id_ssm_parameter_arn} runners_maximum_count: 1 delay_webhook_event: 0 - scale_down_schedule_expression: cron(* * * * ? *) runner_hook_job_started: | echo "Running pre job hook as $(whoami)" runner_hook_job_completed: | diff --git a/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu-2204.yaml b/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu-2204.yaml index 2b1ac15ee8..b1530bee84 100644 --- a/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu-2204.yaml +++ b/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu-2204.yaml @@ -19,7 +19,6 @@ runner_config: - m5a.large runners_maximum_count: 1 delay_webhook_event: 0 - scale_down_schedule_expression: cron(* * * * ? *) userdata_template: ./templates/user-data.sh ami: owners: diff --git a/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu.yaml b/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu.yaml index 8ae700d570..dca632afbc 100644 --- a/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu.yaml +++ b/examples/multi-runner/templates/runner-configs/linux-x64-ubuntu.yaml @@ -20,7 +20,6 @@ runner_config: - m5a.large runners_maximum_count: 1 delay_webhook_event: 0 - scale_down_schedule_expression: cron(* * * * ? *) userdata_template: ./templates/user-data.sh ami: owners: diff --git a/examples/multi-runner/templates/runner-configs/linux-x64.yaml b/examples/multi-runner/templates/runner-configs/linux-x64.yaml index 146c340836..91c564a14c 100644 --- a/examples/multi-runner/templates/runner-configs/linux-x64.yaml +++ b/examples/multi-runner/templates/runner-configs/linux-x64.yaml @@ -21,7 +21,6 @@ runner_config: enable_on_demand_failover_for_errors: ['InsufficientInstanceCapacity'] create_service_linked_role_spot: true delay_webhook_event: 0 - scale_down_schedule_expression: cron(* * * * ? *) runner_metadata_options: instance_metadata_tags: disabled http_endpoint: enabled diff --git a/examples/multi-runner/templates/runner-configs/windows-x64.yaml b/examples/multi-runner/templates/runner-configs/windows-x64.yaml index fdf8be6533..3d13dd612a 100644 --- a/examples/multi-runner/templates/runner-configs/windows-x64.yaml +++ b/examples/multi-runner/templates/runner-configs/windows-x64.yaml @@ -13,7 +13,6 @@ runner_config: - c5.large runners_maximum_count: 1 delay_webhook_event: 5 - scale_down_schedule_expression: cron(* * * * ? *) runner_boot_time_in_minutes: 20 ami_filter: name: diff --git a/lambdas/functions/control-plane/src/aws/runners.ts b/lambdas/functions/control-plane/src/aws/runners.ts index 6779dd39d2..c5815eaff2 100644 --- a/lambdas/functions/control-plane/src/aws/runners.ts +++ b/lambdas/functions/control-plane/src/aws/runners.ts @@ -298,8 +298,7 @@ async function createInstances( } // If launchTime is undefined, this will return false -export function bootTimeExceeded(ec2Runner: { launchTime?: Date }): boolean { - const runnerBootTimeInMinutes = process.env.RUNNER_BOOT_TIME_IN_MINUTES; +export function bootTimeExceeded(ec2Runner: { launchTime?: Date }, runnerBootTimeInMinutes: number): boolean { const launchTimePlusBootTime = moment(ec2Runner.launchTime).utc().add(runnerBootTimeInMinutes, 'minutes'); return launchTimePlusBootTime < moment(new Date()).utc(); } diff --git a/lambdas/functions/control-plane/src/modules.d.ts b/lambdas/functions/control-plane/src/modules.d.ts index 7570f29035..4bab7e1e99 100644 --- a/lambdas/functions/control-plane/src/modules.d.ts +++ b/lambdas/functions/control-plane/src/modules.d.ts @@ -4,6 +4,7 @@ declare namespace NodeJS { ENABLE_METRIC_GITHUB_APP_RATE_LIMIT: string; ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS: string; ENVIRONMENT: string; + ENVIRONMENT_CONFIGS: string; GHES_URL: string; JOB_RETRY_CONFIG: string; LAUNCH_TEMPLATE_NAME: string; @@ -14,8 +15,8 @@ declare namespace NodeJS { PARAMETER_GITHUB_APP_CLIENT_SECRET_NAME: string; PARAMETER_GITHUB_APP_ID_NAME: string; PARAMETER_GITHUB_APP_KEY_BASE64_NAME: string; + RUNNER_BOOT_TIME_IN_MINUTES: string; RUNNER_OWNER: string; - SCALE_DOWN_CONFIG: string; SSM_TOKEN_PATH: string; SSM_CLEANUP_CONFIG: string; SUBNET_IDS: string; diff --git a/lambdas/functions/control-plane/src/pool/pool.ts b/lambdas/functions/control-plane/src/pool/pool.ts index 162a7d0f6d..0bbc976ed6 100644 --- a/lambdas/functions/control-plane/src/pool/pool.ts +++ b/lambdas/functions/control-plane/src/pool/pool.ts @@ -37,6 +37,7 @@ export async function adjust(event: PoolEvent): Promise { const instanceAllocationStrategy = process.env.INSTANCE_ALLOCATION_STRATEGY || 'lowest-price'; // same as AWS default const runnerOwner = process.env.RUNNER_OWNER; const amiIdSsmParameterName = process.env.AMI_ID_SSM_PARAMETER_NAME; + const runnerBootTimeInMinutes = parseInt(process.env.RUNNER_BOOT_TIME_IN_MINUTES || '5'); const tracingEnabled = yn(process.env.POWERTOOLS_TRACE_ENABLED, { default: false }); const onDemandFailoverOnError = process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS ? (JSON.parse(process.env.ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS) as [string]) @@ -63,7 +64,7 @@ export async function adjust(event: PoolEvent): Promise { statuses: ['running'], }); - const numberOfRunnersInPool = calculatePooSize(ec2runners, runnerStatusses); + const numberOfRunnersInPool = calculatePooSize(ec2runners, runnerStatusses, runnerBootTimeInMinutes); const topUp = event.poolSize - numberOfRunnersInPool; if (topUp > 0) { @@ -115,7 +116,7 @@ async function getInstallationId(ghesApiUrl: string, org: string): Promise): number { +function calculatePooSize(ec2runners: RunnerList[], runnerStatus: Map, runnerBootTimeInMinutes: number): number { // Runner should be considered idle if it is still booting, or is idle in GitHub let numberOfRunnersInPool = 0; for (const ec2Instance of ec2runners) { @@ -127,7 +128,7 @@ function calculatePooSize(ec2runners: RunnerList[], runnerStatus: Map { process.env.GITHUB_APP_CLIENT_ID = 'TEST_CLIENT_ID'; process.env.GITHUB_APP_CLIENT_SECRET = 'TEST_CLIENT_SECRET'; process.env.RUNNERS_MAXIMUM_COUNT = '3'; - process.env.SCALE_DOWN_CONFIG = '[]'; - process.env.ENVIRONMENT = ENVIRONMENT; - process.env.MINIMUM_RUNNING_TIME_IN_MINUTES = MINIMUM_TIME_RUNNING_IN_MINUTES.toString(); - process.env.RUNNER_BOOT_TIME_IN_MINUTES = MINIMUM_BOOT_TIME.toString(); + process.env.ENVIRONMENT_CONFIGS = JSON.stringify([{ + environment: ENVIRONMENT, + idle_config: [], + minimum_running_time_in_minutes: MINIMUM_TIME_RUNNING_IN_MINUTES, + runner_boot_time_in_minutes: MINIMUM_BOOT_TIME, + }]); nock.disableNetConnect(); vi.clearAllMocks(); @@ -620,7 +622,12 @@ describe('Scale down runners', () => { }; beforeEach(() => { - process.env.SCALE_DOWN_CONFIG = JSON.stringify([defaultConfig]); + process.env.ENVIRONMENT_CONFIGS = JSON.stringify([{ + environment: ENVIRONMENT, + idle_config: [defaultConfig], + minimum_running_time_in_minutes: MINIMUM_TIME_RUNNING_IN_MINUTES, + runner_boot_time_in_minutes: MINIMUM_BOOT_TIME, + }]); }); it(`Should terminate based on the the idle config with ${evictionStrategy} eviction strategy`, async () => { @@ -752,6 +759,140 @@ describe('Scale down runners', () => { expect(runnersTest[2].launchTime).not.toBeDefined(); }); }); + + describe('Multi-environment scale-down', () => { + it('Should process multiple environments independently', async () => { + // setup - two environments with different settings + const environment1 = 'env-1'; + const environment2 = 'env-2'; + const minTime1 = 10; + const minTime2 = 20; + + process.env.ENVIRONMENT_CONFIGS = JSON.stringify([ + { + environment: environment1, + idle_config: [], + minimum_running_time_in_minutes: minTime1, + runner_boot_time_in_minutes: MINIMUM_BOOT_TIME, + }, + { + environment: environment2, + idle_config: [], + minimum_running_time_in_minutes: minTime2, + runner_boot_time_in_minutes: MINIMUM_BOOT_TIME, + }, + ]); + + const runners1 = [ + createRunnerTestData('env1-runner-old', 'Org', minTime1 + 1, true, false, true, 'owner1'), + createRunnerTestData('env1-runner-new', 'Org', minTime1 - 1, true, false, false, 'owner1'), + ]; + + const runners2 = [ + createRunnerTestData('env2-runner-old', 'Org', minTime2 + 1, true, false, true, 'owner2'), + createRunnerTestData('env2-runner-new', 'Org', minTime2 - 1, true, false, false, 'owner2'), + ]; + + mockListRunners.mockImplementation(async (filter) => { + const allRunners = filter?.environment === environment1 ? runners1 : + filter?.environment === environment2 ? runners2 : []; + // Filter by orphan flag if specified + return allRunners.filter((r) => !filter?.orphan || r.orphan === filter.orphan); + }); + + // Mock GitHub API to return runners filtered by owner + mockOctokit.paginate.mockImplementation((fn, params: any) => { + const allRunners = [...runners1, ...runners2]; + return Promise.resolve( + allRunners + .filter((r) => r.owner === params.org) + .map((r) => ({ id: r.instanceId, name: r.instanceId })) + ); + }); + + // act + await scaleDown(); + + // assert - should have been called for both environments + expect(listEC2Runners).toHaveBeenCalledWith({ environment: environment1 }); + expect(listEC2Runners).toHaveBeenCalledWith({ environment: environment2 }); + + // env1 runner that exceeded minTime1 should be terminated + expect(terminateRunner).toHaveBeenCalledWith(runners1[0].instanceId); + // env1 runner that didn't exceed minTime1 should not be terminated + expect(terminateRunner).not.toHaveBeenCalledWith(runners1[1].instanceId); + + // env2 runner that exceeded minTime2 should be terminated + expect(terminateRunner).toHaveBeenCalledWith(runners2[0].instanceId); + // env2 runner that didn't exceed minTime2 should not be terminated + expect(terminateRunner).not.toHaveBeenCalledWith(runners2[1].instanceId); + }); + + it('Should use per-environment idle config', async () => { + // setup - two environments with different idle configs + const environment1 = 'env-1'; + const environment2 = 'env-2'; + + const idleConfig1 = { cron: '* * * * * *', idleCount: 2, timeZone: 'UTC' }; + const idleConfig2 = { cron: '* * * * * *', idleCount: 0, timeZone: 'UTC' }; + + process.env.ENVIRONMENT_CONFIGS = JSON.stringify([ + { + environment: environment1, + idle_config: [idleConfig1], + minimum_running_time_in_minutes: MINIMUM_TIME_RUNNING_IN_MINUTES, + runner_boot_time_in_minutes: MINIMUM_BOOT_TIME, + }, + { + environment: environment2, + idle_config: [idleConfig2], + minimum_running_time_in_minutes: MINIMUM_TIME_RUNNING_IN_MINUTES, + runner_boot_time_in_minutes: MINIMUM_BOOT_TIME, + }, + ]); + + const runners1 = [ + createRunnerTestData('env1-idle-1', 'Org', MINIMUM_TIME_RUNNING_IN_MINUTES + 5, true, false, true, 'owner1'), // oldest - should terminate + createRunnerTestData('env1-idle-2', 'Org', MINIMUM_TIME_RUNNING_IN_MINUTES + 4, true, false, false, 'owner1'), // middle - keep + createRunnerTestData('env1-idle-3', 'Org', MINIMUM_TIME_RUNNING_IN_MINUTES + 3, true, false, false, 'owner1'), // newest - keep + ]; + + const runners2 = [ + createRunnerTestData('env2-idle-1', 'Org', MINIMUM_TIME_RUNNING_IN_MINUTES + 5, true, false, true, 'owner2'), + createRunnerTestData('env2-idle-2', 'Org', MINIMUM_TIME_RUNNING_IN_MINUTES + 4, true, false, true, 'owner2'), + ]; + + mockListRunners.mockImplementation(async (filter) => { + const allRunners = filter?.environment === environment1 ? runners1 : + filter?.environment === environment2 ? runners2 : []; + // Filter by orphan flag if specified + return allRunners.filter((r) => !filter?.orphan || r.orphan === filter.orphan); + }); + + // Mock GitHub API to return runners filtered by owner + mockOctokit.paginate.mockImplementation((fn, params: any) => { + const allRunners = [...runners1, ...runners2]; + return Promise.resolve( + allRunners + .filter((r) => r.owner === params.org) + .map((r) => ({ id: r.instanceId, name: r.instanceId })) + ); + }); + + // act + await scaleDown(); + + // assert + // env1 has idleCount=2, so terminate oldest, keep 2 newest + expect(terminateRunner).toHaveBeenCalledWith(runners1[0].instanceId); // oldest - terminated + expect(terminateRunner).not.toHaveBeenCalledWith(runners1[1].instanceId); // middle - kept + expect(terminateRunner).not.toHaveBeenCalledWith(runners1[2].instanceId); // newest - kept + + // env2 has idleCount=0, so all idle runners should be terminated + expect(terminateRunner).toHaveBeenCalledWith(runners2[0].instanceId); + expect(terminateRunner).toHaveBeenCalledWith(runners2[1].instanceId); + }); + }); }); function mockAwsRunners(runners: RunnerTestItem[]) { diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 1e5e712a24..3c61c73890 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -8,7 +8,12 @@ import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners'; import { RunnerInfo, RunnerList } from './../aws/runners.d'; import { GhRunners, githubCache } from './cache'; -import { ScalingDownConfig, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; +import { + ScalingDownConfig, + EnvironmentScaleDownConfig, + getEvictionStrategy, + getIdleRunnerCount, +} from './scale-down-config'; import { metricGitHubAppRateLimit } from '../github/rate-limit'; import { getGitHubEnterpriseApiUrl } from './scale-up'; @@ -120,8 +125,7 @@ async function listGitHubRunners(runner: RunnerInfo): Promise { return runners; } -function runnerMinimumTimeExceeded(runner: RunnerInfo): boolean { - const minimumRunningTimeInMinutes = process.env.MINIMUM_RUNNING_TIME_IN_MINUTES; +function runnerMinimumTimeExceeded(runner: RunnerInfo, minimumRunningTimeInMinutes: number): boolean { const launchTimePlusMinimum = moment(runner.launchTime).utc().add(minimumRunningTimeInMinutes, 'minutes'); const now = moment(new Date()).utc(); return launchTimePlusMinimum < now; @@ -174,6 +178,8 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi async function evaluateAndRemoveRunners( ec2Runners: RunnerInfo[], scaleDownConfigs: ScalingDownConfig[], + minimumRunningTimeInMinutes: number, + runnerBootTimeInMinutes: number, ): Promise { let idleCounter = getIdleRunnerCount(scaleDownConfigs); const evictionStrategy = getEvictionStrategy(scaleDownConfigs); @@ -197,7 +203,7 @@ async function evaluateAndRemoveRunners( `GitHub runners for AWS runner instance: '${ec2Runner.instanceId}': ${JSON.stringify(ghRunnersFiltered)}`, ); if (ghRunnersFiltered.length) { - if (runnerMinimumTimeExceeded(ec2Runner)) { + if (runnerMinimumTimeExceeded(ec2Runner, minimumRunningTimeInMinutes)) { if (idleCounter > 0) { idleCounter--; logger.info(`Runner '${ec2Runner.instanceId}' will be kept idle.`); @@ -209,7 +215,7 @@ async function evaluateAndRemoveRunners( ); } } - } else if (bootTimeExceeded(ec2Runner)) { + } else if (bootTimeExceeded(ec2Runner, runnerBootTimeInMinutes)) { await markOrphan(ec2Runner.instanceId); } else { logger.debug(`Runner ${ec2Runner.instanceId} has not yet booted.`); @@ -307,8 +313,17 @@ function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] { export async function scaleDown(): Promise { githubCache.reset(); - const environment = process.env.ENVIRONMENT; - const scaleDownConfigs = JSON.parse(process.env.SCALE_DOWN_CONFIG) as [ScalingDownConfig]; + const environmentConfigs = JSON.parse(process.env.ENVIRONMENT_CONFIGS) as EnvironmentScaleDownConfig[]; + + for (const envConfig of environmentConfigs) { + await scaleDownEnvironment(envConfig); + } +} + +async function scaleDownEnvironment(envConfig: EnvironmentScaleDownConfig): Promise { + const { environment, idle_config, minimum_running_time_in_minutes, runner_boot_time_in_minutes } = envConfig; + + logger.info(`Processing scale-down for environment: ${environment}`); // first runners marked to be orphan. await terminateOrphan(environment); @@ -325,7 +340,7 @@ export async function scaleDown(): Promise { } const runners = filterRunners(ec2Runners); - await evaluateAndRemoveRunners(runners, scaleDownConfigs); + await evaluateAndRemoveRunners(runners, idle_config, minimum_running_time_in_minutes, runner_boot_time_in_minutes); const activeEc2RunnersCountAfter = (await listRunners(environment)).length; logger.info(`Found: '${activeEc2RunnersCountAfter}' active GitHub EC2 runners instances after clean-up.`); diff --git a/modules/multi-runner/README.md b/modules/multi-runner/README.md index 759cb61832..edd4a1980c 100644 --- a/modules/multi-runner/README.md +++ b/modules/multi-runner/README.md @@ -97,6 +97,7 @@ module "multi-runner" { | [instance\_termination\_watcher](#module\_instance\_termination\_watcher) | ../termination-watcher | n/a | | [runner\_binaries](#module\_runner\_binaries) | ../runner-binaries-syncer | n/a | | [runners](#module\_runners) | ../runners | n/a | +| [scale\_down](#module\_scale\_down) | ../runners/scale-down | n/a | | [ssm](#module\_ssm) | ../ssm | n/a | | [webhook](#module\_webhook) | ../webhook | n/a | @@ -148,7 +149,7 @@ module "multi-runner" { | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | Specifies the number of days you want to retain log events for the lambda log group. Possible values are: 0, 1, 3, 5, 7, 14, 30, 60, 90, 120, 150, 180, 365, 400, 545, 731, 1827, and 3653. | `number` | `180` | no | | [matcher\_config\_parameter\_store\_tier](#input\_matcher\_config\_parameter\_store\_tier) | The tier of the parameter store for the matcher configuration. Valid values are `Standard`, and `Advanced`. | `string` | `"Standard"` | no | | [metrics](#input\_metrics) | Configuration for metrics created by the module, by default metrics are disabled to avoid additional costs. When metrics are enable all metrics are created unless explicit configured otherwise. |
object({
enable = optional(bool, false)
namespace = optional(string, "GitHub Runners")
metric = optional(object({
enable_github_app_rate_limit = optional(bool, true)
enable_job_retry = optional(bool, true)
enable_spot_termination_warning = optional(bool, true)
}), {})
})
| `{}` | no | -| [multi\_runner\_config](#input\_multi\_runner\_config) | multi\_runner\_config = {
runner\_config: {
runner\_os: "The EC2 Operating System type to use for action runner instances (linux,windows)."
runner\_architecture: "The platform architecture of the runner instance\_type."
runner\_metadata\_options: "(Optional) Metadata options for the ec2 runner instances."
ami: "(Optional) AMI configuration for the action runner instances. This object allows you to specify all AMI-related settings in one place."
ami\_filter: "(Optional) List of maps used to create the AMI filter for the action runner AMI. By default amazon linux 2 is used."
ami\_owners: "(Optional) The list of owners used to select the AMI of action runner instances."
create\_service\_linked\_role\_spot: (Optional) create the serviced linked role for spot instances that is required by the scale-up lambda.
credit\_specification: "(Optional) The credit specification of the runner instance\_type. Can be unset, `standard` or `unlimited`.
delay\_webhook\_event: "The number of seconds the event accepted by the webhook is invisible on the queue before the scale up lambda will receive the event."
disable\_runner\_autoupdate: "Disable the auto update of the github runner agent. Be aware there is a grace period of 30 days, see also the [GitHub article](https://github.blog/changelog/2022-02-01-github-actions-self-hosted-runners-can-now-disable-automatic-updates/)"
ebs\_optimized: "The EC2 EBS optimized configuration."
enable\_ephemeral\_runners: "Enable ephemeral runners, runners will only be used once."
enable\_job\_queued\_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT confiugration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
enable\_on\_demand\_failover\_for\_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
enable\_organization\_runners: "Register runners to organization, instead of repo level"
enable\_runner\_binaries\_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
enable\_ssm\_on\_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."
enable\_userdata: "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI."
instance\_allocation\_strategy: "The allocation strategy for spot instances. AWS recommends to use `capacity-optimized` however the AWS default is `lowest-price`."
instance\_max\_spot\_price: "Max price price for spot intances per hour. This variable will be passed to the create fleet as max spot price for the fleet."
instance\_target\_capacity\_type: "Default lifecycle used for runner instances, can be either `spot` or `on-demand`."
instance\_types: "List of instance types for the action runner. Defaults are based on runner\_os (al2023 for linux and Windows Server Core for win)."
job\_queue\_retention\_in\_seconds: "The number of seconds the job is held in the queue before it is purged"
minimum\_running\_time\_in\_minutes: "The time an ec2 action runner should be running at minimum before terminated if not busy."
pool\_runner\_owner: "The pool will deploy runners to the GitHub org ID, set this value to the org to which you want the runners deployed. Repo level is not supported."
runner\_additional\_security\_group\_ids: "List of additional security groups IDs to apply to the runner. If added outside the multi\_runner\_config block, the additional security group(s) will be applied to all runner configs. If added inside the multi\_runner\_config, the additional security group(s) will be applied to the individual runner."
runner\_as\_root: "Run the action runner under the root user. Variable `runner_run_as` will be ignored."
runner\_boot\_time\_in\_minutes: "The minimum time for an EC2 runner to boot and register as a runner."
runner\_disable\_default\_labels: "Disable default labels for the runners (os, architecture and `self-hosted`). If enabled, the runner will only have the extra labels provided in `runner_extra_labels`. In case you on own start script is used, this configuration parameter needs to be parsed via SSM."
runner\_extra\_labels: "Extra (custom) labels for the runners (GitHub). Separate each label by a comma. Labels checks on the webhook can be enforced by setting `multi_runner_config.matcherConfig.exactMatch`. GitHub read-only labels should not be provided."
runner\_group\_name: "Name of the runner group."
runner\_name\_prefix: "Prefix for the GitHub runner name."
runner\_run\_as: "Run the GitHub actions agent as user."
runners\_maximum\_count: "The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check."
scale\_down\_schedule\_expression: "Scheduler expression to check every x for scale down."
scale\_up\_reserved\_concurrent\_executions: "Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations."
userdata\_template: "Alternative user-data template, replacing the default template. By providing your own user\_data you have to take care of installing all required software, including the action runner. Variables userdata\_pre/post\_install are ignored."
enable\_jit\_config "Overwrite the default behavior for JIT configuration. By default JIT configuration is enabled for ephemeral runners and disabled for non-ephemeral runners. In case of GHES check first if the JIT config API is avaialbe. In case you upgradeing from 3.x to 4.x you can set `enable_jit_config` to `false` to avoid a breaking change when having your own AMI."
enable\_runner\_detailed\_monitoring: "Should detailed monitoring be enabled for the runner. Set this to true if you want to use detailed monitoring. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-cloudwatch-new.html for details."
enable\_cloudwatch\_agent: "Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`."
cloudwatch\_config: "(optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details."
userdata\_pre\_install: "Script to be ran before the GitHub Actions runner is installed on the EC2 instances"
userdata\_post\_install: "Script to be ran after the GitHub Actions runner is installed on the EC2 instances"
runner\_hook\_job\_started: "Script to be ran in the runner environment at the beginning of every job"
runner\_hook\_job\_completed: "Script to be ran in the runner environment at the end of every job"
runner\_ec2\_tags: "Map of tags that will be added to the launch template instance tag specifications."
runner\_iam\_role\_managed\_policy\_arns: "Attach AWS or customer-managed IAM policies (by ARN) to the runner IAM role"
vpc\_id: "The VPC for security groups of the action runners. If not set uses the value of `var.vpc_id`."
subnet\_ids: "List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. If not set, uses the value of `var.subnet_ids`."
idle\_config: "List of time period that can be defined as cron expression to keep a minimum amount of runners active instead of scaling down to 0. By defining this list you can ensure that in time periods that match the cron expression within 5 seconds a runner is kept idle."
runner\_log\_files: "(optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details."
block\_device\_mappings: "The EC2 instance block device configuration. Takes the following keys: `device_name`, `delete_on_termination`, `volume_type`, `volume_size`, `encrypted`, `iops`, `throughput`, `kms_key_id`, `snapshot_id`."
job\_retry: "Experimental! Can be removed / changed without trigger a major release. Configure job retries. The configuration enables job retries (for ephemeral runners). After creating the insances a message will be published to a job retry queue. The job retry check lambda is checking after a delay if the job is queued. If not the message will be published again on the scale-up (build queue). Using this feature can impact the reate limit of the GitHub app."
pool\_config: "The configuration for updating the pool. The `pool_size` to adjust to by the events triggered by the `schedule_expression`. For example you can configure a cron expression for week days to adjust the pool to 10 and another expression for the weekend to adjust the pool to 1. Use `schedule_expression_timezone` to override the schedule time zone (defaults to UTC)."
}
matcherConfig: {
labelMatchers: "The list of list of labels supported by the runner configuration. `[[self-hosted, linux, x64, example]]`"
exactMatch: "If set to true all labels in the workflow job must match the GitHub labels (os, architecture and `self-hosted`). When false if __any__ workflow label matches it will trigger the webhook."
priority: "If set it defines the priority of the matcher, the matcher with the lowest priority will be evaluated first. Default is 999, allowed values 0-999."
}
redrive\_build\_queue: "Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting `enabled` to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries."
} |
map(object({
runner_config = object({
runner_os = string
runner_architecture = string
runner_metadata_options = optional(map(any), {
instance_metadata_tags = "enabled"
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 1
})
ami = optional(object({
filter = optional(map(list(string)), { state = ["available"] })
owners = optional(list(string), ["amazon"])
id_ssm_parameter_arn = optional(string, null)
kms_key_arn = optional(string, null)
}), null) # Defaults to null, in which case the module falls back to individual AMI variables (deprecated)
# Deprecated: Use ami object instead
ami_filter = optional(map(list(string)), { state = ["available"] })
ami_owners = optional(list(string), ["amazon"])
ami_id_ssm_parameter_name = optional(string, null)
ami_kms_key_arn = optional(string, "")
create_service_linked_role_spot = optional(bool, false)
credit_specification = optional(string, null)
delay_webhook_event = optional(number, 30)
disable_runner_autoupdate = optional(bool, false)
ebs_optimized = optional(bool, false)
enable_ephemeral_runners = optional(bool, false)
enable_job_queued_check = optional(bool, null)
enable_on_demand_failover_for_errors = optional(list(string), [])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
enable_userdata = optional(bool, true)
instance_allocation_strategy = optional(string, "lowest-price")
instance_max_spot_price = optional(string, null)
instance_target_capacity_type = optional(string, "spot")
instance_types = list(string)
job_queue_retention_in_seconds = optional(number, 86400)
minimum_running_time_in_minutes = optional(number, null)
pool_runner_owner = optional(string, null)
runner_as_root = optional(bool, false)
runner_boot_time_in_minutes = optional(number, 5)
runner_disable_default_labels = optional(bool, false)
runner_extra_labels = optional(list(string), [])
runner_group_name = optional(string, "Default")
runner_name_prefix = optional(string, "")
runner_run_as = optional(string, "ec2-user")
runners_maximum_count = number
runner_additional_security_group_ids = optional(list(string), [])
scale_down_schedule_expression = optional(string, "cron(*/5 * * * ? *)")
scale_up_reserved_concurrent_executions = optional(number, 1)
userdata_template = optional(string, null)
userdata_content = optional(string, null)
enable_jit_config = optional(bool, null)
enable_runner_detailed_monitoring = optional(bool, false)
enable_cloudwatch_agent = optional(bool, true)
cloudwatch_config = optional(string, null)
userdata_pre_install = optional(string, "")
userdata_post_install = optional(string, "")
runner_hook_job_started = optional(string, "")
runner_hook_job_completed = optional(string, "")
runner_ec2_tags = optional(map(string), {})
runner_iam_role_managed_policy_arns = optional(list(string), [])
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
idle_config = optional(list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
})), [])
cpu_options = optional(object({
core_count = number
threads_per_core = number
}), null)
runner_log_files = optional(list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
})), null)
block_device_mappings = optional(list(object({
delete_on_termination = optional(bool, true)
device_name = optional(string, "/dev/xvda")
encrypted = optional(bool, true)
iops = optional(number)
kms_key_id = optional(string)
snapshot_id = optional(string)
throughput = optional(number)
volume_size = number
volume_type = optional(string, "gp3")
})), [{
volume_size = 30
}])
pool_config = optional(list(object({
schedule_expression = string
schedule_expression_timezone = optional(string)
size = number
})), [])
job_retry = optional(object({
enable = optional(bool, false)
delay_in_seconds = optional(number, 300)
delay_backoff = optional(number, 2)
lambda_memory_size = optional(number, 256)
lambda_timeout = optional(number, 30)
max_attempts = optional(number, 1)
}), {})
})
matcherConfig = object({
labelMatchers = list(list(string))
exactMatch = optional(bool, false)
priority = optional(number, 999)
})
redrive_build_queue = optional(object({
enabled = bool
maxReceiveCount = number
}), {
enabled = false
maxReceiveCount = null
})
}))
| n/a | yes | +| [multi\_runner\_config](#input\_multi\_runner\_config) | multi\_runner\_config = {
runner\_config: {
runner\_os: "The EC2 Operating System type to use for action runner instances (linux,windows)."
runner\_architecture: "The platform architecture of the runner instance\_type."
runner\_metadata\_options: "(Optional) Metadata options for the ec2 runner instances."
ami: "(Optional) AMI configuration for the action runner instances. This object allows you to specify all AMI-related settings in one place."
ami\_filter: "(Optional) List of maps used to create the AMI filter for the action runner AMI. By default amazon linux 2 is used."
ami\_owners: "(Optional) The list of owners used to select the AMI of action runner instances."
create\_service\_linked\_role\_spot: (Optional) create the serviced linked role for spot instances that is required by the scale-up lambda.
credit\_specification: "(Optional) The credit specification of the runner instance\_type. Can be unset, `standard` or `unlimited`.
delay\_webhook\_event: "The number of seconds the event accepted by the webhook is invisible on the queue before the scale up lambda will receive the event."
disable\_runner\_autoupdate: "Disable the auto update of the github runner agent. Be aware there is a grace period of 30 days, see also the [GitHub article](https://github.blog/changelog/2022-02-01-github-actions-self-hosted-runners-can-now-disable-automatic-updates/)"
ebs\_optimized: "The EC2 EBS optimized configuration."
enable\_ephemeral\_runners: "Enable ephemeral runners, runners will only be used once."
enable\_job\_queued\_check: "Enables JIT configuration for creating runners instead of registration token based registraton. JIT configuration will only be applied for ephemeral runners. By default JIT confiugration is enabled for ephemeral runners an can be disabled via this override. When running on GHES without support for JIT configuration this variable should be set to true for ephemeral runners."
enable\_on\_demand\_failover\_for\_errors: "Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later."
enable\_organization\_runners: "Register runners to organization, instead of repo level"
enable\_runner\_binaries\_syncer: "Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI."
enable\_ssm\_on\_runners: "Enable to allow access the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances."
enable\_userdata: "Should the userdata script be enabled for the runner. Set this to false if you are using your own prebuilt AMI."
instance\_allocation\_strategy: "The allocation strategy for spot instances. AWS recommends to use `capacity-optimized` however the AWS default is `lowest-price`."
instance\_max\_spot\_price: "Max price price for spot intances per hour. This variable will be passed to the create fleet as max spot price for the fleet."
instance\_target\_capacity\_type: "Default lifecycle used for runner instances, can be either `spot` or `on-demand`."
instance\_types: "List of instance types for the action runner. Defaults are based on runner\_os (al2023 for linux and Windows Server Core for win)."
job\_queue\_retention\_in\_seconds: "The number of seconds the job is held in the queue before it is purged"
minimum\_running\_time\_in\_minutes: "The time an ec2 action runner should be running at minimum before terminated if not busy."
pool\_runner\_owner: "The pool will deploy runners to the GitHub org ID, set this value to the org to which you want the runners deployed. Repo level is not supported."
runner\_additional\_security\_group\_ids: "List of additional security groups IDs to apply to the runner. If added outside the multi\_runner\_config block, the additional security group(s) will be applied to all runner configs. If added inside the multi\_runner\_config, the additional security group(s) will be applied to the individual runner."
runner\_as\_root: "Run the action runner under the root user. Variable `runner_run_as` will be ignored."
runner\_boot\_time\_in\_minutes: "The minimum time for an EC2 runner to boot and register as a runner."
runner\_disable\_default\_labels: "Disable default labels for the runners (os, architecture and `self-hosted`). If enabled, the runner will only have the extra labels provided in `runner_extra_labels`. In case you on own start script is used, this configuration parameter needs to be parsed via SSM."
runner\_extra\_labels: "Extra (custom) labels for the runners (GitHub). Separate each label by a comma. Labels checks on the webhook can be enforced by setting `multi_runner_config.matcherConfig.exactMatch`. GitHub read-only labels should not be provided."
runner\_group\_name: "Name of the runner group."
runner\_name\_prefix: "Prefix for the GitHub runner name."
runner\_run\_as: "Run the GitHub actions agent as user."
runners\_maximum\_count: "The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check."
scale\_up\_reserved\_concurrent\_executions: "Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations."
userdata\_template: "Alternative user-data template, replacing the default template. By providing your own user\_data you have to take care of installing all required software, including the action runner. Variables userdata\_pre/post\_install are ignored."
enable\_jit\_config: "Overwrite the default behavior for JIT configuration. By default JIT configuration is enabled for ephemeral runners and disabled for non-ephemeral runners. In case of GHES check first if the JIT config API is avaialbe. In case you upgradeing from 3.x to 4.x you can set `enable_jit_config` to `false` to avoid a breaking change when having your own AMI."
enable\_runner\_detailed\_monitoring: "Should detailed monitoring be enabled for the runner. Set this to true if you want to use detailed monitoring. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-cloudwatch-new.html for details."
enable\_cloudwatch\_agent: "Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`."
cloudwatch\_config: "(optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details."
userdata\_pre\_install: "Script to be ran before the GitHub Actions runner is installed on the EC2 instances"
userdata\_post\_install: "Script to be ran after the GitHub Actions runner is installed on the EC2 instances"
runner\_hook\_job\_started: "Script to be ran in the runner environment at the beginning of every job"
runner\_hook\_job\_completed: "Script to be ran in the runner environment at the end of every job"
runner\_ec2\_tags: "Map of tags that will be added to the launch template instance tag specifications."
runner\_iam\_role\_managed\_policy\_arns: "Attach AWS or customer-managed IAM policies (by ARN) to the runner IAM role"
vpc\_id: "The VPC for security groups of the action runners. If not set uses the value of `var.vpc_id`."
subnet\_ids: "List of subnets in which the action runners will be launched, the subnets needs to be subnets in the `vpc_id`. If not set, uses the value of `var.subnet_ids`."
idle\_config: "List of time period that can be defined as cron expression to keep a minimum amount of runners active instead of scaling down to 0. By defining this list you can ensure that in time periods that match the cron expression within 5 seconds a runner is kept idle."
runner\_log\_files: "(optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details."
block\_device\_mappings: "The EC2 instance block device configuration. Takes the following keys: `device_name`, `delete_on_termination`, `volume_type`, `volume_size`, `encrypted`, `iops`, `throughput`, `kms_key_id`, `snapshot_id`."
job\_retry: "Experimental! Can be removed / changed without trigger a major release. Configure job retries. The configuration enables job retries (for ephemeral runners). After creating the insances a message will be published to a job retry queue. The job retry check lambda is checking after a delay if the job is queued. If not the message will be published again on the scale-up (build queue). Using this feature can impact the reate limit of the GitHub app."
pool\_config: "The configuration for updating the pool. The `pool_size` to adjust to by the events triggered by the `schedule_expression`. For example you can configure a cron expression for week days to adjust the pool to 10 and another expression for the weekend to adjust the pool to 1. Use `schedule_expression_timezone` to override the schedule time zone (defaults to UTC)."
}
matcherConfig: {
labelMatchers: "The list of list of labels supported by the runner configuration. `[[self-hosted, linux, x64, example]]`"
exactMatch: "If set to true all labels in the workflow job must match the GitHub labels (os, architecture and `self-hosted`). When false if __any__ workflow label matches it will trigger the webhook."
priority: "If set it defines the priority of the matcher, the matcher with the lowest priority will be evaluated first. Default is 999, allowed values 0-999."
}
redrive\_build\_queue: "Set options to attach (optional) a dead letter queue to the build queue, the queue between the webhook and the scale up lambda. You have the following options. 1. Disable by setting `enabled` to false. 2. Enable by setting `enabled` to `true`, `maxReceiveCount` to a number of max retries."
} |
map(object({
runner_config = object({
runner_os = string
runner_architecture = string
runner_metadata_options = optional(map(any), {
instance_metadata_tags = "enabled"
http_endpoint = "enabled"
http_tokens = "required"
http_put_response_hop_limit = 1
})
ami = optional(object({
filter = optional(map(list(string)), { state = ["available"] })
owners = optional(list(string), ["amazon"])
id_ssm_parameter_arn = optional(string, null)
kms_key_arn = optional(string, null)
}), null) # Defaults to null, in which case the module falls back to individual AMI variables (deprecated)
# Deprecated: Use ami object instead
ami_filter = optional(map(list(string)), { state = ["available"] })
ami_owners = optional(list(string), ["amazon"])
ami_id_ssm_parameter_name = optional(string, null)
ami_kms_key_arn = optional(string, "")
create_service_linked_role_spot = optional(bool, false)
credit_specification = optional(string, null)
delay_webhook_event = optional(number, 30)
disable_runner_autoupdate = optional(bool, false)
ebs_optimized = optional(bool, false)
enable_ephemeral_runners = optional(bool, false)
enable_job_queued_check = optional(bool, null)
enable_on_demand_failover_for_errors = optional(list(string), [])
enable_organization_runners = optional(bool, false)
enable_runner_binaries_syncer = optional(bool, true)
enable_ssm_on_runners = optional(bool, false)
enable_userdata = optional(bool, true)
instance_allocation_strategy = optional(string, "lowest-price")
instance_max_spot_price = optional(string, null)
instance_target_capacity_type = optional(string, "spot")
instance_types = list(string)
job_queue_retention_in_seconds = optional(number, 86400)
minimum_running_time_in_minutes = optional(number, null)
pool_runner_owner = optional(string, null)
runner_as_root = optional(bool, false)
runner_boot_time_in_minutes = optional(number, 5)
runner_disable_default_labels = optional(bool, false)
runner_extra_labels = optional(list(string), [])
runner_group_name = optional(string, "Default")
runner_name_prefix = optional(string, "")
runner_run_as = optional(string, "ec2-user")
runners_maximum_count = number
runner_additional_security_group_ids = optional(list(string), [])
scale_up_reserved_concurrent_executions = optional(number, 1)
userdata_template = optional(string, null)
userdata_content = optional(string, null)
enable_jit_config = optional(bool, null)
enable_runner_detailed_monitoring = optional(bool, false)
enable_cloudwatch_agent = optional(bool, true)
cloudwatch_config = optional(string, null)
userdata_pre_install = optional(string, "")
userdata_post_install = optional(string, "")
runner_hook_job_started = optional(string, "")
runner_hook_job_completed = optional(string, "")
runner_ec2_tags = optional(map(string), {})
runner_iam_role_managed_policy_arns = optional(list(string), [])
vpc_id = optional(string, null)
subnet_ids = optional(list(string), null)
idle_config = optional(list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
})), [])
cpu_options = optional(object({
core_count = number
threads_per_core = number
}), null)
runner_log_files = optional(list(object({
log_group_name = string
prefix_log_group = bool
file_path = string
log_stream_name = string
})), null)
block_device_mappings = optional(list(object({
delete_on_termination = optional(bool, true)
device_name = optional(string, "/dev/xvda")
encrypted = optional(bool, true)
iops = optional(number)
kms_key_id = optional(string)
snapshot_id = optional(string)
throughput = optional(number)
volume_size = number
volume_type = optional(string, "gp3")
})), [{
volume_size = 30
}])
pool_config = optional(list(object({
schedule_expression = string
schedule_expression_timezone = optional(string)
size = number
})), [])
job_retry = optional(object({
enable = optional(bool, false)
delay_in_seconds = optional(number, 300)
delay_backoff = optional(number, 2)
lambda_memory_size = optional(number, 256)
lambda_timeout = optional(number, 30)
max_attempts = optional(number, 1)
}), {})
})
matcherConfig = object({
labelMatchers = list(list(string))
exactMatch = optional(bool, false)
priority = optional(number, 999)
})
redrive_build_queue = optional(object({
enabled = bool
maxReceiveCount = number
}), {
enabled = false
maxReceiveCount = null
})
}))
| n/a | yes | | [pool\_lambda\_reserved\_concurrent\_executions](#input\_pool\_lambda\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [pool\_lambda\_timeout](#input\_pool\_lambda\_timeout) | Time out for the pool lambda in seconds. | `number` | `60` | no | | [prefix](#input\_prefix) | The prefix used for naming resources | `string` | `"github-actions"` | no | @@ -170,6 +171,7 @@ module "multi-runner" { | [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no | | [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.

`schedule_expression`: is used to configure the schedule for the lambda.
`enabled`: enable or disable the lambda trigger via the EventBridge.
`lambda_memory_size`: lambda memery size limit.
`lambda_timeout`: timeout for the lambda in seconds.
`config`: configuration for the lambda function. Token path will be read by default from the module. |
object({
schedule_expression = optional(string, "rate(1 day)")
enabled = optional(bool, true)
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
})
|
{
"config": {}
}
| no | | [scale\_down\_lambda\_memory\_size](#input\_scale\_down\_lambda\_memory\_size) | Memory size limit in MB for scale down. | `number` | `512` | no | +| [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no | | [scale\_up\_lambda\_memory\_size](#input\_scale\_up\_lambda\_memory\_size) | Memory size limit in MB for scale\_up lambda. | `number` | `512` | no | | [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secreets. |
object({
root = optional(string, "github-action-runners")
app = optional(string, "app")
runners = optional(string, "runners")
webhook = optional(string, "webhook")
})
| `{}` | no | | [state\_event\_rule\_binaries\_syncer](#input\_state\_event\_rule\_binaries\_syncer) | Option to disable EventBridge Lambda trigger for the binary syncer, useful to stop automatic updates of binary distribution | `string` | `"ENABLED"` | no | @@ -196,6 +198,7 @@ module "multi-runner" { | [instance\_termination\_handler](#output\_instance\_termination\_handler) | n/a | | [instance\_termination\_watcher](#output\_instance\_termination\_watcher) | n/a | | [runners\_map](#output\_runners\_map) | n/a | +| [scale\_down](#output\_scale\_down) | Lambda to scale-down runners | | [ssm\_parameters](#output\_ssm\_parameters) | n/a | | [webhook](#output\_webhook) | n/a | diff --git a/modules/multi-runner/main.tf b/modules/multi-runner/main.tf index 905cc7f793..efafb629c4 100644 --- a/modules/multi-runner/main.tf +++ b/modules/multi-runner/main.tf @@ -24,3 +24,51 @@ resource "random_string" "random" { special = false upper = false } + +locals { + scale_down_environment_configs = [ + for k, v in local.runner_config : { + environment = "${var.prefix}-${k}" + idle_config = v.runner_config.idle_config + minimum_running_time_in_minutes = coalesce( + v.runner_config.minimum_running_time_in_minutes, + v.runner_config.runner_os == "windows" ? 15 : 5 + ) + runner_boot_time_in_minutes = v.runner_config.runner_boot_time_in_minutes + } + ] +} + +module "scale_down" { + source = "../runners/scale-down" + + environments = local.scale_down_environment_configs + prefix = var.prefix + schedule_expression = var.scale_down_schedule_expression + + github_app_parameters = local.github_app_parameters + lambda_s3_bucket = var.lambda_s3_bucket + runners_lambda_s3_key = var.runners_lambda_s3_key + runners_lambda_s3_object_version = var.runners_lambda_s3_object_version + lambda_runtime = var.lambda_runtime + lambda_timeout = var.runners_scale_down_lambda_timeout + lambda_memory_size = var.scale_down_lambda_memory_size + lambda_architecture = var.lambda_architecture + lambda_zip = var.runners_lambda_zip + lambda_subnet_ids = var.lambda_subnet_ids + lambda_security_group_ids = var.lambda_security_group_ids + lambda_tags = var.lambda_tags + tracing_config = var.tracing_config + logging_retention_in_days = var.logging_retention_in_days + logging_kms_key_id = var.logging_kms_key_id + kms_key_arn = coalesce(var.kms_key_arn, "") + ghes_url = var.ghes_url + ghes_ssl_verify = var.ghes_ssl_verify + user_agent = var.user_agent + log_level = var.log_level + metrics = var.metrics + role_path = var.role_path + role_permissions_boundary = var.role_permissions_boundary + aws_partition = var.aws_partition + tags = local.tags +} diff --git a/modules/multi-runner/outputs.tf b/modules/multi-runner/outputs.tf index 2f2b1d3458..7d15c14ad8 100644 --- a/modules/multi-runner/outputs.tf +++ b/modules/multi-runner/outputs.tf @@ -7,13 +7,10 @@ output "runners_map" { launch_template_ami_id = runner.launch_template.image_id lambda_up = runner.lambda_scale_up lambda_up_log_group = runner.lambda_scale_up_log_group - lambda_down = runner.lambda_scale_down - lambda_down_log_group = runner.lambda_scale_down_log_group lambda_pool = runner.lambda_pool lambda_pool_log_group = runner.lambda_pool_log_group role_runner = runner.role_runner role_scale_up = runner.role_scale_up - role_scale_down = runner.role_scale_down role_pool = runner.role_pool runners_log_groups = runner.runners_log_groups logfiles = runner.logfiles @@ -21,6 +18,16 @@ output "runners_map" { } } +output "scale_down" { + description = "Lambda to scale-down runners" + value = { + lambda = module.scale_down.lambda + lambda_log_group = module.scale_down.lambda_log_group + role = module.scale_down.role + event_rule = module.scale_down.cloudwatch_event_rule + } +} + output "binaries_syncer_map" { value = { for runner_binary_key, runner_binary in module.runner_binaries : runner_binary_key => { lambda = runner_binary.lambda diff --git a/modules/multi-runner/runners.tf b/modules/multi-runner/runners.tf index 811ab36260..7a2c84c5a6 100644 --- a/modules/multi-runner/runners.tf +++ b/modules/multi-runner/runners.tf @@ -42,7 +42,7 @@ module "runners" { disable_runner_autoupdate = each.value.runner_config.disable_runner_autoupdate enable_managed_runner_security_group = var.enable_managed_runner_security_group enable_runner_detailed_monitoring = each.value.runner_config.enable_runner_detailed_monitoring - scale_down_schedule_expression = each.value.runner_config.scale_down_schedule_expression + scale_down_schedule_expression = null # multi-runner handles scaling down all runners minimum_running_time_in_minutes = each.value.runner_config.minimum_running_time_in_minutes runner_boot_time_in_minutes = each.value.runner_config.runner_boot_time_in_minutes runner_disable_default_labels = each.value.runner_config.runner_disable_default_labels diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf index edbdb33059..1c40672eb7 100644 --- a/modules/multi-runner/variables.tf +++ b/modules/multi-runner/variables.tf @@ -104,7 +104,6 @@ variable "multi_runner_config" { runner_run_as = optional(string, "ec2-user") runners_maximum_count = number runner_additional_security_group_ids = optional(list(string), []) - scale_down_schedule_expression = optional(string, "cron(*/5 * * * ? *)") scale_up_reserved_concurrent_executions = optional(number, 1) userdata_template = optional(string, null) userdata_content = optional(string, null) @@ -213,10 +212,9 @@ variable "multi_runner_config" { runner_name_prefix: "Prefix for the GitHub runner name." runner_run_as: "Run the GitHub actions agent as user." runners_maximum_count: "The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check." - scale_down_schedule_expression: "Scheduler expression to check every x for scale down." scale_up_reserved_concurrent_executions: "Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations." userdata_template: "Alternative user-data template, replacing the default template. By providing your own user_data you have to take care of installing all required software, including the action runner. Variables userdata_pre/post_install are ignored." - enable_jit_config "Overwrite the default behavior for JIT configuration. By default JIT configuration is enabled for ephemeral runners and disabled for non-ephemeral runners. In case of GHES check first if the JIT config API is avaialbe. In case you upgradeing from 3.x to 4.x you can set `enable_jit_config` to `false` to avoid a breaking change when having your own AMI." + enable_jit_config: "Overwrite the default behavior for JIT configuration. By default JIT configuration is enabled for ephemeral runners and disabled for non-ephemeral runners. In case of GHES check first if the JIT config API is avaialbe. In case you upgradeing from 3.x to 4.x you can set `enable_jit_config` to `false` to avoid a breaking change when having your own AMI." enable_runner_detailed_monitoring: "Should detailed monitoring be enabled for the runner. Set this to true if you want to use detailed monitoring. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-cloudwatch-new.html for details." enable_cloudwatch_agent: "Enabling the cloudwatch agent on the ec2 runner instances, the runner contains default config. Configuration can be overridden via `cloudwatch_config`." cloudwatch_config: "(optional) Replaces the module default cloudwatch log config. See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Agent-Configuration-File-Details.html for details." @@ -268,6 +266,12 @@ variable "runners_scale_down_lambda_timeout" { default = 60 } +variable "scale_down_schedule_expression" { + description = "Scheduler expression to check every x for scale down." + type = string + default = "cron(*/5 * * * ? *)" +} + variable "webhook_lambda_zip" { description = "File location of the webhook lambda zip file." type = string diff --git a/modules/runners/README.md b/modules/runners/README.md index cf62c2c96a..f1bf526255 100644 --- a/modules/runners/README.md +++ b/modules/runners/README.md @@ -14,12 +14,6 @@ The action runners are created via a launch template; in the launch template onl The scale up lambda is triggered by events on a SQS queue. Events on this queue are delayed, which will give the workflow some time to start running on available runners. For each event the lambda will check if the workflow is still queued and no other limits are reached. In that case the lambda will create a new EC2 instance. The lambda only needs to know which launch template to use and which subnets are available. From the available subnets a random one will be chosen. Once the instance is created the event is assumed as handled, and we assume the workflow wil start at some moment once the created instance is ready. -### Lambda scale down - -The scale down lambda is triggered via a CloudWatch event. The event is triggered by a cron expression defined in the variable `scale_down_schedule_expression` (https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/ScheduledEvents.html). For scaling down GitHub does not provide a good API yet, therefore we run the scaling down based on this event every x minutes. Each time the lambda is triggered it tries to remove all runners older than x minutes (configurable) managed in this deployment. In case the runner can be removed from GitHub, which means it is not executing a workflow, the lambda will terminate the EC2 instance. - ---8<-- "modules/runners/scale-down-state-diagram.md:mkdocs_scale_down_state_diagram" - ## Lambda Function The Lambda function is written in [TypeScript](https://www.typescriptlang.org/) and requires Node 12.x and yarn. Sources are located in [./lambdas/runners]. Two lambda functions share the same sources, there is one entry point for `scaleDown` and another one for `scaleUp`. @@ -67,23 +61,20 @@ yarn run dist |------|--------|---------| | [job\_retry](#module\_job\_retry) | ./job-retry | n/a | | [pool](#module\_pool) | ./pool | n/a | +| [scale\_down](#module\_scale\_down) | ./scale-down | n/a | ## Resources | Name | Type | |------|------| -| [aws_cloudwatch_event_rule.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | | [aws_cloudwatch_event_rule.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | -| [aws_cloudwatch_event_target.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_event_target.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | | [aws_cloudwatch_log_group.gh_runners](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | -| [aws_cloudwatch_log_group.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_log_group.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_cloudwatch_log_group.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | | [aws_iam_instance_profile.runner](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_instance_profile) | resource | | [aws_iam_policy.ami_id_ssm_parameter_read](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_policy) | resource | | [aws_iam_role.runner](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | -| [aws_iam_role.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | | [aws_iam_role_policy.cloudwatch](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | @@ -93,9 +84,6 @@ yarn run dist | [aws_iam_role_policy.ec2](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.job_retry_sqs_publish](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.runner_session_manager_aws_managed](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | -| [aws_iam_role_policy.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | -| [aws_iam_role_policy.scale_down_logging](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | -| [aws_iam_role_policy.scale_down_xray](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.scale_up_logging](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy.scale_up_xray](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | @@ -106,15 +94,12 @@ yarn run dist | [aws_iam_role_policy.ssm_parameters](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | | [aws_iam_role_policy_attachment.ami_id_ssm_parameter_read](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.managed_policies](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | -| [aws_iam_role_policy_attachment.scale_down_vpc_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.scale_up_vpc_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.ssm_housekeeper_vpc_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_iam_role_policy_attachment.xray_tracing](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_lambda_event_source_mapping.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_event_source_mapping) | resource | -| [aws_lambda_function.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | | [aws_lambda_function.scale_up](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | | [aws_lambda_function.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | -| [aws_lambda_permission.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_lambda_permission.scale_runners_lambda](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_lambda_permission.ssm_housekeeper](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | | [aws_launch_template.runner](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/launch_template) | resource | @@ -221,7 +206,7 @@ yarn run dist | [runners\_lambda\_s3\_object\_version](#input\_runners\_lambda\_s3\_object\_version) | S3 object version for runners lambda function. Useful if S3 versioning is enabled on source bucket. | `string` | `null` | no | | [runners\_maximum\_count](#input\_runners\_maximum\_count) | The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check. | `number` | `3` | no | | [s3\_runner\_binaries](#input\_s3\_runner\_binaries) | Bucket details for cached GitHub binary. |
object({
arn = string
id = string
key = string
})
| n/a | yes | -| [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no | +| [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. Set to null to disable scale-down Lambda creation. | `string` | `"cron(*/5 * * * ? *)"` | no | | [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [sqs\_build\_queue](#input\_sqs\_build\_queue) | SQS queue to consume accepted build events. |
object({
arn = string
url = string
})
| n/a | yes | | [ssm\_housekeeper](#input\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.

`schedule_expression`: is used to configure the schedule for the lambda.
`state`: state of the cloudwatch event rule. Valid values are `DISABLED`, `ENABLED`, and `ENABLED_WITH_ALL_CLOUDTRAIL_MANAGEMENT_EVENTS`.
`lambda_memory_size`: lambda memery size limit.
`lambda_timeout`: timeout for the lambda in seconds.
`config`: configuration for the lambda function. Token path will be read by default from the module. |
object({
schedule_expression = optional(string, "rate(1 day)")
state = optional(string, "ENABLED")
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
})
|
{
"config": {}
}
| no | diff --git a/modules/runners/outputs.tf b/modules/runners/outputs.tf index 8f366dce90..407cfbe0ca 100644 --- a/modules/runners/outputs.tf +++ b/modules/runners/outputs.tf @@ -19,15 +19,15 @@ output "role_scale_up" { } output "lambda_scale_down" { - value = aws_lambda_function.scale_down + value = try(module.scale_down[0].lambda, null) } output "lambda_scale_down_log_group" { - value = aws_cloudwatch_log_group.scale_down + value = try(module.scale_down[0].lambda_log_group, null) } output "role_scale_down" { - value = aws_iam_role.scale_down + value = try(module.scale_down[0].role, null) } output "lambda_pool" { diff --git a/modules/runners/scale-down-state-diagram.md b/modules/runners/scale-down-state-diagram.md deleted file mode 100644 index b4f260eb2a..0000000000 --- a/modules/runners/scale-down-state-diagram.md +++ /dev/null @@ -1,150 +0,0 @@ -# GitHub Actions Runner Scale-Down State Diagram - - - -The scale-down Lambda function runs on a scheduled basis (every 5 minutes by default) to manage GitHub Actions runner instances. It performs a two-phase cleanup process: first terminating confirmed orphaned instances, then evaluating active runners to maintain the desired idle capacity while removing unnecessary instances. - -```mermaid -stateDiagram-v2 - [*] --> ScheduledExecution : Cron Trigger every 5 min - - ScheduledExecution --> Phase1_OrphanTermination : Start Phase 1 - - state Phase1_OrphanTermination { - [*] --> ListOrphanInstances : Query EC2 for ghr orphan true - - ListOrphanInstances --> CheckOrphanType : For each orphan - - state CheckOrphanType <> - CheckOrphanType --> HasRunnerIdTag : Has ghr github runner id - CheckOrphanType --> TerminateOrphan : No runner ID tag - - HasRunnerIdTag --> LastChanceCheck : Query GitHub API - - state LastChanceCheck <> - LastChanceCheck --> ConfirmedOrphan : Offline and busy - LastChanceCheck --> FalsePositive : Exists and not problematic - - ConfirmedOrphan --> TerminateOrphan - FalsePositive --> RemoveOrphanTag - - TerminateOrphan --> NextOrphan : Continue processing - RemoveOrphanTag --> NextOrphan - - NextOrphan --> CheckOrphanType : More orphans? - NextOrphan --> Phase2_ActiveRunners : All processed - } - - Phase1_OrphanTermination --> Phase2_ActiveRunners : Phase 1 Complete - - state Phase2_ActiveRunners { - [*] --> ListActiveRunners : Query non-orphan EC2 instances - - ListActiveRunners --> GroupByOwner : Sort by owner and repo - - GroupByOwner --> ProcessOwnerGroup : For each owner - - state ProcessOwnerGroup { - [*] --> SortByStrategy : Apply eviction strategy - SortByStrategy --> ProcessRunner : Oldest first or newest first - - ProcessRunner --> QueryGitHub : Get GitHub runners for owner - - QueryGitHub --> MatchRunner : Find runner by instance ID suffix - - state MatchRunner <> - MatchRunner --> FoundInGitHub : Runner exists in GitHub - MatchRunner --> NotFoundInGitHub : Runner not in GitHub - - state FoundInGitHub { - [*] --> CheckMinimumTime : Has minimum runtime passed? - - state CheckMinimumTime <> - CheckMinimumTime --> TooYoung : Runtime less than minimum - CheckMinimumTime --> CheckIdleQuota : Runtime greater than or equal to minimum - - TooYoung --> NextRunner - - state CheckIdleQuota <> - CheckIdleQuota --> KeepIdle : Idle quota available - CheckIdleQuota --> CheckBusyState : Quota full - - KeepIdle --> NextRunner - - state CheckBusyState <> - CheckBusyState --> KeepBusy : Runner busy - CheckBusyState --> TerminateIdle : Runner idle - - KeepBusy --> NextRunner - TerminateIdle --> DeregisterFromGitHub - DeregisterFromGitHub --> TerminateInstance - TerminateInstance --> NextRunner - } - - state NotFoundInGitHub { - [*] --> CheckBootTime : Has boot time exceeded? - - state CheckBootTime <> - CheckBootTime --> StillBooting : Boot time less than threshold - CheckBootTime --> MarkOrphan : Boot time greater than or equal to threshold - - StillBooting --> NextRunner - MarkOrphan --> TagAsOrphan : Set ghr orphan true - TagAsOrphan --> NextRunner - } - - NextRunner --> ProcessRunner : More runners in group? - NextRunner --> NextOwnerGroup : Group complete - } - - NextOwnerGroup --> ProcessOwnerGroup : More owner groups? - NextOwnerGroup --> ExecutionComplete : All groups processed - } - - Phase2_ActiveRunners --> ExecutionComplete : Phase 2 Complete - - ExecutionComplete --> [*] : Wait for next cron trigger - - note right of LastChanceCheck - Uses ghr github runner id tag - for precise GitHub API lookup - end note - - note right of MatchRunner - Matches GitHub runner name - ending with EC2 instance ID - end note - - note right of CheckMinimumTime - Minimum running time in minutes - (Linux: 5min, Windows: 15min) - end note - - note right of CheckBootTime - Runner boot time in minutes - Default configuration value - end note -``` - - - -## Key Decision Points - -| State | Condition | Action | -|-------|-----------|--------| -| **Orphan w/ Runner ID** | GitHub: offline + busy | Terminate (confirmed orphan) | -| **Orphan w/ Runner ID** | GitHub: exists + healthy | Remove orphan tag (false positive) | -| **Orphan w/o Runner ID** | Always | Terminate (no way to verify) | -| **Active Runner Found** | Runtime < minimum | Keep (too young) | -| **Active Runner Found** | Idle quota available | Keep as idle | -| **Active Runner Found** | Quota full + idle | Terminate + deregister | -| **Active Runner Found** | Quota full + busy | Keep running | -| **Active Runner Missing** | Boot time exceeded | Mark as orphan | -| **Active Runner Missing** | Still booting | Wait | - -## Configuration Parameters - -- **Cron Schedule**: `cron(*/5 * * * ? *)` (every 5 minutes) -- **Minimum Runtime**: Linux 5min, Windows 15min -- **Boot Timeout**: Configurable via `runner_boot_time_in_minutes` -- **Idle Config**: Per-environment configuration for desired idle runners diff --git a/modules/runners/scale-down.tf b/modules/runners/scale-down.tf index d274e3d4f1..cd72483a60 100644 --- a/modules/runners/scale-down.tf +++ b/modules/runners/scale-down.tf @@ -4,122 +4,46 @@ locals { "windows" = 15 "linux" = 5 } -} -resource "aws_lambda_function" "scale_down" { - s3_bucket = var.lambda_s3_bucket != null ? var.lambda_s3_bucket : null - s3_key = var.runners_lambda_s3_key != null ? var.runners_lambda_s3_key : null - s3_object_version = var.runners_lambda_s3_object_version != null ? var.runners_lambda_s3_object_version : null - filename = var.lambda_s3_bucket == null ? local.lambda_zip : null - source_code_hash = var.lambda_s3_bucket == null ? filebase64sha256(local.lambda_zip) : null - function_name = "${var.prefix}-scale-down" - role = aws_iam_role.scale_down.arn - handler = "index.scaleDownHandler" - runtime = var.lambda_runtime - timeout = var.lambda_timeout_scale_down - tags = merge(local.tags, var.lambda_tags) - memory_size = var.lambda_scale_down_memory_size - architectures = [var.lambda_architecture] - - environment { - variables = { - ENVIRONMENT = var.prefix - ENABLE_METRIC_GITHUB_APP_RATE_LIMIT = var.metrics.enable && var.metrics.metric.enable_github_app_rate_limit - GHES_URL = var.ghes_url - USER_AGENT = var.user_agent - LOG_LEVEL = var.log_level - MINIMUM_RUNNING_TIME_IN_MINUTES = coalesce(var.minimum_running_time_in_minutes, local.min_runtime_defaults[var.runner_os]) - NODE_TLS_REJECT_UNAUTHORIZED = var.ghes_url != null && !var.ghes_ssl_verify ? 0 : 1 - PARAMETER_GITHUB_APP_ID_NAME = var.github_app_parameters.id.name - PARAMETER_GITHUB_APP_KEY_BASE64_NAME = var.github_app_parameters.key_base64.name - POWERTOOLS_LOGGER_LOG_EVENT = var.log_level == "debug" ? "true" : "false" - RUNNER_BOOT_TIME_IN_MINUTES = var.runner_boot_time_in_minutes - SCALE_DOWN_CONFIG = jsonencode(var.idle_config) - POWERTOOLS_SERVICE_NAME = "runners-scale-down" - POWERTOOLS_METRICS_NAMESPACE = var.metrics.namespace - POWERTOOLS_TRACE_ENABLED = var.tracing_config.mode != null ? true : false - POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests - POWERTOOLS_TRACER_CAPTURE_ERROR = var.tracing_config.capture_error - } - } - - dynamic "vpc_config" { - for_each = var.lambda_subnet_ids != null && var.lambda_security_group_ids != null ? [true] : [] - content { - security_group_ids = var.lambda_security_group_ids - subnet_ids = var.lambda_subnet_ids - } - } - dynamic "tracing_config" { - for_each = var.tracing_config.mode != null ? [true] : [] - content { - mode = var.tracing_config.mode - } + scale_down_environment_config = { + environment = var.prefix + idle_config = var.idle_config + minimum_running_time_in_minutes = coalesce(var.minimum_running_time_in_minutes, local.min_runtime_defaults[var.runner_os]) + runner_boot_time_in_minutes = var.runner_boot_time_in_minutes } } -resource "aws_cloudwatch_log_group" "scale_down" { - name = "/aws/lambda/${aws_lambda_function.scale_down.function_name}" - retention_in_days = var.logging_retention_in_days - kms_key_id = var.logging_kms_key_id - tags = var.tags -} +module "scale_down" { + count = var.scale_down_schedule_expression != null ? 1 : 0 + source = "./scale-down" -resource "aws_cloudwatch_event_rule" "scale_down" { - name = "${var.prefix}-scale-down-rule" + environments = [local.scale_down_environment_config] + prefix = var.prefix schedule_expression = var.scale_down_schedule_expression - tags = var.tags -} - -resource "aws_cloudwatch_event_target" "scale_down" { - rule = aws_cloudwatch_event_rule.scale_down.name - arn = aws_lambda_function.scale_down.arn -} - -resource "aws_lambda_permission" "scale_down" { - statement_id = "AllowExecutionFromCloudWatch" - action = "lambda:InvokeFunction" - function_name = aws_lambda_function.scale_down.function_name - principal = "events.amazonaws.com" - source_arn = aws_cloudwatch_event_rule.scale_down.arn -} - -resource "aws_iam_role" "scale_down" { - name = "${substr("${var.prefix}-scale-down-lambda", 0, 54)}-${substr(md5("${var.prefix}-scale-down-lambda"), 0, 8)}" - assume_role_policy = data.aws_iam_policy_document.lambda_assume_role_policy.json - path = local.role_path - permissions_boundary = var.role_permissions_boundary - tags = local.tags -} - -resource "aws_iam_role_policy" "scale_down" { - name = "scale-down-policy" - role = aws_iam_role.scale_down.name - policy = templatefile("${path.module}/policies/lambda-scale-down.json", { - environment = var.prefix - github_app_id_arn = var.github_app_parameters.id.arn - github_app_key_base64_arn = var.github_app_parameters.key_base64.arn - kms_key_arn = local.kms_key_arn - }) -} - -resource "aws_iam_role_policy" "scale_down_logging" { - name = "logging-policy" - role = aws_iam_role.scale_down.name - policy = templatefile("${path.module}/policies/lambda-cloudwatch.json", { - log_group_arn = aws_cloudwatch_log_group.scale_down.arn - }) -} - -resource "aws_iam_role_policy_attachment" "scale_down_vpc_execution_role" { - count = length(var.lambda_subnet_ids) > 0 ? 1 : 0 - role = aws_iam_role.scale_down.name - policy_arn = "arn:${var.aws_partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" -} -resource "aws_iam_role_policy" "scale_down_xray" { - count = var.tracing_config.mode != null ? 1 : 0 - name = "xray-policy" - policy = data.aws_iam_policy_document.lambda_xray[0].json - role = aws_iam_role.scale_down.name + github_app_parameters = var.github_app_parameters + lambda_s3_bucket = var.lambda_s3_bucket + runners_lambda_s3_key = var.runners_lambda_s3_key + runners_lambda_s3_object_version = var.runners_lambda_s3_object_version + lambda_runtime = var.lambda_runtime + lambda_timeout = var.lambda_timeout_scale_down + lambda_memory_size = var.lambda_scale_down_memory_size + lambda_architecture = var.lambda_architecture + lambda_zip = local.lambda_zip + lambda_subnet_ids = var.lambda_subnet_ids + lambda_security_group_ids = var.lambda_security_group_ids + lambda_tags = var.lambda_tags + tracing_config = var.tracing_config + logging_retention_in_days = var.logging_retention_in_days + logging_kms_key_id = var.logging_kms_key_id + kms_key_arn = local.kms_key_arn + ghes_url = var.ghes_url + ghes_ssl_verify = var.ghes_ssl_verify + user_agent = var.user_agent + log_level = var.log_level + metrics = var.metrics + role_path = local.role_path + role_permissions_boundary = var.role_permissions_boundary + aws_partition = var.aws_partition + tags = local.tags } diff --git a/modules/runners/scale-down/README.md b/modules/runners/scale-down/README.md new file mode 100644 index 0000000000..c2906801b3 --- /dev/null +++ b/modules/runners/scale-down/README.md @@ -0,0 +1,84 @@ +# Module - Scale Down + +The scale down lambda is triggered via a CloudWatch event. The event is triggered by a cron expression defined in the variable `scale_down_schedule_expression` (https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/ScheduledEvents.html). For scaling down GitHub does not provide a good API yet, therefore we run the scaling down based on this event every x minutes. Each time the lambda is triggered it tries to remove all runners older than x minutes (configurable) managed in this deployment. In case the runner can be removed from GitHub, which means it is not executing a workflow, the lambda will terminate the EC2 instance. + +## Multi-Environment Support + +This module supports managing multiple runner environments (configurations) from a single Lambda function. When multiple environments are configured, the Lambda processes all environments sequentially in a single invocation. GitHub API calls are cached by organization/repository owner during the Lambda execution. When multiple environments share the same GitHub organization or repository, the list of runners is fetched from the GitHub API once and reused across those environments. + +--8<-- "modules/runners/scale-down/scale-down-state-diagram.md:mkdocs_scale_down_state_diagram" + + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [aws](#provider\_aws) | n/a | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [aws_cloudwatch_event_rule.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_rule) | resource | +| [aws_cloudwatch_event_target.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_event_target) | resource | +| [aws_cloudwatch_log_group.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/cloudwatch_log_group) | resource | +| [aws_iam_role.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role) | resource | +| [aws_iam_role_policy.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.scale_down_logging](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy.scale_down_xray](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy) | resource | +| [aws_iam_role_policy_attachment.scale_down_vpc_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | +| [aws_lambda_function.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | +| [aws_lambda_permission.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_iam_policy_document.lambda_assume_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_iam_policy_document.lambda_xray](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [aws\_partition](#input\_aws\_partition) | AWS partition | `string` | n/a | yes | +| [environments](#input\_environments) | List of environment configurations for scale-down |
list(object({
environment = string
idle_config = list(object({
cron = string
timeZone = string
idleCount = number
evictionStrategy = optional(string, "oldest_first")
}))
minimum_running_time_in_minutes = number
runner_boot_time_in_minutes = number
}))
| n/a | yes | +| [ghes\_ssl\_verify](#input\_ghes\_ssl\_verify) | Verify GitHub Enterprise Server SSL certificate | `bool` | `true` | no | +| [ghes\_url](#input\_ghes\_url) | GitHub Enterprise Server URL | `string` | `null` | no | +| [github\_app\_parameters](#input\_github\_app\_parameters) | GitHub App SSM parameters |
object({
id = object({
name = string
arn = string
})
key_base64 = object({
name = string
arn = string
})
})
| n/a | yes | +| [kms\_key\_arn](#input\_kms\_key\_arn) | KMS key ARN for SSM parameter decryption | `string` | `""` | no | +| [lambda\_architecture](#input\_lambda\_architecture) | Lambda architecture (x86\_64 or arm64) | `string` | n/a | yes | +| [lambda\_memory\_size](#input\_lambda\_memory\_size) | Lambda memory size in MB | `number` | n/a | yes | +| [lambda\_runtime](#input\_lambda\_runtime) | Lambda runtime | `string` | n/a | yes | +| [lambda\_s3\_bucket](#input\_lambda\_s3\_bucket) | S3 bucket for Lambda deployment package | `string` | `null` | no | +| [lambda\_security\_group\_ids](#input\_lambda\_security\_group\_ids) | List of security group IDs for Lambda VPC configuration | `list(string)` | `[]` | no | +| [lambda\_subnet\_ids](#input\_lambda\_subnet\_ids) | List of subnet IDs for Lambda VPC configuration | `list(string)` | `[]` | no | +| [lambda\_tags](#input\_lambda\_tags) | Tags for Lambda function | `map(string)` | `{}` | no | +| [lambda\_timeout](#input\_lambda\_timeout) | Lambda timeout in seconds | `number` | n/a | yes | +| [lambda\_zip](#input\_lambda\_zip) | Path to Lambda deployment package | `string` | n/a | yes | +| [log\_level](#input\_log\_level) | Log level for Lambda function | `string` | `"info"` | no | +| [logging\_kms\_key\_id](#input\_logging\_kms\_key\_id) | KMS key ID for CloudWatch log encryption | `string` | `null` | no | +| [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | CloudWatch log retention in days | `number` | n/a | yes | +| [metrics](#input\_metrics) | Metrics configuration |
object({
enable = bool
namespace = string
metric = object({
enable_github_app_rate_limit = bool
})
})
| n/a | yes | +| [prefix](#input\_prefix) | Prefix for Lambda function name | `string` | n/a | yes | +| [role\_path](#input\_role\_path) | IAM role path | `string` | n/a | yes | +| [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | IAM role permissions boundary ARN | `string` | `null` | no | +| [runners\_lambda\_s3\_key](#input\_runners\_lambda\_s3\_key) | S3 key for Lambda deployment package | `string` | `null` | no | +| [runners\_lambda\_s3\_object\_version](#input\_runners\_lambda\_s3\_object\_version) | S3 object version for Lambda deployment package | `string` | `null` | no | +| [schedule\_expression](#input\_schedule\_expression) | CloudWatch Event schedule expression | `string` | `"cron(*/5 * * * ? *)"` | no | +| [tags](#input\_tags) | Tags to apply to resources | `map(string)` | `{}` | no | +| [tracing\_config](#input\_tracing\_config) | Lambda tracing configuration |
object({
mode = optional(string, null)
capture_http_requests = optional(string, "false")
capture_error = optional(string, "false")
})
| n/a | yes | +| [user\_agent](#input\_user\_agent) | User agent string for GitHub API requests | `string` | `null` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [cloudwatch\_event\_rule](#output\_cloudwatch\_event\_rule) | CloudWatch Event Rule for scale-down | +| [lambda](#output\_lambda) | Scale-down Lambda function | +| [lambda\_log\_group](#output\_lambda\_log\_group) | Scale-down Lambda log group | +| [role](#output\_role) | Scale-down Lambda IAM role | + \ No newline at end of file diff --git a/modules/runners/scale-down/main.tf b/modules/runners/scale-down/main.tf new file mode 100644 index 0000000000..0ae49ea71a --- /dev/null +++ b/modules/runners/scale-down/main.tf @@ -0,0 +1,149 @@ +locals { + managed_environments = [for e in var.environments : e.environment] +} + +# IAM assume role policy for Lambda +data "aws_iam_policy_document" "lambda_assume_role_policy" { + statement { + actions = ["sts:AssumeRole"] + + principals { + type = "Service" + identifiers = ["lambda.amazonaws.com"] + } + } +} + +# X-Ray tracing policy +data "aws_iam_policy_document" "lambda_xray" { + count = var.tracing_config.mode != null ? 1 : 0 + statement { + actions = [ + "xray:BatchGetTraces", + "xray:GetTraceSummaries", + "xray:PutTelemetryRecords", + "xray:PutTraceSegments" + ] + effect = "Allow" + resources = [ + "*" + ] + sid = "AllowXRay" + } +} + +resource "aws_lambda_function" "scale_down" { + s3_bucket = var.lambda_s3_bucket != null ? var.lambda_s3_bucket : null + s3_key = var.runners_lambda_s3_key != null ? var.runners_lambda_s3_key : null + s3_object_version = var.runners_lambda_s3_object_version != null ? var.runners_lambda_s3_object_version : null + filename = var.lambda_s3_bucket == null ? var.lambda_zip : null + source_code_hash = var.lambda_s3_bucket == null ? filebase64sha256(var.lambda_zip) : null + function_name = "${var.prefix}-scale-down" + role = aws_iam_role.scale_down.arn + handler = "index.scaleDownHandler" + runtime = var.lambda_runtime + timeout = var.lambda_timeout + tags = merge(var.tags, var.lambda_tags) + memory_size = var.lambda_memory_size + architectures = [var.lambda_architecture] + + environment { + variables = { + ENVIRONMENT_CONFIGS = jsonencode(var.environments) + ENABLE_METRIC_GITHUB_APP_RATE_LIMIT = var.metrics.enable && var.metrics.metric.enable_github_app_rate_limit + GHES_URL = var.ghes_url + USER_AGENT = var.user_agent + LOG_LEVEL = var.log_level + NODE_TLS_REJECT_UNAUTHORIZED = var.ghes_url != null && !var.ghes_ssl_verify ? 0 : 1 + PARAMETER_GITHUB_APP_ID_NAME = var.github_app_parameters.id.name + PARAMETER_GITHUB_APP_KEY_BASE64_NAME = var.github_app_parameters.key_base64.name + POWERTOOLS_LOGGER_LOG_EVENT = var.log_level == "debug" ? "true" : "false" + POWERTOOLS_SERVICE_NAME = "runners-scale-down" + POWERTOOLS_METRICS_NAMESPACE = var.metrics.namespace + POWERTOOLS_TRACE_ENABLED = var.tracing_config.mode != null ? true : false + POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests + POWERTOOLS_TRACER_CAPTURE_ERROR = var.tracing_config.capture_error + } + } + + dynamic "vpc_config" { + for_each = length(var.lambda_subnet_ids) > 0 && length(var.lambda_security_group_ids) > 0 ? [true] : [] + content { + security_group_ids = var.lambda_security_group_ids + subnet_ids = var.lambda_subnet_ids + } + } + + dynamic "tracing_config" { + for_each = var.tracing_config.mode != null ? [true] : [] + content { + mode = var.tracing_config.mode + } + } +} + +resource "aws_cloudwatch_log_group" "scale_down" { + name = "/aws/lambda/${aws_lambda_function.scale_down.function_name}" + retention_in_days = var.logging_retention_in_days + kms_key_id = var.logging_kms_key_id + tags = var.tags +} + +resource "aws_cloudwatch_event_rule" "scale_down" { + name = "${var.prefix}-scale-down-rule" + schedule_expression = var.schedule_expression + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "scale_down" { + rule = aws_cloudwatch_event_rule.scale_down.name + arn = aws_lambda_function.scale_down.arn +} + +resource "aws_lambda_permission" "scale_down" { + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.scale_down.function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.scale_down.arn +} + +resource "aws_iam_role" "scale_down" { + name = "${substr("${var.prefix}-scale-down-lambda", 0, 54)}-${substr(md5("${var.prefix}-scale-down-lambda"), 0, 8)}" + assume_role_policy = data.aws_iam_policy_document.lambda_assume_role_policy.json + path = var.role_path + permissions_boundary = var.role_permissions_boundary + tags = var.tags +} + +resource "aws_iam_role_policy" "scale_down" { + name = "scale-down-policy" + role = aws_iam_role.scale_down.name + policy = templatefile("${path.module}/policies/lambda-scale-down.json", { + environments = jsonencode(local.managed_environments) + github_app_id_arn = var.github_app_parameters.id.arn + github_app_key_base64_arn = var.github_app_parameters.key_base64.arn + kms_key_arn = var.kms_key_arn + }) +} + +resource "aws_iam_role_policy" "scale_down_logging" { + name = "logging-policy" + role = aws_iam_role.scale_down.name + policy = templatefile("${path.module}/policies/lambda-cloudwatch.json", { + log_group_arn = aws_cloudwatch_log_group.scale_down.arn + }) +} + +resource "aws_iam_role_policy_attachment" "scale_down_vpc_execution_role" { + count = length(var.lambda_subnet_ids) > 0 ? 1 : 0 + role = aws_iam_role.scale_down.name + policy_arn = "arn:${var.aws_partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" +} + +resource "aws_iam_role_policy" "scale_down_xray" { + count = var.tracing_config.mode != null ? 1 : 0 + name = "xray-policy" + policy = data.aws_iam_policy_document.lambda_xray[0].json + role = aws_iam_role.scale_down.name +} diff --git a/modules/runners/scale-down/outputs.tf b/modules/runners/scale-down/outputs.tf new file mode 100644 index 0000000000..293314f76a --- /dev/null +++ b/modules/runners/scale-down/outputs.tf @@ -0,0 +1,19 @@ +output "lambda" { + description = "Scale-down Lambda function" + value = aws_lambda_function.scale_down +} + +output "lambda_log_group" { + description = "Scale-down Lambda log group" + value = aws_cloudwatch_log_group.scale_down +} + +output "role" { + description = "Scale-down Lambda IAM role" + value = aws_iam_role.scale_down +} + +output "cloudwatch_event_rule" { + description = "CloudWatch Event Rule for scale-down" + value = aws_cloudwatch_event_rule.scale_down +} diff --git a/modules/runners/scale-down/policies/lambda-cloudwatch.json b/modules/runners/scale-down/policies/lambda-cloudwatch.json new file mode 100644 index 0000000000..ad9246bcb3 --- /dev/null +++ b/modules/runners/scale-down/policies/lambda-cloudwatch.json @@ -0,0 +1,10 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["logs:CreateLogStream", "logs:PutLogEvents"], + "Resource": "${log_group_arn}*" + } + ] +} diff --git a/modules/runners/scale-down/policies/lambda-scale-down.json b/modules/runners/scale-down/policies/lambda-scale-down.json new file mode 100644 index 0000000000..c5137f6ab7 --- /dev/null +++ b/modules/runners/scale-down/policies/lambda-scale-down.json @@ -0,0 +1,66 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeInstances", + "ec2:DescribeTags" + ], + "Resource": [ + "*" + ] + }, + { + "Effect": "Allow", + "Action": [ + "ec2:TerminateInstances", + "ec2:CreateTags", + "ec2:DeleteTags" + ], + "Resource": [ + "*" + ], + "Condition": { + "StringEquals": { + "ec2:ResourceTag/ghr:Application": "github-action-runner" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "ec2:TerminateInstances", + "ec2:CreateTags", + "ec2:DeleteTags" + ], + "Resource": [ + "*" + ], + "Condition": { + "StringEquals": { + "ec2:ResourceTag/ghr:environment": ${environments} + } + } + }, + { + "Effect": "Allow", + "Action": [ + "ssm:GetParameter" + ], + "Resource": [ + "${github_app_key_base64_arn}", + "${github_app_id_arn}" + ] +%{ if kms_key_arn != "" ~} + }, + { + "Effect": "Allow", + "Action": [ + "kms:Decrypt" + ], + "Resource": "${kms_key_arn}" +%{ endif ~} + } + ] +} diff --git a/modules/runners/scale-down/scale-down-state-diagram.md b/modules/runners/scale-down/scale-down-state-diagram.md new file mode 100644 index 0000000000..f346697de4 --- /dev/null +++ b/modules/runners/scale-down/scale-down-state-diagram.md @@ -0,0 +1,166 @@ +# GitHub Actions Runner Scale-Down State Diagram + + + +The scale-down Lambda function runs on a scheduled basis (every 5 minutes by default) to manage GitHub Actions runner instances. It processes each environment configuration sequentially, performing a two-phase cleanup process for each: first terminating confirmed orphaned instances, then evaluating active runners to maintain the desired idle capacity while removing unnecessary instances. GitHub API responses are cached across environments to optimize rate limit usage. + +```mermaid +stateDiagram-v2 + [*] --> ScheduledExecution : Cron Trigger every 5 min + + ScheduledExecution --> ResetCache : Clear GitHub API cache + + ResetCache --> ProcessEnvironments + + state ProcessEnvironments { + [*] --> ProcessEnvironment + + state ProcessEnvironment { + [*] --> Phase1_OrphanTermination : Start Phase 1 + + state Phase1_OrphanTermination { + [*] --> ListOrphanInstances : Query EC2 for ghr orphan true + + ListOrphanInstances --> CheckOrphanType : For each orphan + + state CheckOrphanType <> + CheckOrphanType --> HasRunnerIdTag : Has ghr github runner id + CheckOrphanType --> TerminateOrphan : No runner ID tag + + HasRunnerIdTag --> LastChanceCheck : Query GitHub API + + state LastChanceCheck <> + LastChanceCheck --> ConfirmedOrphan : Offline and busy + LastChanceCheck --> FalsePositive : Exists and not problematic + + ConfirmedOrphan --> TerminateOrphan + FalsePositive --> RemoveOrphanTag + + TerminateOrphan --> NextOrphan : Continue processing + RemoveOrphanTag --> NextOrphan + + NextOrphan --> CheckOrphanType : More orphans? + NextOrphan --> Phase2_ActiveRunners : All processed + } + + Phase1_OrphanTermination --> Phase2_ActiveRunners : Phase 1 Complete + + state Phase2_ActiveRunners { + [*] --> ListActiveRunners : Query non-orphan EC2 instances + + ListActiveRunners --> GroupByOwner : Sort by owner and repo + + GroupByOwner --> ProcessOwnerGroup : For each owner + + state ProcessOwnerGroup { + [*] --> SortByStrategy : Apply eviction strategy + SortByStrategy --> ProcessRunner : Oldest first or newest first + + ProcessRunner --> QueryGitHub : Get GitHub runners for owner + + QueryGitHub --> MatchRunner : Find runner by instance ID suffix + + state MatchRunner <> + MatchRunner --> FoundInGitHub : Runner exists in GitHub + MatchRunner --> NotFoundInGitHub : Runner not in GitHub + + state FoundInGitHub { + [*] --> CheckMinimumTime : Has minimum runtime passed? + + state CheckMinimumTime <> + CheckMinimumTime --> TooYoung : Runtime less than minimum + CheckMinimumTime --> CheckIdleQuota : Runtime greater than or equal to minimum + + TooYoung --> NextRunner + + state CheckIdleQuota <> + CheckIdleQuota --> KeepIdle : Idle quota available + CheckIdleQuota --> CheckBusyState : Quota full + + KeepIdle --> NextRunner + + state CheckBusyState <> + CheckBusyState --> KeepBusy : Runner busy + CheckBusyState --> TerminateIdle : Runner idle + + KeepBusy --> NextRunner + TerminateIdle --> DeregisterFromGitHub + DeregisterFromGitHub --> TerminateInstance + TerminateInstance --> NextRunner + } + + state NotFoundInGitHub { + [*] --> CheckBootTime : Has boot time exceeded? + + state CheckBootTime <> + CheckBootTime --> StillBooting : Boot time less than threshold + CheckBootTime --> MarkOrphan : Boot time greater than or equal to threshold + + StillBooting --> NextRunner + MarkOrphan --> TagAsOrphan : Set ghr orphan true + TagAsOrphan --> NextRunner + } + + NextRunner --> ProcessRunner : More runners in group? + NextRunner --> MoreOwnerGroups : Group complete + } + + state MoreOwnerGroups <> + MoreOwnerGroups --> ProcessOwnerGroup : More owner groups + MoreOwnerGroups --> [*] : All groups processed + } + + Phase2_ActiveRunners --> [*] : Phase 2 Complete + } + + ProcessEnvironment --> CheckMoreEnvironments + + state CheckMoreEnvironments <> + CheckMoreEnvironments --> ProcessEnvironment : More environments? + CheckMoreEnvironments --> [*] : All environments processed + } + + + note left of ResetCache + Cache persists across all environmens in a single invocation + end note + + note right of LastChanceCheck + Uses ghr github runner id tag for precise GitHub API lookup + end note + + note right of MatchRunner + Matches GitHub runner name ending with EC2 instance ID. GitHub API response cached by owner. + end note + + note right of CheckMinimumTime + Minimum running time in minutes (Linux: 5min, Windows: 15min) + end note + + note right of CheckBootTime + Runner boot time in minutes Default configuration value + end note +``` + + + +## Key Decision Points + +| State | Condition | Action | +| ------------------------- | ------------------------ | ---------------------------------- | +| **Orphan w/ Runner ID** | GitHub: offline + busy | Terminate (confirmed orphan) | +| **Orphan w/ Runner ID** | GitHub: exists + healthy | Remove orphan tag (false positive) | +| **Orphan w/o Runner ID** | Always | Terminate (no way to verify) | +| **Active Runner Found** | Runtime < minimum | Keep (too young) | +| **Active Runner Found** | Idle quota available | Keep as idle | +| **Active Runner Found** | Quota full + idle | Terminate + deregister | +| **Active Runner Found** | Quota full + busy | Keep running | +| **Active Runner Missing** | Boot time exceeded | Mark as orphan | +| **Active Runner Missing** | Still booting | Wait | + +## Configuration Parameters + +- **Cron Schedule**: `cron(*/5 * * * ? *)` (every 5 minutes) +- **Minimum Runtime**: Linux 5min, Windows 15min +- **Boot Timeout**: Configurable via `runner_boot_time_in_minutes` +- **Idle Config**: Per-environment configuration for desired idle runners diff --git a/modules/runners/scale-down/variables.tf b/modules/runners/scale-down/variables.tf new file mode 100644 index 0000000000..76bb66a134 --- /dev/null +++ b/modules/runners/scale-down/variables.tf @@ -0,0 +1,185 @@ +variable "environments" { + description = "List of environment configurations for scale-down" + type = list(object({ + environment = string + idle_config = list(object({ + cron = string + timeZone = string + idleCount = number + evictionStrategy = optional(string, "oldest_first") + })) + minimum_running_time_in_minutes = number + runner_boot_time_in_minutes = number + })) +} + +variable "prefix" { + description = "Prefix for Lambda function name" + type = string +} + +variable "schedule_expression" { + description = "CloudWatch Event schedule expression" + type = string + default = "cron(*/5 * * * ? *)" +} + +variable "github_app_parameters" { + description = "GitHub App SSM parameters" + type = object({ + id = object({ + name = string + arn = string + }) + key_base64 = object({ + name = string + arn = string + }) + }) +} + +variable "lambda_s3_bucket" { + description = "S3 bucket for Lambda deployment package" + type = string + default = null +} + +variable "runners_lambda_s3_key" { + description = "S3 key for Lambda deployment package" + type = string + default = null +} + +variable "runners_lambda_s3_object_version" { + description = "S3 object version for Lambda deployment package" + type = string + default = null +} + +variable "lambda_runtime" { + description = "Lambda runtime" + type = string +} + +variable "lambda_timeout" { + description = "Lambda timeout in seconds" + type = number +} + +variable "lambda_memory_size" { + description = "Lambda memory size in MB" + type = number +} + +variable "lambda_architecture" { + description = "Lambda architecture (x86_64 or arm64)" + type = string +} + +variable "lambda_zip" { + description = "Path to Lambda deployment package" + type = string +} + +variable "lambda_subnet_ids" { + description = "List of subnet IDs for Lambda VPC configuration" + type = list(string) + default = [] +} + +variable "lambda_security_group_ids" { + description = "List of security group IDs for Lambda VPC configuration" + type = list(string) + default = [] +} + +variable "lambda_tags" { + description = "Tags for Lambda function" + type = map(string) + default = {} +} + +variable "tracing_config" { + description = "Lambda tracing configuration" + type = object({ + mode = optional(string, null) + capture_http_requests = optional(bool, false) + capture_error = optional(bool, false) + }) + default = {} +} + +variable "logging_retention_in_days" { + description = "CloudWatch log retention in days" + type = number +} + +variable "logging_kms_key_id" { + description = "KMS key ID for CloudWatch log encryption" + type = string + default = null +} + +variable "kms_key_arn" { + description = "KMS key ARN for SSM parameter decryption" + type = string + default = "" +} + +variable "ghes_url" { + description = "GitHub Enterprise Server URL" + type = string + default = null +} + +variable "ghes_ssl_verify" { + description = "Verify GitHub Enterprise Server SSL certificate" + type = bool + default = true +} + +variable "user_agent" { + description = "User agent string for GitHub API requests" + type = string + default = null +} + +variable "log_level" { + description = "Log level for Lambda function" + type = string + default = "info" +} + +variable "metrics" { + description = "Metrics configuration" + type = object({ + enable = optional(bool, false) + namespace = optional(string, "GitHub Runners") + metric = optional(object({ + enable_github_app_rate_limit = optional(bool, true) + }), {}) + }) + default = {} +} + +variable "role_path" { + description = "IAM role path" + type = string +} + +variable "role_permissions_boundary" { + description = "IAM role permissions boundary ARN" + type = string + default = null +} + +variable "aws_partition" { + description = "AWS partition" + type = string +} + +variable "tags" { + description = "Tags to apply to resources" + type = map(string) + default = {} +} diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf index a78231e7da..b1d7037bb6 100644 --- a/modules/runners/variables.tf +++ b/modules/runners/variables.tf @@ -234,7 +234,7 @@ variable "lambda_scale_down_memory_size" { } variable "scale_down_schedule_expression" { - description = "Scheduler expression to check every x for scale down." + description = "Scheduler expression to check every x for scale down. Set to null to disable scale-down Lambda creation." type = string default = "cron(*/5 * * * ? *)" } From bdda79dc399be78d88245f906b3169eb4198af74 Mon Sep 17 00:00:00 2001 From: Iain Lane Date: Tue, 21 Oct 2025 18:07:48 +0100 Subject: [PATCH 2/3] feat(scale-down): use SSM to store scale-down config Now we're potentially running multiple configurations in one scale-down invocation, if we continue to use the environment we could start to hit size limits: on Lambda, environment variables are limited to 4K. Adopt the approach we use elsewhere and switch to SSM parameter store for config. Here we add all the necessary IAM permissions, arrange to store the config in the store and then read it back in `scale-down`. A more strict parser is also introduced, ensuring that we detect more invalid configurations and reject them with clear error messages. --- README.md | 1 + .../functions/control-plane/src/modules.d.ts | 2 +- .../scale-runners/scale-down-config.test.ts | 383 ++++++++++++++++-- .../src/scale-runners/scale-down-config.ts | 196 +++++++++ .../src/scale-runners/scale-down.test.ts | 137 +++++-- .../src/scale-runners/scale-down.ts | 33 +- lambdas/libs/aws-ssm-util/src/index.test.ts | 128 +++++- lambdas/libs/aws-ssm-util/src/index.ts | 49 ++- main.tf | 1 + modules/multi-runner/README.md | 1 + modules/multi-runner/main.tf | 4 + modules/multi-runner/outputs.tf | 1 + modules/multi-runner/runners.tf | 1 + modules/multi-runner/variables.tf | 10 + modules/runners/README.md | 2 + modules/runners/outputs.tf | 4 + modules/runners/scale-down.tf | 4 + modules/runners/scale-down/README.md | 5 +- modules/runners/scale-down/main.tf | 47 ++- modules/runners/scale-down/outputs.tf | 5 + .../policies/lambda-scale-down.json | 10 + modules/runners/scale-down/variables.tf | 22 + modules/runners/variables.tf | 10 + variables.tf | 10 + 24 files changed, 956 insertions(+), 110 deletions(-) diff --git a/README.md b/README.md index 6e5994a305..9d04e55425 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,7 @@ Join our discord community via [this invite link](https://discord.gg/bxgXW8jJGh) | [runners\_scale\_up\_lambda\_memory\_size](#input\_runners\_scale\_up\_lambda\_memory\_size) | Memory size limit in MB for scale-up lambda. | `number` | `512` | no | | [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no | | [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.

`schedule_expression`: is used to configure the schedule for the lambda.
`enabled`: enable or disable the lambda trigger via the EventBridge.
`lambda_memory_size`: lambda memery size limit.
`lambda_timeout`: timeout for the lambda in seconds.
`config`: configuration for the lambda function. Token path will be read by default from the module. |
object({
schedule_expression = optional(string, "rate(1 day)")
enabled = optional(bool, true)
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
})
|
{
"config": {}
}
| no | +| [scale\_down\_parameter\_store\_tier](#input\_scale\_down\_parameter\_store\_tier) | SSM Parameter Store tier to use for scale-down configuration parameters. | `string` | `"Standard"` | no | | [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no | | [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secrets. |
object({
root = optional(string, "github-action-runners")
app = optional(string, "app")
runners = optional(string, "runners")
webhook = optional(string, "webhook")
use_prefix = optional(bool, true)
})
| `{}` | no | diff --git a/lambdas/functions/control-plane/src/modules.d.ts b/lambdas/functions/control-plane/src/modules.d.ts index 4bab7e1e99..ca5f166ec9 100644 --- a/lambdas/functions/control-plane/src/modules.d.ts +++ b/lambdas/functions/control-plane/src/modules.d.ts @@ -4,7 +4,6 @@ declare namespace NodeJS { ENABLE_METRIC_GITHUB_APP_RATE_LIMIT: string; ENABLE_ON_DEMAND_FAILOVER_FOR_ERRORS: string; ENVIRONMENT: string; - ENVIRONMENT_CONFIGS: string; GHES_URL: string; JOB_RETRY_CONFIG: string; LAUNCH_TEMPLATE_NAME: string; @@ -17,6 +16,7 @@ declare namespace NodeJS { PARAMETER_GITHUB_APP_KEY_BASE64_NAME: string; RUNNER_BOOT_TIME_IN_MINUTES: string; RUNNER_OWNER: string; + SCALE_DOWN_CONFIG_SSM_PATH_PREFIX: string; SSM_TOKEN_PATH: string; SSM_CLEANUP_CONFIG: string; SUBNET_IDS: string; diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down-config.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down-config.test.ts index ff2325128a..6f38629fa7 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down-config.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down-config.test.ts @@ -1,57 +1,358 @@ +import { GetParametersByPathCommand, SSMClient } from '@aws-sdk/client-ssm'; +import 'aws-sdk-client-mock-jest/vitest'; +import { mockClient } from 'aws-sdk-client-mock'; +import { describe, it, expect, beforeEach, vi } from 'vitest'; + import moment from 'moment-timezone'; -import { EvictionStrategy, ScalingDownConfigList, getEvictionStrategy, getIdleRunnerCount } from './scale-down-config'; -import { describe, it, expect } from 'vitest'; - -const DEFAULT_TIMEZONE = 'America/Los_Angeles'; -const DEFAULT_IDLE_COUNT = 1; -const DEFAULT_EVICTION_STRATEGY: EvictionStrategy = 'oldest_first'; -const now = moment.tz(new Date(), 'America/Los_Angeles'); - -function getConfig( - cronTabs: string[], - evictionStrategy: EvictionStrategy | undefined = undefined, -): ScalingDownConfigList { - return cronTabs.map((cron) => ({ - cron: cron, - idleCount: DEFAULT_IDLE_COUNT, - timeZone: DEFAULT_TIMEZONE, - evictionStrategy, - })); -} - -describe('scaleDownConfig', () => { - describe('Check runners that should be kept idle based on config.', () => { - it('One active cron configuration', async () => { - const scaleDownConfig = getConfig(['* * * * * *']); - expect(getIdleRunnerCount(scaleDownConfig)).toEqual(DEFAULT_IDLE_COUNT); +import { + InvalidEvictionStrategyError, + InvalidIdleConfigEntryError, + InvalidIdleConfigError, + InvalidJsonError, + MissingMinimumRunningTimeError, + MissingRunnerBootTimeError, + ScaleDownConfigurationsNotFoundError, + getEvictionStrategy, + getIdleRunnerCount, + loadEnvironmentScaleDownConfigFromSsm, +} from './scale-down-config'; + +const mockSSMClient = mockClient(SSMClient); + +describe('loadEnvironmentScaleDownConfigFromSsm', () => { + beforeEach(() => { + vi.clearAllMocks(); + mockSSMClient.reset(); + }); + + it('loads and parses configurations for multiple environments', async () => { + mockSSMClient + .on(GetParametersByPathCommand, { + Path: '/configs', + Recursive: false, + WithDecryption: true, + }) + .resolves({ + Parameters: [ + { + Name: '/configs/env-a', + Value: JSON.stringify({ + environment: 'env-a', + idle_config: [ + { + cron: '* * * * * *', + timeZone: 'UTC', + idleCount: 1, + evictionStrategy: 'oldest_first', + }, + ], + minimum_running_time_in_minutes: 10, + runner_boot_time_in_minutes: 5, + }), + }, + { + Name: '/configs/env-b', + Value: JSON.stringify({ + idle_config: [], + minimum_running_time_in_minutes: 20, + runner_boot_time_in_minutes: 8, + }), + }, + ], + }); + + const configs = await loadEnvironmentScaleDownConfigFromSsm('/configs/'); + + expect(mockSSMClient).toHaveReceivedCommandWith(GetParametersByPathCommand, { + Path: '/configs', + Recursive: false, + WithDecryption: true, }); + expect(configs).toEqual([ + { + environment: 'env-a', + idle_config: [ + { + cron: '* * * * * *', + timeZone: 'UTC', + idleCount: 1, + evictionStrategy: 'oldest_first', + }, + ], + minimum_running_time_in_minutes: 10, + runner_boot_time_in_minutes: 5, + }, + { + environment: 'env-b', + idle_config: [], + minimum_running_time_in_minutes: 20, + runner_boot_time_in_minutes: 8, + }, + ]); + }); - it('No active cron configuration', async () => { - const scaleDownConfig = getConfig(['* * * * * ' + ((now.day() + 1) % 7)]); - expect(getIdleRunnerCount(scaleDownConfig)).toEqual(0); + it('normalizes provided path prefix', async () => { + mockSSMClient + .on(GetParametersByPathCommand, { + Path: '/configs', + Recursive: false, + WithDecryption: true, + }) + .resolves({ + Parameters: [ + { + Name: '/configs/env', + Value: JSON.stringify({ + idle_config: [], + minimum_running_time_in_minutes: 5, + runner_boot_time_in_minutes: 2, + }), + }, + ], + }); + + await loadEnvironmentScaleDownConfigFromSsm('configs'); + + expect(mockSSMClient).toHaveReceivedCommandWith(GetParametersByPathCommand, { + Path: '/configs', + Recursive: false, + WithDecryption: true, }); + }); + + it('throws when no parameters are found', async () => { + mockSSMClient + .on(GetParametersByPathCommand, { + Path: '/configs', + Recursive: false, + WithDecryption: true, + }) + .resolves({ Parameters: [] }); - it('1 of 2 cron configurations be active', async () => { - const scaleDownConfig = getConfig(['* * * * * ' + ((now.day() + 1) % 7), '* * * * * ' + (now.day() % 7)]); - expect(getIdleRunnerCount(scaleDownConfig)).toEqual(DEFAULT_IDLE_COUNT); + await expect(loadEnvironmentScaleDownConfigFromSsm('/configs')).rejects.toMatchObject({ + prefix: '/configs', }); }); - describe('Determine eviction strategy.', () => { - it('Default eviction strategy', async () => { - const scaleDownConfig = getConfig(['* * * * * *']); - expect(getEvictionStrategy(scaleDownConfig)).toEqual('oldest_first'); + it('throws when configuration is invalid JSON', async () => { + mockSSMClient + .on(GetParametersByPathCommand, { + Path: '/configs', + Recursive: false, + WithDecryption: true, + }) + .resolves({ + Parameters: [ + { + Name: '/configs/env', + Value: '{invalid json', + }, + ], + }); + + await expect(loadEnvironmentScaleDownConfigFromSsm('/configs')).rejects.toMatchObject({ + path: '/configs/env', }); + }); - it('Overriding eviction strategy to newest_first', async () => { - const scaleDownConfig = getConfig(['* * * * * *'], 'newest_first'); - expect(getEvictionStrategy(scaleDownConfig)).toEqual('newest_first'); + it.each([ + { + description: 'minimum_running_time_in_minutes is missing', + config: { + idle_config: [], + runner_boot_time_in_minutes: 5, + }, + }, + { + description: 'minimum_running_time_in_minutes is NaN', + config: { + idle_config: [], + minimum_running_time_in_minutes: NaN, + runner_boot_time_in_minutes: 5, + }, + }, + { + description: 'runner_boot_time_in_minutes is missing', + config: { + idle_config: [], + minimum_running_time_in_minutes: 5, + }, + }, + { + description: 'runner_boot_time_in_minutes is NaN', + config: { + idle_config: [], + minimum_running_time_in_minutes: 5, + runner_boot_time_in_minutes: NaN, + }, + }, + ])('throws when $description', async ({ config }) => { + mockSSMClient + .on(GetParametersByPathCommand, { + Path: '/configs', + Recursive: false, + WithDecryption: true, + }) + .resolves({ + Parameters: [ + { + Name: '/configs/env', + Value: JSON.stringify(config), + }, + ], + }); + + await expect(loadEnvironmentScaleDownConfigFromSsm('/configs')).rejects.toMatchObject({ + path: '/configs/env', }); + }); - it('No active cron configuration', async () => { - const scaleDownConfig = getConfig(['* * * * * ' + ((now.day() + 1) % 7)]); - expect(getEvictionStrategy(scaleDownConfig)).toEqual(DEFAULT_EVICTION_STRATEGY); + it('throws when environment name contains slashes', async () => { + mockSSMClient + .on(GetParametersByPathCommand, { + Path: '/configs', + Recursive: false, + WithDecryption: true, + }) + .resolves({ + Parameters: [ + { + Name: '/configs/env/nested', + Value: JSON.stringify({ + idle_config: [], + minimum_running_time_in_minutes: 5, + runner_boot_time_in_minutes: 2, + }), + }, + ], + }); + + await expect(loadEnvironmentScaleDownConfigFromSsm('/configs')).rejects.toMatchObject({ + path: '/configs/env/nested', + environment: 'env/nested', }); }); + + it.each([ + { + description: 'idle_config is not an array', + config: { + idle_config: 'not an array', + minimum_running_time_in_minutes: 5, + runner_boot_time_in_minutes: 2, + }, + expectedError: InvalidIdleConfigError, + }, + { + description: 'idle_config entry is missing cron', + config: { + idle_config: [{ timeZone: 'UTC', idleCount: 1 }], + minimum_running_time_in_minutes: 5, + runner_boot_time_in_minutes: 2, + }, + expectedError: InvalidIdleConfigEntryError, + expectedMatch: { index: 0 }, + }, + { + description: 'idle_config entry is missing timeZone', + config: { + idle_config: [{ cron: '* * * * * *', idleCount: 1 }], + minimum_running_time_in_minutes: 5, + runner_boot_time_in_minutes: 2, + }, + expectedError: InvalidIdleConfigEntryError, + expectedMatch: { index: 0 }, + }, + { + description: 'idle_config entry is missing idleCount', + config: { + idle_config: [{ cron: '* * * * * *', timeZone: 'UTC' }], + minimum_running_time_in_minutes: 5, + runner_boot_time_in_minutes: 2, + }, + expectedError: InvalidIdleConfigEntryError, + expectedMatch: { index: 0 }, + }, + { + description: 'idle_config entry has invalid evictionStrategy', + config: { + idle_config: [ + { + cron: '* * * * * *', + timeZone: 'UTC', + idleCount: 1, + evictionStrategy: 'invalid_strategy', + }, + ], + minimum_running_time_in_minutes: 5, + runner_boot_time_in_minutes: 2, + }, + expectedError: InvalidEvictionStrategyError, + expectedMatch: { index: 0, evictionStrategy: 'invalid_strategy' }, + }, + ])('throws when $description', async ({ config, expectedError, expectedMatch }) => { + mockSSMClient + .on(GetParametersByPathCommand, { + Path: '/configs', + Recursive: false, + WithDecryption: true, + }) + .resolves({ + Parameters: [ + { + Name: '/configs/env', + Value: JSON.stringify(config), + }, + ], + }); + + const expectation = expect(loadEnvironmentScaleDownConfigFromSsm('/configs')).rejects; + if (expectedMatch) { + await expectation.toMatchObject({ + path: '/configs/env', + ...expectedMatch, + }); + return; + } + await expectation.toThrow(expectedError); + }); +}); + +describe('scale-down config helpers', () => { + const DEFAULT_TIMEZONE = 'America/Los_Angeles'; + const DEFAULT_IDLE_COUNT = 1; + + function buildConfig(cronExpressions: string[], evictionStrategy?: 'oldest_first' | 'newest_first') { + return cronExpressions.map((cron) => ({ + cron, + idleCount: DEFAULT_IDLE_COUNT, + timeZone: DEFAULT_TIMEZONE, + evictionStrategy, + })); + } + + it('returns idle runner count when cron expression matches current time', () => { + const result = getIdleRunnerCount(buildConfig(['* * * * * *'])); + expect(result).toBe(DEFAULT_IDLE_COUNT); + }); + + it('returns zero when no cron expressions match', () => { + const now = moment(); + const mismatch = `* * * * * ${(now.day() + 1) % 7}`; + const result = getIdleRunnerCount(buildConfig([mismatch])); + expect(result).toBe(0); + }); + + it('prefers eviction strategy from matching cron expression', () => { + const configs = buildConfig(['* * * * * *'], 'newest_first'); + expect(getEvictionStrategy(configs)).toBe('newest_first'); + }); + + it('falls back to oldest_first when no cron matches', () => { + const now = moment(); + const mismatch = `* * * * * ${(now.day() + 1) % 7}`; + const configs = buildConfig([mismatch], 'newest_first'); + expect(getEvictionStrategy(configs)).toBe('oldest_first'); + }); }); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down-config.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down-config.ts index 50fa191ca5..60f5628b45 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down-config.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down-config.ts @@ -1,4 +1,5 @@ import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; +import { getParametersByPath } from '@aws-github-runner/aws-ssm-util'; import parser from 'cron-parser'; import moment from 'moment'; @@ -20,6 +21,201 @@ export interface EnvironmentScaleDownConfig { const logger = createChildLogger('scale-down-config.ts'); +export abstract class ScaleDownConfigError extends Error { + constructor( + public readonly path: string, + message: string, + options?: { cause?: unknown }, + ) { + super(`Scale-down configuration '${path}' ${message}`, options); + this.name = new.target.name; + } +} + +export class ScaleDownConfigPathError extends Error { + constructor() { + super('SSM parameter path prefix is required but was not provided or is empty.'); + this.name = 'ScaleDownConfigPathError'; + } +} + +export class ScaleDownConfigurationsNotFoundError extends Error { + constructor(public readonly prefix: string) { + super(`No scale-down configuration parameters found under '${prefix}'.`); + this.name = 'ScaleDownConfigurationsNotFoundError'; + } +} + +export class InvalidIdleConfigError extends ScaleDownConfigError { + constructor(path: string) { + super(path, "is missing a valid 'idle_config' array."); + } +} + +export class InvalidIdleConfigEntryError extends ScaleDownConfigError { + constructor( + path: string, + public readonly index: number, + ) { + super(path, `has an invalid idle_config entry at index ${index}.`); + } +} + +export class InvalidEvictionStrategyError extends ScaleDownConfigError { + constructor( + path: string, + public readonly index: number, + public readonly evictionStrategy: string, + ) { + super(path, `has an invalid evictionStrategy '${evictionStrategy}' at index ${index}.`); + } +} + +export class InvalidJsonError extends ScaleDownConfigError { + constructor(path: string, cause: unknown) { + super(path, 'is not valid JSON.', { cause }); + } +} + +export class InvalidPrefixError extends ScaleDownConfigError { + constructor( + path: string, + public readonly expectedPrefix: string, + ) { + super(path, `does not start with expected prefix '${expectedPrefix}'.`); + } +} + +export class MissingEnvironmentSuffixError extends ScaleDownConfigError { + constructor(path: string) { + super(path, 'is missing an environment suffix.'); + } +} + +export class MissingMinimumRunningTimeError extends ScaleDownConfigError { + constructor(path: string) { + super(path, "must include 'minimum_running_time_in_minutes' as a number."); + } +} + +export class MissingRunnerBootTimeError extends ScaleDownConfigError { + constructor(path: string) { + super(path, "must include 'runner_boot_time_in_minutes' as a number."); + } +} + +export class InvalidEnvironmentNameError extends ScaleDownConfigError { + constructor( + path: string, + public readonly environment: string, + ) { + super(path, `has an invalid environment name '${environment}' (environment names cannot contain slashes).`); + } +} + +type RawEnvironmentScaleDownConfig = Partial & { + environment?: string; +}; + +function normalizePathPrefix(path: string): string { + if (!path) { + throw new ScaleDownConfigPathError(); + } + + let normalized = path.startsWith('/') ? path : `/${path}`; + normalized = normalized.replace(/\/+$/, ''); + + return normalized.length === 0 ? '/' : normalized; +} + +function validateIdleConfig(idleConfig: unknown, parameterName: string): ScalingDownConfig[] { + if (!Array.isArray(idleConfig)) { + throw new InvalidIdleConfigError(parameterName); + } + + return idleConfig.map((config, index) => { + if ( + typeof config !== 'object' || + config === null || + typeof (config as ScalingDownConfig).cron !== 'string' || + typeof (config as ScalingDownConfig).timeZone !== 'string' || + typeof (config as ScalingDownConfig).idleCount !== 'number' + ) { + throw new InvalidIdleConfigEntryError(parameterName, index); + } + + const evictionStrategy = (config as ScalingDownConfig).evictionStrategy; + if (evictionStrategy && !['newest_first', 'oldest_first'].includes(evictionStrategy)) { + throw new InvalidEvictionStrategyError(parameterName, index, evictionStrategy); + } + + return config as ScalingDownConfig; + }); +} + +function parseEnvironmentConfig(prefix: string, parameterName: string, rawValue: string): EnvironmentScaleDownConfig { + let parsed: RawEnvironmentScaleDownConfig; + try { + parsed = JSON.parse(rawValue); + } catch (error) { + throw new InvalidJsonError(parameterName, error); + } + + const prefixWithSeparator = prefix === '/' ? '/' : `${prefix}/`; + if (!parameterName.startsWith(prefixWithSeparator)) { + throw new InvalidPrefixError(parameterName, prefix); + } + + const environment = parameterName.slice(prefixWithSeparator.length); + if (!environment) { + throw new MissingEnvironmentSuffixError(parameterName); + } + + if (environment.includes('/')) { + throw new InvalidEnvironmentNameError(parameterName, environment); + } + + if (parsed.environment && parsed.environment !== environment) { + logger.warn( + `Scale-down configuration for parameter '${parameterName}' declares environment '${parsed.environment}', ` + + `but is stored under '${environment}'. Using parameter name as source of truth.`, + ); + } + + const minimumRunning = parsed.minimum_running_time_in_minutes; + if (typeof minimumRunning !== 'number' || Number.isNaN(minimumRunning)) { + throw new MissingMinimumRunningTimeError(parameterName); + } + + const runnerBootTime = parsed.runner_boot_time_in_minutes; + if (typeof runnerBootTime !== 'number' || Number.isNaN(runnerBootTime)) { + throw new MissingRunnerBootTimeError(parameterName); + } + + const idleConfig = validateIdleConfig(parsed.idle_config, parameterName); + + return { + environment, + idle_config: idleConfig, + minimum_running_time_in_minutes: minimumRunning, + runner_boot_time_in_minutes: runnerBootTime, + }; +} + +export async function loadEnvironmentScaleDownConfigFromSsm(pathPrefix: string): Promise { + const normalizedPrefix = normalizePathPrefix(pathPrefix); + const parameters = await getParametersByPath(normalizedPrefix, { + recursive: false, + }); + + const parameterEntries = Object.entries(parameters); + if (parameterEntries.length === 0) { + throw new ScaleDownConfigurationsNotFoundError(normalizedPrefix); + } + + return parameterEntries.map(([name, value]) => parseEnvironmentConfig(normalizedPrefix, name, value)); +} + function inPeriod(period: ScalingDownConfig): boolean { const now = moment(new Date()); const expr = parser.parse(period.cron, { diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 1dc2ac8690..e03b7f08d5 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -1,14 +1,18 @@ -import { Octokit } from '@octokit/rest'; import { RequestError } from '@octokit/request-error'; +import { Octokit } from '@octokit/rest'; import moment from 'moment'; import nock from 'nock'; - -import { RunnerInfo, RunnerList } from '../aws/runners.d'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { listEC2Runners, tag, terminateRunner, untag } from './../aws/runners'; +import type { RunnerInfo, RunnerList } from '../aws/runners.d'; import * as ghAuth from '../github/auth'; -import { listEC2Runners, terminateRunner, tag, untag } from './../aws/runners'; import { githubCache } from './cache'; -import { newestFirstStrategy, oldestFirstStrategy, scaleDown } from './scale-down'; -import { describe, it, expect, beforeEach, vi } from 'vitest'; +import { newestFirstStrategy, oldestFirstStrategy, scaleDown, scaleDownEnvironment } from './scale-down'; +import { + type EnvironmentScaleDownConfig, + type EvictionStrategy, + loadEnvironmentScaleDownConfigFromSsm, +} from './scale-down-config'; const mockOctokit = { apps: { @@ -58,6 +62,14 @@ vi.mock('./cache', async () => ({ }, })); +vi.mock('./scale-down-config', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + loadEnvironmentScaleDownConfigFromSsm: vi.fn(), + }; +}); + const mocktokit = Octokit as vi.MockedClass; const mockedAppAuth = vi.mocked(ghAuth.createGithubAppAuth); const mockedInstallationAuth = vi.mocked(ghAuth.createGithubInstallationAuth); @@ -66,6 +78,7 @@ const mockListRunners = vi.mocked(listEC2Runners); const mockTagRunners = vi.mocked(tag); const mockUntagRunners = vi.mocked(untag); const mockTerminateRunners = vi.mocked(terminateRunner); +const mockLoadEnvironmentScaleDownConfigFromSsm = vi.mocked(loadEnvironmentScaleDownConfigFromSsm); export interface TestData { repositoryName: string; @@ -82,6 +95,13 @@ const TEST_DATA: TestData = { repositoryOwner: 'Codertocat', }; +const defaultEnvironmentConfig: EnvironmentScaleDownConfig = { + environment: ENVIRONMENT, + idle_config: [], + minimum_running_time_in_minutes: MINIMUM_TIME_RUNNING_IN_MINUTES, + runner_boot_time_in_minutes: MINIMUM_BOOT_TIME, +}; + interface RunnerTestItem extends RunnerList { registered: boolean; orphan: boolean; @@ -96,18 +116,14 @@ describe('Scale down runners', () => { process.env.GITHUB_APP_CLIENT_ID = 'TEST_CLIENT_ID'; process.env.GITHUB_APP_CLIENT_SECRET = 'TEST_CLIENT_SECRET'; process.env.RUNNERS_MAXIMUM_COUNT = '3'; - process.env.ENVIRONMENT_CONFIGS = JSON.stringify([{ - environment: ENVIRONMENT, - idle_config: [], - minimum_running_time_in_minutes: MINIMUM_TIME_RUNNING_IN_MINUTES, - runner_boot_time_in_minutes: MINIMUM_BOOT_TIME, - }]); nock.disableNetConnect(); vi.clearAllMocks(); vi.resetModules(); githubCache.clients.clear(); githubCache.runners.clear(); + mockLoadEnvironmentScaleDownConfigFromSsm.mockReset(); + mockLoadEnvironmentScaleDownConfigFromSsm.mockResolvedValue([defaultEnvironmentConfig]); mockOctokit.apps.getOrgInstallation.mockImplementation(() => ({ data: { id: 'ORG', @@ -183,6 +199,13 @@ describe('Scale down runners', () => { mockCreateClient.mockResolvedValue(new mocktokit()); }); + it('should handle empty environment configs gracefully', async () => { + mockLoadEnvironmentScaleDownConfigFromSsm.mockResolvedValue([]); + + await expect(scaleDown()).resolves.not.toThrow(); + expect(mockListRunners).not.toHaveBeenCalled(); + }); + const endpoints = ['https://api.github.com', 'https://github.enterprise.something', 'https://companyname.ghe.com']; describe.each(endpoints)('for %s', (endpoint) => { @@ -373,11 +396,21 @@ describe('Scale down runners', () => { if (type === 'Repo') { mockOctokit.actions.getSelfHostedRunnerForRepo.mockResolvedValueOnce({ - data: { id: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, + data: { + id: 1234567890, + name: orphanRunner.instanceId, + busy: true, + status: 'online', + }, }); } else { mockOctokit.actions.getSelfHostedRunnerForOrg.mockResolvedValueOnce({ - data: { id: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'online' }, + data: { + id: 1234567890, + name: orphanRunner.instanceId, + busy: true, + status: 'online', + }, }); } @@ -391,11 +424,21 @@ describe('Scale down runners', () => { // arrange if (type === 'Repo') { mockOctokit.actions.getSelfHostedRunnerForRepo.mockResolvedValueOnce({ - data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'offline' }, + data: { + runnerId: 1234567890, + name: orphanRunner.instanceId, + busy: true, + status: 'offline', + }, }); } else { mockOctokit.actions.getSelfHostedRunnerForOrg.mockResolvedValueOnce({ - data: { runnerId: 1234567890, name: orphanRunner.instanceId, busy: true, status: 'offline' }, + data: { + runnerId: 1234567890, + name: orphanRunner.instanceId, + busy: true, + status: 'offline', + }, }); } @@ -612,7 +655,7 @@ describe('Scale down runners', () => { await expect(scaleDown()).resolves.not.toThrow(); }); - const evictionStrategies = ['oldest_first', 'newest_first']; + const evictionStrategies: EvictionStrategy[] = ['oldest_first', 'newest_first']; describe.each(evictionStrategies)('When idle config defined', (evictionStrategy) => { const defaultConfig = { idleCount: 1, @@ -622,12 +665,14 @@ describe('Scale down runners', () => { }; beforeEach(() => { - process.env.ENVIRONMENT_CONFIGS = JSON.stringify([{ - environment: ENVIRONMENT, - idle_config: [defaultConfig], - minimum_running_time_in_minutes: MINIMUM_TIME_RUNNING_IN_MINUTES, - runner_boot_time_in_minutes: MINIMUM_BOOT_TIME, - }]); + mockLoadEnvironmentScaleDownConfigFromSsm.mockResolvedValue([ + { + environment: ENVIRONMENT, + idle_config: [defaultConfig], + minimum_running_time_in_minutes: MINIMUM_TIME_RUNNING_IN_MINUTES, + runner_boot_time_in_minutes: MINIMUM_BOOT_TIME, + }, + ]); }); it(`Should terminate based on the the idle config with ${evictionStrategy} eviction strategy`, async () => { @@ -768,7 +813,7 @@ describe('Scale down runners', () => { const minTime1 = 10; const minTime2 = 20; - process.env.ENVIRONMENT_CONFIGS = JSON.stringify([ + mockLoadEnvironmentScaleDownConfigFromSsm.mockResolvedValue([ { environment: environment1, idle_config: [], @@ -794,8 +839,8 @@ describe('Scale down runners', () => { ]; mockListRunners.mockImplementation(async (filter) => { - const allRunners = filter?.environment === environment1 ? runners1 : - filter?.environment === environment2 ? runners2 : []; + const allRunners = + filter?.environment === environment1 ? runners1 : filter?.environment === environment2 ? runners2 : []; // Filter by orphan flag if specified return allRunners.filter((r) => !filter?.orphan || r.orphan === filter.orphan); }); @@ -804,9 +849,7 @@ describe('Scale down runners', () => { mockOctokit.paginate.mockImplementation((fn, params: any) => { const allRunners = [...runners1, ...runners2]; return Promise.resolve( - allRunners - .filter((r) => r.owner === params.org) - .map((r) => ({ id: r.instanceId, name: r.instanceId })) + allRunners.filter((r) => r.owner === params.org).map((r) => ({ id: r.instanceId, name: r.instanceId })), ); }); @@ -814,8 +857,12 @@ describe('Scale down runners', () => { await scaleDown(); // assert - should have been called for both environments - expect(listEC2Runners).toHaveBeenCalledWith({ environment: environment1 }); - expect(listEC2Runners).toHaveBeenCalledWith({ environment: environment2 }); + expect(listEC2Runners).toHaveBeenCalledWith({ + environment: environment1, + }); + expect(listEC2Runners).toHaveBeenCalledWith({ + environment: environment2, + }); // env1 runner that exceeded minTime1 should be terminated expect(terminateRunner).toHaveBeenCalledWith(runners1[0].instanceId); @@ -833,10 +880,18 @@ describe('Scale down runners', () => { const environment1 = 'env-1'; const environment2 = 'env-2'; - const idleConfig1 = { cron: '* * * * * *', idleCount: 2, timeZone: 'UTC' }; - const idleConfig2 = { cron: '* * * * * *', idleCount: 0, timeZone: 'UTC' }; - - process.env.ENVIRONMENT_CONFIGS = JSON.stringify([ + const idleConfig1 = { + cron: '* * * * * *', + idleCount: 2, + timeZone: 'UTC', + }; + const idleConfig2 = { + cron: '* * * * * *', + idleCount: 0, + timeZone: 'UTC', + }; + + mockLoadEnvironmentScaleDownConfigFromSsm.mockResolvedValue([ { environment: environment1, idle_config: [idleConfig1], @@ -852,7 +907,7 @@ describe('Scale down runners', () => { ]); const runners1 = [ - createRunnerTestData('env1-idle-1', 'Org', MINIMUM_TIME_RUNNING_IN_MINUTES + 5, true, false, true, 'owner1'), // oldest - should terminate + createRunnerTestData('env1-idle-1', 'Org', MINIMUM_TIME_RUNNING_IN_MINUTES + 5, true, false, true, 'owner1'), // oldest - should terminate createRunnerTestData('env1-idle-2', 'Org', MINIMUM_TIME_RUNNING_IN_MINUTES + 4, true, false, false, 'owner1'), // middle - keep createRunnerTestData('env1-idle-3', 'Org', MINIMUM_TIME_RUNNING_IN_MINUTES + 3, true, false, false, 'owner1'), // newest - keep ]; @@ -863,8 +918,8 @@ describe('Scale down runners', () => { ]; mockListRunners.mockImplementation(async (filter) => { - const allRunners = filter?.environment === environment1 ? runners1 : - filter?.environment === environment2 ? runners2 : []; + const allRunners = + filter?.environment === environment1 ? runners1 : filter?.environment === environment2 ? runners2 : []; // Filter by orphan flag if specified return allRunners.filter((r) => !filter?.orphan || r.orphan === filter.orphan); }); @@ -873,9 +928,7 @@ describe('Scale down runners', () => { mockOctokit.paginate.mockImplementation((fn, params: any) => { const allRunners = [...runners1, ...runners2]; return Promise.resolve( - allRunners - .filter((r) => r.owner === params.org) - .map((r) => ({ id: r.instanceId, name: r.instanceId })) + allRunners.filter((r) => r.owner === params.org).map((r) => ({ id: r.instanceId, name: r.instanceId })), ); }); @@ -884,7 +937,7 @@ describe('Scale down runners', () => { // assert // env1 has idleCount=2, so terminate oldest, keep 2 newest - expect(terminateRunner).toHaveBeenCalledWith(runners1[0].instanceId); // oldest - terminated + expect(terminateRunner).toHaveBeenCalledWith(runners1[0].instanceId); // oldest - terminated expect(terminateRunner).not.toHaveBeenCalledWith(runners1[1].instanceId); // middle - kept expect(terminateRunner).not.toHaveBeenCalledWith(runners1[2].instanceId); // newest - kept diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 3c61c73890..a201621472 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -1,20 +1,20 @@ -import { Octokit } from '@octokit/rest'; -import { Endpoints } from '@octokit/types'; -import { RequestError } from '@octokit/request-error'; import { createChildLogger } from '@aws-github-runner/aws-powertools-util'; +import { RequestError } from '@octokit/request-error'; +import type { Octokit } from '@octokit/rest'; +import type { Endpoints } from '@octokit/types'; import moment from 'moment'; - +import { bootTimeExceeded, listEC2Runners, tag, terminateRunner, untag } from './../aws/runners'; +import type { RunnerInfo, RunnerList } from './../aws/runners.d'; import { createGithubAppAuth, createGithubInstallationAuth, createOctokitClient } from '../github/auth'; -import { bootTimeExceeded, listEC2Runners, tag, untag, terminateRunner } from './../aws/runners'; -import { RunnerInfo, RunnerList } from './../aws/runners.d'; -import { GhRunners, githubCache } from './cache'; +import { metricGitHubAppRateLimit } from '../github/rate-limit'; +import { type GhRunners, githubCache } from './cache'; import { - ScalingDownConfig, - EnvironmentScaleDownConfig, + type EnvironmentScaleDownConfig, getEvictionStrategy, getIdleRunnerCount, + loadEnvironmentScaleDownConfigFromSsm, + type ScalingDownConfig, } from './scale-down-config'; -import { metricGitHubAppRateLimit } from '../github/rate-limit'; import { getGitHubEnterpriseApiUrl } from './scale-up'; const logger = createChildLogger('scale-down'); @@ -229,7 +229,9 @@ async function markOrphan(instanceId: string): Promise { await tag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); logger.info(`Runner '${instanceId}' tagged as orphan.`); } catch (e) { - logger.error(`Failed to tag runner '${instanceId}' as orphan.`, { error: e }); + logger.error(`Failed to tag runner '${instanceId}' as orphan.`, { + error: e, + }); } } @@ -238,7 +240,9 @@ async function unMarkOrphan(instanceId: string): Promise { await untag(instanceId, [{ Key: 'ghr:orphan', Value: 'true' }]); logger.info(`Runner '${instanceId}' untagged as orphan.`); } catch (e) { - logger.error(`Failed to un-tag runner '${instanceId}' as orphan.`, { error: e }); + logger.error(`Failed to un-tag runner '${instanceId}' as orphan.`, { + error: e, + }); } } @@ -313,14 +317,15 @@ function filterRunners(ec2runners: RunnerList[]): RunnerInfo[] { export async function scaleDown(): Promise { githubCache.reset(); - const environmentConfigs = JSON.parse(process.env.ENVIRONMENT_CONFIGS) as EnvironmentScaleDownConfig[]; + const configPathPrefix = process.env.SCALE_DOWN_CONFIG_SSM_PATH_PREFIX; + const environmentConfigs = await loadEnvironmentScaleDownConfigFromSsm(configPathPrefix ?? ''); for (const envConfig of environmentConfigs) { await scaleDownEnvironment(envConfig); } } -async function scaleDownEnvironment(envConfig: EnvironmentScaleDownConfig): Promise { +export async function scaleDownEnvironment(envConfig: EnvironmentScaleDownConfig): Promise { const { environment, idle_config, minimum_running_time_in_minutes, runner_boot_time_in_minutes } = envConfig; logger.info(`Processing scale-down for environment: ${environment}`); diff --git a/lambdas/libs/aws-ssm-util/src/index.test.ts b/lambdas/libs/aws-ssm-util/src/index.test.ts index 52e4242fdb..ec05841454 100644 --- a/lambdas/libs/aws-ssm-util/src/index.test.ts +++ b/lambdas/libs/aws-ssm-util/src/index.test.ts @@ -1,6 +1,8 @@ import { GetParameterCommand, GetParameterCommandOutput, + GetParametersByPathCommand, + GetParametersByPathCommandOutput, PutParameterCommand, PutParameterCommandOutput, SSMClient, @@ -9,7 +11,7 @@ import 'aws-sdk-client-mock-jest/vitest'; import { mockClient } from 'aws-sdk-client-mock'; import nock from 'nock'; -import { getParameter, putParameter, SSM_ADVANCED_TIER_THRESHOLD } from '.'; +import { getParameter, getParametersByPath, putParameter, SSM_ADVANCED_TIER_THRESHOLD } from '.'; import { describe, it, expect, beforeEach, vi } from 'vitest'; const mockSSMClient = mockClient(SSMClient); @@ -18,6 +20,7 @@ const cleanEnv = process.env; beforeEach(() => { vi.resetModules(); vi.clearAllMocks(); + mockSSMClient.reset(); process.env = { ...cleanEnv }; nock.disableNetConnect(); }); @@ -165,4 +168,127 @@ describe('Test getParameter and putParameter', () => { Tier: expectedTier, }); }); + + describe('getParametersByPath', () => { + it('returns parameters for single page result', async () => { + const output: GetParametersByPathCommandOutput = { + Parameters: [ + { Name: '/path/param1', Value: 'value1' }, + { Name: '/path/param2', Value: 'value2' }, + ], + $metadata: { httpStatusCode: 200 }, + }; + + mockSSMClient.on(GetParametersByPathCommand).resolves(output); + + await expect(getParametersByPath('/path')).resolves.toEqual({ + '/path/param1': 'value1', + '/path/param2': 'value2', + }); + }); + + it('paginates over multiple responses', async () => { + const firstPage: GetParametersByPathCommandOutput = { + Parameters: [{ Name: '/path/param1', Value: 'value1' }], + NextToken: 'TOKEN', + $metadata: { httpStatusCode: 200 }, + }; + const secondPage: GetParametersByPathCommandOutput = { + Parameters: [{ Name: '/path/param2', Value: 'value2' }], + $metadata: { httpStatusCode: 200 }, + }; + + mockSSMClient + .on(GetParametersByPathCommand, { Path: '/path', Recursive: false, WithDecryption: true }) + .resolvesOnce(firstPage) + .resolvesOnce(secondPage); + + const result = await getParametersByPath('/path'); + + expect(result).toEqual({ '/path/param1': 'value1', '/path/param2': 'value2' }); + expect(mockSSMClient).toHaveReceivedCommandTimes(GetParametersByPathCommand, 2); + }); + + it('returns empty record when path is empty', async () => { + await expect(getParametersByPath('')).resolves.toEqual({}); + expect(mockSSMClient).not.toHaveReceivedCommand(GetParametersByPathCommand); + }); + + it('returns empty record when no parameters exist at path', async () => { + const output: GetParametersByPathCommandOutput = { + Parameters: [], + $metadata: { httpStatusCode: 200 }, + }; + + mockSSMClient.on(GetParametersByPathCommand).resolves(output); + + await expect(getParametersByPath('/path')).resolves.toEqual({}); + }); + + it('uses recursive option when specified', async () => { + const output: GetParametersByPathCommandOutput = { + Parameters: [{ Name: '/path/nested/param1', Value: 'value1' }], + $metadata: { httpStatusCode: 200 }, + }; + + mockSSMClient + .on(GetParametersByPathCommand, { Path: '/path', Recursive: true, WithDecryption: true }) + .resolves(output); + + const result = await getParametersByPath('/path', { recursive: true }); + + expect(result).toEqual({ '/path/nested/param1': 'value1' }); + expect(mockSSMClient).toHaveReceivedCommandWith(GetParametersByPathCommand, { + Path: '/path', + Recursive: true, + WithDecryption: true, + }); + }); + + it.each([ + { + description: 'filters out parameters with missing Name', + mockParameters: [ + { Name: '/path/param1', Value: 'value1' }, + { Value: 'value2' }, // no Name + ], + expectedOutput: { '/path/param1': 'value1' }, + }, + { + description: 'filters out parameters with undefined Value', + mockParameters: [ + { Name: '/path/param1', Value: 'value1' }, + { Name: '/path/param2' }, // undefined Value + ], + expectedOutput: { '/path/param1': 'value1' }, + }, + { + description: 'includes parameters with empty string Value', + mockParameters: [ + { Name: '/path/param1', Value: '' }, + { Name: '/path/param2', Value: 'value2' }, + ], + expectedOutput: { '/path/param1': '', '/path/param2': 'value2' }, + }, + { + description: 'handles mix of valid and invalid parameters', + mockParameters: [ + { Name: '/path/param1', Value: 'value1' }, + { Value: 'no-name' }, + { Name: '/path/param2' }, // undefined Value + { Name: '/path/param3', Value: 'value3' }, + ], + expectedOutput: { '/path/param1': 'value1', '/path/param3': 'value3' }, + }, + ])('$description', async ({ mockParameters, expectedOutput }) => { + const output: GetParametersByPathCommandOutput = { + Parameters: mockParameters, + $metadata: { httpStatusCode: 200 }, + }; + + mockSSMClient.on(GetParametersByPathCommand).resolves(output); + + await expect(getParametersByPath('/path')).resolves.toEqual(expectedOutput); + }); + }); }); diff --git a/lambdas/libs/aws-ssm-util/src/index.ts b/lambdas/libs/aws-ssm-util/src/index.ts index 0b4925c17d..b05aa34194 100644 --- a/lambdas/libs/aws-ssm-util/src/index.ts +++ b/lambdas/libs/aws-ssm-util/src/index.ts @@ -1,6 +1,6 @@ -import { PutParameterCommand, SSMClient, Tag } from '@aws-sdk/client-ssm'; import { getTracedAWSV3Client } from '@aws-github-runner/aws-powertools-util'; import { SSMProvider } from '@aws-lambda-powertools/parameters/ssm'; +import { GetParametersByPathCommand, PutParameterCommand, SSMClient, type Tag } from '@aws-sdk/client-ssm'; export async function getParameter(parameter_name: string): Promise { const ssmClient = getTracedAWSV3Client(new SSMClient({ region: process.env.AWS_REGION })); @@ -19,6 +19,53 @@ export async function getParameter(parameter_name: string): Promise { export const SSM_ADVANCED_TIER_THRESHOLD = 4000; +/** + * Retrieve all SSM parameters under a given path. + * + * @remarks + * - Always requests decrypted values (`WithDecryption: true`). + * - Supports automatic pagination when the result spans multiple pages. + * + * @param parameter_path - Exact SSM path prefix (including leading slash). + * @param options.recursive - When true, recurse into all nested paths. + * @returns Map of parameter name to value for every parameter found beneath the path. + */ +export async function getParametersByPath( + parameter_path: string, + options: { recursive?: boolean } = {}, +): Promise> { + if (!parameter_path) { + return {}; + } + + const ssmClient = getTracedAWSV3Client(new SSMClient({ region: process.env.AWS_REGION })); + const parameters = >{}; + let nextToken: string | undefined; + + do { + const response = await ssmClient.send( + new GetParametersByPathCommand({ + Path: parameter_path, + Recursive: options.recursive ?? false, + WithDecryption: true, + NextToken: nextToken, + }), + ); + + for (const parameter of response.Parameters ?? []) { + if (!parameter.Name || parameter.Value === undefined) { + continue; + } + + parameters[parameter.Name] = parameter.Value; + } + + nextToken = response.NextToken; + } while (nextToken); + + return parameters; +} + export async function putParameter( parameter_name: string, parameter_value: string, diff --git a/main.tf b/main.tf index 9c72614808..be25c537e5 100644 --- a/main.tf +++ b/main.tf @@ -195,6 +195,7 @@ module "runners" { enable_managed_runner_security_group = var.enable_managed_runner_security_group enable_runner_detailed_monitoring = var.enable_runner_detailed_monitoring scale_down_schedule_expression = var.scale_down_schedule_expression + scale_down_parameter_store_tier = var.scale_down_parameter_store_tier minimum_running_time_in_minutes = var.minimum_running_time_in_minutes runner_boot_time_in_minutes = var.runner_boot_time_in_minutes runner_disable_default_labels = var.runner_disable_default_labels diff --git a/modules/multi-runner/README.md b/modules/multi-runner/README.md index edd4a1980c..a39fb1d0c1 100644 --- a/modules/multi-runner/README.md +++ b/modules/multi-runner/README.md @@ -171,6 +171,7 @@ module "multi-runner" { | [runners\_scale\_up\_lambda\_timeout](#input\_runners\_scale\_up\_lambda\_timeout) | Time out for the scale up lambda in seconds. | `number` | `30` | no | | [runners\_ssm\_housekeeper](#input\_runners\_ssm\_housekeeper) | Configuration for the SSM housekeeper lambda. This lambda deletes token / JIT config from SSM.

`schedule_expression`: is used to configure the schedule for the lambda.
`enabled`: enable or disable the lambda trigger via the EventBridge.
`lambda_memory_size`: lambda memery size limit.
`lambda_timeout`: timeout for the lambda in seconds.
`config`: configuration for the lambda function. Token path will be read by default from the module. |
object({
schedule_expression = optional(string, "rate(1 day)")
enabled = optional(bool, true)
lambda_memory_size = optional(number, 512)
lambda_timeout = optional(number, 60)
config = object({
tokenPath = optional(string)
minimumDaysOld = optional(number, 1)
dryRun = optional(bool, false)
})
})
|
{
"config": {}
}
| no | | [scale\_down\_lambda\_memory\_size](#input\_scale\_down\_lambda\_memory\_size) | Memory size limit in MB for scale down. | `number` | `512` | no | +| [scale\_down\_parameter\_store\_tier](#input\_scale\_down\_parameter\_store\_tier) | SSM Parameter Store tier used to store consolidated scale-down configuration. | `string` | `"Standard"` | no | | [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. | `string` | `"cron(*/5 * * * ? *)"` | no | | [scale\_up\_lambda\_memory\_size](#input\_scale\_up\_lambda\_memory\_size) | Memory size limit in MB for scale\_up lambda. | `number` | `512` | no | | [ssm\_paths](#input\_ssm\_paths) | The root path used in SSM to store configuration and secreets. |
object({
root = optional(string, "github-action-runners")
app = optional(string, "app")
runners = optional(string, "runners")
webhook = optional(string, "webhook")
})
| `{}` | no | diff --git a/modules/multi-runner/main.tf b/modules/multi-runner/main.tf index efafb629c4..38b9286af4 100644 --- a/modules/multi-runner/main.tf +++ b/modules/multi-runner/main.tf @@ -17,6 +17,8 @@ locals { unique_os_and_arch = { for i, v in local.tmp_distinct_list_unique_os_and_arch : "${v.os_type}_${v.architecture}" => v } ssm_root_path = "/${var.ssm_paths.root}/${var.prefix}" + + scale_down_parameter_path_prefix = "${local.ssm_root_path}/scale-down" } resource "random_string" "random" { @@ -45,6 +47,8 @@ module "scale_down" { environments = local.scale_down_environment_configs prefix = var.prefix schedule_expression = var.scale_down_schedule_expression + ssm_parameter_path_prefix = local.scale_down_parameter_path_prefix + scale_down_parameter_store_tier = var.scale_down_parameter_store_tier github_app_parameters = local.github_app_parameters lambda_s3_bucket = var.lambda_s3_bucket diff --git a/modules/multi-runner/outputs.tf b/modules/multi-runner/outputs.tf index 7d15c14ad8..137f3efcc2 100644 --- a/modules/multi-runner/outputs.tf +++ b/modules/multi-runner/outputs.tf @@ -25,6 +25,7 @@ output "scale_down" { lambda_log_group = module.scale_down.lambda_log_group role = module.scale_down.role event_rule = module.scale_down.cloudwatch_event_rule + ssm_parameters = module.scale_down.ssm_parameters } } diff --git a/modules/multi-runner/runners.tf b/modules/multi-runner/runners.tf index 7a2c84c5a6..4032b4e368 100644 --- a/modules/multi-runner/runners.tf +++ b/modules/multi-runner/runners.tf @@ -69,6 +69,7 @@ module "runners" { lambda_timeout_scale_up = var.runners_scale_up_lambda_timeout lambda_scale_down_memory_size = var.scale_down_lambda_memory_size lambda_timeout_scale_down = var.runners_scale_down_lambda_timeout + scale_down_parameter_store_tier = var.scale_down_parameter_store_tier lambda_subnet_ids = var.lambda_subnet_ids lambda_security_group_ids = var.lambda_security_group_ids lambda_tags = var.lambda_tags diff --git a/modules/multi-runner/variables.tf b/modules/multi-runner/variables.tf index 1c40672eb7..cff6350f48 100644 --- a/modules/multi-runner/variables.tf +++ b/modules/multi-runner/variables.tf @@ -272,6 +272,16 @@ variable "scale_down_schedule_expression" { default = "cron(*/5 * * * ? *)" } +variable "scale_down_parameter_store_tier" { + description = "SSM Parameter Store tier used to store consolidated scale-down configuration." + type = string + default = "Standard" + validation { + condition = contains(["Standard", "Advanced"], var.scale_down_parameter_store_tier) + error_message = "`scale_down_parameter_store_tier` must be either `Standard` or `Advanced`." + } +} + variable "webhook_lambda_zip" { description = "File location of the webhook lambda zip file." type = string diff --git a/modules/runners/README.md b/modules/runners/README.md index f1bf526255..ed8d80a1ce 100644 --- a/modules/runners/README.md +++ b/modules/runners/README.md @@ -206,6 +206,7 @@ yarn run dist | [runners\_lambda\_s3\_object\_version](#input\_runners\_lambda\_s3\_object\_version) | S3 object version for runners lambda function. Useful if S3 versioning is enabled on source bucket. | `string` | `null` | no | | [runners\_maximum\_count](#input\_runners\_maximum\_count) | The maximum number of runners that will be created. Setting the variable to `-1` desiables the maximum check. | `number` | `3` | no | | [s3\_runner\_binaries](#input\_s3\_runner\_binaries) | Bucket details for cached GitHub binary. |
object({
arn = string
id = string
key = string
})
| n/a | yes | +| [scale\_down\_parameter\_store\_tier](#input\_scale\_down\_parameter\_store\_tier) | SSM Parameter Store tier used to store scale-down configuration. | `string` | `"Standard"` | no | | [scale\_down\_schedule\_expression](#input\_scale\_down\_schedule\_expression) | Scheduler expression to check every x for scale down. Set to null to disable scale-down Lambda creation. | `string` | `"cron(*/5 * * * ? *)"` | no | | [scale\_up\_reserved\_concurrent\_executions](#input\_scale\_up\_reserved\_concurrent\_executions) | Amount of reserved concurrent executions for the scale-up lambda function. A value of 0 disables lambda from being triggered and -1 removes any concurrency limitations. | `number` | `1` | no | | [sqs\_build\_queue](#input\_sqs\_build\_queue) | SQS queue to consume accepted build events. |
object({
arn = string
url = string
})
| n/a | yes | @@ -236,6 +237,7 @@ yarn run dist | [role\_pool](#output\_role\_pool) | n/a | | [role\_runner](#output\_role\_runner) | n/a | | [role\_scale\_down](#output\_role\_scale\_down) | n/a | +| [scale\_down\_ssm\_parameters](#output\_scale\_down\_ssm\_parameters) | n/a | | [role\_scale\_up](#output\_role\_scale\_up) | n/a | | [runners\_log\_groups](#output\_runners\_log\_groups) | List of log groups from different log files of runner machine. | diff --git a/modules/runners/outputs.tf b/modules/runners/outputs.tf index 407cfbe0ca..5a7fbef083 100644 --- a/modules/runners/outputs.tf +++ b/modules/runners/outputs.tf @@ -30,6 +30,10 @@ output "role_scale_down" { value = try(module.scale_down[0].role, null) } +output "scale_down_ssm_parameters" { + value = try(module.scale_down[0].ssm_parameters, null) +} + output "lambda_pool" { value = try(module.pool[0].lambda, null) } diff --git a/modules/runners/scale-down.tf b/modules/runners/scale-down.tf index cd72483a60..4cf0e86487 100644 --- a/modules/runners/scale-down.tf +++ b/modules/runners/scale-down.tf @@ -11,6 +11,8 @@ locals { minimum_running_time_in_minutes = coalesce(var.minimum_running_time_in_minutes, local.min_runtime_defaults[var.runner_os]) runner_boot_time_in_minutes = var.runner_boot_time_in_minutes } + + scale_down_ssm_parameter_prefix = format("/%s/scale-down", trim(var.ssm_paths.root, "/")) } module "scale_down" { @@ -20,6 +22,8 @@ module "scale_down" { environments = [local.scale_down_environment_config] prefix = var.prefix schedule_expression = var.scale_down_schedule_expression + ssm_parameter_path_prefix = local.scale_down_ssm_parameter_prefix + scale_down_parameter_store_tier = var.scale_down_parameter_store_tier github_app_parameters = var.github_app_parameters lambda_s3_bucket = var.lambda_s3_bucket diff --git a/modules/runners/scale-down/README.md b/modules/runners/scale-down/README.md index c2906801b3..3e902bb427 100644 --- a/modules/runners/scale-down/README.md +++ b/modules/runners/scale-down/README.md @@ -69,9 +69,11 @@ No modules. | [runners\_lambda\_s3\_key](#input\_runners\_lambda\_s3\_key) | S3 key for Lambda deployment package | `string` | `null` | no | | [runners\_lambda\_s3\_object\_version](#input\_runners\_lambda\_s3\_object\_version) | S3 object version for Lambda deployment package | `string` | `null` | no | | [schedule\_expression](#input\_schedule\_expression) | CloudWatch Event schedule expression | `string` | `"cron(*/5 * * * ? *)"` | no | +| [scale\_down\_parameter\_store\_tier](#input\_scale\_down\_parameter\_store\_tier) | SSM Parameter Store tier used to store scale-down configuration. | `string` | `"Standard"` | no | | [tags](#input\_tags) | Tags to apply to resources | `map(string)` | `{}` | no | | [tracing\_config](#input\_tracing\_config) | Lambda tracing configuration |
object({
mode = optional(string, null)
capture_http_requests = optional(string, "false")
capture_error = optional(string, "false")
})
| n/a | yes | | [user\_agent](#input\_user\_agent) | User agent string for GitHub API requests | `string` | `null` | no | +| [ssm\_parameter\_path\_prefix](#input\_ssm\_parameter\_path\_prefix) | Base SSM parameter path prefix used to store scale-down configuration (without environment suffix). | `string` | n/a | yes | ## Outputs @@ -81,4 +83,5 @@ No modules. | [lambda](#output\_lambda) | Scale-down Lambda function | | [lambda\_log\_group](#output\_lambda\_log\_group) | Scale-down Lambda log group | | [role](#output\_role) | Scale-down Lambda IAM role | - \ No newline at end of file +| [ssm\_parameters](#output\_ssm\_parameters) | Scale-down configuration parameters stored in SSM | + diff --git a/modules/runners/scale-down/main.tf b/modules/runners/scale-down/main.tf index 0ae49ea71a..e3c0a31391 100644 --- a/modules/runners/scale-down/main.tf +++ b/modules/runners/scale-down/main.tf @@ -1,5 +1,33 @@ locals { - managed_environments = [for e in var.environments : e.environment] + managed_environments = [for e in var.environments : e.environment] + environment_map = { for env in var.environments : env.environment => env } + normalized_ssm_parameter_prefix = "/${trim(var.ssm_parameter_path_prefix, "/")}" + scale_down_parameter_name_prefix = local.normalized_ssm_parameter_prefix +} + +data "aws_caller_identity" "current" {} + +data "aws_region" "current" {} + +locals { + arn_ssm_parameters_path_scale_down_config = "arn:${var.aws_partition}:ssm:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:parameter${local.scale_down_parameter_name_prefix}" +} + +resource "aws_ssm_parameter" "scale_down_config" { + for_each = local.environment_map + + name = "${local.scale_down_parameter_name_prefix}/${each.key}" + description = "Scale-down configuration for environment ${each.key}" + type = "String" + tier = var.scale_down_parameter_store_tier + overwrite = true + value = jsonencode({ + environment = each.key + idle_config = each.value.idle_config + minimum_running_time_in_minutes = each.value.minimum_running_time_in_minutes + runner_boot_time_in_minutes = each.value.runner_boot_time_in_minutes + }) + tags = var.tags } # IAM assume role policy for Lambda @@ -49,20 +77,20 @@ resource "aws_lambda_function" "scale_down" { environment { variables = { - ENVIRONMENT_CONFIGS = jsonencode(var.environments) ENABLE_METRIC_GITHUB_APP_RATE_LIMIT = var.metrics.enable && var.metrics.metric.enable_github_app_rate_limit GHES_URL = var.ghes_url - USER_AGENT = var.user_agent LOG_LEVEL = var.log_level NODE_TLS_REJECT_UNAUTHORIZED = var.ghes_url != null && !var.ghes_ssl_verify ? 0 : 1 PARAMETER_GITHUB_APP_ID_NAME = var.github_app_parameters.id.name PARAMETER_GITHUB_APP_KEY_BASE64_NAME = var.github_app_parameters.key_base64.name POWERTOOLS_LOGGER_LOG_EVENT = var.log_level == "debug" ? "true" : "false" - POWERTOOLS_SERVICE_NAME = "runners-scale-down" POWERTOOLS_METRICS_NAMESPACE = var.metrics.namespace + POWERTOOLS_SERVICE_NAME = "runners-scale-down" POWERTOOLS_TRACE_ENABLED = var.tracing_config.mode != null ? true : false - POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests POWERTOOLS_TRACER_CAPTURE_ERROR = var.tracing_config.capture_error + POWERTOOLS_TRACER_CAPTURE_HTTPS_REQUESTS = var.tracing_config.capture_http_requests + SCALE_DOWN_CONFIG_SSM_PATH_PREFIX = local.scale_down_parameter_name_prefix + USER_AGENT = var.user_agent } } @@ -120,10 +148,11 @@ resource "aws_iam_role_policy" "scale_down" { name = "scale-down-policy" role = aws_iam_role.scale_down.name policy = templatefile("${path.module}/policies/lambda-scale-down.json", { - environments = jsonencode(local.managed_environments) - github_app_id_arn = var.github_app_parameters.id.arn - github_app_key_base64_arn = var.github_app_parameters.key_base64.arn - kms_key_arn = var.kms_key_arn + environments = jsonencode(local.managed_environments) + github_app_id_arn = var.github_app_parameters.id.arn + github_app_key_base64_arn = var.github_app_parameters.key_base64.arn + kms_key_arn = var.kms_key_arn + arn_ssm_parameters_path_scale_down_config = local.arn_ssm_parameters_path_scale_down_config }) } diff --git a/modules/runners/scale-down/outputs.tf b/modules/runners/scale-down/outputs.tf index 293314f76a..ff275fdcf7 100644 --- a/modules/runners/scale-down/outputs.tf +++ b/modules/runners/scale-down/outputs.tf @@ -17,3 +17,8 @@ output "cloudwatch_event_rule" { description = "CloudWatch Event Rule for scale-down" value = aws_cloudwatch_event_rule.scale_down } + +output "ssm_parameters" { + description = "Scale-down configuration parameters stored in SSM" + value = aws_ssm_parameter.scale_down_config +} diff --git a/modules/runners/scale-down/policies/lambda-scale-down.json b/modules/runners/scale-down/policies/lambda-scale-down.json index c5137f6ab7..9bb4582b6f 100644 --- a/modules/runners/scale-down/policies/lambda-scale-down.json +++ b/modules/runners/scale-down/policies/lambda-scale-down.json @@ -52,6 +52,16 @@ "${github_app_key_base64_arn}", "${github_app_id_arn}" ] + }, + { + "Effect": "Allow", + "Action": [ + "ssm:GetParametersByPath" + ], + "Resource": [ + "${arn_ssm_parameters_path_scale_down_config}", + "${arn_ssm_parameters_path_scale_down_config}/*" + ] %{ if kms_key_arn != "" ~} }, { diff --git a/modules/runners/scale-down/variables.tf b/modules/runners/scale-down/variables.tf index 76bb66a134..b4fe035b5e 100644 --- a/modules/runners/scale-down/variables.tf +++ b/modules/runners/scale-down/variables.tf @@ -11,6 +11,28 @@ variable "environments" { minimum_running_time_in_minutes = number runner_boot_time_in_minutes = number })) + validation { + condition = alltrue([ + for env in var.environments : + !can(regex("/", env.environment)) + ]) + error_message = "Environment names cannot contain slashes. Each environment must be stored as a direct child of the SSM parameter path prefix." + } +} + +variable "ssm_parameter_path_prefix" { + description = "Base SSM parameter path prefix used to store scale-down configuration (without environment suffix)." + type = string +} + +variable "scale_down_parameter_store_tier" { + description = "SSM Parameter Store tier to use for persisted scale-down configuration." + type = string + default = "Standard" + validation { + condition = contains(["Standard", "Advanced"], var.scale_down_parameter_store_tier) + error_message = "`scale_down_parameter_store_tier` must be either `Standard` or `Advanced`." + } } variable "prefix" { diff --git a/modules/runners/variables.tf b/modules/runners/variables.tf index b1d7037bb6..01e444c92f 100644 --- a/modules/runners/variables.tf +++ b/modules/runners/variables.tf @@ -239,6 +239,16 @@ variable "scale_down_schedule_expression" { default = "cron(*/5 * * * ? *)" } +variable "scale_down_parameter_store_tier" { + description = "SSM Parameter Store tier used to store scale-down configuration." + type = string + default = "Standard" + validation { + condition = contains(["Standard", "Advanced"], var.scale_down_parameter_store_tier) + error_message = "`scale_down_parameter_store_tier` must be either `Standard` or `Advanced`." + } +} + variable "minimum_running_time_in_minutes" { description = "The time an ec2 action runner should be running at minimum before terminated if non busy. If not set the default is calculated based on the OS." type = number diff --git a/variables.tf b/variables.tf index f412d2a486..9d0f283179 100644 --- a/variables.tf +++ b/variables.tf @@ -73,6 +73,16 @@ variable "scale_down_schedule_expression" { default = "cron(*/5 * * * ? *)" } +variable "scale_down_parameter_store_tier" { + description = "SSM Parameter Store tier to use for scale-down configuration parameters." + type = string + default = "Standard" + validation { + condition = contains(["Standard", "Advanced"], var.scale_down_parameter_store_tier) + error_message = "`scale_down_parameter_store_tier` must be either `Standard` or `Advanced`." + } +} + variable "minimum_running_time_in_minutes" { description = "The time an ec2 action runner should be running at minimum before terminated, if not busy." type = number From d8b16282d9841ec04ce1d829ba8ec41824504573 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 21 Oct 2025 17:11:44 +0000 Subject: [PATCH 3/3] docs: auto update terraform docs --- modules/runners/README.md | 2 +- modules/runners/scale-down/README.md | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/modules/runners/README.md b/modules/runners/README.md index ed8d80a1ce..d0aa472979 100644 --- a/modules/runners/README.md +++ b/modules/runners/README.md @@ -237,7 +237,7 @@ yarn run dist | [role\_pool](#output\_role\_pool) | n/a | | [role\_runner](#output\_role\_runner) | n/a | | [role\_scale\_down](#output\_role\_scale\_down) | n/a | -| [scale\_down\_ssm\_parameters](#output\_scale\_down\_ssm\_parameters) | n/a | | [role\_scale\_up](#output\_role\_scale\_up) | n/a | | [runners\_log\_groups](#output\_runners\_log\_groups) | List of log groups from different log files of runner machine. | +| [scale\_down\_ssm\_parameters](#output\_scale\_down\_ssm\_parameters) | n/a | diff --git a/modules/runners/scale-down/README.md b/modules/runners/scale-down/README.md index 3e902bb427..54b06e93f5 100644 --- a/modules/runners/scale-down/README.md +++ b/modules/runners/scale-down/README.md @@ -37,8 +37,11 @@ No modules. | [aws_iam_role_policy_attachment.scale_down_vpc_execution_role](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_role_policy_attachment) | resource | | [aws_lambda_function.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_function) | resource | | [aws_lambda_permission.scale_down](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/lambda_permission) | resource | +| [aws_ssm_parameter.scale_down_config](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/ssm_parameter) | resource | +| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_iam_policy_document.lambda_assume_role_policy](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | | [aws_iam_policy_document.lambda_xray](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source | +| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | ## Inputs @@ -62,18 +65,18 @@ No modules. | [log\_level](#input\_log\_level) | Log level for Lambda function | `string` | `"info"` | no | | [logging\_kms\_key\_id](#input\_logging\_kms\_key\_id) | KMS key ID for CloudWatch log encryption | `string` | `null` | no | | [logging\_retention\_in\_days](#input\_logging\_retention\_in\_days) | CloudWatch log retention in days | `number` | n/a | yes | -| [metrics](#input\_metrics) | Metrics configuration |
object({
enable = bool
namespace = string
metric = object({
enable_github_app_rate_limit = bool
})
})
| n/a | yes | +| [metrics](#input\_metrics) | Metrics configuration |
object({
enable = optional(bool, false)
namespace = optional(string, "GitHub Runners")
metric = optional(object({
enable_github_app_rate_limit = optional(bool, true)
}), {})
})
| `{}` | no | | [prefix](#input\_prefix) | Prefix for Lambda function name | `string` | n/a | yes | | [role\_path](#input\_role\_path) | IAM role path | `string` | n/a | yes | | [role\_permissions\_boundary](#input\_role\_permissions\_boundary) | IAM role permissions boundary ARN | `string` | `null` | no | | [runners\_lambda\_s3\_key](#input\_runners\_lambda\_s3\_key) | S3 key for Lambda deployment package | `string` | `null` | no | | [runners\_lambda\_s3\_object\_version](#input\_runners\_lambda\_s3\_object\_version) | S3 object version for Lambda deployment package | `string` | `null` | no | +| [scale\_down\_parameter\_store\_tier](#input\_scale\_down\_parameter\_store\_tier) | SSM Parameter Store tier to use for persisted scale-down configuration. | `string` | `"Standard"` | no | | [schedule\_expression](#input\_schedule\_expression) | CloudWatch Event schedule expression | `string` | `"cron(*/5 * * * ? *)"` | no | -| [scale\_down\_parameter\_store\_tier](#input\_scale\_down\_parameter\_store\_tier) | SSM Parameter Store tier used to store scale-down configuration. | `string` | `"Standard"` | no | +| [ssm\_parameter\_path\_prefix](#input\_ssm\_parameter\_path\_prefix) | Base SSM parameter path prefix used to store scale-down configuration (without environment suffix). | `string` | n/a | yes | | [tags](#input\_tags) | Tags to apply to resources | `map(string)` | `{}` | no | -| [tracing\_config](#input\_tracing\_config) | Lambda tracing configuration |
object({
mode = optional(string, null)
capture_http_requests = optional(string, "false")
capture_error = optional(string, "false")
})
| n/a | yes | +| [tracing\_config](#input\_tracing\_config) | Lambda tracing configuration |
object({
mode = optional(string, null)
capture_http_requests = optional(bool, false)
capture_error = optional(bool, false)
})
| `{}` | no | | [user\_agent](#input\_user\_agent) | User agent string for GitHub API requests | `string` | `null` | no | -| [ssm\_parameter\_path\_prefix](#input\_ssm\_parameter\_path\_prefix) | Base SSM parameter path prefix used to store scale-down configuration (without environment suffix). | `string` | n/a | yes | ## Outputs