Skip to content

Commit d77d232

Browse files
committed
fix: only retry when we don't create enough ephemeral runners
Prior to this change, we would have retried if _any_ runner failed to be created either due to capacity or an EC2 error. Now we'll consider these differently. If we can't create due to the maximum allowed runner count, retry only if creating ephemeral runners. Always retry if EC2 creation fails.
1 parent 8dee5a0 commit d77d232

File tree

1 file changed

+18
-9
lines changed
  • lambdas/functions/control-plane/src/scale-runners

1 file changed

+18
-9
lines changed

lambdas/functions/control-plane/src/scale-runners/scale-up.ts

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -383,14 +383,23 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
383383
: // Otherwise, we do have a limit, so work out if `scaleUp` would exceed it.
384384
Math.min(scaleUp, maximumRunners - currentRunners);
385385

386-
if (newRunners <= 0) {
387-
logger.info('No runners will be created for this group, maximum number of runners reached.', {
386+
const missingInstanceCount = Math.max(0, scaleUp - newRunners);
387+
388+
if (missingInstanceCount > 0) {
389+
logger.info('Not all runners will be created for this group, maximum number of runners reached.', {
388390
desiredNewRunners: scaleUp,
389391
});
390392

391-
invalidMessages.push(...messages.map((message) => message.messageId));
393+
if (ephemeralEnabled) {
394+
// This removes `missingInstanceCount` items from the start of the array
395+
// so that, if we retry more messages later, we pick fresh ones.
396+
invalidMessages.push(...messages.splice(0, missingInstanceCount).map(({ messageId }) => messageId));
397+
}
392398

393-
continue;
399+
// No runners will be created, so skip calling the EC2 API.
400+
if (missingInstanceCount === scaleUp) {
401+
continue;
402+
}
394403
}
395404

396405
logger.info(`Attempting to launch new runners`, {
@@ -431,16 +440,16 @@ export async function scaleUp(payloads: ActionRequestMessageSQS[]): Promise<stri
431440

432441
// Not all runners we wanted were created, let's reject enough items so that
433442
// number of entries will be retried.
434-
if (instances.length !== scaleUp) {
435-
const missingInstanceCount = scaleUp - instances.length;
443+
if (instances.length !== newRunners) {
444+
const failedInstanceCount = newRunners - instances.length;
436445

437-
logger.warn('Not enough runners were created, rejecting some messages so the requests are retried', {
446+
logger.warn('Some runners failed to be created, rejecting some messages so the requests are retried', {
438447
wanted: newRunners,
439448
got: instances.length,
440-
missingInstanceCount,
449+
missingInstanceCount: failedInstanceCount,
441450
});
442451

443-
invalidMessages.push(...messages.slice(0, missingInstanceCount).map((message) => message.messageId));
452+
invalidMessages.push(...messages.slice(0, failedInstanceCount).map(({ messageId }) => messageId));
444453
}
445454
}
446455

0 commit comments

Comments
 (0)