Skip to content

Commit acf3998

Browse files
committed
🤖 refactor: simplify adaptive concurrency to hardcoded 1-16 bounds
Make adaptive concurrency the default and only mode for terminal-bench: - Hardcode MIN_CONCURRENT=1, MAX_CONCURRENT=16 in adaptive_bench.py - Remove --max-concurrent CLI argument (no longer needed) - Make `benchmark-terminal` an alias for `benchmark-terminal-adaptive` - Simplify workflow inputs (remove adaptive_mode toggle, concurrency input) - Update documentation to reflect simplified interface This removes unnecessary configuration complexity while providing sensible bounds for all hardware configurations. The 1-16 range covers: - Single-core systems (min=1) - High-core systems (max=16 is reasonable parallelism for Docker containers) - Load-based adjustment within these bounds _Generated with `cmux`_
1 parent 0494e3d commit acf3998

File tree

4 files changed

+57
-86
lines changed

4 files changed

+57
-86
lines changed

.github/workflows/terminal-bench.yml

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -16,25 +16,20 @@ on:
1616
required: false
1717
type: string
1818
default: 'terminal-bench-core==0.1.1'
19+
concurrency:
20+
description: 'Number of concurrent tasks (--n-concurrent)'
21+
required: false
22+
type: string
23+
default: '4'
1924
livestream:
20-
description: 'Enable livestream mode (verbose output to console)'
25+
description: 'Enable livestream mode'
2126
required: false
2227
type: boolean
23-
default: false
28+
default: true
2429
sample_size:
2530
description: 'Number of random tasks to run (empty = all tasks)'
2631
required: false
2732
type: string
28-
load_threshold:
29-
description: 'Load threshold for adaptive concurrency (default: 1.0)'
30-
required: false
31-
type: string
32-
default: '1.0'
33-
check_interval:
34-
description: 'Seconds between adaptive bursts (default: 60)'
35-
required: false
36-
type: string
37-
default: '60'
3833
extra_args:
3934
description: 'Additional arguments to pass to terminal-bench'
4035
required: false
@@ -52,9 +47,9 @@ on:
5247
default: 'terminal-bench-core==0.1.1'
5348
type: string
5449
livestream:
55-
description: 'Enable livestream mode (verbose output to console)'
50+
description: 'Enable livestream mode'
5651
required: false
57-
default: false
52+
default: true
5853
type: boolean
5954
sample_size:
6055
description: 'Number of random tasks to run (empty = all tasks)'
@@ -68,29 +63,28 @@ on:
6863
description: 'Thinking level (off, low, medium, high)'
6964
required: false
7065
type: string
66+
extra_args:
67+
description: 'Additional arguments to pass to terminal-bench'
68+
required: false
69+
type: string
7170
load_threshold:
72-
description: 'Load threshold for adaptive concurrency (default: 1.0)'
71+
description: 'Load average threshold for adaptive concurrency (default: 1.0)'
7372
required: false
7473
default: '1.0'
7574
type: string
7675
check_interval:
77-
description: 'Seconds between adaptive bursts (default: 60)'
76+
description: 'Seconds between bursts for adaptive concurrency (default: 60)'
7877
required: false
7978
default: '60'
8079
type: string
81-
extra_args:
82-
description: 'Additional arguments to pass to terminal-bench'
83-
required: false
84-
type: string
8580

8681
jobs:
8782
benchmark:
8883
name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
8984
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
90-
# Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically
91-
# Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs
92-
# If consistently hitting this timeout, investigate task-level issues
93-
timeout-minutes: 240
85+
# Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
86+
# Allow 3 hours for safety margin and slower tasks
87+
timeout-minutes: 180
9488
steps:
9589
- name: Checkout code
9690
uses: actions/checkout@v4
@@ -111,8 +105,8 @@ jobs:
111105
- name: Build dist/ (skip icons - not needed for benchmark)
112106
run: make build-main build-preload
113107

114-
- name: Run Terminal-Bench
115-
run: make benchmark-terminal 2>&1 | tee benchmark.log
108+
- name: Run Terminal-Bench (adaptive concurrency 1-16)
109+
run: make benchmark-terminal
116110
env:
117111
TB_DATASET: ${{ inputs.dataset }}
118112
TB_LOAD_THRESHOLD: ${{ inputs.load_threshold }}
@@ -127,12 +121,18 @@ jobs:
127121
if: always()
128122
run: |
129123
echo "=== Terminal-Bench Results Summary ==="
130-
if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then
124+
if [ -f "$(find runs -name 'results.json' | head -1)" ]; then
131125
RESULTS_FILE=$(find runs -name 'results.json' | head -1)
132-
cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE"
126+
echo "Results file: $RESULTS_FILE"
127+
echo ""
128+
echo "Full results.json:"
129+
cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
130+
echo ""
131+
echo "Per-task summary:"
132+
cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
133133
else
134-
echo "No results.json found"
135-
ls -laR runs/ 2>/dev/null || echo "runs/ directory missing"
134+
echo "No results.json found in runs/"
135+
ls -la runs/
136136
fi
137137
138138
- name: Set artifact name
@@ -155,7 +155,6 @@ jobs:
155155
name: ${{ steps.artifact-name.outputs.name }}
156156
path: |
157157
runs/
158-
benchmark.log
159158
if-no-files-found: warn
160159
retention-days: 30
161160

Makefile

Lines changed: 3 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -295,46 +295,12 @@ chromatic: node_modules/.installed ## Run Chromatic for visual regression testin
295295
@bun x chromatic --exit-zero-on-changes
296296

297297
## Benchmarks
298-
benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_SAMPLE_SIZE/TB_TIMEOUT/TB_ARGS to customize)
299-
@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
300-
TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
301-
CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \
302-
LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
303-
TASK_ID_FLAGS=""; \
304-
if [ -n "$$TB_SAMPLE_SIZE" ]; then \
305-
echo "Ensuring dataset $$TB_DATASET is downloaded..."; \
306-
uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \
307-
echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \
308-
TASK_IDS=$$(python3 benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
309-
echo "Error: Failed to sample tasks" >&2; \
310-
exit 1; \
311-
}; \
312-
if [ -z "$$TASK_IDS" ]; then \
313-
echo "Error: Sampling returned no task IDs" >&2; \
314-
exit 1; \
315-
fi; \
316-
for task_id in $$TASK_IDS; do \
317-
TASK_ID_FLAGS="$$TASK_ID_FLAGS --task-id $$task_id"; \
318-
done; \
319-
echo "Selected task IDs: $$TASK_IDS"; \
320-
fi; \
321-
echo "Using timeout: $$TB_TIMEOUT seconds"; \
322-
echo "Running Terminal-Bench with dataset $$TB_DATASET"; \
323-
export CMUX_TIMEOUT_MS=$$((TB_TIMEOUT * 1000)); \
324-
uvx terminal-bench run \
325-
--dataset "$$TB_DATASET" \
326-
--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
327-
--global-agent-timeout-sec $$TB_TIMEOUT \
328-
$$CONCURRENCY_FLAG \
329-
$$LIVESTREAM_FLAG \
330-
$$TASK_ID_FLAGS \
331-
$${TB_ARGS}
298+
benchmark-terminal: benchmark-terminal-adaptive ## Run Terminal-Bench with adaptive concurrency (alias)
332299

333300
.PHONY: benchmark-terminal-adaptive
334-
benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (use TB_MAX_CONCURRENT/TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
301+
benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (auto-scales 1-16, use TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
335302
@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
336303
TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
337-
TB_MAX_CONCURRENT=$${TB_MAX_CONCURRENT:-16}; \
338304
TB_LOAD_THRESHOLD=$${TB_LOAD_THRESHOLD:-1.0}; \
339305
TB_CHECK_INTERVAL=$${TB_CHECK_INTERVAL:-60}; \
340306
LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
@@ -356,9 +322,8 @@ benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (us
356322
done; \
357323
echo "Selected task IDs: $$TASK_IDS"; \
358324
fi; \
359-
echo "Running adaptive terminal-bench (max concurrency: $$TB_MAX_CONCURRENT, load threshold: $$TB_LOAD_THRESHOLD)"; \
325+
echo "Running adaptive terminal-bench (auto-scaling 1-16, load threshold: $$TB_LOAD_THRESHOLD)"; \
360326
python3 benchmarks/terminal_bench/adaptive_bench.py \
361-
--max-concurrent $$TB_MAX_CONCURRENT \
362327
--load-threshold $$TB_LOAD_THRESHOLD \
363328
--check-interval $$TB_CHECK_INTERVAL \
364329
-- \

benchmarks/terminal_bench/README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@ This directory contains the cmux agent adapter for [Terminal-Bench](https://gith
44

55
## Quick Start
66

7+
Terminal-bench now runs with **adaptive concurrency by default**, automatically scaling from 1-16 concurrent tasks based on system load.
8+
79
```bash
8-
# Run full benchmark suite (80 tasks, ~2.5 hours)
10+
# Run full benchmark suite (80 tasks, ~2.5 hours) with adaptive concurrency
911
make benchmark-terminal
1012

1113
# Run with sample of 5 tasks
1214
TB_SAMPLE_SIZE=5 make benchmark-terminal
1315

16+
# Adjust load threshold (default: 1.0)
17+
TB_LOAD_THRESHOLD=2.0 make benchmark-terminal
18+
1419
# Run specific tasks
1520
make benchmark-terminal TB_ARGS="--task-id hello-world --task-id chess-best-move"
1621

@@ -24,7 +29,8 @@ make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic:claude-opus-
2429

2530
- `TB_DATASET`: Dataset to use (default: `terminal-bench-core==0.1.1`)
2631
- `TB_SAMPLE_SIZE`: Number of random tasks to run (default: all 80 tasks)
27-
- `TB_CONCURRENCY`: Number of concurrent tasks (default: 4)
32+
- `TB_LOAD_THRESHOLD`: Load average threshold for concurrency adjustments (default: 1.0)
33+
- `TB_CHECK_INTERVAL`: Seconds between bursts (default: 60)
2834
- `TB_LIVESTREAM`: Enable livestream mode (set to `1` to enable)
2935
- `TB_TIMEOUT`: Global timeout in seconds (default: 1800 = 30 minutes)
3036
- `TB_ARGS`: Additional arguments passed to terminal-bench

benchmarks/terminal_bench/adaptive_bench.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,28 @@
1717

1818

1919
class AdaptiveBench:
20+
"""
21+
Adaptive concurrency wrapper for terminal-bench.
22+
23+
Concurrency is automatically bounded to [1, 16] for optimal performance
24+
across different hardware configurations.
25+
"""
26+
27+
MIN_CONCURRENT = 1
28+
MAX_CONCURRENT = 16
29+
2030
def __init__(
2131
self,
2232
load_threshold: float,
2333
check_interval: int,
24-
max_concurrent: int,
2534
runs_dir: Path,
2635
tb_args: list[str],
2736
):
2837
self.load_threshold = load_threshold
2938
self.check_interval = check_interval
30-
self.max_concurrent = max_concurrent
3139
self.runs_dir = runs_dir
3240
self.tb_args = tb_args
33-
self.current_concurrent = 1
41+
self.current_concurrent = self.MIN_CONCURRENT
3442
self.run_id: Optional[str] = None
3543
self.burst_count = 0
3644

@@ -78,12 +86,12 @@ def adjust_concurrency(self) -> bool:
7886
load = self.get_load_avg()
7987
old_concurrent = self.current_concurrent
8088

81-
if load < self.load_threshold and self.current_concurrent < self.max_concurrent:
89+
if load < self.load_threshold and self.current_concurrent < self.MAX_CONCURRENT:
8290
self.current_concurrent = min(
83-
self.current_concurrent * 2, self.max_concurrent
91+
self.current_concurrent * 2, self.MAX_CONCURRENT
8492
)
85-
elif load > self.load_threshold and self.current_concurrent > 1:
86-
self.current_concurrent = max(self.current_concurrent // 2, 1)
93+
elif load > self.load_threshold and self.current_concurrent > self.MIN_CONCURRENT:
94+
self.current_concurrent = max(self.current_concurrent // 2, self.MIN_CONCURRENT)
8795

8896
if self.current_concurrent != old_concurrent:
8997
print(
@@ -221,7 +229,7 @@ def run(self):
221229

222230
def main():
223231
parser = argparse.ArgumentParser(
224-
description="Run terminal-bench with adaptive concurrency via burst-and-resume"
232+
description="Run terminal-bench with adaptive concurrency (auto-scales 1-16 based on load)"
225233
)
226234
parser.add_argument(
227235
"--load-threshold",
@@ -235,12 +243,6 @@ def main():
235243
default=60,
236244
help="Seconds between bursts (default: 60)",
237245
)
238-
parser.add_argument(
239-
"--max-concurrent",
240-
type=int,
241-
required=True,
242-
help="Maximum concurrency limit",
243-
)
244246
parser.add_argument(
245247
"--runs-dir",
246248
type=Path,
@@ -263,7 +265,6 @@ def main():
263265
bench = AdaptiveBench(
264266
load_threshold=args.load_threshold,
265267
check_interval=args.check_interval,
266-
max_concurrent=args.max_concurrent,
267268
runs_dir=args.runs_dir,
268269
tb_args=tb_args,
269270
)

0 commit comments

Comments
 (0)