Skip to content

Commit 0c84cb9

Browse files
committed
🤖 refactor: simplify adaptive concurrency to hardcoded 1-16 bounds
Make adaptive concurrency the default and only mode for terminal-bench: - Hardcode MIN_CONCURRENT=1, MAX_CONCURRENT=16 in adaptive_bench.py - Remove --max-concurrent CLI argument (no longer needed) - Make `benchmark-terminal` an alias for `benchmark-terminal-adaptive` - Simplify workflow inputs (remove adaptive_mode toggle, concurrency input) - Update documentation to reflect simplified interface This removes unnecessary configuration complexity while providing sensible bounds for all hardware configurations. The 1-16 range covers: - Single-core systems (min=1) - High-core systems (max=16 is reasonable parallelism for Docker containers) - Load-based adjustment within these bounds _Generated with `cmux`_
1 parent 4edafea commit 0c84cb9

File tree

4 files changed

+36
-79
lines changed

4 files changed

+36
-79
lines changed

.github/workflows/terminal-bench.yml

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,6 @@ on:
4646
required: false
4747
default: 'terminal-bench-core==0.1.1'
4848
type: string
49-
concurrency:
50-
description: 'Number of concurrent tasks (--n-concurrent)'
51-
required: false
52-
default: '4'
53-
type: string
5449
livestream:
5550
description: 'Enable livestream mode'
5651
required: false
@@ -72,21 +67,16 @@ on:
7267
description: 'Additional arguments to pass to terminal-bench'
7368
required: false
7469
type: string
75-
adaptive_mode:
76-
description: 'Use adaptive concurrency (overrides concurrency setting)'
77-
required: false
78-
default: false
79-
type: boolean
80-
max_concurrent:
81-
description: 'Max concurrency for adaptive mode (only used if adaptive_mode=true)'
82-
required: false
83-
default: '16'
84-
type: string
8570
load_threshold:
86-
description: 'Load threshold for adaptive mode (only used if adaptive_mode=true)'
71+
description: 'Load average threshold for adaptive concurrency (default: 1.0)'
8772
required: false
8873
default: '1.0'
8974
type: string
75+
check_interval:
76+
description: 'Seconds between bursts for adaptive concurrency (default: 60)'
77+
required: false
78+
default: '60'
79+
type: string
9080

9181
jobs:
9282
benchmark:
@@ -115,18 +105,12 @@ jobs:
115105
- name: Build dist/ (skip icons - not needed for benchmark)
116106
run: make build-main build-preload
117107

118-
- name: Run Terminal-Bench
119-
run: |
120-
if [ "${{ inputs.adaptive_mode }}" = "true" ]; then
121-
make benchmark-terminal-adaptive
122-
else
123-
make benchmark-terminal
124-
fi
108+
- name: Run Terminal-Bench (adaptive concurrency 1-16)
109+
run: make benchmark-terminal
125110
env:
126111
TB_DATASET: ${{ inputs.dataset }}
127-
TB_CONCURRENCY: ${{ inputs.concurrency }}
128-
TB_MAX_CONCURRENT: ${{ inputs.max_concurrent }}
129112
TB_LOAD_THRESHOLD: ${{ inputs.load_threshold }}
113+
TB_CHECK_INTERVAL: ${{ inputs.check_interval }}
130114
TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
131115
TB_SAMPLE_SIZE: ${{ inputs.sample_size }}
132116
TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }}

Makefile

Lines changed: 3 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -295,45 +295,12 @@ chromatic: node_modules/.installed ## Run Chromatic for visual regression testin
295295
@bun x chromatic --exit-zero-on-changes
296296

297297
## Benchmarks
298-
benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_SAMPLE_SIZE/TB_TIMEOUT/TB_ARGS to customize)
299-
@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
300-
TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
301-
CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \
302-
LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
303-
TASK_ID_FLAGS=""; \
304-
if [ -n "$$TB_SAMPLE_SIZE" ]; then \
305-
echo "Ensuring dataset $$TB_DATASET is downloaded..."; \
306-
uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \
307-
echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \
308-
TASK_IDS=$$(python benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
309-
echo "Error: Failed to sample tasks" >&2; \
310-
exit 1; \
311-
}; \
312-
if [ -z "$$TASK_IDS" ]; then \
313-
echo "Error: Sampling returned no task IDs" >&2; \
314-
exit 1; \
315-
fi; \
316-
for task_id in $$TASK_IDS; do \
317-
TASK_ID_FLAGS="$$TASK_ID_FLAGS --task-id $$task_id"; \
318-
done; \
319-
echo "Selected task IDs: $$TASK_IDS"; \
320-
fi; \
321-
echo "Using timeout: $$TB_TIMEOUT seconds"; \
322-
echo "Running Terminal-Bench with dataset $$TB_DATASET"; \
323-
uvx terminal-bench run \
324-
--dataset "$$TB_DATASET" \
325-
--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
326-
--global-agent-timeout-sec $$TB_TIMEOUT \
327-
$$CONCURRENCY_FLAG \
328-
$$LIVESTREAM_FLAG \
329-
$$TASK_ID_FLAGS \
330-
$${TB_ARGS}
298+
benchmark-terminal: benchmark-terminal-adaptive ## Run Terminal-Bench with adaptive concurrency (alias)
331299

332300
.PHONY: benchmark-terminal-adaptive
333-
benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (use TB_MAX_CONCURRENT/TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
301+
benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (auto-scales 1-16, use TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
334302
@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
335303
TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
336-
TB_MAX_CONCURRENT=$${TB_MAX_CONCURRENT:-16}; \
337304
TB_LOAD_THRESHOLD=$${TB_LOAD_THRESHOLD:-1.0}; \
338305
TB_CHECK_INTERVAL=$${TB_CHECK_INTERVAL:-60}; \
339306
LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
@@ -355,9 +322,8 @@ benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (us
355322
done; \
356323
echo "Selected task IDs: $$TASK_IDS"; \
357324
fi; \
358-
echo "Running adaptive terminal-bench (max concurrency: $$TB_MAX_CONCURRENT, load threshold: $$TB_LOAD_THRESHOLD)"; \
325+
echo "Running adaptive terminal-bench (auto-scaling 1-16, load threshold: $$TB_LOAD_THRESHOLD)"; \
359326
python3 benchmarks/terminal_bench/adaptive_bench.py \
360-
--max-concurrent $$TB_MAX_CONCURRENT \
361327
--load-threshold $$TB_LOAD_THRESHOLD \
362328
--check-interval $$TB_CHECK_INTERVAL \
363329
-- \

benchmarks/terminal_bench/README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@ This directory contains the cmux agent adapter for [Terminal-Bench](https://gith
44

55
## Quick Start
66

7+
Terminal-bench now runs with **adaptive concurrency by default**, automatically scaling from 1-16 concurrent tasks based on system load.
8+
79
```bash
8-
# Run full benchmark suite (80 tasks, ~2.5 hours)
10+
# Run full benchmark suite (80 tasks, ~2.5 hours) with adaptive concurrency
911
make benchmark-terminal
1012

1113
# Run with sample of 5 tasks
1214
TB_SAMPLE_SIZE=5 make benchmark-terminal
1315

16+
# Adjust load threshold (default: 1.0)
17+
TB_LOAD_THRESHOLD=2.0 make benchmark-terminal
18+
1419
# Run specific tasks
1520
make benchmark-terminal TB_ARGS="--task-id hello-world --task-id chess-best-move"
1621

@@ -24,7 +29,8 @@ make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic:claude-opus-
2429

2530
- `TB_DATASET`: Dataset to use (default: `terminal-bench-core==0.1.1`)
2631
- `TB_SAMPLE_SIZE`: Number of random tasks to run (default: all 80 tasks)
27-
- `TB_CONCURRENCY`: Number of concurrent tasks (default: 4)
32+
- `TB_LOAD_THRESHOLD`: Load average threshold for concurrency adjustments (default: 1.0)
33+
- `TB_CHECK_INTERVAL`: Seconds between bursts (default: 60)
2834
- `TB_LIVESTREAM`: Enable livestream mode (set to `1` to enable)
2935
- `TB_TIMEOUT`: Global timeout in seconds (default: 1800 = 30 minutes)
3036
- `TB_ARGS`: Additional arguments passed to terminal-bench

benchmarks/terminal_bench/adaptive_bench.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,28 @@
1717

1818

1919
class AdaptiveBench:
20+
"""
21+
Adaptive concurrency wrapper for terminal-bench.
22+
23+
Concurrency is automatically bounded to [1, 16] for optimal performance
24+
across different hardware configurations.
25+
"""
26+
27+
MIN_CONCURRENT = 1
28+
MAX_CONCURRENT = 16
29+
2030
def __init__(
2131
self,
2232
load_threshold: float,
2333
check_interval: int,
24-
max_concurrent: int,
2534
runs_dir: Path,
2635
tb_args: list[str],
2736
):
2837
self.load_threshold = load_threshold
2938
self.check_interval = check_interval
30-
self.max_concurrent = max_concurrent
3139
self.runs_dir = runs_dir
3240
self.tb_args = tb_args
33-
self.current_concurrent = 1
41+
self.current_concurrent = self.MIN_CONCURRENT
3442
self.run_id: Optional[str] = None
3543
self.burst_count = 0
3644

@@ -78,12 +86,12 @@ def adjust_concurrency(self) -> bool:
7886
load = self.get_load_avg()
7987
old_concurrent = self.current_concurrent
8088

81-
if load < self.load_threshold and self.current_concurrent < self.max_concurrent:
89+
if load < self.load_threshold and self.current_concurrent < self.MAX_CONCURRENT:
8290
self.current_concurrent = min(
83-
self.current_concurrent * 2, self.max_concurrent
91+
self.current_concurrent * 2, self.MAX_CONCURRENT
8492
)
85-
elif load > self.load_threshold and self.current_concurrent > 1:
86-
self.current_concurrent = max(self.current_concurrent // 2, 1)
93+
elif load > self.load_threshold and self.current_concurrent > self.MIN_CONCURRENT:
94+
self.current_concurrent = max(self.current_concurrent // 2, self.MIN_CONCURRENT)
8795

8896
if self.current_concurrent != old_concurrent:
8997
print(
@@ -221,7 +229,7 @@ def run(self):
221229

222230
def main():
223231
parser = argparse.ArgumentParser(
224-
description="Run terminal-bench with adaptive concurrency via burst-and-resume"
232+
description="Run terminal-bench with adaptive concurrency (auto-scales 1-16 based on load)"
225233
)
226234
parser.add_argument(
227235
"--load-threshold",
@@ -235,12 +243,6 @@ def main():
235243
default=60,
236244
help="Seconds between bursts (default: 60)",
237245
)
238-
parser.add_argument(
239-
"--max-concurrent",
240-
type=int,
241-
required=True,
242-
help="Maximum concurrency limit",
243-
)
244246
parser.add_argument(
245247
"--runs-dir",
246248
type=Path,
@@ -263,7 +265,6 @@ def main():
263265
bench = AdaptiveBench(
264266
load_threshold=args.load_threshold,
265267
check_interval=args.check_interval,
266-
max_concurrent=args.max_concurrent,
267268
runs_dir=args.runs_dir,
268269
tb_args=tb_args,
269270
)

0 commit comments

Comments
 (0)