coder · ammar-agent · Nov 8, 2025 · Nov 8, 2025 · Nov 8, 2025 · Nov 8, 2025
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
@@ -22,10 +22,10 @@ on:
         type: string
         default: '4'
       livestream:
-        description: 'Enable livestream mode (verbose output to console)'
+        description: 'Enable livestream mode'
         required: false
         type: boolean
-        default: false
+        default: true
       sample_size:
         description: 'Number of random tasks to run (empty = all tasks)'
         required: false
@@ -46,15 +46,10 @@ on:
         required: false
         default: 'terminal-bench-core==0.1.1'
         type: string
-      concurrency:
-        description: 'Number of concurrent tasks (--n-concurrent)'
-        required: false
-        default: '4'
-        type: string
       livestream:
-        description: 'Enable livestream mode (verbose output to console)'
+        description: 'Enable livestream mode'
         required: false
-        default: false
+        default: true
         type: boolean
       sample_size:
         description: 'Number of random tasks to run (empty = all tasks)'
@@ -72,15 +67,24 @@ on:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
+      load_threshold:
+        description: 'Load average threshold for adaptive concurrency (default: 1.0)'
+        required: false
+        default: '1.0'
+        type: string
+      check_interval:
+        description: 'Seconds between bursts for adaptive concurrency (default: 60)'
+        required: false
+        default: '60'
+        type: string
 
 jobs:
   benchmark:
     name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
     runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
-    # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically
-    # Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs
-    # If consistently hitting this timeout, investigate task-level issues
-    timeout-minutes: 240
+    # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
+    # Allow 3 hours for safety margin and slower tasks
+    timeout-minutes: 180
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -101,11 +105,12 @@ jobs:
       - name: Build dist/ (skip icons - not needed for benchmark)
         run: make build-main build-preload
 
-      - name: Run Terminal-Bench
-        run: make benchmark-terminal 2>&1 | tee benchmark.log
+      - name: Run Terminal-Bench (adaptive concurrency 1-16)
+        run: make benchmark-terminal
         env:
           TB_DATASET: ${{ inputs.dataset }}
-          TB_CONCURRENCY: ${{ inputs.concurrency }}
+          TB_LOAD_THRESHOLD: ${{ inputs.load_threshold }}
+          TB_CHECK_INTERVAL: ${{ inputs.check_interval }}
           TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
           TB_SAMPLE_SIZE: ${{ inputs.sample_size }}
           TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }}
@@ -116,12 +121,18 @@ jobs:
         if: always()
         run: |
           echo "=== Terminal-Bench Results Summary ==="
-          if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then
+          if [ -f "$(find runs -name 'results.json' | head -1)" ]; then
             RESULTS_FILE=$(find runs -name 'results.json' | head -1)
-            cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE"
+            echo "Results file: $RESULTS_FILE"
+            echo ""
+            echo "Full results.json:"
+            cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
+            echo ""
+            echo "Per-task summary:"
+            cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
           else
-            echo "❌ No results.json found"
-            ls -laR runs/ 2>/dev/null || echo "runs/ directory missing"
+            echo "No results.json found in runs/"
+            ls -la runs/
           fi
 
       - name: Set artifact name
@@ -144,7 +155,6 @@ jobs:
           name: ${{ steps.artifact-name.outputs.name }}
           path: |
             runs/
-            benchmark.log
           if-no-files-found: warn
           retention-days: 30
 
diff --git a/Makefile b/Makefile
@@ -39,7 +39,7 @@ include fmt.mk
 .PHONY: dist dist-mac dist-win dist-linux
 .PHONY: docs docs-build docs-watch
 .PHONY: storybook storybook-build test-storybook chromatic
-.PHONY: benchmark-terminal
+.PHONY: benchmark-terminal benchmark-terminal-adaptive
 .PHONY: ensure-deps
 .PHONY: check-eager-imports check-bundle-size check-startup
 
@@ -295,10 +295,14 @@ chromatic: node_modules/.installed ## Run Chromatic for visual regression testin
 	@bun x chromatic --exit-zero-on-changes
 
 ## Benchmarks
-benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_SAMPLE_SIZE/TB_TIMEOUT/TB_ARGS to customize)
+benchmark-terminal: benchmark-terminal-adaptive ## Run Terminal-Bench with adaptive concurrency (alias)
+
+.PHONY: benchmark-terminal-adaptive
+benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (auto-scales 1-16, use TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
 	@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
 	TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
-	CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \
+	TB_LOAD_THRESHOLD=$${TB_LOAD_THRESHOLD:-1.0}; \
+	TB_CHECK_INTERVAL=$${TB_CHECK_INTERVAL:-60}; \
 	LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
 	TASK_ID_FLAGS=""; \
 	if [ -n "$$TB_SAMPLE_SIZE" ]; then \
@@ -318,14 +322,14 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB
 		done; \
 		echo "Selected task IDs: $$TASK_IDS"; \
 	fi; \
-	echo "Using timeout: $$TB_TIMEOUT seconds"; \
-	echo "Running Terminal-Bench with dataset $$TB_DATASET"; \
-	export CMUX_TIMEOUT_MS=$$((TB_TIMEOUT * 1000)); \
-	uvx terminal-bench run \
+	echo "Running adaptive terminal-bench (auto-scaling 1-16, load threshold: $$TB_LOAD_THRESHOLD)"; \
+	python3 benchmarks/terminal_bench/adaptive_bench.py \
+		--load-threshold $$TB_LOAD_THRESHOLD \
+		--check-interval $$TB_CHECK_INTERVAL \
+		-- \
 		--dataset "$$TB_DATASET" \
 		--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
 		--global-agent-timeout-sec $$TB_TIMEOUT \
-		$$CONCURRENCY_FLAG \
 		$$LIVESTREAM_FLAG \
 		$$TASK_ID_FLAGS \
 		$${TB_ARGS}

diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md
@@ -4,13 +4,18 @@ This directory contains the cmux agent adapter for [Terminal-Bench](https://gith
 
 ## Quick Start
 
+Terminal-bench now runs with **adaptive concurrency by default**, automatically scaling from 1-16 concurrent tasks based on system load.
+
 ```bash
-# Run full benchmark suite (80 tasks, ~2.5 hours)
+# Run full benchmark suite (80 tasks, ~2.5 hours) with adaptive concurrency
 make benchmark-terminal
 
 # Run with sample of 5 tasks
 TB_SAMPLE_SIZE=5 make benchmark-terminal
 
+# Adjust load threshold (default: 1.0)
+TB_LOAD_THRESHOLD=2.0 make benchmark-terminal
+
 # Run specific tasks
 make benchmark-terminal TB_ARGS="--task-id hello-world --task-id chess-best-move"
 
@@ -24,7 +29,8 @@ make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic:claude-opus-
 
 - `TB_DATASET`: Dataset to use (default: `terminal-bench-core==0.1.1`)
 - `TB_SAMPLE_SIZE`: Number of random tasks to run (default: all 80 tasks)
-- `TB_CONCURRENCY`: Number of concurrent tasks (default: 4)
+- `TB_LOAD_THRESHOLD`: Load average threshold for concurrency adjustments (default: 1.0)
+- `TB_CHECK_INTERVAL`: Seconds between bursts (default: 60)
 - `TB_LIVESTREAM`: Enable livestream mode (set to `1` to enable)
 - `TB_TIMEOUT`: Global timeout in seconds (default: 1800 = 30 minutes)
 - `TB_ARGS`: Additional arguments passed to terminal-bench
@@ -99,10 +105,55 @@ Based on analysis of the Oct 30 nightly run (15-minute timeout):
 
 **Impact of 30-minute timeout**: Expected to reduce false timeout failures by ~50% and improve pass rates by 10-15 percentage points (from ~42% to ~52-57%).
 
+## Adaptive Concurrency
+
+Terminal-bench uses **adaptive concurrency** that automatically scales from 1-16 concurrent tasks based on system load using a **burst-and-resume pattern**:
+
+### How It Works
+
+1. **Starts with concurrency=1** and runs a burst
+2. **Monitors system load** (1-minute average) after each burst completes
+3. **Adjusts concurrency** using hysteresis:
+   - **Double** when load < threshold (default: 1.0)
+   - **Halve** when load > threshold
+   - **Bounded to [1, 16]** for optimal performance
+4. **Resumes** the run with updated concurrency (skips completed tasks)
+
+The burst-and-resume pattern leverages terminal-bench's native resume capability. Each burst runs to completion with no mid-task interruption, ensuring clean Docker container lifecycle.
+
+### Configuration
+
+```bash
+# Adjust load threshold (default: 1.0)
+TB_LOAD_THRESHOLD=2.0 make benchmark-terminal
+
+# Faster adjustments (default: 60s between bursts)
+TB_CHECK_INTERVAL=30 make benchmark-terminal
+
+# Sample 5 tasks with adaptive concurrency
+TB_SAMPLE_SIZE=5 make benchmark-terminal
+```
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `TB_LOAD_THRESHOLD` | 1.0 | Load average threshold for adjusting concurrency |
+| `TB_CHECK_INTERVAL` | 60 | Seconds to wait between bursts |
+
+### Tradeoffs
+
+- ✅ Automatically finds optimal concurrency for hardware
+- ✅ Prevents system overload
+- ✅ Clean container lifecycle (no mid-task kills)
+- ✅ Bounded to [1, 16] for safety
+- ⚠️ Burst overhead (~2-5s, negligible for 6+ min avg tasks)
+- ⚠️ Adjustment latency = burst duration + check interval
+
 ## Files
 
 - `cmux_agent.py`: Main agent adapter implementing Terminal-Bench's agent interface
 - `cmux-run.sh`: Shell script that sets up environment and invokes cmux CLI
 - `cmux_payload.py`: Helper to package cmux app for containerized execution
 - `cmux_setup.sh.j2`: Jinja2 template for agent installation script
 - `sample_tasks.py`: Utility to randomly sample tasks from dataset
+- `adaptive_bench.py`: Adaptive concurrency wrapper using burst-and-resume pattern
+- `adaptive_bench_test.py`: Unit tests for adaptive_bench.py