coder
diff --git a/‎Makefile‎
Lines changed: 40 additions & 1 deletion b/‎Makefile‎
Lines changed: 40 additions & 1 deletion
diff --git a/‎benchmarks/terminal_bench/README.md‎
Lines changed: 61 additions & 0 deletions b/‎benchmarks/terminal_bench/README.md‎
Lines changed: 61 additions & 0 deletions
@@ -39,7 +39,7 @@ include fmt.mk
 .PHONY: dist dist-mac dist-win dist-linux
 .PHONY: docs docs-build docs-watch
 .PHONY: storybook storybook-build test-storybook chromatic
-.PHONY: benchmark-terminal
+.PHONY: benchmark-terminal benchmark-terminal-adaptive
 .PHONY: ensure-deps
 .PHONY: check-eager-imports check-bundle-size check-startup
 
@@ -329,6 +329,45 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB
 		$$TASK_ID_FLAGS \
 		$${TB_ARGS}
 
+.PHONY: benchmark-terminal-adaptive
+benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (use TB_MAX_CONCURRENT/TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
+	@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
+	TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
+	TB_MAX_CONCURRENT=$${TB_MAX_CONCURRENT:-16}; \
+	TB_LOAD_THRESHOLD=$${TB_LOAD_THRESHOLD:-1.0}; \
+	TB_CHECK_INTERVAL=$${TB_CHECK_INTERVAL:-60}; \
+	LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
+	TASK_ID_FLAGS=""; \
+	if [ -n "$$TB_SAMPLE_SIZE" ]; then \
+		echo "Ensuring dataset $$TB_DATASET is downloaded..."; \
+		uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \
+		echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \
+		TASK_IDS=$$(python3 benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
+			echo "Error: Failed to sample tasks" >&2; \
+			exit 1; \
+		}; \
+		if [ -z "$$TASK_IDS" ]; then \
+			echo "Error: Sampling returned no task IDs" >&2; \
+			exit 1; \
+		fi; \
+		for task_id in $$TASK_IDS; do \
+			TASK_ID_FLAGS="$$TASK_ID_FLAGS --task-id $$task_id"; \
+		done; \
+		echo "Selected task IDs: $$TASK_IDS"; \
+	fi; \
+	echo "Running adaptive terminal-bench (max concurrency: $$TB_MAX_CONCURRENT, load threshold: $$TB_LOAD_THRESHOLD)"; \
+	python3 benchmarks/terminal_bench/adaptive_bench.py \
+		--max-concurrent $$TB_MAX_CONCURRENT \
+		--load-threshold $$TB_LOAD_THRESHOLD \
+		--check-interval $$TB_CHECK_INTERVAL \
+		-- \
+		--dataset "$$TB_DATASET" \
+		--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
+		--global-agent-timeout-sec $$TB_TIMEOUT \
+		$$LIVESTREAM_FLAG \
+		$$TASK_ID_FLAGS \
+		$${TB_ARGS}
+
 ## Clean
 clean: ## Clean build artifacts
 	@echo "Cleaning build artifacts..."
 
@@ -99,10 +99,71 @@ Based on analysis of the Oct 30 nightly run (15-minute timeout):
 
 **Impact of 30-minute timeout**: Expected to reduce false timeout failures by ~50% and improve pass rates by 10-15 percentage points (from ~42% to ~52-57%).
 
+## Adaptive Concurrency Mode
+
+The `benchmark-terminal-adaptive` target automatically adjusts concurrency based on system load using a **burst-and-resume pattern**:
+
+```bash
+# Start with concurrency=1, scale up to max 16 based on load
+TB_MAX_CONCURRENT=16 make benchmark-terminal-adaptive
+
+# More conservative: max 8, higher load threshold
+TB_MAX_CONCURRENT=8 TB_LOAD_THRESHOLD=2.0 make benchmark-terminal-adaptive
+
+# Faster adjustments: check every 30 seconds
+TB_CHECK_INTERVAL=30 TB_MAX_CONCURRENT=16 make benchmark-terminal-adaptive
+
+# Sample 5 tasks with adaptive concurrency
+TB_SAMPLE_SIZE=5 TB_MAX_CONCURRENT=8 make benchmark-terminal-adaptive
+```
+
+### How It Works
+
+1. **Runs terminal-bench in bursts** with current concurrency
+2. **Monitors system load** after each burst completes
+3. **Adjusts concurrency** using hysteresis:
+   - **Double** when 1-minute load avg < threshold
+   - **Halve** when 1-minute load avg > threshold
+4. **Resumes** the run with updated concurrency
+
+The burst-and-resume pattern leverages terminal-bench's native resume capability to skip completed tasks. Each burst runs to completion (no mid-task interruption), ensuring clean Docker container lifecycle.
+
+### Configuration
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `TB_MAX_CONCURRENT` | 16 | Maximum concurrency limit |
+| `TB_LOAD_THRESHOLD` | 1.0 | Load average threshold for adjusting concurrency |
+| `TB_CHECK_INTERVAL` | 60 | Seconds to wait between bursts |
+
+### When to Use Adaptive Mode
+
+**Use adaptive mode when:**
+- Running on shared hardware with variable load
+- Unsure of optimal concurrency for your system
+- Want to maximize throughput without overloading
+- Running long benchmark suites (full 80-task suite)
+
+**Use fixed concurrency when:**
+- Running on dedicated hardware
+- Know optimal concurrency for your setup
+- Running small task samples (< 10 tasks)
+- Burst overhead (2-5s) matters for very short tasks
+
+### Tradeoffs
+
+- ✅ Automatically finds optimal concurrency
+- ✅ Prevents system overload
+- ✅ Clean container lifecycle (no mid-task kills)
+- ⚠️ Burst overhead (~2-5s between bursts)
+- ⚠️ Adjustment latency = burst duration + check interval
+
 ## Files
 
 - `cmux_agent.py`: Main agent adapter implementing Terminal-Bench's agent interface
 - `cmux-run.sh`: Shell script that sets up environment and invokes cmux CLI
 - `cmux_payload.py`: Helper to package cmux app for containerized execution
 - `cmux_setup.sh.j2`: Jinja2 template for agent installation script
 - `sample_tasks.py`: Utility to randomly sample tasks from dataset
+- `adaptive_bench.py`: Adaptive concurrency wrapper using burst-and-resume pattern
+- `adaptive_bench_test.py`: Unit tests for adaptive_bench.py