From e28f0da4f14e4cbb7ac6208bf8625eb4d10c901f Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 21:27:15 +0000
Subject: [PATCH 1/5] =?UTF-8?q?=F0=9F=A4=96=20feat:=20add=20hysteresis-bas?=
 =?UTF-8?q?ed=20adaptive=20concurrency=20for=20terminal-bench?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements adaptive concurrency control for terminal-bench using a
burst-and-resume pattern that automatically adjusts parallelism based on
system load average.

## Key Features

- **Hysteresis-based adjustment**: Double concurrency when load < threshold,
  halve when load > threshold
- **Burst-and-resume pattern**: Runs terminal-bench in bursts, using native
  resume capability to skip completed tasks between bursts
- **Clean container lifecycle**: No mid-task interruption, each burst
  completes naturally before adjusting
- **Configurable parameters**: Max concurrency, load threshold, check interval

## Implementation

- `benchmarks/terminal_bench/adaptive_bench.py`: Main wrapper implementing
  burst-and-resume logic with load monitoring
- `benchmarks/terminal_bench/adaptive_bench_test.py`: Unit tests for adaptive
  logic
- `Makefile`: New `benchmark-terminal-adaptive` target
- Documentation updates in `benchmarks/terminal_bench/README.md`

## Usage

```bash
# Start with concurrency=1, scale up to 16 based on load
TB_MAX_CONCURRENT=16 make benchmark-terminal-adaptive

# Conservative: max 8, higher load threshold
TB_MAX_CONCURRENT=8 TB_LOAD_THRESHOLD=2.0 make benchmark-terminal-adaptive

# Sample 5 tasks with adaptive concurrency
TB_SAMPLE_SIZE=5 TB_MAX_CONCURRENT=8 make benchmark-terminal-adaptive
```

## How It Works

1. Start with concurrency=1
2. Run terminal-bench burst with current concurrency
3. After burst completes, check 1-minute load average
4. Adjust concurrency: double if load < threshold, halve if load > threshold
5. Update tb.lock with new concurrency
6. Resume run (skips completed tasks automatically)
7. Repeat until all tasks complete

## Tradeoffs

- ✅ Automatically finds optimal concurrency for hardware
- ✅ Prevents system overload
- ✅ Uses terminal-bench native features (resume, tb.lock)
- ⚠️ Burst overhead ~2-5s (acceptable for 6+ minute avg task duration)
- ⚠️ Modifies tb.lock (semi-internal format, but stable)

## Design Rationale

Research showed terminal-bench uses fixed-size ThreadPoolExecutor that cannot
be resized mid-run. Kill-and-restart approach would interrupt Docker
containers mid-task. Burst-and-resume leverages terminal-bench's built-in
resume capability for clean checkpointing and task skipping.

_Generated with `cmux`_
---
 Makefile                                      |  41 ++-
 benchmarks/terminal_bench/README.md           |  61 ++++
 benchmarks/terminal_bench/adaptive_bench.py   | 275 ++++++++++++++++++
 .../terminal_bench/adaptive_bench_test.py     | 273 +++++++++++++++++
 4 files changed, 649 insertions(+), 1 deletion(-)
 create mode 100755 benchmarks/terminal_bench/adaptive_bench.py
 create mode 100644 benchmarks/terminal_bench/adaptive_bench_test.py

diff --git a/Makefile b/Makefile
index a27559132..9a860e633 100644
--- a/Makefile
+++ b/Makefile
@@ -39,7 +39,7 @@ include fmt.mk
 .PHONY: dist dist-mac dist-win dist-linux
 .PHONY: docs docs-build docs-watch
 .PHONY: storybook storybook-build test-storybook chromatic
-.PHONY: benchmark-terminal
+.PHONY: benchmark-terminal benchmark-terminal-adaptive
 .PHONY: ensure-deps
 .PHONY: check-eager-imports check-bundle-size check-startup
 
@@ -330,6 +330,45 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB
 		$$TASK_ID_FLAGS \
 		$${TB_ARGS}
 
+.PHONY: benchmark-terminal-adaptive
+benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (use TB_MAX_CONCURRENT/TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
+	@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
+	TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
+	TB_MAX_CONCURRENT=$${TB_MAX_CONCURRENT:-16}; \
+	TB_LOAD_THRESHOLD=$${TB_LOAD_THRESHOLD:-1.0}; \
+	TB_CHECK_INTERVAL=$${TB_CHECK_INTERVAL:-60}; \
+	LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
+	TASK_ID_FLAGS=""; \
+	if [ -n "$$TB_SAMPLE_SIZE" ]; then \
+		echo "Ensuring dataset $$TB_DATASET is downloaded..."; \
+		uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \
+		echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \
+		TASK_IDS=$$(python3 benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
+			echo "Error: Failed to sample tasks" >&2; \
+			exit 1; \
+		}; \
+		if [ -z "$$TASK_IDS" ]; then \
+			echo "Error: Sampling returned no task IDs" >&2; \
+			exit 1; \
+		fi; \
+		for task_id in $$TASK_IDS; do \
+			TASK_ID_FLAGS="$$TASK_ID_FLAGS --task-id $$task_id"; \
+		done; \
+		echo "Selected task IDs: $$TASK_IDS"; \
+	fi; \
+	echo "Running adaptive terminal-bench (max concurrency: $$TB_MAX_CONCURRENT, load threshold: $$TB_LOAD_THRESHOLD)"; \
+	python3 benchmarks/terminal_bench/adaptive_bench.py \
+		--max-concurrent $$TB_MAX_CONCURRENT \
+		--load-threshold $$TB_LOAD_THRESHOLD \
+		--check-interval $$TB_CHECK_INTERVAL \
+		-- \
+		--dataset "$$TB_DATASET" \
+		--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
+		--global-agent-timeout-sec $$TB_TIMEOUT \
+		$$LIVESTREAM_FLAG \
+		$$TASK_ID_FLAGS \
+		$${TB_ARGS}
+
 ## Clean
 clean: ## Clean build artifacts
 	@echo "Cleaning build artifacts..."
diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md
index c106c8804..4f723db1a 100644
--- a/benchmarks/terminal_bench/README.md
+++ b/benchmarks/terminal_bench/README.md
@@ -99,6 +99,65 @@ Based on analysis of the Oct 30 nightly run (15-minute timeout):
 
 **Impact of 30-minute timeout**: Expected to reduce false timeout failures by ~50% and improve pass rates by 10-15 percentage points (from ~42% to ~52-57%).
 
+## Adaptive Concurrency Mode
+
+The `benchmark-terminal-adaptive` target automatically adjusts concurrency based on system load using a **burst-and-resume pattern**:
+
+```bash
+# Start with concurrency=1, scale up to max 16 based on load
+TB_MAX_CONCURRENT=16 make benchmark-terminal-adaptive
+
+# More conservative: max 8, higher load threshold
+TB_MAX_CONCURRENT=8 TB_LOAD_THRESHOLD=2.0 make benchmark-terminal-adaptive
+
+# Faster adjustments: check every 30 seconds
+TB_CHECK_INTERVAL=30 TB_MAX_CONCURRENT=16 make benchmark-terminal-adaptive
+
+# Sample 5 tasks with adaptive concurrency
+TB_SAMPLE_SIZE=5 TB_MAX_CONCURRENT=8 make benchmark-terminal-adaptive
+```
+
+### How It Works
+
+1. **Runs terminal-bench in bursts** with current concurrency
+2. **Monitors system load** after each burst completes
+3. **Adjusts concurrency** using hysteresis:
+   - **Double** when 1-minute load avg < threshold
+   - **Halve** when 1-minute load avg > threshold
+4. **Resumes** the run with updated concurrency
+
+The burst-and-resume pattern leverages terminal-bench's native resume capability to skip completed tasks. Each burst runs to completion (no mid-task interruption), ensuring clean Docker container lifecycle.
+
+### Configuration
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `TB_MAX_CONCURRENT` | 16 | Maximum concurrency limit |
+| `TB_LOAD_THRESHOLD` | 1.0 | Load average threshold for adjusting concurrency |
+| `TB_CHECK_INTERVAL` | 60 | Seconds to wait between bursts |
+
+### When to Use Adaptive Mode
+
+**Use adaptive mode when:**
+- Running on shared hardware with variable load
+- Unsure of optimal concurrency for your system
+- Want to maximize throughput without overloading
+- Running long benchmark suites (full 80-task suite)
+
+**Use fixed concurrency when:**
+- Running on dedicated hardware
+- Know optimal concurrency for your setup
+- Running small task samples (< 10 tasks)
+- Burst overhead (2-5s) matters for very short tasks
+
+### Tradeoffs
+
+- ✅ Automatically finds optimal concurrency
+- ✅ Prevents system overload
+- ✅ Clean container lifecycle (no mid-task kills)
+- ⚠️ Burst overhead (~2-5s between bursts)
+- ⚠️ Adjustment latency = burst duration + check interval
+
 ## Files
 
 - `cmux_agent.py`: Main agent adapter implementing Terminal-Bench's agent interface
@@ -106,3 +165,5 @@ Based on analysis of the Oct 30 nightly run (15-minute timeout):
 - `cmux_payload.py`: Helper to package cmux app for containerized execution
 - `cmux_setup.sh.j2`: Jinja2 template for agent installation script
 - `sample_tasks.py`: Utility to randomly sample tasks from dataset
+- `adaptive_bench.py`: Adaptive concurrency wrapper using burst-and-resume pattern
+- `adaptive_bench_test.py`: Unit tests for adaptive_bench.py
diff --git a/benchmarks/terminal_bench/adaptive_bench.py b/benchmarks/terminal_bench/adaptive_bench.py
new file mode 100755
index 000000000..7d74d6acd
--- /dev/null
+++ b/benchmarks/terminal_bench/adaptive_bench.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+"""
+Adaptive concurrency wrapper for terminal-bench using burst-and-resume pattern.
+
+Runs terminal-bench in bursts with adjustable concurrency, using tb's native
+resume capability to skip completed tasks between bursts.
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+
+class AdaptiveBench:
+    def __init__(
+        self,
+        load_threshold: float,
+        check_interval: int,
+        max_concurrent: int,
+        runs_dir: Path,
+        tb_args: list[str],
+    ):
+        self.load_threshold = load_threshold
+        self.check_interval = check_interval
+        self.max_concurrent = max_concurrent
+        self.runs_dir = runs_dir
+        self.tb_args = tb_args
+        self.current_concurrent = 1
+        self.run_id: Optional[str] = None
+        self.burst_count = 0
+
+    def get_load_avg(self) -> float:
+        """Get 1-minute load average."""
+        return os.getloadavg()[0]
+
+    def get_run_status(self) -> dict:
+        """Get status of current run by parsing results.json and tb.lock."""
+        if not self.run_id:
+            return {"total": 0, "completed": 0, "incomplete": 0}
+
+        try:
+            # Parse tb.lock to get task count
+            lock_path = self.runs_dir / self.run_id / "tb.lock"
+            if lock_path.exists():
+                with open(lock_path) as f:
+                    lock_data = json.load(f)
+                    total_tasks = len(lock_data.get("dataset", {}).get("task_ids", []))
+            else:
+                total_tasks = 0
+
+            # Count completed tasks from results.json
+            results_path = self.runs_dir / self.run_id / "results.json"
+            completed = 0
+            if results_path.exists():
+                with open(results_path) as f:
+                    results_data = json.load(f)
+                    # Count unique task_ids in results
+                    completed = len(
+                        set(r["task_id"] for r in results_data.get("trials", []))
+                    )
+
+            return {
+                "total": total_tasks,
+                "completed": completed,
+                "incomplete": max(0, total_tasks - completed),
+            }
+        except Exception as e:
+            print(f"⚠️  Error getting run status: {e}")
+            return {"total": 0, "completed": 0, "incomplete": 0}
+
+    def adjust_concurrency(self) -> bool:
+        """Check load and adjust concurrency. Returns True if changed."""
+        load = self.get_load_avg()
+        old_concurrent = self.current_concurrent
+
+        if load < self.load_threshold and self.current_concurrent < self.max_concurrent:
+            self.current_concurrent = min(
+                self.current_concurrent * 2, self.max_concurrent
+            )
+        elif load > self.load_threshold and self.current_concurrent > 1:
+            self.current_concurrent = max(self.current_concurrent // 2, 1)
+
+        if self.current_concurrent != old_concurrent:
+            print(
+                f"📊 Load: {load:.2f} (threshold: {self.load_threshold}) → "
+                f"Concurrency: {old_concurrent} → {self.current_concurrent}"
+            )
+            return True
+
+        print(f"📊 Load: {load:.2f} (threshold: {self.load_threshold}) → No change")
+        return False
+
+    def run_burst(self) -> int:
+        """Run a single burst of terminal-bench. Returns exit code."""
+        self.burst_count += 1
+
+        if self.burst_count == 1:
+            # First burst - create new run
+            cmd = [
+                "uvx",
+                "terminal-bench",
+                "run",
+                "--n-concurrent",
+                str(self.current_concurrent),
+                "--output-path",
+                str(self.runs_dir),
+                *self.tb_args,
+            ]
+            print(
+                f"🚀 Burst #{self.burst_count}: Starting NEW run with "
+                f"concurrency={self.current_concurrent}"
+            )
+        else:
+            # Subsequent bursts - resume existing run
+            cmd = [
+                "uvx",
+                "terminal-bench",
+                "runs",
+                "resume",
+                "--run-id",
+                self.run_id,
+                "--runs-dir",
+                str(self.runs_dir),
+            ]
+            print(
+                f"🔄 Burst #{self.burst_count}: Resuming run {self.run_id} "
+                f"with concurrency={self.current_concurrent}"
+            )
+
+        print(f"   Command: {' '.join(cmd)}")
+        burst_start = time.time()
+
+        # Run terminal-bench
+        result = subprocess.run(cmd, env=os.environ.copy())
+
+        burst_duration = time.time() - burst_start
+
+        # Capture run_id from first burst
+        if self.burst_count == 1 and result.returncode == 0:
+            # Find most recent run directory
+            if self.runs_dir.exists():
+                run_dirs = [
+                    d
+                    for d in self.runs_dir.iterdir()
+                    if d.is_dir() and (d / "tb.lock").exists()
+                ]
+                if run_dirs:
+                    # Sort by modification time and take most recent
+                    self.run_id = sorted(run_dirs, key=lambda p: p.stat().st_mtime)[
+                        -1
+                    ].name
+                    print(f"📝 Captured run_id: {self.run_id}")
+
+        print(f"⏱️  Burst #{self.burst_count} completed in {burst_duration:.1f}s")
+
+        # Update n_concurrent in tb.lock for next resume
+        if self.run_id and result.returncode == 0:
+            self._update_lock_concurrency()
+
+        return result.returncode
+
+    def _update_lock_concurrency(self):
+        """Update n_concurrent_trials in tb.lock for next resume."""
+        lock_path = self.runs_dir / self.run_id / "tb.lock"
+        if not lock_path.exists():
+            return
+
+        try:
+            with open(lock_path, "r") as f:
+                lock_data = json.load(f)
+
+            # Update concurrency in lock file
+            if "run_config" in lock_data:
+                lock_data["run_config"][
+                    "n_concurrent_trials"
+                ] = self.current_concurrent
+
+            with open(lock_path, "w") as f:
+                json.dump(lock_data, f, indent=2)
+
+            print(f"   Updated tb.lock with concurrency={self.current_concurrent}")
+        except Exception as e:
+            print(f"⚠️  Could not update tb.lock: {e}")
+
+    def run(self):
+        """Main loop: run bursts with adaptive concurrency."""
+        try:
+            while True:
+                # Run burst with current concurrency
+                exit_code = self.run_burst()
+
+                if exit_code != 0:
+                    print(f"❌ Terminal-bench exited with code {exit_code}")
+                    return exit_code
+
+                # Check if we're done
+                status = self.get_run_status()
+                print(
+                    f"📈 Progress: {status['completed']}/{status['total']} tasks "
+                    f"({status['incomplete']} remaining)"
+                )
+
+                if status["incomplete"] == 0:
+                    print("✅ All tasks completed!")
+                    return 0
+
+                # Wait before next burst and potentially adjust concurrency
+                print(f"⏸️  Waiting {self.check_interval}s before next burst...")
+                time.sleep(self.check_interval)
+                self.adjust_concurrency()
+
+        except KeyboardInterrupt:
+            print("\n⚠️  Received interrupt, stopping...")
+            return 130
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run terminal-bench with adaptive concurrency via burst-and-resume"
+    )
+    parser.add_argument(
+        "--load-threshold",
+        type=float,
+        default=1.0,
+        help="Load average threshold for adjusting concurrency (default: 1.0)",
+    )
+    parser.add_argument(
+        "--check-interval",
+        type=int,
+        default=60,
+        help="Seconds between bursts (default: 60)",
+    )
+    parser.add_argument(
+        "--max-concurrent",
+        type=int,
+        required=True,
+        help="Maximum concurrency limit",
+    )
+    parser.add_argument(
+        "--runs-dir",
+        type=Path,
+        default=Path("runs"),
+        help="Directory for run outputs (default: runs)",
+    )
+    parser.add_argument(
+        "tb_args",
+        nargs=argparse.REMAINDER,
+        help="Arguments to pass to terminal-bench run",
+    )
+
+    args = parser.parse_args()
+
+    # Strip leading '--' from tb_args if present
+    tb_args = args.tb_args
+    if tb_args and tb_args[0] == "--":
+        tb_args = tb_args[1:]
+
+    bench = AdaptiveBench(
+        load_threshold=args.load_threshold,
+        check_interval=args.check_interval,
+        max_concurrent=args.max_concurrent,
+        runs_dir=args.runs_dir,
+        tb_args=tb_args,
+    )
+
+    sys.exit(bench.run())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/terminal_bench/adaptive_bench_test.py b/benchmarks/terminal_bench/adaptive_bench_test.py
new file mode 100644
index 000000000..5f5e5002f
--- /dev/null
+++ b/benchmarks/terminal_bench/adaptive_bench_test.py
@@ -0,0 +1,273 @@
+"""Tests for adaptive_bench.py"""
+
+import json
+import os
+from pathlib import Path
+from unittest.mock import MagicMock, mock_open, patch
+
+import pytest
+
+from adaptive_bench import AdaptiveBench
+
+
+class TestAdaptiveBench:
+    """Test suite for AdaptiveBench."""
+
+    def test_init(self):
+        """Test AdaptiveBench initialization."""
+        bench = AdaptiveBench(
+            load_threshold=2.0,
+            check_interval=30,
+            max_concurrent=8,
+            runs_dir=Path("test_runs"),
+            tb_args=["--dataset", "test"],
+        )
+
+        assert bench.load_threshold == 2.0
+        assert bench.check_interval == 30
+        assert bench.max_concurrent == 8
+        assert bench.runs_dir == Path("test_runs")
+        assert bench.tb_args == ["--dataset", "test"]
+        assert bench.current_concurrent == 1
+        assert bench.run_id is None
+        assert bench.burst_count == 0
+
+    @patch("adaptive_bench.os.getloadavg")
+    def test_get_load_avg(self, mock_getloadavg):
+        """Test getting load average."""
+        mock_getloadavg.return_value = (2.5, 2.0, 1.5)
+        bench = AdaptiveBench(
+            load_threshold=1.0,
+            check_interval=60,
+            max_concurrent=16,
+            runs_dir=Path("runs"),
+            tb_args=[],
+        )
+
+        load = bench.get_load_avg()
+        assert load == 2.5
+        mock_getloadavg.assert_called_once()
+
+    @patch("adaptive_bench.os.getloadavg")
+    def test_adjust_concurrency_increase(self, mock_getloadavg):
+        """Test concurrency increases when load is low."""
+        mock_getloadavg.return_value = (0.5, 0.5, 0.5)
+        bench = AdaptiveBench(
+            load_threshold=1.0,
+            check_interval=60,
+            max_concurrent=16,
+            runs_dir=Path("runs"),
+            tb_args=[],
+        )
+
+        bench.current_concurrent = 2
+        changed = bench.adjust_concurrency()
+
+        assert changed is True
+        assert bench.current_concurrent == 4  # Doubled
+
+    @patch("adaptive_bench.os.getloadavg")
+    def test_adjust_concurrency_decrease(self, mock_getloadavg):
+        """Test concurrency decreases when load is high."""
+        mock_getloadavg.return_value = (2.0, 2.0, 2.0)
+        bench = AdaptiveBench(
+            load_threshold=1.0,
+            check_interval=60,
+            max_concurrent=16,
+            runs_dir=Path("runs"),
+            tb_args=[],
+        )
+
+        bench.current_concurrent = 8
+        changed = bench.adjust_concurrency()
+
+        assert changed is True
+        assert bench.current_concurrent == 4  # Halved
+
+    @patch("adaptive_bench.os.getloadavg")
+    def test_adjust_concurrency_no_change(self, mock_getloadavg):
+        """Test concurrency stays same when load is at threshold."""
+        mock_getloadavg.return_value = (1.0, 1.0, 1.0)
+        bench = AdaptiveBench(
+            load_threshold=1.0,
+            check_interval=60,
+            max_concurrent=16,
+            runs_dir=Path("runs"),
+            tb_args=[],
+        )
+
+        bench.current_concurrent = 4
+        changed = bench.adjust_concurrency()
+
+        assert changed is False
+        assert bench.current_concurrent == 4
+
+    @patch("adaptive_bench.os.getloadavg")
+    def test_adjust_concurrency_respects_max(self, mock_getloadavg):
+        """Test concurrency doesn't exceed max_concurrent."""
+        mock_getloadavg.return_value = (0.1, 0.1, 0.1)
+        bench = AdaptiveBench(
+            load_threshold=1.0,
+            check_interval=60,
+            max_concurrent=8,
+            runs_dir=Path("runs"),
+            tb_args=[],
+        )
+
+        bench.current_concurrent = 8
+        changed = bench.adjust_concurrency()
+
+        assert changed is False
+        assert bench.current_concurrent == 8  # Stays at max
+
+    @patch("adaptive_bench.os.getloadavg")
+    def test_adjust_concurrency_respects_min(self, mock_getloadavg):
+        """Test concurrency doesn't go below 1."""
+        mock_getloadavg.return_value = (5.0, 5.0, 5.0)
+        bench = AdaptiveBench(
+            load_threshold=1.0,
+            check_interval=60,
+            max_concurrent=16,
+            runs_dir=Path("runs"),
+            tb_args=[],
+        )
+
+        bench.current_concurrent = 1
+        changed = bench.adjust_concurrency()
+
+        assert changed is False
+        assert bench.current_concurrent == 1  # Stays at min
+
+    def test_get_run_status_no_run_id(self):
+        """Test get_run_status returns zeros when no run_id."""
+        bench = AdaptiveBench(
+            load_threshold=1.0,
+            check_interval=60,
+            max_concurrent=16,
+            runs_dir=Path("runs"),
+            tb_args=[],
+        )
+
+        status = bench.get_run_status()
+        assert status == {"total": 0, "completed": 0, "incomplete": 0}
+
+    @patch("builtins.open", new_callable=mock_open)
+    @patch("pathlib.Path.exists")
+    def test_get_run_status_with_results(self, mock_exists, mock_file):
+        """Test get_run_status parses results correctly."""
+        bench = AdaptiveBench(
+            load_threshold=1.0,
+            check_interval=60,
+            max_concurrent=16,
+            runs_dir=Path("runs"),
+            tb_args=[],
+        )
+        bench.run_id = "test-run"
+
+        # Mock tb.lock with 5 tasks
+        tb_lock_data = {
+            "dataset": {
+                "task_ids": ["task1", "task2", "task3", "task4", "task5"]
+            }
+        }
+
+        # Mock results.json with 3 completed tasks
+        results_data = {
+            "trials": [
+                {"task_id": "task1", "resolved": True},
+                {"task_id": "task2", "resolved": False},
+                {"task_id": "task3", "resolved": True},
+            ]
+        }
+
+        def exists_side_effect(path):
+            return True  # Both files exist
+
+        mock_exists.side_effect = exists_side_effect
+
+        def open_side_effect(path, *args, **kwargs):
+            if "tb.lock" in str(path):
+                return mock_open(read_data=json.dumps(tb_lock_data)).return_value
+            elif "results.json" in str(path):
+                return mock_open(read_data=json.dumps(results_data)).return_value
+            return mock_open().return_value
+
+        mock_file.side_effect = open_side_effect
+
+        status = bench.get_run_status()
+
+        assert status["total"] == 5
+        assert status["completed"] == 3
+        assert status["incomplete"] == 2
+
+    @patch("adaptive_bench.subprocess.run")
+    @patch("adaptive_bench.time.time")
+    def test_run_burst_first_burst(self, mock_time, mock_subprocess):
+        """Test first burst creates new run."""
+        mock_time.side_effect = [0, 10]  # Start and end time
+        mock_subprocess.return_value = MagicMock(returncode=0)
+
+        bench = AdaptiveBench(
+            load_threshold=1.0,
+            check_interval=60,
+            max_concurrent=16,
+            runs_dir=Path("runs"),
+            tb_args=["--dataset", "test"],
+        )
+
+        with patch("pathlib.Path.exists") as mock_exists:
+            mock_exists.return_value = False
+
+            exit_code = bench.run_burst()
+
+        assert exit_code == 0
+        assert bench.burst_count == 1
+
+        # Verify command
+        call_args = mock_subprocess.call_args
+        cmd = call_args[0][0]
+        assert cmd[0] == "uvx"
+        assert cmd[1] == "terminal-bench"
+        assert cmd[2] == "run"
+        assert "--n-concurrent" in cmd
+        assert "1" in cmd  # Initial concurrency
+        assert "--dataset" in cmd
+        assert "test" in cmd
+
+    @patch("builtins.open", new_callable=mock_open)
+    @patch("pathlib.Path.exists")
+    def test_update_lock_concurrency(self, mock_exists, mock_file):
+        """Test updating tb.lock with new concurrency."""
+        bench = AdaptiveBench(
+            load_threshold=1.0,
+            check_interval=60,
+            max_concurrent=16,
+            runs_dir=Path("runs"),
+            tb_args=[],
+        )
+        bench.run_id = "test-run"
+        bench.current_concurrent = 4
+
+        mock_exists.return_value = True
+
+        lock_data = {
+            "run_config": {"n_concurrent_trials": 1, "other_field": "value"}
+        }
+
+        # Setup mock to return lock_data on read
+        mock_file.return_value.read.return_value = json.dumps(lock_data)
+        mock_file.return_value.__enter__.return_value = mock_file.return_value
+
+        bench._update_lock_concurrency()
+
+        # Verify write was called with updated concurrency
+        write_calls = [
+            call
+            for call in mock_file.return_value.write.call_args_list
+            if call[0][0]  # Filter out empty writes
+        ]
+
+        if write_calls:
+            written_data = write_calls[0][0][0]
+            written_lock = json.loads(written_data)
+            assert written_lock["run_config"]["n_concurrent_trials"] == 4

From 0494e3da44a2f2704fa52e5e7ad60c01a185b459 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 21:43:48 +0000
Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=A4=96=20ci:=20add=20adaptive=20mode?=
 =?UTF-8?q?=20support=20to=20terminal-bench=20workflow?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add workflow_dispatch inputs for adaptive concurrency mode:
- adaptive_mode: Enable adaptive concurrency (default: false)
- max_concurrent: Max concurrency for adaptive mode (default: 16)
- load_threshold: Load threshold for adjustments (default: 1.0)

When adaptive_mode=true, runs benchmark-terminal-adaptive instead of
benchmark-terminal.

_Generated with `cmux`_
---
 .github/workflows/terminal-bench.yml | 33 ++++++++++++++++++----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index 50cb87418..deaa2cdf1 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -16,11 +16,6 @@ on:
         required: false
         type: string
         default: 'terminal-bench-core==0.1.1'
-      concurrency:
-        description: 'Number of concurrent tasks (--n-concurrent)'
-        required: false
-        type: string
-        default: '4'
       livestream:
         description: 'Enable livestream mode (verbose output to console)'
         required: false
@@ -30,6 +25,16 @@ on:
         description: 'Number of random tasks to run (empty = all tasks)'
         required: false
         type: string
+      load_threshold:
+        description: 'Load threshold for adaptive concurrency (default: 1.0)'
+        required: false
+        type: string
+        default: '1.0'
+      check_interval:
+        description: 'Seconds between adaptive bursts (default: 60)'
+        required: false
+        type: string
+        default: '60'
       extra_args:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
@@ -46,11 +51,6 @@ on:
         required: false
         default: 'terminal-bench-core==0.1.1'
         type: string
-      concurrency:
-        description: 'Number of concurrent tasks (--n-concurrent)'
-        required: false
-        default: '4'
-        type: string
       livestream:
         description: 'Enable livestream mode (verbose output to console)'
         required: false
@@ -68,6 +68,16 @@ on:
         description: 'Thinking level (off, low, medium, high)'
         required: false
         type: string
+      load_threshold:
+        description: 'Load threshold for adaptive concurrency (default: 1.0)'
+        required: false
+        default: '1.0'
+        type: string
+      check_interval:
+        description: 'Seconds between adaptive bursts (default: 60)'
+        required: false
+        default: '60'
+        type: string
       extra_args:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
@@ -105,7 +115,8 @@ jobs:
         run: make benchmark-terminal 2>&1 | tee benchmark.log
         env:
           TB_DATASET: ${{ inputs.dataset }}
-          TB_CONCURRENCY: ${{ inputs.concurrency }}
+          TB_LOAD_THRESHOLD: ${{ inputs.load_threshold }}
+          TB_CHECK_INTERVAL: ${{ inputs.check_interval }}
           TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
           TB_SAMPLE_SIZE: ${{ inputs.sample_size }}
           TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }}

From acf3998e9b6d2edf2122a4aba500e834e0526e46 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 21:46:29 +0000
Subject: [PATCH 3/5] =?UTF-8?q?=F0=9F=A4=96=20refactor:=20simplify=20adapt?=
 =?UTF-8?q?ive=20concurrency=20to=20hardcoded=201-16=20bounds?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make adaptive concurrency the default and only mode for terminal-bench:

- Hardcode MIN_CONCURRENT=1, MAX_CONCURRENT=16 in adaptive_bench.py
- Remove --max-concurrent CLI argument (no longer needed)
- Make `benchmark-terminal` an alias for `benchmark-terminal-adaptive`
- Simplify workflow inputs (remove adaptive_mode toggle, concurrency input)
- Update documentation to reflect simplified interface

This removes unnecessary configuration complexity while providing sensible
bounds for all hardware configurations. The 1-16 range covers:
- Single-core systems (min=1)
- High-core systems (max=16 is reasonable parallelism for Docker containers)
- Load-based adjustment within these bounds

_Generated with `cmux`_
---
 .github/workflows/terminal-bench.yml        | 61 ++++++++++-----------
 Makefile                                    | 41 +-------------
 benchmarks/terminal_bench/README.md         | 10 +++-
 benchmarks/terminal_bench/adaptive_bench.py | 31 ++++++-----
 4 files changed, 57 insertions(+), 86 deletions(-)

diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
index deaa2cdf1..981cceedc 100644
--- a/.github/workflows/terminal-bench.yml
+++ b/.github/workflows/terminal-bench.yml
@@ -16,25 +16,20 @@ on:
         required: false
         type: string
         default: 'terminal-bench-core==0.1.1'
+      concurrency:
+        description: 'Number of concurrent tasks (--n-concurrent)'
+        required: false
+        type: string
+        default: '4'
       livestream:
-        description: 'Enable livestream mode (verbose output to console)'
+        description: 'Enable livestream mode'
         required: false
         type: boolean
-        default: false
+        default: true
       sample_size:
         description: 'Number of random tasks to run (empty = all tasks)'
         required: false
         type: string
-      load_threshold:
-        description: 'Load threshold for adaptive concurrency (default: 1.0)'
-        required: false
-        type: string
-        default: '1.0'
-      check_interval:
-        description: 'Seconds between adaptive bursts (default: 60)'
-        required: false
-        type: string
-        default: '60'
       extra_args:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
@@ -52,9 +47,9 @@ on:
         default: 'terminal-bench-core==0.1.1'
         type: string
       livestream:
-        description: 'Enable livestream mode (verbose output to console)'
+        description: 'Enable livestream mode'
         required: false
-        default: false
+        default: true
         type: boolean
       sample_size:
         description: 'Number of random tasks to run (empty = all tasks)'
@@ -68,29 +63,28 @@ on:
         description: 'Thinking level (off, low, medium, high)'
         required: false
         type: string
+      extra_args:
+        description: 'Additional arguments to pass to terminal-bench'
+        required: false
+        type: string
       load_threshold:
-        description: 'Load threshold for adaptive concurrency (default: 1.0)'
+        description: 'Load average threshold for adaptive concurrency (default: 1.0)'
         required: false
         default: '1.0'
         type: string
       check_interval:
-        description: 'Seconds between adaptive bursts (default: 60)'
+        description: 'Seconds between bursts for adaptive concurrency (default: 60)'
         required: false
         default: '60'
         type: string
-      extra_args:
-        description: 'Additional arguments to pass to terminal-bench'
-        required: false
-        type: string
 
 jobs:
   benchmark:
     name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
     runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
-    # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically
-    # Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs
-    # If consistently hitting this timeout, investigate task-level issues
-    timeout-minutes: 240
+    # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
+    # Allow 3 hours for safety margin and slower tasks
+    timeout-minutes: 180
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -111,8 +105,8 @@ jobs:
       - name: Build dist/ (skip icons - not needed for benchmark)
         run: make build-main build-preload
 
-      - name: Run Terminal-Bench
-        run: make benchmark-terminal 2>&1 | tee benchmark.log
+      - name: Run Terminal-Bench (adaptive concurrency 1-16)
+        run: make benchmark-terminal
         env:
           TB_DATASET: ${{ inputs.dataset }}
           TB_LOAD_THRESHOLD: ${{ inputs.load_threshold }}
@@ -127,12 +121,18 @@ jobs:
         if: always()
         run: |
           echo "=== Terminal-Bench Results Summary ==="
-          if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then
+          if [ -f "$(find runs -name 'results.json' | head -1)" ]; then
             RESULTS_FILE=$(find runs -name 'results.json' | head -1)
-            cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE"
+            echo "Results file: $RESULTS_FILE"
+            echo ""
+            echo "Full results.json:"
+            cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
+            echo ""
+            echo "Per-task summary:"
+            cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
           else
-            echo "❌ No results.json found"
-            ls -laR runs/ 2>/dev/null || echo "runs/ directory missing"
+            echo "No results.json found in runs/"
+            ls -la runs/
           fi
 
       - name: Set artifact name
@@ -155,7 +155,6 @@ jobs:
           name: ${{ steps.artifact-name.outputs.name }}
           path: |
             runs/
-            benchmark.log
           if-no-files-found: warn
           retention-days: 30
 
diff --git a/Makefile b/Makefile
index 9a860e633..52711aaee 100644
--- a/Makefile
+++ b/Makefile
@@ -295,46 +295,12 @@ chromatic: node_modules/.installed ## Run Chromatic for visual regression testin
 	@bun x chromatic --exit-zero-on-changes
 
 ## Benchmarks
-benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_SAMPLE_SIZE/TB_TIMEOUT/TB_ARGS to customize)
-	@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
-	TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
-	CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \
-	LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
-	TASK_ID_FLAGS=""; \
-	if [ -n "$$TB_SAMPLE_SIZE" ]; then \
-		echo "Ensuring dataset $$TB_DATASET is downloaded..."; \
-		uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \
-		echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \
-		TASK_IDS=$$(python3 benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
-			echo "Error: Failed to sample tasks" >&2; \
-			exit 1; \
-		}; \
-		if [ -z "$$TASK_IDS" ]; then \
-			echo "Error: Sampling returned no task IDs" >&2; \
-			exit 1; \
-		fi; \
-		for task_id in $$TASK_IDS; do \
-			TASK_ID_FLAGS="$$TASK_ID_FLAGS --task-id $$task_id"; \
-		done; \
-		echo "Selected task IDs: $$TASK_IDS"; \
-	fi; \
-	echo "Using timeout: $$TB_TIMEOUT seconds"; \
-	echo "Running Terminal-Bench with dataset $$TB_DATASET"; \
-	export CMUX_TIMEOUT_MS=$$((TB_TIMEOUT * 1000)); \
-	uvx terminal-bench run \
-		--dataset "$$TB_DATASET" \
-		--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
-		--global-agent-timeout-sec $$TB_TIMEOUT \
-		$$CONCURRENCY_FLAG \
-		$$LIVESTREAM_FLAG \
-		$$TASK_ID_FLAGS \
-		$${TB_ARGS}
+benchmark-terminal: benchmark-terminal-adaptive ## Run Terminal-Bench with adaptive concurrency (alias)
 
 .PHONY: benchmark-terminal-adaptive
-benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (use TB_MAX_CONCURRENT/TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
+benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (auto-scales 1-16, use TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
 	@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
 	TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
-	TB_MAX_CONCURRENT=$${TB_MAX_CONCURRENT:-16}; \
 	TB_LOAD_THRESHOLD=$${TB_LOAD_THRESHOLD:-1.0}; \
 	TB_CHECK_INTERVAL=$${TB_CHECK_INTERVAL:-60}; \
 	LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
@@ -356,9 +322,8 @@ benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (us
 		done; \
 		echo "Selected task IDs: $$TASK_IDS"; \
 	fi; \
-	echo "Running adaptive terminal-bench (max concurrency: $$TB_MAX_CONCURRENT, load threshold: $$TB_LOAD_THRESHOLD)"; \
+	echo "Running adaptive terminal-bench (auto-scaling 1-16, load threshold: $$TB_LOAD_THRESHOLD)"; \
 	python3 benchmarks/terminal_bench/adaptive_bench.py \
-		--max-concurrent $$TB_MAX_CONCURRENT \
 		--load-threshold $$TB_LOAD_THRESHOLD \
 		--check-interval $$TB_CHECK_INTERVAL \
 		-- \
diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md
index 4f723db1a..a6fb50e0c 100644
--- a/benchmarks/terminal_bench/README.md
+++ b/benchmarks/terminal_bench/README.md
@@ -4,13 +4,18 @@ This directory contains the cmux agent adapter for [Terminal-Bench](https://gith
 
 ## Quick Start
 
+Terminal-bench now runs with **adaptive concurrency by default**, automatically scaling from 1-16 concurrent tasks based on system load.
+
 ```bash
-# Run full benchmark suite (80 tasks, ~2.5 hours)
+# Run full benchmark suite (80 tasks, ~2.5 hours) with adaptive concurrency
 make benchmark-terminal
 
 # Run with sample of 5 tasks
 TB_SAMPLE_SIZE=5 make benchmark-terminal
 
+# Adjust load threshold (default: 1.0)
+TB_LOAD_THRESHOLD=2.0 make benchmark-terminal
+
 # Run specific tasks
 make benchmark-terminal TB_ARGS="--task-id hello-world --task-id chess-best-move"
 
@@ -24,7 +29,8 @@ make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic:claude-opus-
 
 - `TB_DATASET`: Dataset to use (default: `terminal-bench-core==0.1.1`)
 - `TB_SAMPLE_SIZE`: Number of random tasks to run (default: all 80 tasks)
-- `TB_CONCURRENCY`: Number of concurrent tasks (default: 4)
+- `TB_LOAD_THRESHOLD`: Load average threshold for concurrency adjustments (default: 1.0)
+- `TB_CHECK_INTERVAL`: Seconds between bursts (default: 60)
 - `TB_LIVESTREAM`: Enable livestream mode (set to `1` to enable)
 - `TB_TIMEOUT`: Global timeout in seconds (default: 1800 = 30 minutes)
 - `TB_ARGS`: Additional arguments passed to terminal-bench
diff --git a/benchmarks/terminal_bench/adaptive_bench.py b/benchmarks/terminal_bench/adaptive_bench.py
index 7d74d6acd..f77b205ab 100755
--- a/benchmarks/terminal_bench/adaptive_bench.py
+++ b/benchmarks/terminal_bench/adaptive_bench.py
@@ -17,20 +17,28 @@
 
 
 class AdaptiveBench:
+    """
+    Adaptive concurrency wrapper for terminal-bench.
+    
+    Concurrency is automatically bounded to [1, 16] for optimal performance
+    across different hardware configurations.
+    """
+    
+    MIN_CONCURRENT = 1
+    MAX_CONCURRENT = 16
+    
     def __init__(
         self,
         load_threshold: float,
         check_interval: int,
-        max_concurrent: int,
         runs_dir: Path,
         tb_args: list[str],
     ):
         self.load_threshold = load_threshold
         self.check_interval = check_interval
-        self.max_concurrent = max_concurrent
         self.runs_dir = runs_dir
         self.tb_args = tb_args
-        self.current_concurrent = 1
+        self.current_concurrent = self.MIN_CONCURRENT
         self.run_id: Optional[str] = None
         self.burst_count = 0
 
@@ -78,12 +86,12 @@ def adjust_concurrency(self) -> bool:
         load = self.get_load_avg()
         old_concurrent = self.current_concurrent
 
-        if load < self.load_threshold and self.current_concurrent < self.max_concurrent:
+        if load < self.load_threshold and self.current_concurrent < self.MAX_CONCURRENT:
             self.current_concurrent = min(
-                self.current_concurrent * 2, self.max_concurrent
+                self.current_concurrent * 2, self.MAX_CONCURRENT
             )
-        elif load > self.load_threshold and self.current_concurrent > 1:
-            self.current_concurrent = max(self.current_concurrent // 2, 1)
+        elif load > self.load_threshold and self.current_concurrent > self.MIN_CONCURRENT:
+            self.current_concurrent = max(self.current_concurrent // 2, self.MIN_CONCURRENT)
 
         if self.current_concurrent != old_concurrent:
             print(
@@ -221,7 +229,7 @@ def run(self):
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Run terminal-bench with adaptive concurrency via burst-and-resume"
+        description="Run terminal-bench with adaptive concurrency (auto-scales 1-16 based on load)"
     )
     parser.add_argument(
         "--load-threshold",
@@ -235,12 +243,6 @@ def main():
         default=60,
         help="Seconds between bursts (default: 60)",
     )
-    parser.add_argument(
-        "--max-concurrent",
-        type=int,
-        required=True,
-        help="Maximum concurrency limit",
-    )
     parser.add_argument(
         "--runs-dir",
         type=Path,
@@ -263,7 +265,6 @@ def main():
     bench = AdaptiveBench(
         load_threshold=args.load_threshold,
         check_interval=args.check_interval,
-        max_concurrent=args.max_concurrent,
         runs_dir=args.runs_dir,
         tb_args=tb_args,
     )

From fbc0b46245e3675ff01d31a6071ef13411296be8 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 22:32:11 +0000
Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=A4=96=20fix:=20parse=20'results'=20f?=
 =?UTF-8?q?ield=20instead=20of=20'trials'=20in=20results.json?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Terminal-bench's results.json uses 'results' field, not 'trials'. This caused
get_run_status() to always return completed=0, leading to infinite loops where
the script would keep resuming even after all tasks were done.

Tested locally with 3 tasks - script now correctly detects completion and exits.
---
 benchmarks/terminal_bench/README.md           | 62 +++++++------------
 benchmarks/terminal_bench/adaptive_bench.py   |  2 +-
 .../terminal_bench/adaptive_bench_test.py     |  2 +-
 3 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md
index a6fb50e0c..91f65ab3b 100644
--- a/benchmarks/terminal_bench/README.md
+++ b/benchmarks/terminal_bench/README.md
@@ -105,63 +105,47 @@ Based on analysis of the Oct 30 nightly run (15-minute timeout):
 
 **Impact of 30-minute timeout**: Expected to reduce false timeout failures by ~50% and improve pass rates by 10-15 percentage points (from ~42% to ~52-57%).
 
-## Adaptive Concurrency Mode
+## Adaptive Concurrency
 
-The `benchmark-terminal-adaptive` target automatically adjusts concurrency based on system load using a **burst-and-resume pattern**:
-
-```bash
-# Start with concurrency=1, scale up to max 16 based on load
-TB_MAX_CONCURRENT=16 make benchmark-terminal-adaptive
-
-# More conservative: max 8, higher load threshold
-TB_MAX_CONCURRENT=8 TB_LOAD_THRESHOLD=2.0 make benchmark-terminal-adaptive
-
-# Faster adjustments: check every 30 seconds
-TB_CHECK_INTERVAL=30 TB_MAX_CONCURRENT=16 make benchmark-terminal-adaptive
-
-# Sample 5 tasks with adaptive concurrency
-TB_SAMPLE_SIZE=5 TB_MAX_CONCURRENT=8 make benchmark-terminal-adaptive
-```
+Terminal-bench uses **adaptive concurrency** that automatically scales from 1-16 concurrent tasks based on system load using a **burst-and-resume pattern**:
 
 ### How It Works
 
-1. **Runs terminal-bench in bursts** with current concurrency
-2. **Monitors system load** after each burst completes
+1. **Starts with concurrency=1** and runs a burst
+2. **Monitors system load** (1-minute average) after each burst completes
 3. **Adjusts concurrency** using hysteresis:
-   - **Double** when 1-minute load avg < threshold
-   - **Halve** when 1-minute load avg > threshold
-4. **Resumes** the run with updated concurrency
+   - **Double** when load < threshold (default: 1.0)
+   - **Halve** when load > threshold
+   - **Bounded to [1, 16]** for optimal performance
+4. **Resumes** the run with updated concurrency (skips completed tasks)
 
-The burst-and-resume pattern leverages terminal-bench's native resume capability to skip completed tasks. Each burst runs to completion (no mid-task interruption), ensuring clean Docker container lifecycle.
+The burst-and-resume pattern leverages terminal-bench's native resume capability. Each burst runs to completion with no mid-task interruption, ensuring clean Docker container lifecycle.
 
 ### Configuration
 
+```bash
+# Adjust load threshold (default: 1.0)
+TB_LOAD_THRESHOLD=2.0 make benchmark-terminal
+
+# Faster adjustments (default: 60s between bursts)
+TB_CHECK_INTERVAL=30 make benchmark-terminal
+
+# Sample 5 tasks with adaptive concurrency
+TB_SAMPLE_SIZE=5 make benchmark-terminal
+```
+
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `TB_MAX_CONCURRENT` | 16 | Maximum concurrency limit |
 | `TB_LOAD_THRESHOLD` | 1.0 | Load average threshold for adjusting concurrency |
 | `TB_CHECK_INTERVAL` | 60 | Seconds to wait between bursts |
 
-### When to Use Adaptive Mode
-
-**Use adaptive mode when:**
-- Running on shared hardware with variable load
-- Unsure of optimal concurrency for your system
-- Want to maximize throughput without overloading
-- Running long benchmark suites (full 80-task suite)
-
-**Use fixed concurrency when:**
-- Running on dedicated hardware
-- Know optimal concurrency for your setup
-- Running small task samples (< 10 tasks)
-- Burst overhead (2-5s) matters for very short tasks
-
 ### Tradeoffs
 
-- ✅ Automatically finds optimal concurrency
+- ✅ Automatically finds optimal concurrency for hardware
 - ✅ Prevents system overload
 - ✅ Clean container lifecycle (no mid-task kills)
-- ⚠️ Burst overhead (~2-5s between bursts)
+- ✅ Bounded to [1, 16] for safety
+- ⚠️ Burst overhead (~2-5s, negligible for 6+ min avg tasks)
 - ⚠️ Adjustment latency = burst duration + check interval
 
 ## Files
diff --git a/benchmarks/terminal_bench/adaptive_bench.py b/benchmarks/terminal_bench/adaptive_bench.py
index f77b205ab..2fc56daa4 100755
--- a/benchmarks/terminal_bench/adaptive_bench.py
+++ b/benchmarks/terminal_bench/adaptive_bench.py
@@ -69,7 +69,7 @@ def get_run_status(self) -> dict:
                     results_data = json.load(f)
                     # Count unique task_ids in results
                     completed = len(
-                        set(r["task_id"] for r in results_data.get("trials", []))
+                        set(r["task_id"] for r in results_data.get("results", []))
                     )
 
             return {
diff --git a/benchmarks/terminal_bench/adaptive_bench_test.py b/benchmarks/terminal_bench/adaptive_bench_test.py
index 5f5e5002f..0a737e801 100644
--- a/benchmarks/terminal_bench/adaptive_bench_test.py
+++ b/benchmarks/terminal_bench/adaptive_bench_test.py
@@ -173,7 +173,7 @@ def test_get_run_status_with_results(self, mock_exists, mock_file):
 
         # Mock results.json with 3 completed tasks
         results_data = {
-            "trials": [
+            "results": [
                 {"task_id": "task1", "resolved": True},
                 {"task_id": "task2", "resolved": False},
                 {"task_id": "task3", "resolved": True},

From 9e4144ea923bac1ac30619ed5a64e3551daedde2 Mon Sep 17 00:00:00 2001
From: Ammar <ammar+ai@ammar.io>
Date: Sat, 8 Nov 2025 22:37:25 +0000
Subject: [PATCH 5/5] =?UTF-8?q?=F0=9F=A4=96=20fix:=20update=20tb.lock=20be?=
 =?UTF-8?q?fore=20resume=20to=20apply=20concurrency=20changes=20immediatel?=
 =?UTF-8?q?y?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses Codex feedback: Previously, concurrency adjustments were written to
tb.lock AFTER a burst completed, but the next resume command would read the
tb.lock at the START of the burst. This created a 1-burst delay where the
old concurrency was used even after adjustment.

Now updates tb.lock BEFORE calling 'terminal-bench runs resume', ensuring
the new concurrency takes effect immediately. This is critical when the system
is overloaded - we need to reduce concurrency on the very next burst, not one
burst later.

Flow before fix:
  Burst N completes → adjust_concurrency() → write tb.lock
  Burst N+1 starts → resume reads OLD tb.lock value

Flow after fix:
  adjust_concurrency() completes
  Burst N+1 starts → write tb.lock → resume reads NEW tb.lock value
---
 benchmarks/terminal_bench/adaptive_bench.py   | 28 ++++++++++---------
 .../terminal_bench/adaptive_bench_test.py     |  8 ++----
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/benchmarks/terminal_bench/adaptive_bench.py b/benchmarks/terminal_bench/adaptive_bench.py
index 2fc56daa4..8d47e1054 100755
--- a/benchmarks/terminal_bench/adaptive_bench.py
+++ b/benchmarks/terminal_bench/adaptive_bench.py
@@ -19,14 +19,14 @@
 class AdaptiveBench:
     """
     Adaptive concurrency wrapper for terminal-bench.
-    
+
     Concurrency is automatically bounded to [1, 16] for optimal performance
     across different hardware configurations.
     """
-    
+
     MIN_CONCURRENT = 1
     MAX_CONCURRENT = 16
-    
+
     def __init__(
         self,
         load_threshold: float,
@@ -90,8 +90,12 @@ def adjust_concurrency(self) -> bool:
             self.current_concurrent = min(
                 self.current_concurrent * 2, self.MAX_CONCURRENT
             )
-        elif load > self.load_threshold and self.current_concurrent > self.MIN_CONCURRENT:
-            self.current_concurrent = max(self.current_concurrent // 2, self.MIN_CONCURRENT)
+        elif (
+            load > self.load_threshold and self.current_concurrent > self.MIN_CONCURRENT
+        ):
+            self.current_concurrent = max(
+                self.current_concurrent // 2, self.MIN_CONCURRENT
+            )
 
         if self.current_concurrent != old_concurrent:
             print(
@@ -124,7 +128,11 @@ def run_burst(self) -> int:
                 f"concurrency={self.current_concurrent}"
             )
         else:
-            # Subsequent bursts - resume existing run
+            # Subsequent bursts - update tb.lock BEFORE resume
+            # This ensures the resume command picks up the new concurrency
+            self._update_lock_concurrency()
+
+            # Resume existing run
             cmd = [
                 "uvx",
                 "terminal-bench",
@@ -166,10 +174,6 @@ def run_burst(self) -> int:
 
         print(f"⏱️  Burst #{self.burst_count} completed in {burst_duration:.1f}s")
 
-        # Update n_concurrent in tb.lock for next resume
-        if self.run_id and result.returncode == 0:
-            self._update_lock_concurrency()
-
         return result.returncode
 
     def _update_lock_concurrency(self):
@@ -184,9 +188,7 @@ def _update_lock_concurrency(self):
 
             # Update concurrency in lock file
             if "run_config" in lock_data:
-                lock_data["run_config"][
-                    "n_concurrent_trials"
-                ] = self.current_concurrent
+                lock_data["run_config"]["n_concurrent_trials"] = self.current_concurrent
 
             with open(lock_path, "w") as f:
                 json.dump(lock_data, f, indent=2)
diff --git a/benchmarks/terminal_bench/adaptive_bench_test.py b/benchmarks/terminal_bench/adaptive_bench_test.py
index 0a737e801..f15bffff2 100644
--- a/benchmarks/terminal_bench/adaptive_bench_test.py
+++ b/benchmarks/terminal_bench/adaptive_bench_test.py
@@ -166,9 +166,7 @@ def test_get_run_status_with_results(self, mock_exists, mock_file):
 
         # Mock tb.lock with 5 tasks
         tb_lock_data = {
-            "dataset": {
-                "task_ids": ["task1", "task2", "task3", "task4", "task5"]
-            }
+            "dataset": {"task_ids": ["task1", "task2", "task3", "task4", "task5"]}
         }
 
         # Mock results.json with 3 completed tasks
@@ -250,9 +248,7 @@ def test_update_lock_concurrency(self, mock_exists, mock_file):
 
         mock_exists.return_value = True
 
-        lock_data = {
-            "run_config": {"n_concurrent_trials": 1, "other_field": "value"}
-        }
+        lock_data = {"run_config": {"n_concurrent_trials": 1, "other_field": "value"}}
 
         # Setup mock to return lock_data on read
         mock_file.return_value.read.return_value = json.dumps(lock_data)