🤖 refactor: simplify adaptive concurrency to hardcoded 1-16 bounds

ammar-agent · ammar-agent · commit acf3998e9b6d · 2025-11-08T22:36:27.000Z
Make adaptive concurrency the default and only mode for terminal-bench:

- Hardcode MIN_CONCURRENT=1, MAX_CONCURRENT=16 in adaptive_bench.py
- Remove --max-concurrent CLI argument (no longer needed)
- Make `benchmark-terminal` an alias for `benchmark-terminal-adaptive`
- Simplify workflow inputs (remove adaptive_mode toggle, concurrency input)
- Update documentation to reflect simplified interface

This removes unnecessary configuration complexity while providing sensible
bounds for all hardware configurations. The 1-16 range covers:
- Single-core systems (min=1)
- High-core systems (max=16 is reasonable parallelism for Docker containers)
- Load-based adjustment within these bounds

_Generated with `cmux`_
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
@@ -16,25 +16,20 @@ on:
         required: false
         type: string
         default: 'terminal-bench-core==0.1.1'
+      concurrency:
+        description: 'Number of concurrent tasks (--n-concurrent)'
+        required: false
+        type: string
+        default: '4'
       livestream:
-        description: 'Enable livestream mode (verbose output to console)'
+        description: 'Enable livestream mode'
         required: false
         type: boolean
-        default: false
+        default: true
       sample_size:
         description: 'Number of random tasks to run (empty = all tasks)'
         required: false
         type: string
-      load_threshold:
-        description: 'Load threshold for adaptive concurrency (default: 1.0)'
-        required: false
-        type: string
-        default: '1.0'
-      check_interval:
-        description: 'Seconds between adaptive bursts (default: 60)'
-        required: false
-        type: string
-        default: '60'
       extra_args:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
@@ -52,9 +47,9 @@ on:
         default: 'terminal-bench-core==0.1.1'
         type: string
       livestream:
-        description: 'Enable livestream mode (verbose output to console)'
+        description: 'Enable livestream mode'
         required: false
-        default: false
+        default: true
         type: boolean
       sample_size:
         description: 'Number of random tasks to run (empty = all tasks)'
@@ -68,29 +63,28 @@ on:
         description: 'Thinking level (off, low, medium, high)'
         required: false
         type: string
+      extra_args:
+        description: 'Additional arguments to pass to terminal-bench'
+        required: false
+        type: string
       load_threshold:
-        description: 'Load threshold for adaptive concurrency (default: 1.0)'
+        description: 'Load average threshold for adaptive concurrency (default: 1.0)'
         required: false
         default: '1.0'
         type: string
       check_interval:
-        description: 'Seconds between adaptive bursts (default: 60)'
+        description: 'Seconds between bursts for adaptive concurrency (default: 60)'
         required: false
         default: '60'
         type: string
-      extra_args:
-        description: 'Additional arguments to pass to terminal-bench'
-        required: false
-        type: string
 
 jobs:
   benchmark:
     name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
     runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
-    # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically
-    # Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs
-    # If consistently hitting this timeout, investigate task-level issues
-    timeout-minutes: 240
+    # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
+    # Allow 3 hours for safety margin and slower tasks
+    timeout-minutes: 180
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -111,8 +105,8 @@ jobs:
       - name: Build dist/ (skip icons - not needed for benchmark)
         run: make build-main build-preload
 
-      - name: Run Terminal-Bench
-        run: make benchmark-terminal 2>&1 | tee benchmark.log
+      - name: Run Terminal-Bench (adaptive concurrency 1-16)
+        run: make benchmark-terminal
         env:
           TB_DATASET: ${{ inputs.dataset }}
           TB_LOAD_THRESHOLD: ${{ inputs.load_threshold }}
@@ -127,12 +121,18 @@ jobs:
         if: always()
         run: |
           echo "=== Terminal-Bench Results Summary ==="
-          if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then
+          if [ -f "$(find runs -name 'results.json' | head -1)" ]; then
             RESULTS_FILE=$(find runs -name 'results.json' | head -1)
-            cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE"
+            echo "Results file: $RESULTS_FILE"
+            echo ""
+            echo "Full results.json:"
+            cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
+            echo ""
+            echo "Per-task summary:"
+            cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
           else
-            echo "❌ No results.json found"
-            ls -laR runs/ 2>/dev/null || echo "runs/ directory missing"
+            echo "No results.json found in runs/"
+            ls -la runs/
           fi
 
       - name: Set artifact name
@@ -155,7 +155,6 @@ jobs:
           name: ${{ steps.artifact-name.outputs.name }}
           path: |
             runs/
-            benchmark.log
           if-no-files-found: warn
           retention-days: 30
 
diff --git a/Makefile b/Makefile
@@ -295,46 +295,12 @@ chromatic: node_modules/.installed ## Run Chromatic for visual regression testin
 	@bun x chromatic --exit-zero-on-changes
 
 ## Benchmarks
-benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_SAMPLE_SIZE/TB_TIMEOUT/TB_ARGS to customize)
-	@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
-	TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
-	CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \
-	LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
-	TASK_ID_FLAGS=""; \
-	if [ -n "$$TB_SAMPLE_SIZE" ]; then \
-		echo "Ensuring dataset $$TB_DATASET is downloaded..."; \
-		uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \
-		echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \
-		TASK_IDS=$$(python3 benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
-			echo "Error: Failed to sample tasks" >&2; \
-			exit 1; \
-		}; \
-		if [ -z "$$TASK_IDS" ]; then \
-			echo "Error: Sampling returned no task IDs" >&2; \
-			exit 1; \
-		fi; \
-		for task_id in $$TASK_IDS; do \
-			TASK_ID_FLAGS="$$TASK_ID_FLAGS --task-id $$task_id"; \
-		done; \
-		echo "Selected task IDs: $$TASK_IDS"; \
-	fi; \
-	echo "Using timeout: $$TB_TIMEOUT seconds"; \
-	echo "Running Terminal-Bench with dataset $$TB_DATASET"; \
-	export CMUX_TIMEOUT_MS=$$((TB_TIMEOUT * 1000)); \
-	uvx terminal-bench run \
-		--dataset "$$TB_DATASET" \
-		--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
-		--global-agent-timeout-sec $$TB_TIMEOUT \
-		$$CONCURRENCY_FLAG \
-		$$LIVESTREAM_FLAG \
-		$$TASK_ID_FLAGS \
-		$${TB_ARGS}
+benchmark-terminal: benchmark-terminal-adaptive ## Run Terminal-Bench with adaptive concurrency (alias)
 
 .PHONY: benchmark-terminal-adaptive
-benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (use TB_MAX_CONCURRENT/TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
+benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (auto-scales 1-16, use TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
 	@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
 	TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
-	TB_MAX_CONCURRENT=$${TB_MAX_CONCURRENT:-16}; \
 	TB_LOAD_THRESHOLD=$${TB_LOAD_THRESHOLD:-1.0}; \
 	TB_CHECK_INTERVAL=$${TB_CHECK_INTERVAL:-60}; \
 	LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
@@ -356,9 +322,8 @@ benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (us
 		done; \
 		echo "Selected task IDs: $$TASK_IDS"; \
 	fi; \
-	echo "Running adaptive terminal-bench (max concurrency: $$TB_MAX_CONCURRENT, load threshold: $$TB_LOAD_THRESHOLD)"; \
+	echo "Running adaptive terminal-bench (auto-scaling 1-16, load threshold: $$TB_LOAD_THRESHOLD)"; \
 	python3 benchmarks/terminal_bench/adaptive_bench.py \
-		--max-concurrent $$TB_MAX_CONCURRENT \
 		--load-threshold $$TB_LOAD_THRESHOLD \
 		--check-interval $$TB_CHECK_INTERVAL \
 		-- \
diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md
@@ -4,13 +4,18 @@ This directory contains the cmux agent adapter for [Terminal-Bench](https://gith
 
 ## Quick Start
 
+Terminal-bench now runs with **adaptive concurrency by default**, automatically scaling from 1-16 concurrent tasks based on system load.
+
 ```bash
-# Run full benchmark suite (80 tasks, ~2.5 hours)
+# Run full benchmark suite (80 tasks, ~2.5 hours) with adaptive concurrency
 make benchmark-terminal
 
 # Run with sample of 5 tasks
 TB_SAMPLE_SIZE=5 make benchmark-terminal
 
+# Adjust load threshold (default: 1.0)
+TB_LOAD_THRESHOLD=2.0 make benchmark-terminal
+
 # Run specific tasks
 make benchmark-terminal TB_ARGS="--task-id hello-world --task-id chess-best-move"
 
@@ -24,7 +29,8 @@ make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic:claude-opus-
 
 - `TB_DATASET`: Dataset to use (default: `terminal-bench-core==0.1.1`)
 - `TB_SAMPLE_SIZE`: Number of random tasks to run (default: all 80 tasks)
-- `TB_CONCURRENCY`: Number of concurrent tasks (default: 4)
+- `TB_LOAD_THRESHOLD`: Load average threshold for concurrency adjustments (default: 1.0)
+- `TB_CHECK_INTERVAL`: Seconds between bursts (default: 60)
 - `TB_LIVESTREAM`: Enable livestream mode (set to `1` to enable)
 - `TB_TIMEOUT`: Global timeout in seconds (default: 1800 = 30 minutes)
 - `TB_ARGS`: Additional arguments passed to terminal-bench
diff --git a/benchmarks/terminal_bench/adaptive_bench.py b/benchmarks/terminal_bench/adaptive_bench.py
@@ -17,20 +17,28 @@
 
 
 class AdaptiveBench:
+    """
+    Adaptive concurrency wrapper for terminal-bench.
+    
+    Concurrency is automatically bounded to [1, 16] for optimal performance
+    across different hardware configurations.
+    """
+    
+    MIN_CONCURRENT = 1
+    MAX_CONCURRENT = 16
+    
     def __init__(
         self,
         load_threshold: float,
         check_interval: int,
-        max_concurrent: int,
         runs_dir: Path,
         tb_args: list[str],
     ):
         self.load_threshold = load_threshold
         self.check_interval = check_interval
-        self.max_concurrent = max_concurrent
         self.runs_dir = runs_dir
         self.tb_args = tb_args
-        self.current_concurrent = 1
+        self.current_concurrent = self.MIN_CONCURRENT
         self.run_id: Optional[str] = None
         self.burst_count = 0
 
@@ -78,12 +86,12 @@ def adjust_concurrency(self) -> bool:
         load = self.get_load_avg()
         old_concurrent = self.current_concurrent
 
-        if load < self.load_threshold and self.current_concurrent < self.max_concurrent:
+        if load < self.load_threshold and self.current_concurrent < self.MAX_CONCURRENT:
             self.current_concurrent = min(
-                self.current_concurrent * 2, self.max_concurrent
+                self.current_concurrent * 2, self.MAX_CONCURRENT
             )
-        elif load > self.load_threshold and self.current_concurrent > 1:
-            self.current_concurrent = max(self.current_concurrent // 2, 1)
+        elif load > self.load_threshold and self.current_concurrent > self.MIN_CONCURRENT:
+            self.current_concurrent = max(self.current_concurrent // 2, self.MIN_CONCURRENT)
 
         if self.current_concurrent != old_concurrent:
             print(
@@ -221,7 +229,7 @@ def run(self):
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Run terminal-bench with adaptive concurrency via burst-and-resume"
+        description="Run terminal-bench with adaptive concurrency (auto-scales 1-16 based on load)"
     )
     parser.add_argument(
         "--load-threshold",
@@ -235,12 +243,6 @@ def main():
         default=60,
         help="Seconds between bursts (default: 60)",
     )
-    parser.add_argument(
-        "--max-concurrent",
-        type=int,
-        required=True,
-        help="Maximum concurrency limit",
-    )
     parser.add_argument(
         "--runs-dir",
         type=Path,
@@ -263,7 +265,6 @@ def main():
     bench = AdaptiveBench(
         load_threshold=args.load_threshold,
         check_interval=args.check_interval,
-        max_concurrent=args.max_concurrent,
         runs_dir=args.runs_dir,
         tb_args=tb_args,
     )