🤖 refactor: simplify adaptive concurrency to hardcoded 1-16 bounds

ammar-agent · ammar-agent · commit 0c84cb99f4c3 · 2025-11-08T21:46:29.000Z
Make adaptive concurrency the default and only mode for terminal-bench:

- Hardcode MIN_CONCURRENT=1, MAX_CONCURRENT=16 in adaptive_bench.py
- Remove --max-concurrent CLI argument (no longer needed)
- Make `benchmark-terminal` an alias for `benchmark-terminal-adaptive`
- Simplify workflow inputs (remove adaptive_mode toggle, concurrency input)
- Update documentation to reflect simplified interface

This removes unnecessary configuration complexity while providing sensible
bounds for all hardware configurations. The 1-16 range covers:
- Single-core systems (min=1)
- High-core systems (max=16 is reasonable parallelism for Docker containers)
- Load-based adjustment within these bounds

_Generated with `cmux`_
diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml
@@ -46,11 +46,6 @@ on:
         required: false
         default: 'terminal-bench-core==0.1.1'
         type: string
-      concurrency:
-        description: 'Number of concurrent tasks (--n-concurrent)'
-        required: false
-        default: '4'
-        type: string
       livestream:
         description: 'Enable livestream mode'
         required: false
@@ -72,21 +67,16 @@ on:
         description: 'Additional arguments to pass to terminal-bench'
         required: false
         type: string
-      adaptive_mode:
-        description: 'Use adaptive concurrency (overrides concurrency setting)'
-        required: false
-        default: false
-        type: boolean
-      max_concurrent:
-        description: 'Max concurrency for adaptive mode (only used if adaptive_mode=true)'
-        required: false
-        default: '16'
-        type: string
       load_threshold:
-        description: 'Load threshold for adaptive mode (only used if adaptive_mode=true)'
+        description: 'Load average threshold for adaptive concurrency (default: 1.0)'
         required: false
         default: '1.0'
         type: string
+      check_interval:
+        description: 'Seconds between bursts for adaptive concurrency (default: 60)'
+        required: false
+        default: '60'
+        type: string
 
 jobs:
   benchmark:
@@ -115,18 +105,12 @@ jobs:
       - name: Build dist/ (skip icons - not needed for benchmark)
         run: make build-main build-preload
 
-      - name: Run Terminal-Bench
-        run: |
-          if [ "${{ inputs.adaptive_mode }}" = "true" ]; then
-            make benchmark-terminal-adaptive
-          else
-            make benchmark-terminal
-          fi
+      - name: Run Terminal-Bench (adaptive concurrency 1-16)
+        run: make benchmark-terminal
         env:
           TB_DATASET: ${{ inputs.dataset }}
-          TB_CONCURRENCY: ${{ inputs.concurrency }}
-          TB_MAX_CONCURRENT: ${{ inputs.max_concurrent }}
           TB_LOAD_THRESHOLD: ${{ inputs.load_threshold }}
+          TB_CHECK_INTERVAL: ${{ inputs.check_interval }}
           TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }}
           TB_SAMPLE_SIZE: ${{ inputs.sample_size }}
           TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }}
diff --git a/Makefile b/Makefile
@@ -295,45 +295,12 @@ chromatic: node_modules/.installed ## Run Chromatic for visual regression testin
 	@bun x chromatic --exit-zero-on-changes
 
 ## Benchmarks
-benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_SAMPLE_SIZE/TB_TIMEOUT/TB_ARGS to customize)
-	@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
-	TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
-	CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \
-	LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
-	TASK_ID_FLAGS=""; \
-	if [ -n "$$TB_SAMPLE_SIZE" ]; then \
-		echo "Ensuring dataset $$TB_DATASET is downloaded..."; \
-		uvx terminal-bench datasets download --dataset "$$TB_DATASET" 2>&1 | grep -v "already exists" || true; \
-		echo "Sampling $$TB_SAMPLE_SIZE tasks from $$TB_DATASET..."; \
-		TASK_IDS=$$(python benchmarks/terminal_bench/sample_tasks.py --dataset "$$TB_DATASET" --sample-size "$$TB_SAMPLE_SIZE" --format space) || { \
-			echo "Error: Failed to sample tasks" >&2; \
-			exit 1; \
-		}; \
-		if [ -z "$$TASK_IDS" ]; then \
-			echo "Error: Sampling returned no task IDs" >&2; \
-			exit 1; \
-		fi; \
-		for task_id in $$TASK_IDS; do \
-			TASK_ID_FLAGS="$$TASK_ID_FLAGS --task-id $$task_id"; \
-		done; \
-		echo "Selected task IDs: $$TASK_IDS"; \
-	fi; \
-	echo "Using timeout: $$TB_TIMEOUT seconds"; \
-	echo "Running Terminal-Bench with dataset $$TB_DATASET"; \
-	uvx terminal-bench run \
-		--dataset "$$TB_DATASET" \
-		--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
-		--global-agent-timeout-sec $$TB_TIMEOUT \
-		$$CONCURRENCY_FLAG \
-		$$LIVESTREAM_FLAG \
-		$$TASK_ID_FLAGS \
-		$${TB_ARGS}
+benchmark-terminal: benchmark-terminal-adaptive ## Run Terminal-Bench with adaptive concurrency (alias)
 
 .PHONY: benchmark-terminal-adaptive
-benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (use TB_MAX_CONCURRENT/TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
+benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (auto-scales 1-16, use TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL)
 	@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
 	TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \
-	TB_MAX_CONCURRENT=$${TB_MAX_CONCURRENT:-16}; \
 	TB_LOAD_THRESHOLD=$${TB_LOAD_THRESHOLD:-1.0}; \
 	TB_CHECK_INTERVAL=$${TB_CHECK_INTERVAL:-60}; \
 	LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
@@ -355,9 +322,8 @@ benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (us
 		done; \
 		echo "Selected task IDs: $$TASK_IDS"; \
 	fi; \
-	echo "Running adaptive terminal-bench (max concurrency: $$TB_MAX_CONCURRENT, load threshold: $$TB_LOAD_THRESHOLD)"; \
+	echo "Running adaptive terminal-bench (auto-scaling 1-16, load threshold: $$TB_LOAD_THRESHOLD)"; \
 	python3 benchmarks/terminal_bench/adaptive_bench.py \
-		--max-concurrent $$TB_MAX_CONCURRENT \
 		--load-threshold $$TB_LOAD_THRESHOLD \
 		--check-interval $$TB_CHECK_INTERVAL \
 		-- \
diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md
@@ -4,13 +4,18 @@ This directory contains the cmux agent adapter for [Terminal-Bench](https://gith
 
 ## Quick Start
 
+Terminal-bench now runs with **adaptive concurrency by default**, automatically scaling from 1-16 concurrent tasks based on system load.
+
 ```bash
-# Run full benchmark suite (80 tasks, ~2.5 hours)
+# Run full benchmark suite (80 tasks, ~2.5 hours) with adaptive concurrency
 make benchmark-terminal
 
 # Run with sample of 5 tasks
 TB_SAMPLE_SIZE=5 make benchmark-terminal
 
+# Adjust load threshold (default: 1.0)
+TB_LOAD_THRESHOLD=2.0 make benchmark-terminal
+
 # Run specific tasks
 make benchmark-terminal TB_ARGS="--task-id hello-world --task-id chess-best-move"
 
@@ -24,7 +29,8 @@ make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic:claude-opus-
 
 - `TB_DATASET`: Dataset to use (default: `terminal-bench-core==0.1.1`)
 - `TB_SAMPLE_SIZE`: Number of random tasks to run (default: all 80 tasks)
-- `TB_CONCURRENCY`: Number of concurrent tasks (default: 4)
+- `TB_LOAD_THRESHOLD`: Load average threshold for concurrency adjustments (default: 1.0)
+- `TB_CHECK_INTERVAL`: Seconds between bursts (default: 60)
 - `TB_LIVESTREAM`: Enable livestream mode (set to `1` to enable)
 - `TB_TIMEOUT`: Global timeout in seconds (default: 1800 = 30 minutes)
 - `TB_ARGS`: Additional arguments passed to terminal-bench
diff --git a/benchmarks/terminal_bench/adaptive_bench.py b/benchmarks/terminal_bench/adaptive_bench.py
@@ -17,20 +17,28 @@
 
 
 class AdaptiveBench:
+    """
+    Adaptive concurrency wrapper for terminal-bench.
+    
+    Concurrency is automatically bounded to [1, 16] for optimal performance
+    across different hardware configurations.
+    """
+    
+    MIN_CONCURRENT = 1
+    MAX_CONCURRENT = 16
+    
     def __init__(
         self,
         load_threshold: float,
         check_interval: int,
-        max_concurrent: int,
         runs_dir: Path,
         tb_args: list[str],
     ):
         self.load_threshold = load_threshold
         self.check_interval = check_interval
-        self.max_concurrent = max_concurrent
         self.runs_dir = runs_dir
         self.tb_args = tb_args
-        self.current_concurrent = 1
+        self.current_concurrent = self.MIN_CONCURRENT
         self.run_id: Optional[str] = None
         self.burst_count = 0
 
@@ -78,12 +86,12 @@ def adjust_concurrency(self) -> bool:
         load = self.get_load_avg()
         old_concurrent = self.current_concurrent
 
-        if load < self.load_threshold and self.current_concurrent < self.max_concurrent:
+        if load < self.load_threshold and self.current_concurrent < self.MAX_CONCURRENT:
             self.current_concurrent = min(
-                self.current_concurrent * 2, self.max_concurrent
+                self.current_concurrent * 2, self.MAX_CONCURRENT
             )
-        elif load > self.load_threshold and self.current_concurrent > 1:
-            self.current_concurrent = max(self.current_concurrent // 2, 1)
+        elif load > self.load_threshold and self.current_concurrent > self.MIN_CONCURRENT:
+            self.current_concurrent = max(self.current_concurrent // 2, self.MIN_CONCURRENT)
 
         if self.current_concurrent != old_concurrent:
             print(
@@ -221,7 +229,7 @@ def run(self):
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Run terminal-bench with adaptive concurrency via burst-and-resume"
+        description="Run terminal-bench with adaptive concurrency (auto-scales 1-16 based on load)"
     )
     parser.add_argument(
         "--load-threshold",
@@ -235,12 +243,6 @@ def main():
         default=60,
         help="Seconds between bursts (default: 60)",
     )
-    parser.add_argument(
-        "--max-concurrent",
-        type=int,
-        required=True,
-        help="Maximum concurrency limit",
-    )
     parser.add_argument(
         "--runs-dir",
         type=Path,
@@ -263,7 +265,6 @@ def main():
     bench = AdaptiveBench(
         load_threshold=args.load_threshold,
         check_interval=args.check_interval,
-        max_concurrent=args.max_concurrent,
         runs_dir=args.runs_dir,
         tb_args=tb_args,
     )