diff --git a/.github/workflows/terminal-bench.yml b/.github/workflows/terminal-bench.yml index 50cb87418..981cceedc 100644 --- a/.github/workflows/terminal-bench.yml +++ b/.github/workflows/terminal-bench.yml @@ -22,10 +22,10 @@ on: type: string default: '4' livestream: - description: 'Enable livestream mode (verbose output to console)' + description: 'Enable livestream mode' required: false type: boolean - default: false + default: true sample_size: description: 'Number of random tasks to run (empty = all tasks)' required: false @@ -46,15 +46,10 @@ on: required: false default: 'terminal-bench-core==0.1.1' type: string - concurrency: - description: 'Number of concurrent tasks (--n-concurrent)' - required: false - default: '4' - type: string livestream: - description: 'Enable livestream mode (verbose output to console)' + description: 'Enable livestream mode' required: false - default: false + default: true type: boolean sample_size: description: 'Number of random tasks to run (empty = all tasks)' @@ -72,15 +67,24 @@ on: description: 'Additional arguments to pass to terminal-bench' required: false type: string + load_threshold: + description: 'Load average threshold for adaptive concurrency (default: 1.0)' + required: false + default: '1.0' + type: string + check_interval: + description: 'Seconds between bursts for adaptive concurrency (default: 60)' + required: false + default: '60' + type: string jobs: benchmark: name: Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }} runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }} - # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically - # Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs - # If consistently hitting this timeout, investigate task-level issues - timeout-minutes: 240 + # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes + # Allow 3 hours for safety margin and slower tasks + timeout-minutes: 180 steps: - name: Checkout code uses: actions/checkout@v4 @@ -101,11 +105,12 @@ jobs: - name: Build dist/ (skip icons - not needed for benchmark) run: make build-main build-preload - - name: Run Terminal-Bench - run: make benchmark-terminal 2>&1 | tee benchmark.log + - name: Run Terminal-Bench (adaptive concurrency 1-16) + run: make benchmark-terminal env: TB_DATASET: ${{ inputs.dataset }} - TB_CONCURRENCY: ${{ inputs.concurrency }} + TB_LOAD_THRESHOLD: ${{ inputs.load_threshold }} + TB_CHECK_INTERVAL: ${{ inputs.check_interval }} TB_LIVESTREAM: ${{ inputs.livestream && '1' || '' }} TB_SAMPLE_SIZE: ${{ inputs.sample_size }} TB_ARGS: ${{ inputs.model_name && format('--agent-kwarg model_name={0} --agent-kwarg thinking_level={1} {2}', inputs.model_name, inputs.thinking_level, inputs.extra_args) || inputs.extra_args }} @@ -116,12 +121,18 @@ jobs: if: always() run: | echo "=== Terminal-Bench Results Summary ===" - if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then + if [ -f "$(find runs -name 'results.json' | head -1)" ]; then RESULTS_FILE=$(find runs -name 'results.json' | head -1) - cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE" + echo "Results file: $RESULTS_FILE" + echo "" + echo "Full results.json:" + cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE" + echo "" + echo "Per-task summary:" + cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details" else - echo "❌ No results.json found" - ls -laR runs/ 2>/dev/null || echo "runs/ directory missing" + echo "No results.json found in runs/" + ls -la runs/ fi - name: Set artifact name @@ -144,7 +155,6 @@ jobs: name: ${{ steps.artifact-name.outputs.name }} path: | runs/ - benchmark.log if-no-files-found: warn retention-days: 30 diff --git a/Makefile b/Makefile index a27559132..52711aaee 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,7 @@ include fmt.mk .PHONY: dist dist-mac dist-win dist-linux .PHONY: docs docs-build docs-watch .PHONY: storybook storybook-build test-storybook chromatic -.PHONY: benchmark-terminal +.PHONY: benchmark-terminal benchmark-terminal-adaptive .PHONY: ensure-deps .PHONY: check-eager-imports check-bundle-size check-startup @@ -295,10 +295,14 @@ chromatic: node_modules/.installed ## Run Chromatic for visual regression testin @bun x chromatic --exit-zero-on-changes ## Benchmarks -benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_SAMPLE_SIZE/TB_TIMEOUT/TB_ARGS to customize) +benchmark-terminal: benchmark-terminal-adaptive ## Run Terminal-Bench with adaptive concurrency (alias) + +.PHONY: benchmark-terminal-adaptive +benchmark-terminal-adaptive: ## Run Terminal-Bench with adaptive concurrency (auto-scales 1-16, use TB_LOAD_THRESHOLD/TB_CHECK_INTERVAL) @TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \ TB_TIMEOUT=$${TB_TIMEOUT:-1800}; \ - CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \ + TB_LOAD_THRESHOLD=$${TB_LOAD_THRESHOLD:-1.0}; \ + TB_CHECK_INTERVAL=$${TB_CHECK_INTERVAL:-60}; \ LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \ TASK_ID_FLAGS=""; \ if [ -n "$$TB_SAMPLE_SIZE" ]; then \ @@ -318,14 +322,14 @@ benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB done; \ echo "Selected task IDs: $$TASK_IDS"; \ fi; \ - echo "Using timeout: $$TB_TIMEOUT seconds"; \ - echo "Running Terminal-Bench with dataset $$TB_DATASET"; \ - export CMUX_TIMEOUT_MS=$$((TB_TIMEOUT * 1000)); \ - uvx terminal-bench run \ + echo "Running adaptive terminal-bench (auto-scaling 1-16, load threshold: $$TB_LOAD_THRESHOLD)"; \ + python3 benchmarks/terminal_bench/adaptive_bench.py \ + --load-threshold $$TB_LOAD_THRESHOLD \ + --check-interval $$TB_CHECK_INTERVAL \ + -- \ --dataset "$$TB_DATASET" \ --agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \ --global-agent-timeout-sec $$TB_TIMEOUT \ - $$CONCURRENCY_FLAG \ $$LIVESTREAM_FLAG \ $$TASK_ID_FLAGS \ $${TB_ARGS} diff --git a/benchmarks/terminal_bench/README.md b/benchmarks/terminal_bench/README.md index c106c8804..91f65ab3b 100644 --- a/benchmarks/terminal_bench/README.md +++ b/benchmarks/terminal_bench/README.md @@ -4,13 +4,18 @@ This directory contains the cmux agent adapter for [Terminal-Bench](https://gith ## Quick Start +Terminal-bench now runs with **adaptive concurrency by default**, automatically scaling from 1-16 concurrent tasks based on system load. + ```bash -# Run full benchmark suite (80 tasks, ~2.5 hours) +# Run full benchmark suite (80 tasks, ~2.5 hours) with adaptive concurrency make benchmark-terminal # Run with sample of 5 tasks TB_SAMPLE_SIZE=5 make benchmark-terminal +# Adjust load threshold (default: 1.0) +TB_LOAD_THRESHOLD=2.0 make benchmark-terminal + # Run specific tasks make benchmark-terminal TB_ARGS="--task-id hello-world --task-id chess-best-move" @@ -24,7 +29,8 @@ make benchmark-terminal TB_ARGS="--agent-kwarg model_name=anthropic:claude-opus- - `TB_DATASET`: Dataset to use (default: `terminal-bench-core==0.1.1`) - `TB_SAMPLE_SIZE`: Number of random tasks to run (default: all 80 tasks) -- `TB_CONCURRENCY`: Number of concurrent tasks (default: 4) +- `TB_LOAD_THRESHOLD`: Load average threshold for concurrency adjustments (default: 1.0) +- `TB_CHECK_INTERVAL`: Seconds between bursts (default: 60) - `TB_LIVESTREAM`: Enable livestream mode (set to `1` to enable) - `TB_TIMEOUT`: Global timeout in seconds (default: 1800 = 30 minutes) - `TB_ARGS`: Additional arguments passed to terminal-bench @@ -99,6 +105,49 @@ Based on analysis of the Oct 30 nightly run (15-minute timeout): **Impact of 30-minute timeout**: Expected to reduce false timeout failures by ~50% and improve pass rates by 10-15 percentage points (from ~42% to ~52-57%). +## Adaptive Concurrency + +Terminal-bench uses **adaptive concurrency** that automatically scales from 1-16 concurrent tasks based on system load using a **burst-and-resume pattern**: + +### How It Works + +1. **Starts with concurrency=1** and runs a burst +2. **Monitors system load** (1-minute average) after each burst completes +3. **Adjusts concurrency** using hysteresis: + - **Double** when load < threshold (default: 1.0) + - **Halve** when load > threshold + - **Bounded to [1, 16]** for optimal performance +4. **Resumes** the run with updated concurrency (skips completed tasks) + +The burst-and-resume pattern leverages terminal-bench's native resume capability. Each burst runs to completion with no mid-task interruption, ensuring clean Docker container lifecycle. + +### Configuration + +```bash +# Adjust load threshold (default: 1.0) +TB_LOAD_THRESHOLD=2.0 make benchmark-terminal + +# Faster adjustments (default: 60s between bursts) +TB_CHECK_INTERVAL=30 make benchmark-terminal + +# Sample 5 tasks with adaptive concurrency +TB_SAMPLE_SIZE=5 make benchmark-terminal +``` + +| Variable | Default | Description | +|----------|---------|-------------| +| `TB_LOAD_THRESHOLD` | 1.0 | Load average threshold for adjusting concurrency | +| `TB_CHECK_INTERVAL` | 60 | Seconds to wait between bursts | + +### Tradeoffs + +- ✅ Automatically finds optimal concurrency for hardware +- ✅ Prevents system overload +- ✅ Clean container lifecycle (no mid-task kills) +- ✅ Bounded to [1, 16] for safety +- ⚠️ Burst overhead (~2-5s, negligible for 6+ min avg tasks) +- ⚠️ Adjustment latency = burst duration + check interval + ## Files - `cmux_agent.py`: Main agent adapter implementing Terminal-Bench's agent interface @@ -106,3 +155,5 @@ Based on analysis of the Oct 30 nightly run (15-minute timeout): - `cmux_payload.py`: Helper to package cmux app for containerized execution - `cmux_setup.sh.j2`: Jinja2 template for agent installation script - `sample_tasks.py`: Utility to randomly sample tasks from dataset +- `adaptive_bench.py`: Adaptive concurrency wrapper using burst-and-resume pattern +- `adaptive_bench_test.py`: Unit tests for adaptive_bench.py diff --git a/benchmarks/terminal_bench/adaptive_bench.py b/benchmarks/terminal_bench/adaptive_bench.py new file mode 100755 index 000000000..8d47e1054 --- /dev/null +++ b/benchmarks/terminal_bench/adaptive_bench.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +""" +Adaptive concurrency wrapper for terminal-bench using burst-and-resume pattern. + +Runs terminal-bench in bursts with adjustable concurrency, using tb's native +resume capability to skip completed tasks between bursts. +""" + +import argparse +import json +import os +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional + + +class AdaptiveBench: + """ + Adaptive concurrency wrapper for terminal-bench. + + Concurrency is automatically bounded to [1, 16] for optimal performance + across different hardware configurations. + """ + + MIN_CONCURRENT = 1 + MAX_CONCURRENT = 16 + + def __init__( + self, + load_threshold: float, + check_interval: int, + runs_dir: Path, + tb_args: list[str], + ): + self.load_threshold = load_threshold + self.check_interval = check_interval + self.runs_dir = runs_dir + self.tb_args = tb_args + self.current_concurrent = self.MIN_CONCURRENT + self.run_id: Optional[str] = None + self.burst_count = 0 + + def get_load_avg(self) -> float: + """Get 1-minute load average.""" + return os.getloadavg()[0] + + def get_run_status(self) -> dict: + """Get status of current run by parsing results.json and tb.lock.""" + if not self.run_id: + return {"total": 0, "completed": 0, "incomplete": 0} + + try: + # Parse tb.lock to get task count + lock_path = self.runs_dir / self.run_id / "tb.lock" + if lock_path.exists(): + with open(lock_path) as f: + lock_data = json.load(f) + total_tasks = len(lock_data.get("dataset", {}).get("task_ids", [])) + else: + total_tasks = 0 + + # Count completed tasks from results.json + results_path = self.runs_dir / self.run_id / "results.json" + completed = 0 + if results_path.exists(): + with open(results_path) as f: + results_data = json.load(f) + # Count unique task_ids in results + completed = len( + set(r["task_id"] for r in results_data.get("results", [])) + ) + + return { + "total": total_tasks, + "completed": completed, + "incomplete": max(0, total_tasks - completed), + } + except Exception as e: + print(f"⚠️ Error getting run status: {e}") + return {"total": 0, "completed": 0, "incomplete": 0} + + def adjust_concurrency(self) -> bool: + """Check load and adjust concurrency. Returns True if changed.""" + load = self.get_load_avg() + old_concurrent = self.current_concurrent + + if load < self.load_threshold and self.current_concurrent < self.MAX_CONCURRENT: + self.current_concurrent = min( + self.current_concurrent * 2, self.MAX_CONCURRENT + ) + elif ( + load > self.load_threshold and self.current_concurrent > self.MIN_CONCURRENT + ): + self.current_concurrent = max( + self.current_concurrent // 2, self.MIN_CONCURRENT + ) + + if self.current_concurrent != old_concurrent: + print( + f"📊 Load: {load:.2f} (threshold: {self.load_threshold}) → " + f"Concurrency: {old_concurrent} → {self.current_concurrent}" + ) + return True + + print(f"📊 Load: {load:.2f} (threshold: {self.load_threshold}) → No change") + return False + + def run_burst(self) -> int: + """Run a single burst of terminal-bench. Returns exit code.""" + self.burst_count += 1 + + if self.burst_count == 1: + # First burst - create new run + cmd = [ + "uvx", + "terminal-bench", + "run", + "--n-concurrent", + str(self.current_concurrent), + "--output-path", + str(self.runs_dir), + *self.tb_args, + ] + print( + f"🚀 Burst #{self.burst_count}: Starting NEW run with " + f"concurrency={self.current_concurrent}" + ) + else: + # Subsequent bursts - update tb.lock BEFORE resume + # This ensures the resume command picks up the new concurrency + self._update_lock_concurrency() + + # Resume existing run + cmd = [ + "uvx", + "terminal-bench", + "runs", + "resume", + "--run-id", + self.run_id, + "--runs-dir", + str(self.runs_dir), + ] + print( + f"🔄 Burst #{self.burst_count}: Resuming run {self.run_id} " + f"with concurrency={self.current_concurrent}" + ) + + print(f" Command: {' '.join(cmd)}") + burst_start = time.time() + + # Run terminal-bench + result = subprocess.run(cmd, env=os.environ.copy()) + + burst_duration = time.time() - burst_start + + # Capture run_id from first burst + if self.burst_count == 1 and result.returncode == 0: + # Find most recent run directory + if self.runs_dir.exists(): + run_dirs = [ + d + for d in self.runs_dir.iterdir() + if d.is_dir() and (d / "tb.lock").exists() + ] + if run_dirs: + # Sort by modification time and take most recent + self.run_id = sorted(run_dirs, key=lambda p: p.stat().st_mtime)[ + -1 + ].name + print(f"📝 Captured run_id: {self.run_id}") + + print(f"⏱️ Burst #{self.burst_count} completed in {burst_duration:.1f}s") + + return result.returncode + + def _update_lock_concurrency(self): + """Update n_concurrent_trials in tb.lock for next resume.""" + lock_path = self.runs_dir / self.run_id / "tb.lock" + if not lock_path.exists(): + return + + try: + with open(lock_path, "r") as f: + lock_data = json.load(f) + + # Update concurrency in lock file + if "run_config" in lock_data: + lock_data["run_config"]["n_concurrent_trials"] = self.current_concurrent + + with open(lock_path, "w") as f: + json.dump(lock_data, f, indent=2) + + print(f" Updated tb.lock with concurrency={self.current_concurrent}") + except Exception as e: + print(f"⚠️ Could not update tb.lock: {e}") + + def run(self): + """Main loop: run bursts with adaptive concurrency.""" + try: + while True: + # Run burst with current concurrency + exit_code = self.run_burst() + + if exit_code != 0: + print(f"❌ Terminal-bench exited with code {exit_code}") + return exit_code + + # Check if we're done + status = self.get_run_status() + print( + f"📈 Progress: {status['completed']}/{status['total']} tasks " + f"({status['incomplete']} remaining)" + ) + + if status["incomplete"] == 0: + print("✅ All tasks completed!") + return 0 + + # Wait before next burst and potentially adjust concurrency + print(f"⏸️ Waiting {self.check_interval}s before next burst...") + time.sleep(self.check_interval) + self.adjust_concurrency() + + except KeyboardInterrupt: + print("\n⚠️ Received interrupt, stopping...") + return 130 + + +def main(): + parser = argparse.ArgumentParser( + description="Run terminal-bench with adaptive concurrency (auto-scales 1-16 based on load)" + ) + parser.add_argument( + "--load-threshold", + type=float, + default=1.0, + help="Load average threshold for adjusting concurrency (default: 1.0)", + ) + parser.add_argument( + "--check-interval", + type=int, + default=60, + help="Seconds between bursts (default: 60)", + ) + parser.add_argument( + "--runs-dir", + type=Path, + default=Path("runs"), + help="Directory for run outputs (default: runs)", + ) + parser.add_argument( + "tb_args", + nargs=argparse.REMAINDER, + help="Arguments to pass to terminal-bench run", + ) + + args = parser.parse_args() + + # Strip leading '--' from tb_args if present + tb_args = args.tb_args + if tb_args and tb_args[0] == "--": + tb_args = tb_args[1:] + + bench = AdaptiveBench( + load_threshold=args.load_threshold, + check_interval=args.check_interval, + runs_dir=args.runs_dir, + tb_args=tb_args, + ) + + sys.exit(bench.run()) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/terminal_bench/adaptive_bench_test.py b/benchmarks/terminal_bench/adaptive_bench_test.py new file mode 100644 index 000000000..f15bffff2 --- /dev/null +++ b/benchmarks/terminal_bench/adaptive_bench_test.py @@ -0,0 +1,269 @@ +"""Tests for adaptive_bench.py""" + +import json +import os +from pathlib import Path +from unittest.mock import MagicMock, mock_open, patch + +import pytest + +from adaptive_bench import AdaptiveBench + + +class TestAdaptiveBench: + """Test suite for AdaptiveBench.""" + + def test_init(self): + """Test AdaptiveBench initialization.""" + bench = AdaptiveBench( + load_threshold=2.0, + check_interval=30, + max_concurrent=8, + runs_dir=Path("test_runs"), + tb_args=["--dataset", "test"], + ) + + assert bench.load_threshold == 2.0 + assert bench.check_interval == 30 + assert bench.max_concurrent == 8 + assert bench.runs_dir == Path("test_runs") + assert bench.tb_args == ["--dataset", "test"] + assert bench.current_concurrent == 1 + assert bench.run_id is None + assert bench.burst_count == 0 + + @patch("adaptive_bench.os.getloadavg") + def test_get_load_avg(self, mock_getloadavg): + """Test getting load average.""" + mock_getloadavg.return_value = (2.5, 2.0, 1.5) + bench = AdaptiveBench( + load_threshold=1.0, + check_interval=60, + max_concurrent=16, + runs_dir=Path("runs"), + tb_args=[], + ) + + load = bench.get_load_avg() + assert load == 2.5 + mock_getloadavg.assert_called_once() + + @patch("adaptive_bench.os.getloadavg") + def test_adjust_concurrency_increase(self, mock_getloadavg): + """Test concurrency increases when load is low.""" + mock_getloadavg.return_value = (0.5, 0.5, 0.5) + bench = AdaptiveBench( + load_threshold=1.0, + check_interval=60, + max_concurrent=16, + runs_dir=Path("runs"), + tb_args=[], + ) + + bench.current_concurrent = 2 + changed = bench.adjust_concurrency() + + assert changed is True + assert bench.current_concurrent == 4 # Doubled + + @patch("adaptive_bench.os.getloadavg") + def test_adjust_concurrency_decrease(self, mock_getloadavg): + """Test concurrency decreases when load is high.""" + mock_getloadavg.return_value = (2.0, 2.0, 2.0) + bench = AdaptiveBench( + load_threshold=1.0, + check_interval=60, + max_concurrent=16, + runs_dir=Path("runs"), + tb_args=[], + ) + + bench.current_concurrent = 8 + changed = bench.adjust_concurrency() + + assert changed is True + assert bench.current_concurrent == 4 # Halved + + @patch("adaptive_bench.os.getloadavg") + def test_adjust_concurrency_no_change(self, mock_getloadavg): + """Test concurrency stays same when load is at threshold.""" + mock_getloadavg.return_value = (1.0, 1.0, 1.0) + bench = AdaptiveBench( + load_threshold=1.0, + check_interval=60, + max_concurrent=16, + runs_dir=Path("runs"), + tb_args=[], + ) + + bench.current_concurrent = 4 + changed = bench.adjust_concurrency() + + assert changed is False + assert bench.current_concurrent == 4 + + @patch("adaptive_bench.os.getloadavg") + def test_adjust_concurrency_respects_max(self, mock_getloadavg): + """Test concurrency doesn't exceed max_concurrent.""" + mock_getloadavg.return_value = (0.1, 0.1, 0.1) + bench = AdaptiveBench( + load_threshold=1.0, + check_interval=60, + max_concurrent=8, + runs_dir=Path("runs"), + tb_args=[], + ) + + bench.current_concurrent = 8 + changed = bench.adjust_concurrency() + + assert changed is False + assert bench.current_concurrent == 8 # Stays at max + + @patch("adaptive_bench.os.getloadavg") + def test_adjust_concurrency_respects_min(self, mock_getloadavg): + """Test concurrency doesn't go below 1.""" + mock_getloadavg.return_value = (5.0, 5.0, 5.0) + bench = AdaptiveBench( + load_threshold=1.0, + check_interval=60, + max_concurrent=16, + runs_dir=Path("runs"), + tb_args=[], + ) + + bench.current_concurrent = 1 + changed = bench.adjust_concurrency() + + assert changed is False + assert bench.current_concurrent == 1 # Stays at min + + def test_get_run_status_no_run_id(self): + """Test get_run_status returns zeros when no run_id.""" + bench = AdaptiveBench( + load_threshold=1.0, + check_interval=60, + max_concurrent=16, + runs_dir=Path("runs"), + tb_args=[], + ) + + status = bench.get_run_status() + assert status == {"total": 0, "completed": 0, "incomplete": 0} + + @patch("builtins.open", new_callable=mock_open) + @patch("pathlib.Path.exists") + def test_get_run_status_with_results(self, mock_exists, mock_file): + """Test get_run_status parses results correctly.""" + bench = AdaptiveBench( + load_threshold=1.0, + check_interval=60, + max_concurrent=16, + runs_dir=Path("runs"), + tb_args=[], + ) + bench.run_id = "test-run" + + # Mock tb.lock with 5 tasks + tb_lock_data = { + "dataset": {"task_ids": ["task1", "task2", "task3", "task4", "task5"]} + } + + # Mock results.json with 3 completed tasks + results_data = { + "results": [ + {"task_id": "task1", "resolved": True}, + {"task_id": "task2", "resolved": False}, + {"task_id": "task3", "resolved": True}, + ] + } + + def exists_side_effect(path): + return True # Both files exist + + mock_exists.side_effect = exists_side_effect + + def open_side_effect(path, *args, **kwargs): + if "tb.lock" in str(path): + return mock_open(read_data=json.dumps(tb_lock_data)).return_value + elif "results.json" in str(path): + return mock_open(read_data=json.dumps(results_data)).return_value + return mock_open().return_value + + mock_file.side_effect = open_side_effect + + status = bench.get_run_status() + + assert status["total"] == 5 + assert status["completed"] == 3 + assert status["incomplete"] == 2 + + @patch("adaptive_bench.subprocess.run") + @patch("adaptive_bench.time.time") + def test_run_burst_first_burst(self, mock_time, mock_subprocess): + """Test first burst creates new run.""" + mock_time.side_effect = [0, 10] # Start and end time + mock_subprocess.return_value = MagicMock(returncode=0) + + bench = AdaptiveBench( + load_threshold=1.0, + check_interval=60, + max_concurrent=16, + runs_dir=Path("runs"), + tb_args=["--dataset", "test"], + ) + + with patch("pathlib.Path.exists") as mock_exists: + mock_exists.return_value = False + + exit_code = bench.run_burst() + + assert exit_code == 0 + assert bench.burst_count == 1 + + # Verify command + call_args = mock_subprocess.call_args + cmd = call_args[0][0] + assert cmd[0] == "uvx" + assert cmd[1] == "terminal-bench" + assert cmd[2] == "run" + assert "--n-concurrent" in cmd + assert "1" in cmd # Initial concurrency + assert "--dataset" in cmd + assert "test" in cmd + + @patch("builtins.open", new_callable=mock_open) + @patch("pathlib.Path.exists") + def test_update_lock_concurrency(self, mock_exists, mock_file): + """Test updating tb.lock with new concurrency.""" + bench = AdaptiveBench( + load_threshold=1.0, + check_interval=60, + max_concurrent=16, + runs_dir=Path("runs"), + tb_args=[], + ) + bench.run_id = "test-run" + bench.current_concurrent = 4 + + mock_exists.return_value = True + + lock_data = {"run_config": {"n_concurrent_trials": 1, "other_field": "value"}} + + # Setup mock to return lock_data on read + mock_file.return_value.read.return_value = json.dumps(lock_data) + mock_file.return_value.__enter__.return_value = mock_file.return_value + + bench._update_lock_concurrency() + + # Verify write was called with updated concurrency + write_calls = [ + call + for call in mock_file.return_value.write.call_args_list + if call[0][0] # Filter out empty writes + ] + + if write_calls: + written_data = write_calls[0][0][0] + written_lock = json.loads(written_data) + assert written_lock["run_config"]["n_concurrent_trials"] == 4