🤖 fix: update tb.lock before resume to apply concurrency changes immediately

ammar-agent · ammar-agent · commit 9e4144ea923b · 2025-11-08T22:38:59.000Z
Addresses Codex feedback: Previously, concurrency adjustments were written to
tb.lock AFTER a burst completed, but the next resume command would read the
tb.lock at the START of the burst. This created a 1-burst delay where the
old concurrency was used even after adjustment.

Now updates tb.lock BEFORE calling 'terminal-bench runs resume', ensuring
the new concurrency takes effect immediately. This is critical when the system
is overloaded - we need to reduce concurrency on the very next burst, not one
burst later.

Flow before fix:
  Burst N completes → adjust_concurrency() → write tb.lock
  Burst N+1 starts → resume reads OLD tb.lock value

Flow after fix:
  adjust_concurrency() completes
  Burst N+1 starts → write tb.lock → resume reads NEW tb.lock value
diff --git a/benchmarks/terminal_bench/adaptive_bench.py b/benchmarks/terminal_bench/adaptive_bench.py
@@ -19,14 +19,14 @@
 class AdaptiveBench:
     """
     Adaptive concurrency wrapper for terminal-bench.
-    
+
     Concurrency is automatically bounded to [1, 16] for optimal performance
     across different hardware configurations.
     """
-    
+
     MIN_CONCURRENT = 1
     MAX_CONCURRENT = 16
-    
+
     def __init__(
         self,
         load_threshold: float,
@@ -90,8 +90,12 @@ def adjust_concurrency(self) -> bool:
             self.current_concurrent = min(
                 self.current_concurrent * 2, self.MAX_CONCURRENT
             )
-        elif load > self.load_threshold and self.current_concurrent > self.MIN_CONCURRENT:
-            self.current_concurrent = max(self.current_concurrent // 2, self.MIN_CONCURRENT)
+        elif (
+            load > self.load_threshold and self.current_concurrent > self.MIN_CONCURRENT
+        ):
+            self.current_concurrent = max(
+                self.current_concurrent // 2, self.MIN_CONCURRENT
+            )
 
         if self.current_concurrent != old_concurrent:
             print(
@@ -124,7 +128,11 @@ def run_burst(self) -> int:
                 f"concurrency={self.current_concurrent}"
             )
         else:
-            # Subsequent bursts - resume existing run
+            # Subsequent bursts - update tb.lock BEFORE resume
+            # This ensures the resume command picks up the new concurrency
+            self._update_lock_concurrency()
+
+            # Resume existing run
             cmd = [
                 "uvx",
                 "terminal-bench",
@@ -166,10 +174,6 @@ def run_burst(self) -> int:
 
         print(f"⏱️  Burst #{self.burst_count} completed in {burst_duration:.1f}s")
 
-        # Update n_concurrent in tb.lock for next resume
-        if self.run_id and result.returncode == 0:
-            self._update_lock_concurrency()
-
         return result.returncode
 
     def _update_lock_concurrency(self):
@@ -184,9 +188,7 @@ def _update_lock_concurrency(self):
 
             # Update concurrency in lock file
             if "run_config" in lock_data:
-                lock_data["run_config"][
-                    "n_concurrent_trials"
-                ] = self.current_concurrent
+                lock_data["run_config"]["n_concurrent_trials"] = self.current_concurrent
 
             with open(lock_path, "w") as f:
                 json.dump(lock_data, f, indent=2)
diff --git a/benchmarks/terminal_bench/adaptive_bench_test.py b/benchmarks/terminal_bench/adaptive_bench_test.py
@@ -166,9 +166,7 @@ def test_get_run_status_with_results(self, mock_exists, mock_file):
 
         # Mock tb.lock with 5 tasks
         tb_lock_data = {
-            "dataset": {
-                "task_ids": ["task1", "task2", "task3", "task4", "task5"]
-            }
+            "dataset": {"task_ids": ["task1", "task2", "task3", "task4", "task5"]}
         }
 
         # Mock results.json with 3 completed tasks
@@ -250,9 +248,7 @@ def test_update_lock_concurrency(self, mock_exists, mock_file):
 
         mock_exists.return_value = True
 
-        lock_data = {
-            "run_config": {"n_concurrent_trials": 1, "other_field": "value"}
-        }
+        lock_data = {"run_config": {"n_concurrent_trials": 1, "other_field": "value"}}
 
         # Setup mock to return lock_data on read
         mock_file.return_value.read.return_value = json.dumps(lock_data)