1616 required : false
1717 type : string
1818 default : ' terminal-bench-core==0.1.1'
19+ concurrency :
20+ description : ' Number of concurrent tasks (--n-concurrent)'
21+ required : false
22+ type : string
23+ default : ' 4'
1924 livestream :
20- description : ' Enable livestream mode (verbose output to console) '
25+ description : ' Enable livestream mode'
2126 required : false
2227 type : boolean
23- default : false
28+ default : true
2429 sample_size :
2530 description : ' Number of random tasks to run (empty = all tasks)'
2631 required : false
2732 type : string
28- load_threshold :
29- description : ' Load threshold for adaptive concurrency (default: 1.0)'
30- required : false
31- type : string
32- default : ' 1.0'
33- check_interval :
34- description : ' Seconds between adaptive bursts (default: 60)'
35- required : false
36- type : string
37- default : ' 60'
3833 extra_args :
3934 description : ' Additional arguments to pass to terminal-bench'
4035 required : false
5247 default : ' terminal-bench-core==0.1.1'
5348 type : string
5449 livestream :
55- description : ' Enable livestream mode (verbose output to console) '
50+ description : ' Enable livestream mode'
5651 required : false
57- default : false
52+ default : true
5853 type : boolean
5954 sample_size :
6055 description : ' Number of random tasks to run (empty = all tasks)'
6863 description : ' Thinking level (off, low, medium, high)'
6964 required : false
7065 type : string
66+ extra_args :
67+ description : ' Additional arguments to pass to terminal-bench'
68+ required : false
69+ type : string
7170 load_threshold :
72- description : ' Load threshold for adaptive concurrency (default: 1.0)'
71+ description : ' Load average threshold for adaptive concurrency (default: 1.0)'
7372 required : false
7473 default : ' 1.0'
7574 type : string
7675 check_interval :
77- description : ' Seconds between adaptive bursts (default: 60)'
76+ description : ' Seconds between bursts for adaptive concurrency (default: 60)'
7877 required : false
7978 default : ' 60'
8079 type : string
81- extra_args :
82- description : ' Additional arguments to pass to terminal-bench'
83- required : false
84- type : string
8580
8681jobs :
8782 benchmark :
8883 name : Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
8984 runs-on : ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
90- # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically
91- # Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs
92- # If consistently hitting this timeout, investigate task-level issues
93- timeout-minutes : 240
85+ # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
86+ # Allow 3 hours for safety margin and slower tasks
87+ timeout-minutes : 180
9488 steps :
9589 - name : Checkout code
9690 uses : actions/checkout@v4
@@ -111,8 +105,8 @@ jobs:
111105 - name : Build dist/ (skip icons - not needed for benchmark)
112106 run : make build-main build-preload
113107
114- - name : Run Terminal-Bench
115- run : make benchmark-terminal 2>&1 | tee benchmark.log
108+ - name : Run Terminal-Bench (adaptive concurrency 1-16)
109+ run : make benchmark-terminal
116110 env :
117111 TB_DATASET : ${{ inputs.dataset }}
118112 TB_LOAD_THRESHOLD : ${{ inputs.load_threshold }}
@@ -127,12 +121,18 @@ jobs:
127121 if : always()
128122 run : |
129123 echo "=== Terminal-Bench Results Summary ==="
130- if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then
124+ if [ -f "$(find runs -name 'results.json' | head -1)" ]; then
131125 RESULTS_FILE=$(find runs -name 'results.json' | head -1)
132- cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE"
126+ echo "Results file: $RESULTS_FILE"
127+ echo ""
128+ echo "Full results.json:"
129+ cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
130+ echo ""
131+ echo "Per-task summary:"
132+ cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
133133 else
134- echo "❌ No results.json found"
135- ls -laR runs/ 2>/dev/null || echo "runs/ directory missing"
134+ echo "No results.json found in runs/ "
135+ ls -la runs/
136136 fi
137137
138138 - name : Set artifact name
@@ -155,7 +155,6 @@ jobs:
155155 name : ${{ steps.artifact-name.outputs.name }}
156156 path : |
157157 runs/
158- benchmark.log
159158 if-no-files-found : warn
160159 retention-days : 30
161160
0 commit comments