2222 type : string
2323 default : ' 4'
2424 livestream :
25- description : ' Enable livestream mode'
25+ description : ' Enable livestream mode (verbose output to console) '
2626 required : false
2727 type : boolean
28- default : true
28+ default : false
2929 sample_size :
3030 description : ' Number of random tasks to run (empty = all tasks)'
3131 required : false
5252 default : ' 4'
5353 type : string
5454 livestream :
55- description : ' Enable livestream mode'
55+ description : ' Enable livestream mode (verbose output to console) '
5656 required : false
57- default : true
57+ default : false
5858 type : boolean
5959 sample_size :
6060 description : ' Number of random tasks to run (empty = all tasks)'
7777 benchmark :
7878 name : Run Terminal-Bench${{ inputs.model_name && format(' ({0})', inputs.model_name) || '' }}
7979 runs-on : ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
80- # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes
81- # Allow 3 hours for safety margin and slower tasks
82- timeout-minutes : 180
80+ # Full suite (~80 tasks) at concurrency=4 takes ~60-90 minutes typically
81+ # Set 4-hour timeout to handle occasional API slowdowns while preventing infinite hangs
82+ # If consistently hitting this timeout, investigate task-level issues
83+ timeout-minutes : 240
8384 steps :
8485 - name : Checkout code
8586 uses : actions/checkout@v4
@@ -101,7 +102,7 @@ jobs:
101102 run : make build-main build-preload
102103
103104 - name : Run Terminal-Bench
104- run : make benchmark-terminal
105+ run : make benchmark-terminal 2>&1 | tee benchmark.log
105106 env :
106107 TB_DATASET : ${{ inputs.dataset }}
107108 TB_CONCURRENCY : ${{ inputs.concurrency }}
@@ -115,18 +116,12 @@ jobs:
115116 if : always()
116117 run : |
117118 echo "=== Terminal-Bench Results Summary ==="
118- if [ -f "$(find runs -name 'results.json' | head -1)" ]; then
119+ if [ -f "$(find runs -name 'results.json' 2>/dev/null | head -1)" ]; then
119120 RESULTS_FILE=$(find runs -name 'results.json' | head -1)
120- echo "Results file: $RESULTS_FILE"
121- echo ""
122- echo "Full results.json:"
123- cat "$RESULTS_FILE" | jq '.' || cat "$RESULTS_FILE"
124- echo ""
125- echo "Per-task summary:"
126- cat "$RESULTS_FILE" | jq -r '.trials[] | "\(.task_id): \(if .resolved then "✓ PASS" else "✗ FAIL" end)"' 2>/dev/null || echo "Failed to parse task details"
121+ cat "$RESULTS_FILE" | jq '{n_resolved, n_unresolved, accuracy}' 2>/dev/null || cat "$RESULTS_FILE"
127122 else
128- echo "No results.json found in runs/ "
129- ls -la runs/
123+ echo "❌ No results.json found"
124+ ls -laR runs/ 2>/dev/null || echo "runs/ directory missing"
130125 fi
131126
132127 - name : Set artifact name
@@ -149,6 +144,7 @@ jobs:
149144 name : ${{ steps.artifact-name.outputs.name }}
150145 path : |
151146 runs/
147+ benchmark.log
152148 if-no-files-found : warn
153149 retention-days : 30
154150
0 commit comments