OpenHands · simonrosenberg · Dec 2, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/.github/workflows/build-gaia-image.yml → .github/workflows/build-gaia-eval-image.yml b/.github/workflows/build-gaia-image.yml → .github/workflows/build-gaia-eval-image.yml
@@ -9,14 +9,6 @@ on:
         description: 'Software Agent SDK commit/ref to use'
         required: true
         type: string
-      target:
-        description: 'Build target (default: binary-minimal)'
-        required: false
-        default: 'binary-minimal'
-        type: choice
-        options:
-          - binary-minimal
-          - source-minimal
 
 concurrency:
   group: build-gaia-${{ github.ref }}
@@ -65,7 +57,7 @@ jobs:
           git add vendor/software-agent-sdk
           echo "Updated SDK submodule to $SDK_SHA (from ${{ inputs.sdk-commit }})"
 
-      - name: Set up Docker Buildx with Blacksmith
+      - name: Set up Docker Buildx
         uses: useblacksmith/setup-docker-builder@v1
 
       - name: Log in to GitHub Container Registry
@@ -88,7 +80,8 @@ jobs:
         run: |
           set -euo pipefail
 
-          TARGET="${{ inputs.target || 'binary-minimal' }}"
+          # GAIA requires 'binary' target to include Chromium for browser operations
+          TARGET="binary"
 
           CMD="uv run benchmarks/gaia/build_images.py \
             --image ghcr.io/openhands/eval-agent-server \
@@ -101,6 +94,39 @@ jobs:
           DOCKER_BUILDKIT: 1
           BUILDKIT_PROGRESS: plain
 
+      - name: Build and push GAIA image with MCP pre-installed
+        run: |
+          set -euo pipefail
+
+          # Get the SDK commit SHA for tagging
+          SDK_SHA=$(git submodule status vendor/software-agent-sdk | awk '{print $1}' | sed 's/^[+-]//' | cut -c1-7)
+
+          # GAIA requires 'binary' target to include Chromium for browser operations
+          TARGET="binary"
+
+          # Compute base and MCP image tags
+          BASE_IMAGE="ghcr.io/openhands/eval-agent-server:${SDK_SHA}-gaia"
+          MCP_IMAGE="ghcr.io/openhands/eval-agent-server:${SDK_SHA}-gaia-with-mcp"
+
+          echo "Building MCP-enhanced image..."
+          echo "  Base image: ${BASE_IMAGE}"
+          echo "  MCP image:  ${MCP_IMAGE}"
+
+          # Build the derived image with MCP pre-cached
+          docker build \
+            -f benchmarks/gaia/Dockerfile.gaia \
+            --build-arg SDK_IMAGE="${BASE_IMAGE}" \
+            -t "${MCP_IMAGE}" \
+            .
+
+          # Push the image
+          docker push "${MCP_IMAGE}"
+
+          echo "✅ MCP-enhanced image built and pushed: ${MCP_IMAGE}"
+        env:
+          DOCKER_BUILDKIT: 1
+          BUILDKIT_PROGRESS: plain
+
       - name: Archive build logs
         if: always()
         run: |
@@ -157,6 +183,7 @@ jobs:
         run: |
           # Get SDK version
           SDK_SHA=$(git submodule status vendor/software-agent-sdk | awk '{print $1}' | sed 's/^[+-]//')
+          SDK_SHA_SHORT=${SDK_SHA:0:7}
 
           # Read the single manifest file
           MANIFEST_FILE=$(find builds -name "manifest.jsonl" -type f 2>/dev/null | head -1 || true)
@@ -167,18 +194,16 @@ jobs:
           fi
 
           # Extract the image tag from the manifest
-          IMAGE_TAG=$(cat "$MANIFEST_FILE" | python3 -c "
-import sys, json
-data = json.loads(sys.stdin.read())
-tags = data.get('tags', [])
-print(tags[0] if tags else 'unknown')
-          ")
+          IMAGE_TAG=$(cat "$MANIFEST_FILE" | python3 -c "import sys, json; data = json.loads(sys.stdin.read()); tags = data.get('tags', []); print(tags[0] if tags else 'unknown')")
 
           if [ "$IMAGE_TAG" = "unknown" ]; then
             echo "No valid image tag found in manifest"
             exit 0
           fi
 
+          # Construct MCP image tag (always binary for GAIA)
+          MCP_IMAGE_TAG="ghcr.io/openhands/eval-agent-server:${SDK_SHA_SHORT}-gaia-with-mcp"
+
           # Determine trigger source
           if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
             TRIGGER="Manual trigger (workflow_dispatch)"
@@ -188,22 +213,23 @@ print(tags[0] if tags else 'unknown')
             TRIGGER="${{ github.event_name }}"
           fi
 
-          # Post comment
-          COMMENT_BODY=$(cat <<EOF
-## GAIA Image Build Complete ✅
-
-**SDK Version:** [\`${SDK_SHA:0:7}\`](https://github.com/OpenHands/software-agent-sdk/commit/${SDK_SHA})
-**Image Tag:** \`${IMAGE_TAG}\`
-**Workflow Run:** [#${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
-**Triggered by:** ${TRIGGER}
-EOF
-          )
-
+          # Post comment using jq to properly handle multi-line content
+          jq -n \
+            --arg sdk_short "${SDK_SHA_SHORT}" \
+            --arg sdk_full "${SDK_SHA}" \
+            --arg image "${IMAGE_TAG}" \
+            --arg mcp_image "${MCP_IMAGE_TAG}" \
+            --arg run_id "${{ github.run_id }}" \
+            --arg server_url "${{ github.server_url }}" \
+            --arg repo "${{ github.repository }}" \
+            --arg trigger "${TRIGGER}" \
+            '{body: "## GAIA Image Build Complete ✅\n\n**SDK Version:** [`\($sdk_short)`](https://github.com/OpenHands/software-agent-sdk/commit/\($sdk_full))\n**Base Image:** `\($image)`\n**MCP Image:** `\($mcp_image)` ⚡ _(MCP server pre-cached)_\n**Workflow Run:** [#\($run_id)](\($server_url)/\($repo)/actions/runs/\($run_id))\n**Triggered by:** \($trigger)\n\nThe MCP-enhanced image includes pre-cached `mcp-server-fetch` to eliminate 1-18 minute startup delays."}' | \
           curl -L -X POST \
             -H "Accept: application/vnd.github+json" \
             -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
             -H "X-GitHub-Api-Version: 2022-11-28" \
+            -H "Content-Type: application/json" \
             "${{ github.api_url }}/repos/${{ github.repository }}/issues/81/comments" \
-            -d "$(jq -n --arg body "$COMMENT_BODY" '{body: $body}')"
+            -d @-
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-swt-bench-images.yml b/.github/workflows/build-swt-bench-images.yml
@@ -0,0 +1,218 @@
+name: Build SWT-Bench Images
+
+on:
+  pull_request_target:
+    types: [labeled]
+  workflow_dispatch:
+    inputs:
+      dataset:
+        description: 'Dataset name (e.g., princeton-nlp/SWE-bench_Verified)'
+        required: true
+        default: 'princeton-nlp/SWE-bench_Verified'
+        type: string
+      split:
+        description: 'Dataset split'
+        required: true
+        default: 'test'
+        type: string
+      max-workers:
+        description: 'Maximum number of parallel workers'
+        required: false
+        default: '4'
+        type: string
+      n-limit:
+        description: 'Limit number of images to build (0 for all)'
+        required: false
+        default: '0'
+        type: string
+      sdk-commit:
+        description: 'Software Agent SDK commit/ref to use'
+        required: true
+        type: string
+
+concurrency:
+  group: build-swt-bench-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  build-and-push:
+    if: >
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'pull_request_target' &&
+       github.event.label.name == 'build-swt-bench')
+
+    runs-on:
+      labels: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+      issues: write
+
+    steps:
+      - name: Determine checkout ref
+        id: checkout-ref
+        run: |
+          if [ -n "${{ github.event.pull_request.head.sha }}" ]; then
+            echo "ref=${{ github.event.pull_request.head.sha }}" >> "$GITHUB_OUTPUT"
+            echo "Using PR head SHA: ${{ github.event.pull_request.head.sha }}"
+          else
+            echo "ref=" >> "$GITHUB_OUTPUT"
+            echo "Using default ref (the commit that triggered this workflow)"
+          fi
+
+      - uses: actions/checkout@v4
+        with:
+          ref: ${{ steps.checkout-ref.outputs.ref }}
+          submodules: recursive
+
+      - name: Update SDK submodule
+        if: ${{ github.event_name == 'workflow_dispatch' && inputs.sdk-commit != '' }}
+        run: |
+          cd vendor/software-agent-sdk
+          git fetch origin ${{ inputs.sdk-commit }}
+          git checkout FETCH_HEAD
+          SDK_SHA=$(git rev-parse HEAD)
+          cd ../..
+          git add vendor/software-agent-sdk
+          echo "Updated SDK submodule to $SDK_SHA (from ${{ inputs.sdk-commit }})"
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: |
+          make build
+
+      - name: Build and push SWT-Bench images
+        run: |
+          set -euo pipefail
+
+          # Get inputs with defaults
+          DATASET="${{ inputs.dataset || 'princeton-nlp/SWE-bench_Verified' }}"
+          SPLIT="${{ inputs.split || 'test' }}"
+          MAX_WORKERS="${{ inputs.max-workers || '4' }}"
+          N_LIMIT="${{ inputs.n-limit || '0' }}"
+
+          # SWT-Bench uses source-minimal target (same as SWE-bench)
+          TARGET="source-minimal"
+
+          CMD="uv run benchmarks/swt_bench/build_images.py \
+            --dataset ${DATASET} \
+            --split ${SPLIT} \
+            --image ghcr.io/openhands/eval-agent-server \
+            --target ${TARGET} \
+            --max-workers ${MAX_WORKERS} \
+            --push"
+
+          # Add n-limit if specified
+          if [ "$N_LIMIT" != "0" ]; then
+            CMD="$CMD --n-limit ${N_LIMIT}"
+          fi
+
+          echo "Running: $CMD"
+          eval "$CMD"
+        env:
+          DOCKER_BUILDKIT: 1
+          BUILDKIT_PROGRESS: plain
+
+      - name: Archive build logs
+        if: always()
+        run: |
+          if [ -d builds ]; then
+            tar -czf build-logs.tar.gz builds/
+            echo "Build logs archived successfully"
+          else
+            echo "No builds directory found"
+          fi
+
+      - name: Upload build logs
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: build-logs-${{ github.run_id }}
+          path: build-logs.tar.gz
+          retention-days: 7
+          if-no-files-found: warn
+
+      - name: Display build summary
+        if: always()
+        run: |
+          MANIFEST_FILE=$(find builds -name "manifest.jsonl" -type f 2>/dev/null | head -1 || true)
+
+          if [ -z "$MANIFEST_FILE" ]; then
+            echo "## Build Summary" >> "$GITHUB_STEP_SUMMARY"
+            echo "❌ Build failed - no manifest found" >> "$GITHUB_STEP_SUMMARY"
+            exit 0
+          fi
+
+          # Count total images built
+          TOTAL_IMAGES=$(wc -l < "$MANIFEST_FILE")
+          SUCCESS_COUNT=$(grep -c '"error":null' "$MANIFEST_FILE" || echo 0)
+          FAIL_COUNT=$((TOTAL_IMAGES - SUCCESS_COUNT))
+
+          echo "## SWT-Bench Image Build Summary" >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          echo "**Total Images:** $TOTAL_IMAGES" >> "$GITHUB_STEP_SUMMARY"
+          echo "**Successful:** $SUCCESS_COUNT ✅" >> "$GITHUB_STEP_SUMMARY"
+          echo "**Failed:** $FAIL_COUNT ❌" >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Comment on tracker issue
+        if: success()
+        run: |
+          # Get SDK version
+          SDK_SHA=$(git submodule status vendor/software-agent-sdk | awk '{print $1}' | sed 's/^[+-]//')
+          SDK_SHA_SHORT=${SDK_SHA:0:7}
+
+          # Read build summary
+          MANIFEST_FILE=$(find builds -name "manifest.jsonl" -type f 2>/dev/null | head -1 || true)
+
+          if [ -z "$MANIFEST_FILE" ]; then
+            echo "No manifest file found"
+            exit 0
+          fi
+
+          TOTAL_IMAGES=$(wc -l < "$MANIFEST_FILE")
+          SUCCESS_COUNT=$(grep -c '"error":null' "$MANIFEST_FILE" || echo 0)
+
+          # Determine trigger source
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            TRIGGER="Manual trigger (workflow_dispatch)"
+          elif [ "${{ github.event_name }}" = "pull_request" ]; then
+            TRIGGER="Pull request [#${{ github.event.pull_request.number }}](${{ github.event.pull_request.html_url }})"
+          else
+            TRIGGER="${{ github.event_name }}"
+          fi
+
+          # Post comment using jq to properly handle multi-line content
+          jq -n \
+            --arg sdk_short "${SDK_SHA_SHORT}" \
+            --arg sdk_full "${SDK_SHA}" \
+            --arg total "$TOTAL_IMAGES" \
+            --arg success "$SUCCESS_COUNT" \
+            --arg run_id "${{ github.run_id }}" \
+            --arg server_url "${{ github.server_url }}" \
+            --arg repo "${{ github.repository }}" \
+            --arg trigger "${TRIGGER}" \
+            '{body: "## SWT-Bench Image Build Complete ✅\n\n**SDK Version:** [`\($sdk_short)`](https://github.com/OpenHands/software-agent-sdk/commit/\($sdk_full))\n**Images Built:** \($success)/\($total)\n**Workflow Run:** [#\($run_id)](\($server_url)/\($repo)/actions/runs/\($run_id))\n**Triggered by:** \($trigger)\n\nSWT-Bench images have been built and pushed to ghcr.io/openhands/eval-agent-server."}' | \
+          curl -L -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            -H "Content-Type: application/json" \
+            "${{ github.api_url }}/repos/${{ github.repository }}/issues/81/comments" \
+            -d @-
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/benchmarks/gaia/Dockerfile.gaia b/benchmarks/gaia/Dockerfile.gaia
@@ -0,0 +1,17 @@
+# Dockerfile for GAIA evaluation with MCP server pre-installed
+# Extends the base SDK image to pre-cache mcp-server-fetch and eliminate startup delays
+
+ARG SDK_IMAGE=ghcr.io/openhands/eval-agent-server:f715937-gaia-binary-minimal
+FROM ${SDK_IMAGE}
+
+# Switch to root to install packages
+USER root
+
+# Pre-install MCP server to avoid 1-18 minute startup delays during agent initialization
+# This caches the mcp-server-fetch package so uvx can start it instantly at runtime
+RUN uvx mcp-server-fetch --version 2>&1 || echo "MCP server cached"
+
+# Switch back to openhands user
+USER openhands
+
+# Inherit all other settings from base image