diff --git a/.github/workflows/assign-reviews.yml b/.github/workflows/assign-reviews.yml
index 6a2e1485f7..5a736050dc 100644
--- a/.github/workflows/assign-reviews.yml
+++ b/.github/workflows/assign-reviews.yml
@@ -41,7 +41,8 @@ jobs:
                 Find all open PRs where:
                 1. The PR is waiting for review (there are no open review comments or change requests)
                 2. The PR is in a "clean" state (CI passing, no merge conflicts)
-                3. The PR has had no activity (comments, commits, reviews) for more than 3 days.
+                3. The PR is not marked as draft (draft: false)
+                4. The PR has had no activity (comments, commits, reviews) for more than 3 days.
 
                 In this case, send a message to the reviewers:
                 [Automatic Post]: This PR seems to be currently waiting for review.
@@ -50,7 +51,7 @@ jobs:
                 # Need Author Action
 
                 Find all open PRs where the most recent change or comment was made on the pull
-                request more than 5 days ago. Then do the following in order:
+                request more than 5 days ago (use 14 days if the PR is marked as draft).
 
                 And send a message to the author:
 
@@ -60,13 +61,36 @@ jobs:
 
                 # Need Reviewers
 
-                Find all open pull requests that:
-                1. Have no reviewers assigned to them.
-                2. Are not marked as draft.
-                3. Were created more than 1 day ago.
-                4. CI is passing and there are no merge conflicts.
+                Find all open pull requests that TRULY have NO reviewers assigned. To do this correctly:
 
-                For each of these pull requests, read the git blame information for the files,
+                1. Use the GitHub API to fetch PR details: GET /repos/{owner}/{repo}/pulls/{pull_number}
+                2. Check the "requested_reviewers" and "requested_teams" arrays
+                3. ALSO check for submitted reviews: GET /repos/{owner}/{repo}/pulls/{pull_number}/reviews
+                4. A PR needs reviewers ONLY if ALL of these are true:
+                   - The "requested_reviewers" array is empty (no pending review requests)
+                   - The "requested_teams" array is empty (no pending team review requests)
+                   - The reviews array is empty (no reviews have been submitted yet)
+                5. IMPORTANT: If ANY of these has entries, SKIP this PR - it already has or had reviewers!
+
+                Example API responses showing a PR that DOES NOT need reviewers (skip this):
+
+                Case 1 - Has requested reviewers:
+                GET /pulls/{number}: {"requested_reviewers": [{"login": "someuser"}], "requested_teams": []}
+
+                Case 2 - Has submitted reviews (even if requested_reviewers is empty):
+                GET /pulls/{number}: {"requested_reviewers": [], "requested_teams": []}
+                GET /pulls/{number}/reviews: [{"user": {"login": "someuser"}, "state": "COMMENTED"}]
+
+                Example API response showing a PR that DOES need reviewers (process this):
+                GET /pulls/{number}: {"requested_reviewers": [], "requested_teams": []}
+                GET /pulls/{number}/reviews: []
+
+                Additional criteria for PRs that need reviewers:
+                1. Are not marked as draft (draft: false)
+                2. Were created more than 1 day ago
+                3. CI is passing and there are no merge conflicts
+
+                For each PR that truly has NO reviewers, read the git blame information for the files,
                 and find the most recent and active contributors to the file/location of the changes.
                 Assign one of these people as a reviewer, but try not to assign too many reviews to
                 any single person. Add this message:
diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml
index 8acb656688..89e4d45462 100644
--- a/.github/workflows/run-examples.yml
+++ b/.github/workflows/run-examples.yml
@@ -1,17 +1,26 @@
 ---
-name: Run Examples Scripts on PR
+name: Run Examples Scripts
 
 on:
     pull_request:
         types: [labeled]
+    workflow_dispatch:
+        inputs:
+            reason:
+                description: Reason for manual trigger
+                required: true
+                default: ''
+    schedule:
+        - cron: 30 22 * * * # Runs at 10:30pm UTC every day
 
 permissions:
     contents: read
     pull-requests: write
+    issues: write
 
 jobs:
     test-examples:
-        if: github.event.label.name == 'test-examples'
+        if: github.event.label.name == 'test-examples' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
         runs-on: blacksmith-2vcpu-ubuntu-2404
         timeout-minutes: 60
         steps:
@@ -75,11 +84,18 @@ jobs:
                       "examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py"
                   )
 
-                  # GitHub API setup
-                  API_URL="https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/issues/${PR_NUMBER}/comments"
+                  # GitHub API setup (only for PR events)
+                  if [ "${{ github.event_name }}" = "pull_request" ]; then
+                      API_URL="https://api.github.com/repos/${REPO_OWNER}/${REPO_NAME}/issues/${PR_NUMBER}/comments"
+                  fi
 
                   # Function to update PR comment
                   update_comment() {
+                      # Skip if not a PR event
+                      if [ "${{ github.event_name }}" != "pull_request" ]; then
+                          return
+                      fi
+                      
                       local comment_body="$1"
                       local response
                       
@@ -102,19 +118,31 @@ jobs:
                       fi
                   }
 
+                  # Function to format cost with 2 decimal places
+                  format_cost() {
+                      local cost="$1"
+                      if [ -z "$cost" ] || [ "$cost" = "N/A" ]; then
+                          echo "N/A"
+                      else
+                          printf "\$%.2f" "$cost" 2>/dev/null || echo "N/A"
+                      fi
+                  }
+
                   # Function to generate markdown table
                   generate_table() {
                       local header="## 🔄 Running Examples with \`${LLM_MODEL}\`\n\n"
                       header+="_Last updated: $(date -u '+%Y-%m-%d %H:%M:%S UTC')_\n\n"
-                      header+="| Example | Status | Duration |\n"
-                      header+="|---------|--------|----------|\n"
+                      header+="| Example | Status | Duration | Cost |\n"
+                      header+="|---------|--------|----------|------|\n"
                       
                       local rows=""
                       for example in "${EXAMPLES[@]}"; do
-                          local short_name="${example#examples/01_standalone_sdk/}"
+                          # Strip examples/ prefix and show relative path from there
+                          local short_name="${example#examples/}"
                           local status="${TEST_STATUS[$example]:-⏳ Pending}"
                           local duration="${TEST_DURATION[$example]:--}"
-                          rows+="| ${short_name} | ${status} | ${duration} |\n"
+                          local cost="${TEST_COST[$example]:--}"
+                          rows+="| ${short_name} | ${status} | ${duration} | ${cost} |\n"
                       done
                       
                       local summary="\n---\n\n"
@@ -124,8 +152,14 @@ jobs:
                           else
                               summary+="### ❌ Some tests failed\n\n"
                           fi
-                          summary+="**Total:** ${#EXAMPLES[@]} | **Passed:** ${PASSED} | **Failed:** ${FAILED}\n\n"
-                          summary+="[View full workflow run](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID})"
+                          summary+="**Total:** ${#EXAMPLES[@]} | **Passed:** ${PASSED} | **Failed:** ${FAILED}"
+                          
+                          # Calculate and display total cost if available
+                          if [ -n "$TOTAL_COST" ]; then
+                              summary+=" | **Total Cost:** $(format_cost $TOTAL_COST)"
+                          fi
+                          
+                          summary+="\n\n[View full workflow run](${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID})"
                       else
                           summary+="**Progress:** ${COMPLETED}/${#EXAMPLES[@]} completed | **Passed:** ${PASSED} | **Failed:** ${FAILED}"
                       fi
@@ -136,9 +170,11 @@ jobs:
                   # Initialize tracking variables
                   declare -A TEST_STATUS
                   declare -A TEST_DURATION
+                  declare -A TEST_COST
                   FAILED=0
                   PASSED=0
                   COMPLETED=0
+                  TOTAL_COST=0
                   FAILED_EXAMPLES=()
                   RESULTS_FILE="test-results.txt"
                   COMMENT_ID=""
@@ -146,9 +182,11 @@ jobs:
                   # Clear results file
                   > "$RESULTS_FILE"
 
-                  # Create initial comment with all tests pending
-                  echo "Creating initial PR comment..."
-                  update_comment "$(generate_table)"
+                  # Create initial comment with all tests pending (only for PR events)
+                  if [ "${{ github.event_name }}" = "pull_request" ]; then
+                      echo "Creating initial PR comment..."
+                      update_comment "$(generate_table)"
+                  fi
 
                   echo "=========================================="
                   echo "Running ${#EXAMPLES[@]} examples with $LLM_MODEL"
@@ -161,33 +199,60 @@ jobs:
                       
                       START_TIME=$(date +%s)
                       
+                      # Create temp file to capture output
+                      OUTPUT_FILE=$(mktemp)
+                      
                       # Run example with timeout (20 minutes per example)
-                      if timeout 1200 uv run python "$example"; then
+                      # Capture output while still displaying it
+                      if timeout 1200 uv run python "$example" 2>&1 | tee "$OUTPUT_FILE"; then
                           END_TIME=$(date +%s)
                           DURATION=$((END_TIME - START_TIME))
                           DURATION_STR="${DURATION}s"
                           
-                          echo "✓ PASSED: $example (${DURATION_STR})"
+                          # Extract cost from output
+                          COST=$(grep "EXAMPLE_COST:" "$OUTPUT_FILE" | awk '{print $2}' | tail -1)
+                          if [ -z "$COST" ]; then
+                              COST="0.00"
+                          fi
+                          
+                          # Accumulate total cost
+                          TOTAL_COST=$(echo "$TOTAL_COST + $COST" | bc -l)
+                          
+                          echo "✓ PASSED: $example (${DURATION_STR}, cost: \$${COST})"
                           PASSED=$((PASSED + 1))
                           COMPLETED=$((COMPLETED + 1))
                           TEST_STATUS[$example]="✅ PASS"
                           TEST_DURATION[$example]="${DURATION_STR}"
-                          echo "PASS|$example|${DURATION}" >> "$RESULTS_FILE"
+                          TEST_COST[$example]="$(format_cost $COST)"
+                          echo "PASS|$example|${DURATION}|${COST}" >> "$RESULTS_FILE"
                       else
                           EXIT_CODE=$?
                           END_TIME=$(date +%s)
                           DURATION=$((END_TIME - START_TIME))
                           DURATION_STR="${DURATION}s"
                           
-                          echo "✗ FAILED: $example (exit code: $EXIT_CODE, ${DURATION_STR})"
+                          # Try to extract cost even for failed tests
+                          COST=$(grep "EXAMPLE_COST:" "$OUTPUT_FILE" | awk '{print $2}' | tail -1)
+                          if [ -z "$COST" ]; then
+                              COST="0.00"
+                          fi
+                          
+                          # Accumulate total cost
+                          TOTAL_COST=$(echo "$TOTAL_COST + $COST" | bc -l)
+                          
+                          echo "✗ FAILED: $example (exit code: $EXIT_CODE, ${DURATION_STR}, cost: \$${COST})"
                           FAILED=$((FAILED + 1))
                           COMPLETED=$((COMPLETED + 1))
                           FAILED_EXAMPLES+=("$example")
                           TEST_STATUS[$example]="❌ FAIL (exit: ${EXIT_CODE})"
                           TEST_DURATION[$example]="${DURATION_STR}"
-                          echo "FAIL|$example|$EXIT_CODE|${DURATION}" >> "$RESULTS_FILE"
+                          TEST_COST[$example]="$(format_cost $COST)"
+                          echo "FAIL|$example|$EXIT_CODE|${DURATION}|${COST}" >> "$RESULTS_FILE"
                       fi
                       
+                      # Clean up temp file
+                      rm -f "$OUTPUT_FILE"
+                      
                       # Update PR comment after each test
                       echo "Updating PR comment..."
                       update_comment "$(generate_table)"
@@ -200,6 +265,12 @@ jobs:
                   echo "Total: ${#EXAMPLES[@]}"
                   echo "Passed: $PASSED"
                   echo "Failed: $FAILED"
+                  echo "Total Cost: $(format_cost $TOTAL_COST)"
+
+                  # Generate final report and save to file
+                  FINAL_REPORT=$(generate_table)
+                  echo "$FINAL_REPORT" > examples_report.md
+                  echo "Final report saved to examples_report.md"
 
                   if [ $FAILED -gt 0 ]; then
                       echo ""
@@ -212,3 +283,29 @@ jobs:
 
                   echo ""
                   echo "All examples passed! ✓"
+
+            - name: Read examples report for issue comment
+              if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+              id: read_report
+              run: |
+                  if [ -f examples_report.md ]; then
+                      REPORT_CONTENT=$(cat examples_report.md)
+                      echo "report<<EOF" >> $GITHUB_OUTPUT
+                      echo "$REPORT_CONTENT" >> $GITHUB_OUTPUT
+                      echo "EOF" >> $GITHUB_OUTPUT
+                  else
+                      echo "report=Report file not found" >> $GITHUB_OUTPUT
+                  fi
+
+            - name: Comment with results on tracker issue
+              if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+              uses: KeisukeYamashita/create-comment@v1
+              with:
+                  number: 976
+                  unique: false
+                  comment: |
+                      **Trigger:** ${{ github.event_name == 'schedule' && 'Nightly Scheduled Run' || format('Manual Trigger: {0}', github.event.inputs.reason) }}
+                      **Commit:** ${{ github.sha }}
+                      **Workflow Run:** ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+
+                      ${{ steps.read_report.outputs.report }}
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 5fed59bd29..e3114d5339 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -3,7 +3,7 @@ name: Agent Server
 
 on:
     push:
-        branches: [main]
+        branches: [rel-1.0.0a6]
         tags:
             - build-docker
     pull_request:
@@ -145,7 +145,7 @@ jobs:
                   make test-server-schema
 
     build-and-push-image:
-        name: Build & Push
+        name: Build & Push (${{ matrix.variant }}-${{ matrix.arch }})
         # Run on push events and pull requests from the same repository (not forks)
         # Fork PRs cannot push to GHCR and would fail authentication
         if: >
@@ -155,33 +155,58 @@ jobs:
         strategy:
             fail-fast: false
             matrix:
+                # Explicit matrix: 3 variants × 2 architectures = 6 jobs
+                # Each job specifies exactly what it builds and where it runs
                 include:
-                    # Python + Node.js variant
-                    - name: python
+                    # Python variant
+                    - variant: python
+                      arch: amd64
+                      base_image: nikolaik/python-nodejs:python3.12-nodejs22
                       runner: blacksmith-8vcpu-ubuntu-2404
-                      platforms: linux/amd64,linux/arm64
+                      platform: linux/amd64
+
+                    - variant: python
+                      arch: arm64
                       base_image: nikolaik/python-nodejs:python3.12-nodejs22
+                      runner: blacksmith-8vcpu-ubuntu-2404-arm
+                      platform: linux/arm64
 
-                    # Java variant (OpenJDK base)
-                    - name: java
+                    # Java variant
+                    - variant: java
+                      arch: amd64
+                      base_image: eclipse-temurin:17-jdk
                       runner: blacksmith-8vcpu-ubuntu-2404
-                      platforms: linux/amd64,linux/arm64
+                      platform: linux/amd64
+
+                    - variant: java
+                      arch: arm64
                       base_image: eclipse-temurin:17-jdk
+                      runner: blacksmith-8vcpu-ubuntu-2404-arm
+                      platform: linux/arm64
 
-                    # Golang variant (Go base)
-                    - name: golang
+                    # Golang variant
+                    - variant: golang
+                      arch: amd64
+                      base_image: golang:1.21-bookworm
                       runner: blacksmith-8vcpu-ubuntu-2404
-                      platforms: linux/amd64,linux/arm64
+                      platform: linux/amd64
+
+                    - variant: golang
+                      arch: arm64
                       base_image: golang:1.21-bookworm
+                      runner: blacksmith-8vcpu-ubuntu-2404-arm
+                      platform: linux/arm64
 
         runs-on: ${{ matrix.runner }}
 
         env:
             IMAGE: ${{ inputs.image != '' && inputs.image || 'ghcr.io/openhands/agent-server' }}
             BASE_IMAGE: ${{ inputs.base_image != '' && inputs.base_image || matrix.base_image }}
-            CUSTOM_TAGS: ${{ matrix.name }}
+            CUSTOM_TAGS: ${{ matrix.variant }}
+            VARIANT: ${{ matrix.variant }}
+            ARCH: ${{ matrix.arch }}
             TARGET: binary
-            PLATFORMS: ${{ matrix.platforms }}
+            PLATFORM: ${{ matrix.platform }}
             GITHUB_SHA: ${{ github.sha }}
             GITHUB_REF: ${{ github.ref }}
             CI: 'true'
@@ -195,7 +220,7 @@ jobs:
               with:
                   version: latest
 
-            - name: Set up Docker Buildx
+            - name: Set up Docker Buildx with Blacksmith
               uses: useblacksmith/setup-docker-builder@v1
 
             - name: Log in to GHCR
@@ -205,43 +230,181 @@ jobs:
                   username: ${{ github.actor }}
                   password: ${{ secrets.GITHUB_TOKEN }}
 
-            - name: Build & Push (${{ matrix.name }})
-              id: build
+            - name: Prepare build context and metadata
+              id: prep
               run: |
                   uv sync --frozen
-                  uv run ./openhands-agent-server/openhands/agent_server/docker/build.py
 
-            - name: Summary (${{ matrix.name }}) - outputs
+                  # Generate build context and tags with arch suffix
+                  # build.py now handles architecture tagging internally via --arch flag
+                  uv run ./openhands-agent-server/openhands/agent_server/docker/build.py \
+                      --build-ctx-only \
+                      --arch ${{ matrix.arch }}
+
+                  # Alias tags_csv output to tags for the build action
+                  TAGS=$(grep '^tags_csv=' $GITHUB_OUTPUT | cut -d= -f2-)
+                  echo "tags=$TAGS" >> $GITHUB_OUTPUT
+
+                  # Extract short SHA for consolidation
+                  SHORT_SHA=$(echo ${{ github.sha }} | cut -c1-7)
+                  echo "short_sha=$SHORT_SHA" >> $GITHUB_OUTPUT
+
+                  # Verify outputs
+                  echo "=== Build outputs ==="
+                  echo "Build context: $(grep '^build_context=' $GITHUB_OUTPUT | cut -d= -f2-)"
+                  echo "Tags: $TAGS"
+                  echo "Short SHA: $SHORT_SHA"
+                  echo "===================="
+
+            - name: Build & Push with Blacksmith (${{ matrix.variant }}-${{ matrix.arch }})
+              id: build
+              uses: useblacksmith/build-push-action@v2
+              with:
+                  context: ${{ steps.prep.outputs.build_context }}
+                  file: ${{ steps.prep.outputs.dockerfile }}
+                  target: ${{ env.TARGET }}
+                  platforms: ${{ env.PLATFORM }}
+                  push: true
+                  tags: ${{ steps.prep.outputs.tags }}
+                  build-args: |
+                      BASE_IMAGE=${{ env.BASE_IMAGE }}
+
+            - name: Cleanup build context
+              if: always()
+              run: |
+                  if [ -n "${{ steps.prep.outputs.build_context }}" ] && [ -d "${{ steps.prep.outputs.build_context }}" ]; then
+                      echo "Cleaning up build context: ${{ steps.prep.outputs.build_context }}"
+                      rm -rf "${{ steps.prep.outputs.build_context }}"
+                  fi
+
+            - name: Summary (${{ matrix.variant }}-${{ matrix.arch }}) - outputs
               run: |
-                  echo "Image: ${{ steps.build.outputs.image }}"
-                  echo "Short SHA: ${{ steps.build.outputs.short_sha }}"
-                  echo "Versioned Tag: ${{ steps.build.outputs.versioned_tag }}"
-                  echo "Multiline tags:"
-                  echo "${{ steps.build.outputs.tags }}"
+                  echo "Image: ${{ env.IMAGE }}"
+                  echo "Variant: ${{ env.VARIANT }}"
+                  echo "Architecture: ${{ env.ARCH }}"
+                  echo "Platform: ${{ env.PLATFORM }}"
+                  echo "Short SHA: ${{ steps.prep.outputs.short_sha }}"
+                  echo "Tags: ${{ steps.prep.outputs.tags }}"
+                  echo "Build digest: ${{ steps.build.outputs.digest }}"
 
             - name: Save build info for consolidation
               run: |
                   mkdir -p build-info
-                  cat > "build-info/${{ matrix.name }}.json" << EOF
+                  cat > "build-info/${{ matrix.variant }}-${{ matrix.arch }}.json" << EOF
                   {
-                    "custom_tags": "${{ matrix.name }}",
+                    "variant": "${{ matrix.variant }}",
+                    "arch": "${{ matrix.arch }}",
                     "base_image": "${{ matrix.base_image }}",
-                    "image": "${{ steps.build.outputs.image }}",
-                    "short_sha": "${{ steps.build.outputs.short_sha }}",
-                    "tags": "${{ steps.build.outputs.tags_csv }}"
+                    "image": "${{ env.IMAGE }}",
+                    "short_sha": "${{ steps.prep.outputs.short_sha }}",
+                    "tags": "${{ steps.prep.outputs.tags }}",
+                    "platform": "${{ env.PLATFORM }}"
                   }
                   EOF
 
             - name: Upload build info artifact
               uses: actions/upload-artifact@v5
               with:
-                  name: build-info-${{ matrix.name }}
-                  path: build-info/${{ matrix.name }}.json
+                  name: build-info-${{ matrix.variant }}-${{ matrix.arch }}
+                  path: build-info/${{ matrix.variant }}-${{ matrix.arch }}.json
+                  retention-days: 1
+
+    merge-manifests:
+        name: Merge Multi-Arch Manifests
+        needs: build-and-push-image
+        if: >
+            github.event_name == 'push' ||
+            (github.event_name == 'pull_request' &&
+             !github.event.pull_request.head.repo.fork)
+        runs-on: blacksmith-2vcpu-ubuntu-2404
+        strategy:
+            matrix:
+                variant: [python, java, golang]
+        env:
+            IMAGE: ${{ inputs.image != '' && inputs.image || 'ghcr.io/openhands/agent-server' }}
+
+        steps:
+            - name: Download build info to extract SHORT_SHA
+              uses: actions/download-artifact@v6
+              with:
+                  pattern: build-info-${{ matrix.variant }}-*
+                  merge-multiple: true
+                  path: build-info
+
+            - name: Extract SHORT_SHA from build info
+              id: get_sha
+              run: |
+                  # Get SHORT_SHA from any build info artifact for this variant
+                  SHORT_SHA=$(jq -r '.short_sha' build-info/${{ matrix.variant }}-amd64.json)
+                  echo "short_sha=$SHORT_SHA" >> $GITHUB_OUTPUT
+                  echo "Using SHORT_SHA: $SHORT_SHA"
+
+            - name: Set up Docker Buildx with Blacksmith
+              uses: useblacksmith/setup-docker-builder@v1
+
+            - name: Log in to GHCR
+              uses: docker/login-action@v3
+              with:
+                  registry: ghcr.io
+                  username: ${{ github.actor }}
+                  password: ${{ secrets.GITHUB_TOKEN }}
+
+            - name: Create and push multi-arch manifest for ${{ matrix.variant }}
+              id: create_manifest
+              run: |
+                  SHORT_SHA=${{ steps.get_sha.outputs.short_sha }}
+                  VARIANT=${{ matrix.variant }}
+                  MANIFEST_TAG="${SHORT_SHA}-${VARIANT}"
+
+                  # Create multi-arch manifest combining amd64 and arm64 using buildx imagetools
+                  # This properly handles manifest lists from Blacksmith builds
+                  echo "Creating multi-arch manifest: ${IMAGE}:${MANIFEST_TAG}"
+                  docker buildx imagetools create -t ${IMAGE}:${MANIFEST_TAG} \
+                    ${IMAGE}:${SHORT_SHA}-${VARIANT}-amd64 \
+                    ${IMAGE}:${SHORT_SHA}-${VARIANT}-arm64
+
+                  # Verify the multi-arch manifest
+                  echo "Inspecting multi-arch manifest:"
+                  docker buildx imagetools inspect ${IMAGE}:${MANIFEST_TAG}
+
+                  echo "✓ Multi-arch manifest created: ${IMAGE}:${MANIFEST_TAG}"
+
+                  # Create latest manifest if on main branch
+                  if [ "${{ github.ref }}" == "refs/heads/main" ]; then
+                      LATEST_TAG="latest-${VARIANT}"
+                      echo "Creating latest multi-arch manifest: ${IMAGE}:${LATEST_TAG}"
+                      docker buildx imagetools create -t ${IMAGE}:${LATEST_TAG} \
+                        ${IMAGE}:main-${VARIANT}-amd64 \
+                        ${IMAGE}:main-${VARIANT}-arm64
+                      
+                      echo "Inspecting latest multi-arch manifest:"
+                      docker buildx imagetools inspect ${IMAGE}:${LATEST_TAG}
+                      echo "✓ Latest multi-arch manifest created: ${IMAGE}:${LATEST_TAG}"
+                      
+                      MANIFEST_TAG="${MANIFEST_TAG},${LATEST_TAG}"
+                  fi
+
+                  # Save manifest info for consolidation
+                  mkdir -p manifest-info
+                  cat > "manifest-info/${VARIANT}.json" << EOF
+                  {
+                    "variant": "${VARIANT}",
+                    "image": "${IMAGE}",
+                    "short_sha": "${SHORT_SHA}",
+                    "manifest_tag": "${MANIFEST_TAG}"
+                  }
+                  EOF
+
+            - name: Upload manifest info artifact
+              uses: actions/upload-artifact@v5
+              with:
+                  name: manifest-info-${{ matrix.variant }}
+                  path: manifest-info/${{ matrix.variant }}.json
                   retention-days: 1
 
     consolidate-build-info:
         name: Consolidate Build Information
-        needs: build-and-push-image
+        needs: [build-and-push-image, merge-manifests]
         # Run if it's a PR and the matrix job completed (even if some variants failed)
         if: github.event_name == 'pull_request' && always() && (needs.build-and-push-image.result == 'success' || needs.build-and-push-image.result ==
             'failure')
@@ -249,13 +412,20 @@ jobs:
         outputs:
             build_summary: ${{ steps.consolidate.outputs.build_summary }}
         steps:
-            - name: Download all build info artifacts
+            - name: Download build info artifacts
               uses: actions/download-artifact@v6
               with:
                   pattern: build-info-*
                   merge-multiple: true
                   path: build-info
 
+            - name: Download manifest info artifacts
+              uses: actions/download-artifact@v6
+              with:
+                  pattern: manifest-info-*
+                  merge-multiple: true
+                  path: manifest-info
+
             - name: Consolidate build information from artifacts
               id: consolidate
               run: |
@@ -267,7 +437,10 @@ jobs:
                   IMAGE=""
                   SHORT_SHA=""
                   ALL_TAGS=""
-                  VARIANTS_JSON="[]"
+
+                  # Use associative arrays to track variants (bash 4+)
+                  declare -A VARIANT_BASE_IMAGE
+                  declare -A VARIANT_ARCHS
 
                   # Process each build info
                   for info_file in build-info/*.json; do
@@ -281,7 +454,8 @@ jobs:
                       echo "=== End of $info_file ==="
                       
                       # Extract information from JSON
-                      CUSTOM_TAGS=$(jq -r '.custom_tags' "$info_file")
+                      VARIANT=$(jq -r '.variant' "$info_file")
+                      ARCH=$(jq -r '.arch' "$info_file")
                       BASE_IMAGE=$(jq -r '.base_image' "$info_file")
                       VARIANT_IMAGE=$(jq -r '.image' "$info_file")
                       VARIANT_SHA=$(jq -r '.short_sha' "$info_file")
@@ -293,14 +467,13 @@ jobs:
                           SHORT_SHA="$VARIANT_SHA"
                       fi
                       
-                      # Add to JSON array
-                      VARIANTS_JSON=$(echo "$VARIANTS_JSON" | jq \
-                          --arg custom_tags "$CUSTOM_TAGS" \
-                          --arg base_image "$BASE_IMAGE" \
-                          '. += [{custom_tags: $custom_tags, base_image: $base_image}]')
-                      
-                      echo "Added custom_tags $CUSTOM_TAGS, current variants JSON:"
-                      echo "$VARIANTS_JSON" | jq .
+                      # Store variant information
+                      VARIANT_BASE_IMAGE[$VARIANT]=$BASE_IMAGE
+                      if [[ -z "${VARIANT_ARCHS[$VARIANT]}" ]]; then
+                          VARIANT_ARCHS[$VARIANT]=$ARCH
+                      else
+                          VARIANT_ARCHS[$VARIANT]="${VARIANT_ARCHS[$VARIANT]}, $ARCH"
+                      fi
                       
                       # Collect tags (comma-separated to newline-separated)
                       if [[ -n "$VARIANT_TAGS" ]]; then
@@ -313,6 +486,57 @@ jobs:
                       fi
                   done
 
+                  # Build variants JSON array from collected data
+                  VARIANTS_JSON="[]"
+                  for VARIANT in "${!VARIANT_BASE_IMAGE[@]}"; do
+                      BASE_IMG="${VARIANT_BASE_IMAGE[$VARIANT]}"
+                      ARCHS="${VARIANT_ARCHS[$VARIANT]}"
+                      VARIANTS_JSON=$(echo "$VARIANTS_JSON" | jq \
+                          --arg variant "$VARIANT" \
+                          --arg base_image "$BASE_IMG" \
+                          --arg archs "$ARCHS" \
+                          '. += [{custom_tags: $variant, base_image: $base_image, architectures: $archs}]')
+                      
+                      echo "Added variant $VARIANT ($ARCHS), current variants JSON:"
+                      echo "$VARIANTS_JSON" | jq .
+                  done
+
+                  # Process manifest info artifacts
+                  echo "Processing manifest info artifacts..."
+                  if [[ -d "manifest-info" ]]; then
+                      ls -la manifest-info/
+                      
+                      MANIFEST_TAGS=""
+                      for manifest_file in manifest-info/*.json; do
+                          if [[ -f "$manifest_file" ]]; then
+                              echo "=== Processing $manifest_file ==="
+                              cat "$manifest_file"
+                              
+                              MANIFEST_TAG_CSV=$(jq -r '.manifest_tag' "$manifest_file")
+                              # Convert comma-separated tags to newline-separated
+                              MANIFEST_TAG_LIST=$(echo "$MANIFEST_TAG_CSV" | tr ',' '\n' | sed "s|^|${IMAGE}:|")
+                              
+                              if [[ -n "$MANIFEST_TAGS" ]]; then
+                                  MANIFEST_TAGS="${MANIFEST_TAGS}"$'\n'"${MANIFEST_TAG_LIST}"
+                              else
+                                  MANIFEST_TAGS="$MANIFEST_TAG_LIST"
+                              fi
+                          fi
+                      done
+
+                      # Add manifest tags to ALL_TAGS
+                      if [[ -n "$MANIFEST_TAGS" ]]; then
+                          echo "Adding manifest tags to output"
+                          if [[ -n "$ALL_TAGS" ]]; then
+                              ALL_TAGS="${ALL_TAGS}"$'\n'"${MANIFEST_TAGS}"
+                          else
+                              ALL_TAGS="$MANIFEST_TAGS"
+                          fi
+                      fi
+                  else
+                      echo "No manifest-info directory found (merge-manifests may not have run)"
+                  fi
+
                   # Create consolidated build summary
                   BUILD_SUMMARY=$(jq -n \
                       --arg image "$IMAGE" \
@@ -385,10 +609,11 @@ jobs:
                       echo "DEBUG: Processing build JSON: $VARIANT_JSON"
                       CUSTOM_TAGS=$(echo "$VARIANT_JSON" | jq -r '.custom_tags')
                       BASE_IMAGE=$(echo "$VARIANT_JSON" | jq -r '.base_image')
+                      ARCHS=$(echo "$VARIANT_JSON" | jq -r '.architectures // "amd64, arm64"')
                       
-                      echo "DEBUG: Adding custom_tags $CUSTOM_TAGS with base image $BASE_IMAGE"
-                      # Add to variants table
-                      VARIANTS_TABLE="${VARIANTS_TABLE}| ${CUSTOM_TAGS} | \`${BASE_IMAGE}\` | [Link](https://hub.docker.com/_/${BASE_IMAGE}) |"$'\n'
+                      echo "DEBUG: Adding variant $CUSTOM_TAGS with base image $BASE_IMAGE (archs: $ARCHS)"
+                      # Add to variants table with architecture info
+                      VARIANTS_TABLE="${VARIANTS_TABLE}| ${CUSTOM_TAGS} | ${ARCHS} | \`${BASE_IMAGE}\` | [Link](https://hub.docker.com/_/${BASE_IMAGE}) |"$'\n'
                   done
 
                   echo "DEBUG: Final variants table:"
@@ -404,12 +629,13 @@ jobs:
                   • **GHCR package:** ${GHCR_URL}
 
                   **Variants & Base Images**
-                  | Variant | Base Image | Docs / Tags |
-                  |---|---|---|
+                  | Variant | Architectures | Base Image | Docs / Tags |
+                  |---|---|---|---|
                   ${VARIANTS_TABLE}
 
                   **Pull (multi-arch manifest)**
                   \`\`\`bash
+                  # Each variant is a multi-arch manifest supporting both amd64 and arm64
                   docker pull ${IMAGE}:${SHORT_SHA}-python
                   \`\`\`
 
@@ -426,7 +652,10 @@ jobs:
                   ${ALL_TAGS}
                   \`\`\`
 
-                  _The \`${SHORT_SHA}\` tag is a multi-arch manifest (amd64/arm64); your client pulls the right arch automatically._
+                  **About Multi-Architecture Support**
+                  - Each variant tag (e.g., \`${SHORT_SHA}-python\`) is a **multi-arch manifest** supporting both **amd64** and **arm64**
+                  - Docker automatically pulls the correct architecture for your platform
+                  - Individual architecture tags (e.g., \`${SHORT_SHA}-python-amd64\`) are also available if needed
                   <!-- AGENT_SERVER_IMAGES_END -->
                   EOF
                   )
diff --git a/README.md b/README.md
index 7a20ae84d7..a6b37fc65d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,28 @@
-# OpenHands Agent SDK
-
-Build AI agents that write software. A clean, modular SDK with production-ready tools.
+<a name="readme-top"></a>
+
+<div align="center">
+  <img src="https://raw.githubusercontent.com/OpenHands/docs/main/openhands/static/img/logo.png" alt="Logo" width="200">
+  <h1 align="center">OpenHands Software Agent SDK </h1>
+</div>
+
+
+<div align="center">
+  <a href="https://docs.openhands.dev/sdk"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
+  <a href="https://github.com/OpenHands/software-agent-sdk/blob/main/LICENSE"><img src="https://img.shields.io/github/license/OpenHands/software-agent-sdk?style=for-the-badge&color=blue" alt="MIT License"></a>
+  <a href="https://all-hands.dev/joinslack"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
+  <br>
+  <!-- Keep these links. Translations will automatically update with the README. -->
+  <a href="https://www.readme-i18n.com/OpenHands/software-agent-sdk?lang=de">Deutsch</a> |
+  <a href="https://www.readme-i18n.com/OpenHands/software-agent-sdk?lang=es">Español</a> |
+  <a href="https://www.readme-i18n.com/OpenHands/software-agent-sdk?lang=fr">français</a> |
+  <a href="https://www.readme-i18n.com/OpenHands/software-agent-sdk?lang=ja">日本語</a> |
+  <a href="https://www.readme-i18n.com/OpenHands/software-agent-sdk?lang=ko">한국어</a> |
+  <a href="https://www.readme-i18n.com/OpenHands/software-agent-sdk?lang=pt">Português</a> |
+  <a href="https://www.readme-i18n.com/OpenHands/software-agent-sdk?lang=ru">Русский</a> |
+  <a href="https://www.readme-i18n.com/OpenHands/software-agent-sdk?lang=zh">中文</a>
+
+  <hr>
+</div>
 
 The OpenHands SDK allows you to build applications with agents that write software. This SDK also powers [OpenHands](https://github.com/OpenHands/OpenHands), an all-batteries-included coding agent that you can access through a GUI, CLI, or API.
 
@@ -25,7 +47,7 @@ from openhands.sdk import LLM, Conversation
 from openhands.tools.preset.default import get_default_agent
 
 # Configure LLM and create agent
-llm = LLM(model="openhands/claude-sonnet-4-5-20250929", api_key=api_key)
+llm = LLM(model="openhands/claude-sonnet-4-5-20250929", api_key='...')
 agent = get_default_agent(llm=llm)
 
 # Start a conversation
@@ -65,7 +87,3 @@ For development setup, testing, and contribution guidelines, see [DEVELOPMENT.md
 - [Join Slack](https://openhands.dev/joinslack) - Connect with the OpenHands community
 - [GitHub Repository](https://github.com/OpenHands/agent-sdk) - Source code and issues
 - [Documentation](https://docs.openhands.dev/sdk) - Complete documentation
-
-## License
-
-MIT License - see [LICENSE](LICENSE) for details.
diff --git a/examples/01_standalone_sdk/01_hello_world.py b/examples/01_standalone_sdk/01_hello_world.py
index ab7d493319..105c4d10c9 100644
--- a/examples/01_standalone_sdk/01_hello_world.py
+++ b/examples/01_standalone_sdk/01_hello_world.py
@@ -27,3 +27,7 @@
 # Send a message and let the agent run
 conversation.send_message("Write 3 facts about the current project into FACTS.txt.")
 conversation.run()
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index 6b7dc3bf34..4c399d467f 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -26,15 +26,14 @@
 )
 from openhands.tools.execute_bash import (
     BashExecutor,
+    BashTool,
     ExecuteBashAction,
-    execute_bash_tool,
 )
 from openhands.tools.file_editor import FileEditorTool
 
 
 logger = get_logger(__name__)
 
-
 # --- Action / Observation ---
 
 
@@ -115,6 +114,41 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
 * When you are doing an open ended search that may require multiple rounds of globbing and grepping, use the Agent tool instead
 """  # noqa: E501
 
+
+# --- Tool Definition ---
+
+
+class GrepTool(ToolDefinition[GrepAction, GrepObservation]):
+    """A custom grep tool that searches file contents using regular expressions."""
+
+    @classmethod
+    def create(
+        cls, conv_state, bash_executor: BashExecutor | None = None
+    ) -> Sequence[ToolDefinition]:
+        """Create GrepTool instance with a GrepExecutor.
+
+        Args:
+            conv_state: Conversation state to get working directory from.
+            bash_executor: Optional bash executor to reuse. If not provided,
+                         a new one will be created.
+
+        Returns:
+            A sequence containing a single GrepTool instance.
+        """
+        if bash_executor is None:
+            bash_executor = BashExecutor(working_dir=conv_state.workspace.working_dir)
+        grep_executor = GrepExecutor(bash_executor)
+
+        return [
+            cls(
+                description=_GREP_DESCRIPTION,
+                action_type=GrepAction,
+                observation_type=GrepObservation,
+                executor=grep_executor,
+            )
+        ]
+
+
 # Configure LLM
 api_key = os.getenv("LLM_API_KEY")
 assert api_key is not None, "LLM_API_KEY environment variable is not set."
@@ -135,25 +169,19 @@ def _make_bash_and_grep_tools(conv_state) -> list[ToolDefinition]:
     """Create execute_bash and custom grep tools sharing one executor."""
 
     bash_executor = BashExecutor(working_dir=conv_state.workspace.working_dir)
-    bash_tool = execute_bash_tool.set_executor(executor=bash_executor)
-
-    grep_executor = GrepExecutor(bash_executor)
-    grep_tool = ToolDefinition(
-        name="grep",
-        description=_GREP_DESCRIPTION,
-        action_type=GrepAction,
-        observation_type=GrepObservation,
-        executor=grep_executor,
-    )
+    # bash_tool = execute_bash_tool.set_executor(executor=bash_executor)
+    bash_tool = BashTool.create(conv_state, executor=bash_executor)[0]
+
+    # Use the GrepTool.create() method with shared bash_executor
+    grep_tool = GrepTool.create(conv_state, bash_executor=bash_executor)[0]
 
     return [bash_tool, grep_tool]
 
 
-register_tool("FileEditorTool", FileEditorTool)
 register_tool("BashAndGrepToolSet", _make_bash_and_grep_tools)
 
 tools = [
-    Tool(name="FileEditorTool"),
+    Tool(name=FileEditorTool.name),
     Tool(name="BashAndGrepToolSet"),
 ]
 
@@ -186,3 +214,7 @@ def conversation_callback(event: Event):
 print("Conversation finished. Got the following LLM messages:")
 for i, message in enumerate(llm_messages):
     print(f"Message {i}: {str(message)[:200]}")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/03_activate_skill.py b/examples/01_standalone_sdk/03_activate_skill.py
index ee8bc5a27a..d2da4b0a57 100644
--- a/examples/01_standalone_sdk/03_activate_skill.py
+++ b/examples/01_standalone_sdk/03_activate_skill.py
@@ -15,7 +15,7 @@
     KeywordTrigger,
     Skill,
 )
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
 
@@ -36,13 +36,11 @@
 
 # Tools
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
 tools = [
     Tool(
-        name="BashTool",
+        name=BashTool.name,
     ),
-    Tool(name="FileEditorTool"),
+    Tool(name=FileEditorTool.name),
 ]
 
 agent_context = AgentContext(
@@ -73,11 +71,9 @@
     user_message_suffix="The first character of your response should be 'I'",
 )
 
-
 # Agent
 agent = Agent(llm=llm, tools=tools, agent_context=agent_context)
 
-
 llm_messages = []  # collect raw LLM messages
 
 
@@ -104,3 +100,7 @@ def conversation_callback(event: Event):
 print("Conversation finished. Got the following LLM messages:")
 for i, message in enumerate(llm_messages):
     print(f"Message {i}: {str(message)[:200]}")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/04_confirmation_mode_example.py b/examples/01_standalone_sdk/04_confirmation_mode_example.py
index 3b12d49adc..c34596c996 100644
--- a/examples/01_standalone_sdk/04_confirmation_mode_example.py
+++ b/examples/01_standalone_sdk/04_confirmation_mode_example.py
@@ -7,7 +7,10 @@
 from pydantic import SecretStr
 
 from openhands.sdk import LLM, BaseConversation, Conversation
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 from openhands.sdk.security.confirmation_policy import AlwaysConfirm, NeverConfirm
 from openhands.tools.preset.default import get_default_agent
 
@@ -56,10 +59,10 @@ def run_until_finished(conversation: BaseConversation, confirmer: Callable) -> N
     on reject, call reject_pending_actions().
     Preserves original error if agent waits but no actions exist.
     """
-    while conversation.state.agent_status != AgentExecutionStatus.FINISHED:
+    while conversation.state.execution_status != ConversationExecutionStatus.FINISHED:
         if (
-            conversation.state.agent_status
-            == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+            conversation.state.execution_status
+            == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
         ):
             pending = ConversationState.get_unmatched_actions(conversation.state.events)
             if not pending:
@@ -125,7 +128,7 @@ def run_until_finished(conversation: BaseConversation, confirmer: Callable) -> N
 print("Key points:")
 print(
     "- conversation.run() creates actions; confirmation mode "
-    "sets agent_status=WAITING_FOR_CONFIRMATION"
+    "sets execution_status=WAITING_FOR_CONFIRMATION"
 )
 print("- User confirmation is handled via a single reusable function")
 print("- Rejection uses conversation.reject_pending_actions() and the loop continues")
diff --git a/examples/01_standalone_sdk/05_use_llm_registry.py b/examples/01_standalone_sdk/05_use_llm_registry.py
index 6cad805467..f0bbfdaf4c 100644
--- a/examples/01_standalone_sdk/05_use_llm_registry.py
+++ b/examples/01_standalone_sdk/05_use_llm_registry.py
@@ -13,7 +13,7 @@
     TextContent,
     get_logger,
 )
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 
 
@@ -42,8 +42,7 @@
 
 # Tools
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
-tools = [Tool(name="BashTool")]
+tools = [Tool(name=BashTool.name)]
 
 # Agent
 agent = Agent(llm=llm, tools=tools)
@@ -76,15 +75,16 @@ def conversation_callback(event: Event):
 print(f"Same LLM instance: {llm is same_llm}")
 
 # Demonstrate requesting a completion directly from an LLM
-completion_response = llm.completion(
+resp = llm.completion(
     messages=[
         Message(role="user", content=[TextContent(text="Say hello in one word.")])
     ]
 )
-# Access the response content
-raw_response = completion_response.raw_response
-if raw_response.choices and raw_response.choices[0].message:  # type: ignore
-    content = raw_response.choices[0].message.content  # type: ignore
-    print(f"Direct completion response: {content}")
-else:
-    print("No response content available")
+# Access the response content via OpenHands LLMResponse
+msg = resp.message
+texts = [c.text for c in msg.content if isinstance(c, TextContent)]
+print(f"Direct completion response: {texts[0] if texts else str(msg)}")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/06_interactive_terminal_w_reasoning.py b/examples/01_standalone_sdk/06_interactive_terminal_w_reasoning.py
index c8510d6b9c..a3b5c9c48a 100644
--- a/examples/01_standalone_sdk/06_interactive_terminal_w_reasoning.py
+++ b/examples/01_standalone_sdk/06_interactive_terminal_w_reasoning.py
@@ -10,7 +10,7 @@
     LLMConvertibleEvent,
     get_logger,
 )
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 
 
@@ -30,10 +30,9 @@
 
 # Tools
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
 tools = [
     Tool(
-        name="BashTool",
+        name=BashTool.name,
         params={"no_change_timeout_seconds": 3},
     )
 ]
diff --git a/examples/01_standalone_sdk/07_mcp_integration.py b/examples/01_standalone_sdk/07_mcp_integration.py
index 6a498cb8d8..5e5acf6cdf 100644
--- a/examples/01_standalone_sdk/07_mcp_integration.py
+++ b/examples/01_standalone_sdk/07_mcp_integration.py
@@ -11,7 +11,7 @@
     get_logger,
 )
 from openhands.sdk.security.llm_analyzer import LLMSecurityAnalyzer
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
 
@@ -31,11 +31,9 @@
 )
 
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
 tools = [
-    Tool(name="BashTool"),
-    Tool(name="FileEditorTool"),
+    Tool(name=BashTool.name),
+    Tool(name=FileEditorTool.name),
 ]
 
 # Add MCP Tools
@@ -84,3 +82,7 @@ def conversation_callback(event: Event):
 print("Conversation finished. Got the following LLM messages:")
 for i, message in enumerate(llm_messages):
     print(f"Message {i}: {str(message)[:200]}")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/08_mcp_with_oauth.py b/examples/01_standalone_sdk/08_mcp_with_oauth.py
index 79d6be8593..364dd3b01e 100644
--- a/examples/01_standalone_sdk/08_mcp_with_oauth.py
+++ b/examples/01_standalone_sdk/08_mcp_with_oauth.py
@@ -10,7 +10,7 @@
     LLMConvertibleEvent,
     get_logger,
 )
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
 
@@ -30,13 +30,11 @@
 )
 
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
 tools = [
     Tool(
-        name="BashTool",
+        name=BashTool.name,
     ),
-    Tool(name="FileEditorTool"),
+    Tool(name=FileEditorTool.name),
 ]
 
 mcp_config = {
diff --git a/examples/01_standalone_sdk/09_pause_example.py b/examples/01_standalone_sdk/09_pause_example.py
index 80159ea3d0..4e2a460329 100644
--- a/examples/01_standalone_sdk/09_pause_example.py
+++ b/examples/01_standalone_sdk/09_pause_example.py
@@ -9,7 +9,7 @@
     Agent,
     Conversation,
 )
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
 
@@ -27,20 +27,17 @@
 )
 
 # Tools
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
 tools = [
     Tool(
-        name="BashTool",
+        name=BashTool.name,
     ),
-    Tool(name="FileEditorTool"),
+    Tool(name=FileEditorTool.name),
 ]
 
 # Agent
 agent = Agent(llm=llm, tools=tools)
 conversation = Conversation(agent, workspace=os.getcwd())
 
-
 print("=" * 60)
 print("Pause and Continue Example")
 print("=" * 60)
@@ -53,7 +50,7 @@
     "one number per line. After you finish, summarize what you did."
 )
 
-print(f"Initial status: {conversation.state.agent_status}")
+print(f"Initial status: {conversation.state.execution_status}")
 print()
 
 # Start the agent in a background thread
@@ -72,10 +69,9 @@
 # Wait for the thread to finish (it will stop when paused)
 thread.join()
 
-print(f"Agent status after pause: {conversation.state.agent_status}")
+print(f"Agent status after pause: {conversation.state.execution_status}")
 print()
 
-
 # Phase 3: Send a new message while paused
 print("Phase 3: Sending a new message while agent is paused...")
 conversation.send_message(
@@ -86,9 +82,13 @@
 
 # Phase 4: Resume the agent with .run()
 print("Phase 4: Resuming agent with .run()...")
-print(f"Status before resume: {conversation.state.agent_status}")
+print(f"Status before resume: {conversation.state.execution_status}")
 
 # Resume execution
 conversation.run()
 
-print(f"Final status: {conversation.state.agent_status}")
+print(f"Final status: {conversation.state.execution_status}")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/10_persistence.py b/examples/01_standalone_sdk/10_persistence.py
index f5b8b046ca..7bdc145954 100644
--- a/examples/01_standalone_sdk/10_persistence.py
+++ b/examples/01_standalone_sdk/10_persistence.py
@@ -11,7 +11,7 @@
     LLMConvertibleEvent,
     get_logger,
 )
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
 
@@ -32,11 +32,9 @@
 
 # Tools
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
 tools = [
-    Tool(name="BashTool"),
-    Tool(name="FileEditorTool"),
+    Tool(name=BashTool.name),
+    Tool(name=FileEditorTool.name),
 ]
 
 # Add MCP Tools
@@ -98,3 +96,7 @@ def conversation_callback(event: Event):
 print("Sending message to deserialized conversation...")
 conversation.send_message("Hey what did you create? Return an agent finish action")
 conversation.run()
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/11_async.py b/examples/01_standalone_sdk/11_async.py
index c6fc1c45f6..7dc7c7a8be 100644
--- a/examples/01_standalone_sdk/11_async.py
+++ b/examples/01_standalone_sdk/11_async.py
@@ -18,7 +18,7 @@
     get_logger,
 )
 from openhands.sdk.conversation.types import ConversationCallbackType
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.sdk.utils.async_utils import AsyncCallbackWrapper
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
@@ -41,15 +41,12 @@
 
 # Tools
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
-register_tool("TaskTrackerTool", TaskTrackerTool)
 tools = [
     Tool(
-        name="BashTool",
+        name=BashTool.name,
     ),
-    Tool(name="FileEditorTool"),
-    Tool(name="TaskTrackerTool"),
+    Tool(name=FileEditorTool.name),
+    Tool(name=TaskTrackerTool.name),
 ]
 
 # Agent
@@ -92,6 +89,10 @@ async def main():
     for i, message in enumerate(llm_messages):
         print(f"Message {i}: {str(message)[:200]}")
 
+    # Report cost
+    cost = llm.metrics.accumulated_cost
+    print(f"EXAMPLE_COST: {cost}")
+
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/examples/01_standalone_sdk/12_custom_secrets.py b/examples/01_standalone_sdk/12_custom_secrets.py
index 24b4c7b6b3..2b07c45a6c 100644
--- a/examples/01_standalone_sdk/12_custom_secrets.py
+++ b/examples/01_standalone_sdk/12_custom_secrets.py
@@ -8,7 +8,7 @@
     Conversation,
 )
 from openhands.sdk.conversation.secret_source import SecretSource
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
 
@@ -26,11 +26,9 @@
 )
 
 # Tools
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
 tools = [
-    Tool(name="BashTool"),
-    Tool(name="FileEditorTool"),
+    Tool(name=BashTool.name),
+    Tool(name=FileEditorTool.name),
 ]
 
 # Agent
@@ -54,3 +52,7 @@ def get_value(self) -> str:
 conversation.send_message("just echo $SECRET_FUNCTION_TOKEN")
 
 conversation.run()
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/13_get_llm_metrics.py b/examples/01_standalone_sdk/13_get_llm_metrics.py
index 9c011246ea..c13dde69bc 100644
--- a/examples/01_standalone_sdk/13_get_llm_metrics.py
+++ b/examples/01_standalone_sdk/13_get_llm_metrics.py
@@ -10,7 +10,7 @@
     LLMConvertibleEvent,
     get_logger,
 )
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
 
@@ -30,11 +30,9 @@
 )
 
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
 tools = [
-    Tool(name="BashTool"),
-    Tool(name="FileEditorTool"),
+    Tool(name=BashTool.name),
+    Tool(name=FileEditorTool.name),
 ]
 
 # Add MCP Tools
@@ -77,3 +75,7 @@ def conversation_callback(event: Event):
 print(
     f"Conversation finished. Final LLM metrics with details: {llm.metrics.model_dump()}"
 )
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/14_context_condenser.py b/examples/01_standalone_sdk/14_context_condenser.py
index 41fed80bfd..ae2d3a76af 100644
--- a/examples/01_standalone_sdk/14_context_condenser.py
+++ b/examples/01_standalone_sdk/14_context_condenser.py
@@ -18,7 +18,7 @@
     get_logger,
 )
 from openhands.sdk.context.condenser import LLMSummarizingCondenser
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
 from openhands.tools.task_tracker import TaskTrackerTool
@@ -40,15 +40,12 @@
 
 # Tools
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
-register_tool("TaskTrackerTool", TaskTrackerTool)
 tools = [
     Tool(
-        name="BashTool",
+        name=BashTool.name,
     ),
-    Tool(name="FileEditorTool"),
-    Tool(name="TaskTrackerTool"),
+    Tool(name=FileEditorTool.name),
+    Tool(name=TaskTrackerTool.name),
 ]
 
 # Create a condenser to manage the context. The condenser will automatically truncate
@@ -105,7 +102,6 @@ def conversation_callback(event: Event):
 )
 conversation.run()
 
-
 print("=" * 100)
 print("Conversation finished. Got the following LLM messages:")
 for i, message in enumerate(llm_messages):
@@ -129,10 +125,13 @@ def conversation_callback(event: Event):
 conversation.send_message("Finally, clean up by deleting both files.")
 conversation.run()
 
-
 print("=" * 100)
 print("Conversation finished with LLM Summarizing Condenser.")
 print(f"Total LLM messages collected: {len(llm_messages)}")
 print("\nThe condenser automatically summarized older conversation history")
 print("when the conversation exceeded the configured max_size threshold.")
 print("This helps manage context length while preserving important information.")
+
+# Report cost
+cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/15_browser_use.py b/examples/01_standalone_sdk/15_browser_use.py
index 19c57963af..f57c8055fb 100644
--- a/examples/01_standalone_sdk/15_browser_use.py
+++ b/examples/01_standalone_sdk/15_browser_use.py
@@ -10,7 +10,7 @@
     LLMConvertibleEvent,
     get_logger,
 )
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.browser_use import BrowserToolSet
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
@@ -32,15 +32,12 @@
 
 # Tools
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
-register_tool("BrowserToolSet", BrowserToolSet)
 tools = [
     Tool(
-        name="BashTool",
+        name=BashTool.name,
     ),
-    Tool(name="FileEditorTool"),
-    Tool(name="BrowserToolSet"),
+    Tool(name=FileEditorTool.name),
+    Tool(name=BrowserToolSet.name),
 ]
 
 # If you need fine-grained browser control, you can manually register individual browser
@@ -68,7 +65,6 @@ def conversation_callback(event: Event):
 )
 conversation.run()
 
-
 print("=" * 100)
 print("Conversation finished. Got the following LLM messages:")
 for i, message in enumerate(llm_messages):
diff --git a/examples/01_standalone_sdk/16_llm_security_analyzer.py b/examples/01_standalone_sdk/16_llm_security_analyzer.py
index 5e4af8695a..ee6cb83805 100644
--- a/examples/01_standalone_sdk/16_llm_security_analyzer.py
+++ b/examples/01_standalone_sdk/16_llm_security_analyzer.py
@@ -11,10 +11,13 @@
 from pydantic import SecretStr
 
 from openhands.sdk import LLM, Agent, BaseConversation, Conversation
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 from openhands.sdk.security.confirmation_policy import ConfirmRisky
 from openhands.sdk.security.llm_analyzer import LLMSecurityAnalyzer
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
 
@@ -65,14 +68,14 @@ def run_until_finished_with_security(
     """
     Drive the conversation until FINISHED.
     - If WAITING_FOR_CONFIRMATION: ask the confirmer.
-        * On approve: set agent_status = IDLE (keeps original example’s behavior).
+        * On approve: set execution_status = IDLE (keeps original example’s behavior).
         * On reject: conversation.reject_pending_actions(...).
     - If WAITING but no pending actions: print warning and set IDLE (matches original).
     """
-    while conversation.state.agent_status != AgentExecutionStatus.FINISHED:
+    while conversation.state.execution_status != ConversationExecutionStatus.FINISHED:
         if (
-            conversation.state.agent_status
-            == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+            conversation.state.execution_status
+            == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
         ):
             pending = ConversationState.get_unmatched_actions(conversation.state.events)
             if not pending:
@@ -101,13 +104,11 @@ def run_until_finished_with_security(
 )
 
 # Tools
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
 tools = [
     Tool(
-        name="BashTool",
+        name=BashTool.name,
     ),
-    Tool(name="FileEditorTool"),
+    Tool(name=FileEditorTool.name),
 ]
 
 # Agent with security analyzer
diff --git a/examples/01_standalone_sdk/17_image_input.py b/examples/01_standalone_sdk/17_image_input.py
index 717adfe2e3..f20236592c 100644
--- a/examples/01_standalone_sdk/17_image_input.py
+++ b/examples/01_standalone_sdk/17_image_input.py
@@ -19,7 +19,6 @@
     TextContent,
     get_logger,
 )
-from openhands.sdk.tool.registry import register_tool
 from openhands.sdk.tool.spec import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
@@ -43,18 +42,14 @@
 
 cwd = os.getcwd()
 
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
-register_tool("TaskTrackerTool", TaskTrackerTool)
-
 agent = Agent(
     llm=llm,
     tools=[
         Tool(
-            name="BashTool",
+            name=BashTool.name,
         ),
-        Tool(name="FileEditorTool"),
-        Tool(name="TaskTrackerTool"),
+        Tool(name=FileEditorTool.name),
+        Tool(name=TaskTrackerTool.name),
     ],
 )
 
@@ -93,8 +88,11 @@ def conversation_callback(event: Event) -> None:
 )
 conversation.run()
 
-
 print("=" * 100)
 print("Conversation finished. Got the following LLM messages:")
 for i, message in enumerate(llm_messages):
     print(f"Message {i}: {str(message)[:200]}")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/18_send_message_while_processing.py b/examples/01_standalone_sdk/18_send_message_while_processing.py
index 24551fcf97..1586784f9a 100644
--- a/examples/01_standalone_sdk/18_send_message_while_processing.py
+++ b/examples/01_standalone_sdk/18_send_message_while_processing.py
@@ -50,7 +50,7 @@
     Agent,
     Conversation,
 )
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 from openhands.tools.file_editor import FileEditorTool
 
@@ -69,13 +69,11 @@
 
 # Tools
 cwd = os.getcwd()
-register_tool("BashTool", BashTool)
-register_tool("FileEditorTool", FileEditorTool)
 tools = [
     Tool(
-        name="BashTool",
+        name=BashTool.name,
     ),
-    Tool(name="FileEditorTool"),
+    Tool(name=FileEditorTool.name),
 ]
 
 # Agent
@@ -140,3 +138,7 @@ def timestamp() -> str:
     os.remove(document_path)
 else:
     print("WARNING: Document.txt was not created")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/19_llm_routing.py b/examples/01_standalone_sdk/19_llm_routing.py
index 513103b365..166e2a2b4d 100644
--- a/examples/01_standalone_sdk/19_llm_routing.py
+++ b/examples/01_standalone_sdk/19_llm_routing.py
@@ -89,8 +89,11 @@ def conversation_callback(event: Event):
 )
 conversation.run()
 
-
 print("=" * 100)
 print("Conversation finished. Got the following LLM messages:")
 for i, message in enumerate(llm_messages):
     print(f"Message {i}: {str(message)[:200]}")
+
+# Report cost
+cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/20_stuck_detector.py b/examples/01_standalone_sdk/20_stuck_detector.py
index 78eb23c92f..1aacda4304 100644
--- a/examples/01_standalone_sdk/20_stuck_detector.py
+++ b/examples/01_standalone_sdk/20_stuck_detector.py
@@ -62,3 +62,7 @@ def conversation_callback(event: Event):
 print("Conversation finished. Got the following LLM messages:")
 for i, message in enumerate(llm_messages):
     print(f"Message {i}: {str(message)[:200]}")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/21_generate_extraneous_conversation_costs.py b/examples/01_standalone_sdk/21_generate_extraneous_conversation_costs.py
index 9a85e1694d..20b2e0c177 100644
--- a/examples/01_standalone_sdk/21_generate_extraneous_conversation_costs.py
+++ b/examples/01_standalone_sdk/21_generate_extraneous_conversation_costs.py
@@ -12,11 +12,8 @@
     TextContent,
     get_logger,
 )
-from openhands.sdk.tool.registry import register_tool
 from openhands.sdk.tool.spec import Tool
-from openhands.tools.execute_bash import (
-    BashTool,
-)
+from openhands.tools.execute_bash import BashTool
 
 
 logger = get_logger(__name__)
@@ -43,8 +40,6 @@
 )
 
 # Tools
-register_tool("BashTool", BashTool)
-
 condenser = LLMSummarizingCondenser(llm=llm_condenser, max_size=10, keep_first=2)
 
 cwd = os.getcwd()
@@ -52,7 +47,7 @@
     llm=llm,
     tools=[
         Tool(
-            name="BashTool",
+            name=BashTool.name,
         ),
     ],
     condenser=condenser,
@@ -67,7 +62,6 @@
 )
 conversation.run()
 
-
 # Demonstrate extraneous costs part of the conversation
 second_llm = LLM(
     usage_id="demo-secondary",
@@ -80,7 +74,6 @@
     messages=[Message(role="user", content=[TextContent(text="echo 'More spend!'")])]
 )
 
-
 # Access total spend
 spend = conversation.conversation_stats.get_combined_metrics()
 print("\n=== Total Spend for Conversation ===\n")
@@ -91,7 +84,6 @@
     print(f"Cache Read Tokens: {spend.accumulated_token_usage.cache_read_tokens}")
     print(f"Cache Write Tokens: {spend.accumulated_token_usage.cache_write_tokens}")
 
-
 spend_per_usage = conversation.conversation_stats.usage_to_metrics
 print("\n=== Spend Breakdown by Usage ID ===\n")
 rows = []
@@ -116,3 +108,7 @@
         tablefmt="github",
     )
 )
+
+# Report cost
+cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/22_anthropic_thinking.py b/examples/01_standalone_sdk/22_anthropic_thinking.py
index 18c08b25bd..890d45b94f 100644
--- a/examples/01_standalone_sdk/22_anthropic_thinking.py
+++ b/examples/01_standalone_sdk/22_anthropic_thinking.py
@@ -13,7 +13,7 @@
     RedactedThinkingBlock,
     ThinkingBlock,
 )
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 from openhands.tools.execute_bash import BashTool
 
 
@@ -31,8 +31,7 @@
 )
 
 # Setup agent with bash tool
-register_tool("BashTool", BashTool)
-agent = Agent(llm=llm, tools=[Tool(name="BashTool")])
+agent = Agent(llm=llm, tools=[Tool(name=BashTool.name)])
 
 
 # Callback to display thinking blocks
@@ -63,3 +62,7 @@ def show_thinking(event: Event):
 )
 conversation.run()
 print("✅ Done!")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/23_responses_reasoning.py b/examples/01_standalone_sdk/23_responses_reasoning.py
index db0284e2b4..74ac651941 100644
--- a/examples/01_standalone_sdk/23_responses_reasoning.py
+++ b/examples/01_standalone_sdk/23_responses_reasoning.py
@@ -23,11 +23,10 @@
 
 logger = get_logger(__name__)
 
-
 api_key = os.getenv("LLM_API_KEY") or os.getenv("OPENAI_API_KEY")
 assert api_key, "Set LLM_API_KEY or OPENAI_API_KEY in your environment."
 
-model = os.getenv("LLM_MODEL", "openhands/gpt-5-codex")
+model = "openhands/gpt-5-mini-2025-08-07"  # Use a model that supports Responses API
 base_url = os.getenv("LLM_BASE_URL")
 
 llm = LLM(
@@ -73,3 +72,7 @@ def conversation_callback(event: Event):
 for i, message in enumerate(llm_messages):
     ms = str(message)
     print(f"Message {i}: {ms[:200]}{'...' if len(ms) > 200 else ''}")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/24_planning_agent_workflow.py b/examples/01_standalone_sdk/24_planning_agent_workflow.py
index 6c673f43d8..ada17fcd26 100644
--- a/examples/01_standalone_sdk/24_planning_agent_workflow.py
+++ b/examples/01_standalone_sdk/24_planning_agent_workflow.py
@@ -131,3 +131,7 @@ def get_event_content(event):
 for file_path in workspace_dir.rglob("*"):
     if file_path.is_file():
         print(f"  - {file_path.relative_to(workspace_dir)}")
+
+# Report cost
+cost = llm.metrics.accumulated_cost
+print(f"EXAMPLE_COST: {cost}")
diff --git a/examples/02_remote_agent_server/01_convo_with_local_agent_server.py b/examples/02_remote_agent_server/01_convo_with_local_agent_server.py
index e7a2b02c2e..9305e39a07 100644
--- a/examples/02_remote_agent_server/01_convo_with_local_agent_server.py
+++ b/examples/02_remote_agent_server/01_convo_with_local_agent_server.py
@@ -190,7 +190,7 @@ def event_callback(event):
         conversation.run()
 
         logger.info("✅ First task completed!")
-        logger.info(f"Agent status: {conversation.state.agent_status}")
+        logger.info(f"Agent status: {conversation.state.execution_status}")
 
         # Wait for events to stop coming (no events for 2 seconds)
         logger.info("⏳ Waiting for events to stop...")
@@ -237,6 +237,10 @@ def event_callback(event):
             if isinstance(event, ConversationStateUpdateEvent):
                 logger.info(f"  - {event}")
 
+        # Report cost (must be before conversation.close())
+        cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost
+        print(f"EXAMPLE_COST: {cost}")
+
     finally:
         # Clean up
         print("\n🧹 Cleaning up conversation...")
diff --git a/examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py b/examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py
index c927996f50..a2bd67dfbc 100644
--- a/examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py
+++ b/examples/02_remote_agent_server/02_convo_with_docker_sandboxed_server.py
@@ -16,7 +16,6 @@
 
 logger = get_logger(__name__)
 
-
 # 1) Ensure we have LLM API key
 api_key = os.getenv("LLM_API_KEY")
 assert api_key is not None, "LLM_API_KEY environment variable is not set."
@@ -89,7 +88,7 @@ def event_callback(event) -> None:
         logger.info("🚀 Running conversation...")
         conversation.run()
         logger.info("✅ First task completed!")
-        logger.info(f"Agent status: {conversation.state.agent_status}")
+        logger.info(f"Agent status: {conversation.state.execution_status}")
 
         # Wait for events to settle (no events for 2 seconds)
         logger.info("⏳ Waiting for events to stop...")
@@ -101,6 +100,10 @@ def event_callback(event) -> None:
         conversation.send_message("Great! Now delete that file.")
         conversation.run()
         logger.info("✅ Second task completed!")
+
+        # Report cost (must be before conversation.close())
+        cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost
+        print(f"EXAMPLE_COST: {cost}")
     finally:
         print("\n🧹 Cleaning up conversation...")
         conversation.close()
diff --git a/examples/02_remote_agent_server/03_browser_use_with_docker_sandboxed_server.py b/examples/02_remote_agent_server/03_browser_use_with_docker_sandboxed_server.py
index b81784cbe4..f0bc6e3dd9 100644
--- a/examples/02_remote_agent_server/03_browser_use_with_docker_sandboxed_server.py
+++ b/examples/02_remote_agent_server/03_browser_use_with_docker_sandboxed_server.py
@@ -12,7 +12,6 @@
 
 logger = get_logger(__name__)
 
-
 api_key = os.getenv("LLM_API_KEY")
 assert api_key is not None, "LLM_API_KEY environment variable is not set."
 
diff --git a/examples/02_remote_agent_server/04_convo_with_api_sandboxed_server.py b/examples/02_remote_agent_server/04_convo_with_api_sandboxed_server.py
index 8024b44d9c..b9f9538af9 100644
--- a/examples/02_remote_agent_server/04_convo_with_api_sandboxed_server.py
+++ b/examples/02_remote_agent_server/04_convo_with_api_sandboxed_server.py
@@ -44,7 +44,6 @@
     logger.error("RUNTIME_API_KEY required")
     exit(1)
 
-
 with APIRemoteWorkspace(
     runtime_api_url=os.getenv("RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"),
     runtime_api_key=runtime_api_key,
diff --git a/examples/02_remote_agent_server/04_vscode_with_docker_sandboxed_server.py b/examples/02_remote_agent_server/04_vscode_with_docker_sandboxed_server.py
index b46b295385..dfe3fff6f0 100644
--- a/examples/02_remote_agent_server/04_vscode_with_docker_sandboxed_server.py
+++ b/examples/02_remote_agent_server/04_vscode_with_docker_sandboxed_server.py
@@ -12,7 +12,6 @@
 
 logger = get_logger(__name__)
 
-
 api_key = os.getenv("LLM_API_KEY")
 assert api_key is not None, "LLM_API_KEY environment variable is not set."
 
diff --git a/examples/03_github_workflows/01_basic_action/assign-reviews.yml b/examples/03_github_workflows/01_basic_action/assign-reviews.yml
index b003c836ef..3109ff81af 100644
--- a/examples/03_github_workflows/01_basic_action/assign-reviews.yml
+++ b/examples/03_github_workflows/01_basic_action/assign-reviews.yml
@@ -41,7 +41,8 @@ jobs:
                 Find all open PRs where:
                 1. The PR is waiting for review (there are no open review comments or change requests)
                 2. The PR is in a "clean" state (CI passing, no merge conflicts)
-                3. The PR has had no activity (comments, commits, reviews) for more than 3 days.
+                3. The PR is not marked as draft (draft: false)
+                4. The PR has had no activity (comments, commits, reviews) for more than 3 days.
 
                 In this case, send a message to the reviewers:
                 [Automatic Post]: This PR seems to be currently waiting for review.
@@ -50,7 +51,7 @@ jobs:
                 # Need Author Action
 
                 Find all open PRs where the most recent change or comment was made on the pull
-                request more than 5 days ago. Then do the following in order:
+                request more than 5 days ago (use 14 days if the PR is marked as draft).
 
                 And send a message to the author:
 
diff --git a/examples/03_github_workflows/03_todo_management/prompt.py b/examples/03_github_workflows/03_todo_management/prompt.py
index 21326d9ac6..2a50d32a81 100644
--- a/examples/03_github_workflows/03_todo_management/prompt.py
+++ b/examples/03_github_workflows/03_todo_management/prompt.py
@@ -21,7 +21,6 @@
     active contributors to the file/location of the changes.
     Assign one of these people as a reviewer.
 
-
 Please make sure to:
 - Create a descriptive branch name related to the TODO
 - Fix the issue with clean code
diff --git a/examples/03_github_workflows/04_datadog_debugging/README.md b/examples/03_github_workflows/04_datadog_debugging/README.md
new file mode 100644
index 0000000000..12be38a0b2
--- /dev/null
+++ b/examples/03_github_workflows/04_datadog_debugging/README.md
@@ -0,0 +1,299 @@
+# Datadog Error Debugging Workflow
+
+This example demonstrates how to use OpenHands agents to automatically debug errors from Datadog in a GitHub Actions workflow.
+
+## Overview
+
+The workflow:
+1. Fetches errors from Datadog based on configurable queries
+2. Searches for or creates GitHub issues to track errors
+3. Clones relevant repositories for comprehensive analysis
+4. Uses OpenHands AI agents to analyze code and identify root causes
+5. Posts debugging insights as comments on GitHub issues
+
+## Files
+
+- `workflow.yml` - GitHub Actions workflow with manual trigger
+- `datadog_debugging.py` - Main debugging script
+- `debug_prompt.jinja` - Template for AI debugging prompts
+
+## Features
+
+### Manual Trigger
+Run on-demand via GitHub Actions UI with configurable inputs:
+- **Query Type**: Choose between `log-query` (search) or `log-error-id` (specific error ID)
+- **Datadog Query**:
+  - For `log-query`: Search query like `service:deploy ClientDisconnect`
+  - For `log-error-id`: Specific error tracking ID like `2adba034-ab5a-11f0-b04e-da7ad0900000`
+- Repository list to analyze
+- Issue repository for tracking
+- Parent issue for organization
+- LLM model selection
+
+### Smart Issue Management
+- Searches for existing issues before creating duplicates
+- Uses URL encoding for proper GitHub API queries
+- Selects oldest matching issue when duplicates exist
+- Links to parent tracking issue
+
+### Multi-Repository Analysis
+- Clone multiple repositories for comprehensive context
+- Agent has full view of all relevant codebases
+- Identifies root causes across repository boundaries
+
+### AI-Powered Debugging
+- Automatic code analysis using OpenHands agents
+- Identifies error locations and root causes
+- Provides actionable fix recommendations
+- Posts detailed findings as GitHub comments
+
+## Setup
+
+### Required Secrets
+
+Configure these in your repository Settings → Secrets and variables → Actions:
+
+```yaml
+DD_API_KEY: Your Datadog API key
+DD_APP_KEY: Your Datadog Application key
+DD_SITE: Your Datadog site (e.g., us5.datadoghq.com)
+LLM_API_KEY: API key for LLM service
+LLM_BASE_URL: Base URL for LLM service (optional)
+```
+
+**Note**: `GITHUB_TOKEN` is automatically provided by GitHub Actions.
+
+### Installation
+
+1. Copy `workflow.yml` to your repository's `.github/workflows/` directory (e.g., `.github/workflows/datadog-debugging.yml`)
+2. Configure the required secrets in repository Settings → Secrets and variables → Actions
+3. Optionally, customize the workflow inputs and defaults in the YAML file
+
+**Note**: The workflow automatically downloads the latest version of `datadog_debugging.py` and `debug_prompt.jinja` from the SDK repository at runtime. No need to copy these files to your repository unless you want to customize them.
+
+## Usage
+
+### Via GitHub Actions UI
+
+1. Go to the **Actions** tab in your repository
+2. Select **Datadog Error Debugging** workflow
+3. Click **Run workflow**
+4. Configure inputs:
+   - **Query Type**: Choose `log-query` or `log-error-id` (default: `log-query`)
+   - **Datadog Query**: 
+     - For `log-query`: Search query (default: `service:deploy ClientDisconnect`)
+     - For `log-error-id`: Error tracking ID (e.g., `2adba034-ab5a-11f0-b04e-da7ad0900000`)
+   - **Repository List**: Comma-separated repos to analyze (default: `OpenHands/OpenHands,All-Hands-AI/infra`)
+   - **Issue Repository**: Where to create issues (default: `All-Hands-AI/infra`)
+   - **Parent Issue**: Optional parent issue URL for tracking
+   - **Issue Prefix**: Prefix for issue titles (default: `DataDog Error: `)
+   - **LLM Model**: Model to use (default: `openhands/claude-sonnet-4-5-20250929`)
+5. Click **Run workflow**
+
+### Via GitHub CLI
+
+**Search for errors matching a query:**
+```bash
+gh workflow run datadog-debugging.yml \
+  -f query_type="log-query" \
+  -f datadog_query="service:deploy ClientDisconnect" \
+  -f repo_list="OpenHands/OpenHands,All-Hands-AI/infra" \
+  -f issue_repo="All-Hands-AI/infra"
+```
+
+**Debug a specific error by ID:**
+```bash
+gh workflow run datadog-debugging.yml \
+  -f query_type="log-error-id" \
+  -f datadog_query="2adba034-ab5a-11f0-b04e-da7ad0900000" \
+  -f repo_list="OpenHands/OpenHands,All-Hands-AI/infra,All-Hands-AI/deploy" \
+  -f issue_repo="All-Hands-AI/infra"
+```
+
+## Example
+
+### Input (Search Query)
+```yaml
+query_type: "log-query"
+datadog_query: "service:deploy ClientDisconnect"
+repo_list: "OpenHands/OpenHands,All-Hands-AI/infra,All-Hands-AI/deploy"
+issue_repo: "All-Hands-AI/infra"
+issue_parent: "https://github.com/All-Hands-AI/infra/issues/672"
+```
+
+### Input (Specific Error ID)
+```yaml
+query_type: "log-error-id"
+datadog_query: "2adba034-ab5a-11f0-b04e-da7ad0900000"
+repo_list: "OpenHands/OpenHands,All-Hands-AI/infra,All-Hands-AI/deploy"
+issue_repo: "All-Hands-AI/infra"
+issue_parent: "https://github.com/All-Hands-AI/infra/issues/672"
+```
+
+### Output
+- **Console**: Progress logs showing error fetching, repository cloning, and agent analysis
+- **GitHub Issue**: Created or updated with error details
+- **GitHub Comment**: AI-generated analysis with root cause and recommendations
+- **Artifacts**: Debugging data and logs saved for 7 days
+
+### Real Example
+
+See a real run with production data:
+- Error: `starlette.requests.ClientDisconnect` (1,526 occurrences)
+- Issue: https://github.com/All-Hands-AI/infra/issues/703
+- AI Analysis: https://github.com/All-Hands-AI/infra/issues/703#issuecomment-3480707049
+
+The agent identified:
+- Error locations in `github.py` and `gitlab.py`
+- Root cause: Unhandled `ClientDisconnect` exceptions
+- Recommendations: Add proper error handling for client disconnections
+
+## Configuration
+
+### Datadog Query Examples
+
+```yaml
+# ClientDisconnect errors
+service:deploy ClientDisconnect
+
+# Server errors (5xx)
+service:deploy http.status_code:5*
+
+# Database errors
+service:deploy (database OR postgresql) status:error
+
+# Authentication errors
+service:deploy (authentication OR authorization) status:error
+
+# Rate limit errors
+service:deploy rate_limit status:error
+```
+
+### Repository List Format
+
+Comma-separated list of `owner/repo`:
+```
+OpenHands/OpenHands,All-Hands-AI/infra,All-Hands-AI/deploy
+```
+
+### LLM Model Options
+
+- `openhands/claude-sonnet-4-5-20250929` - Best quality (default)
+- `openhands/claude-haiku-4-5-20251001` - Faster, cheaper
+- `anthropic/claude-3-5-sonnet-20241022` - Alternative
+
+## Workflow Details
+
+### Inputs
+
+| Input | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| `datadog_query` | string | Yes | `service:deploy ClientDisconnect` | Datadog query to search for errors |
+| `repo_list` | string | Yes | `OpenHands/OpenHands,All-Hands-AI/infra` | Comma-separated list of repositories |
+| `issue_repo` | string | Yes | `All-Hands-AI/infra` | Repository to create/update issues in |
+| `issue_parent` | string | No | - | Parent GitHub issue URL for tracking |
+| `issue_prefix` | string | No | `DataDog Error: ` | Prefix for issue titles |
+| `max_errors` | string | No | `5` | Maximum number of errors to fetch |
+| `llm_model` | string | No | `openhands/claude-sonnet-4-5-20250929` | LLM model to use |
+
+### Outputs
+
+- **GitHub Issues**: Created or updated with error details
+- **GitHub Comments**: AI analysis posted to issues
+- **Artifacts**: Debugging data and logs (retained for 7 days)
+
+### Permissions
+
+```yaml
+permissions:
+  contents: read   # Clone repositories
+  issues: write    # Create/update issues and comments
+```
+
+## Customization
+
+### For Production Use
+
+Consider creating a separate configuration repository with:
+- Scheduled runs (daily for critical, weekly for comprehensive)
+- Predefined error query categories
+- Repository group definitions
+- Environment-specific settings
+
+See the All-Hands-AI/infra example for a production-ready implementation.
+
+### Adding Scheduled Runs
+
+Add to the workflow's `on:` section:
+
+```yaml
+on:
+  workflow_dispatch:
+    # ... existing inputs ...
+  
+  schedule:
+    # Daily at 09:00 UTC for critical errors
+    - cron: '0 9 * * *'
+    # Weekly on Monday at 09:00 UTC for full scan
+    - cron: '0 9 * * 1'
+```
+
+### Matrix Strategy
+
+Run multiple queries in parallel:
+
+```yaml
+jobs:
+  debug-errors:
+    strategy:
+      matrix:
+        query:
+          - "service:deploy ClientDisconnect"
+          - "service:deploy http.status_code:5*"
+          - "service:deploy database status:error"
+      fail-fast: false
+```
+
+## Troubleshooting
+
+### Workflow Fails to Start
+- Verify all required secrets are configured
+- Check `GITHUB_TOKEN` has necessary permissions
+- Review workflow syntax with `yamllint`
+
+### No Issues Created
+- Verify issue repository exists and is accessible
+- Check `GITHUB_TOKEN` has `issues: write` permission
+- Review workflow logs for API errors
+
+### Agent Analysis Incomplete
+- Increase workflow timeout if needed
+- Check `LLM_API_KEY` is valid and has quota
+- Try a different LLM model
+- Reduce number of repositories to analyze
+
+### Repository Clone Failures
+- Verify repository names use `owner/repo` format
+- Check `GITHUB_TOKEN` has access to private repos
+- Ensure repositories exist and are accessible
+
+## Related Examples
+
+- **Basic Action**: `examples/03_github_workflows/01_basic_action/` - Simple workflow example
+- **PR Review**: `examples/03_github_workflows/02_pr_review/` - PR automation example
+- **TODO Management**: `examples/03_github_workflows/03_todo_management/` - Automated TODO tracking
+
+## Benefits
+
+1. **Automated Debugging**: AI analyzes code without manual intervention
+2. **Reduced MTTR**: Faster root cause identification
+3. **Context-Aware**: Multi-repo analysis for complete picture
+4. **No Duplicates**: Smart issue tracking prevents clutter
+5. **Actionable Insights**: Clear recommendations for fixes
+6. **Scalable**: Easy to add new error categories
+
+## Learn More
+
+- [Datadog API Documentation](https://docs.datadoghq.com/api/)
+- [GitHub Actions Documentation](https://docs.github.com/en/actions)
+- [OpenHands SDK Documentation](https://github.com/OpenHands/software-agent-sdk)
diff --git a/examples/03_github_workflows/04_datadog_debugging/datadog_debugging.py b/examples/03_github_workflows/04_datadog_debugging/datadog_debugging.py
new file mode 100644
index 0000000000..8f97db053d
--- /dev/null
+++ b/examples/03_github_workflows/04_datadog_debugging/datadog_debugging.py
@@ -0,0 +1,748 @@
+#!/usr/bin/env python3
+"""
+Datadog Debugging Example
+
+This example demonstrates how to use the OpenHands agent to debug errors
+logged in Datadog.
+The agent will:
+1. Query Datadog logs to understand the error using curl commands
+2. Clone relevant GitHub repositories using git commands
+3. Analyze the codebase to identify potential causes
+4. Attempt to reproduce the error
+5. Optionally create a draft PR with a fix
+
+Usage:
+    python 26_datadog_debugging.py --query "status:error service:deploy" \\
+        --repos "All-Hands-AI/OpenHands,All-Hands-AI/deploy"
+
+Environment Variables Required:
+    - DD_API_KEY: Your Datadog API key
+    - DD_APP_KEY: Your Datadog application key
+    - DD_SITE: (optional) Datadog site (e.g., datadoghq.com, datadoghq.eu)
+    - GITHUB_TOKEN: Your GitHub personal access token
+    - LLM_API_KEY: API key for the LLM service
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+
+import requests
+from jinja2 import Environment, FileSystemLoader
+from pydantic import SecretStr
+
+from openhands.sdk import (
+    LLM,
+    Agent,
+    Conversation,
+    Event,
+    LLMConvertibleEvent,
+    Message,
+    TextContent,
+    get_logger,
+)
+from openhands.sdk.tool import Tool, register_tool
+from openhands.tools.execute_bash import BashTool
+from openhands.tools.file_editor import FileEditorTool
+from openhands.tools.task_tracker import TaskTrackerTool
+
+
+logger = get_logger(__name__)
+
+
+def validate_environment():
+    """Validate that all required environment variables are set."""
+    required_vars = [
+        "DD_API_KEY",
+        "DD_APP_KEY",
+        "GITHUB_TOKEN",
+        "LLM_API_KEY",
+    ]
+
+    missing_vars = []
+    for var in required_vars:
+        if not os.getenv(var):
+            missing_vars.append(var)
+
+    if missing_vars:
+        print(f"❌ Missing required environment variables: {', '.join(missing_vars)}")
+        print("\nPlease set the following environment variables:")
+        for var in missing_vars:
+            print(f"  export {var}=your_key_here")
+        return False
+
+    return True
+
+
+def fetch_datadog_errors(
+    query: str, working_dir: Path, query_type: str = "log-query", limit: int = 5
+) -> Path:
+    """
+    Fetch error examples from Datadog Error Tracking and save to a JSON file.
+
+    Args:
+        query: Datadog query string (search query or error tracking ID)
+        working_dir: Directory to save the error examples
+        query_type: Type of query - "log-query" or "log-error-id"
+        limit: Maximum number of error examples to fetch (default: 5)
+
+    Returns:
+        Path to the JSON file containing error examples
+    """
+    dd_api_key = os.getenv("DD_API_KEY")
+    dd_app_key = os.getenv("DD_APP_KEY")
+    dd_site = os.getenv("DD_SITE", "datadoghq.com")
+
+    error_examples = []
+
+    if query_type == "log-error-id":
+        # Fetch specific error by ID using GET endpoint
+        api_url = f"https://api.{dd_site}/api/v2/error-tracking/issues/{query}"
+
+        print("📡 Fetching specific error from Datadog...")
+        print(f"   Error ID: {query}")
+        print(f"   API: {api_url}")
+
+        headers = {
+            "DD-API-KEY": dd_api_key,
+            "DD-APPLICATION-KEY": dd_app_key,
+        }
+
+        try:
+            response = requests.get(api_url, headers=headers, timeout=30)
+            response.raise_for_status()
+        except requests.exceptions.Timeout:
+            print("❌ Error: Request to Datadog API timed out")
+            sys.exit(1)
+        except requests.exceptions.RequestException as e:
+            print(f"❌ Error fetching from Datadog API: {e}")
+            sys.exit(1)
+
+        try:
+            response_data = response.json()
+        except json.JSONDecodeError as e:
+            print(f"❌ Error parsing Datadog API response: {e}")
+            print(f"   Response: {response.text[:500]}")
+            sys.exit(1)
+
+        # Check for API errors
+        if "errors" in response_data:
+            print(f"❌ Datadog API error: {response_data['errors']}")
+            sys.exit(1)
+
+        # Extract error details from GET response
+        data = response_data.get("data", {})
+        attrs = data.get("attributes", {})
+
+        error_example = {
+            "example_number": 1,
+            "issue_id": query,
+            "service": attrs.get("service"),
+            "error_type": attrs.get("error_type"),
+            "error_message": attrs.get("error_message", ""),
+            "file_path": attrs.get("file_path"),
+            "function_name": attrs.get("function_name"),
+            "first_seen": attrs.get("first_seen"),
+            "last_seen": attrs.get("last_seen"),
+            "state": attrs.get("state"),
+            "platform": attrs.get("platform"),
+            "languages": attrs.get("languages", []),
+        }
+        error_examples.append(error_example)
+
+    else:  # log-query
+        # Use Error Tracking Search API
+        api_url = f"https://api.{dd_site}/api/v2/error-tracking/issues/search"
+
+        # Calculate timestamps (30 days back)
+        now = int(datetime.now().timestamp() * 1000)
+        thirty_days_ago = int((datetime.now() - timedelta(days=30)).timestamp() * 1000)
+
+        # Build the request body for Error Tracking API
+        request_body = {
+            "data": {
+                "attributes": {
+                    "query": query,
+                    "from": thirty_days_ago,
+                    "to": now,
+                    "track": "logs",  # Track errors from logs
+                },
+                "type": "search_request",
+            }
+        }
+
+        print(f"📡 Fetching up to {limit} error tracking issues from Datadog...")
+        print(f"   Query: {query}")
+        print(f"   API: {api_url}")
+
+        # Add include parameter to get full issue details
+        params = {"include": "issue"}
+        headers = {
+            "Content-Type": "application/json",
+            "DD-API-KEY": dd_api_key,
+            "DD-APPLICATION-KEY": dd_app_key,
+        }
+
+        try:
+            response = requests.post(
+                api_url, params=params, headers=headers, json=request_body, timeout=30
+            )
+            response.raise_for_status()
+        except requests.exceptions.Timeout:
+            print("❌ Error: Request to Datadog API timed out")
+            sys.exit(1)
+        except requests.exceptions.RequestException as e:
+            print(f"❌ Error fetching from Datadog API: {e}")
+            sys.exit(1)
+
+        try:
+            response_data = response.json()
+        except json.JSONDecodeError as e:
+            print(f"❌ Error parsing Datadog API response: {e}")
+            print(f"   Response: {response.text[:500]}")
+            sys.exit(1)
+
+        # Check for API errors
+        if "errors" in response_data:
+            print(f"❌ Datadog API error: {response_data['errors']}")
+            sys.exit(1)
+
+        # Extract and format error tracking issues from search results
+        search_results = response_data.get("data", [])
+        included_details = {
+            item["id"]: item for item in response_data.get("included", [])
+        }
+
+        if search_results:
+            for idx, search_result in enumerate(search_results[:limit], 1):
+                issue_id = search_result.get("id")
+                search_attrs = search_result.get("attributes", {})
+
+                # Get detailed issue info from included section
+                issue_details = included_details.get(issue_id, {})
+                issue_attrs = issue_details.get("attributes", {})
+
+                error_example = {
+                    "example_number": idx,
+                    "issue_id": issue_id,
+                    "total_count": search_attrs.get("total_count"),
+                    "impacted_users": search_attrs.get("impacted_users"),
+                    "impacted_sessions": search_attrs.get("impacted_sessions"),
+                    "service": issue_attrs.get("service"),
+                    "error_type": issue_attrs.get("error_type"),
+                    "error_message": issue_attrs.get("error_message", ""),
+                    "file_path": issue_attrs.get("file_path"),
+                    "function_name": issue_attrs.get("function_name"),
+                    "first_seen": issue_attrs.get("first_seen"),
+                    "last_seen": issue_attrs.get("last_seen"),
+                    "state": issue_attrs.get("state"),
+                    "platform": issue_attrs.get("platform"),
+                    "languages": issue_attrs.get("languages", []),
+                }
+                error_examples.append(error_example)
+
+    # Save to file
+    errors_file = working_dir / "datadog_errors.json"
+    with open(errors_file, "w") as f:
+        json.dump(
+            {
+                "query": query,
+                "fetch_time": "now",
+                "total_examples": len(error_examples),
+                "examples": error_examples,
+            },
+            f,
+            indent=2,
+        )
+
+    print(f"✅ Fetched {len(error_examples)} error examples")
+    print(f"📄 Saved to: {errors_file}")
+    return errors_file
+
+
+def create_unique_identifier(query: str, errors_data: dict) -> str:
+    """
+    Create a unique identifier for the error based on query or issue ID.
+
+    Args:
+        query: The Datadog query string
+        errors_data: The parsed error data from datadog_errors.json
+
+    Returns:
+        Unique identifier string
+    """
+    # Check if we have a specific issue ID
+    examples = errors_data.get("examples", [])
+    if examples and examples[0].get("issue_id"):
+        issue_id = examples[0]["issue_id"]
+        return f"error-id: {issue_id}"
+    else:
+        # Use query as identifier
+        return f"query: {query}"
+
+
+def search_existing_issue(
+    issue_repo: str, identifier: str, github_token: str
+) -> int | None:
+    """
+    Search for existing GitHub issues containing the identifier.
+
+    Args:
+        issue_repo: Repository in format 'owner/repo'
+        identifier: Unique identifier to search for
+        github_token: GitHub API token
+
+    Returns:
+        Issue number if found, None otherwise
+    """
+    print(f"🔍 Searching for existing issue with identifier: {identifier}")
+
+    # Search issues in the repository
+    search_query = f'repo:{issue_repo} is:issue "{identifier}"'
+    url = "https://api.github.com/search/issues"
+    headers = {
+        "Authorization": f"Bearer {github_token}",
+        "Accept": "application/vnd.github+json",
+    }
+    params = {"q": search_query}
+
+    try:
+        response = requests.get(url, headers=headers, params=params, timeout=30)
+        response.raise_for_status()
+        data = response.json()
+        items = data.get("items", [])
+        if items:
+            # Sort by created_at to get the oldest issue (first created)
+            items_sorted = sorted(items, key=lambda x: x["created_at"])
+            issue_number = items_sorted[0]["number"]
+            print(f"✅ Found existing issue #{issue_number} (oldest of {len(items)})")
+            return issue_number
+        else:
+            print("❌ No existing issue found")
+            return None
+    except (
+        requests.exceptions.RequestException,
+        json.JSONDecodeError,
+        KeyError,
+    ) as e:
+        print(f"⚠️  Error searching for issues: {e}")
+        return None
+
+
+def create_github_issue(
+    issue_repo: str,
+    title: str,
+    body: str,
+    github_token: str,
+) -> int:
+    """
+    Create a new GitHub issue.
+
+    Args:
+        issue_repo: Repository in format 'owner/repo'
+        title: Issue title
+        body: Issue body content
+        github_token: GitHub API token
+
+    Returns:
+        Created issue number
+    """
+    print(f"📝 Creating new issue: {title}")
+
+    url = f"https://api.github.com/repos/{issue_repo}/issues"
+
+    headers = {
+        "Authorization": f"Bearer {github_token}",
+        "Accept": "application/vnd.github+json",
+        "Content-Type": "application/json",
+    }
+    payload = {"title": title, "body": body}
+
+    try:
+        response = requests.post(url, headers=headers, json=payload, timeout=30)
+        response.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        print(f"❌ Error creating issue: {e}")
+        if hasattr(e, "response") and e.response:
+            print(f"Response: {e.response.text[:500]}")
+        sys.exit(1)
+
+    try:
+        data = response.json()
+        issue_number = data["number"]
+        issue_url = data["html_url"]
+        print(f"✅ Created issue #{issue_number}: {issue_url}")
+        return issue_number
+    except (json.JSONDecodeError, KeyError) as e:
+        print(f"❌ Error parsing response: {e}")
+        print(f"Response: {response.text[:500]}")
+        sys.exit(1)
+
+
+def format_issue_body(
+    errors_data: dict,
+    identifier: str,
+    parent_issue_url: str | None,
+) -> str:
+    """
+    Format the GitHub issue body with error details.
+
+    Args:
+        errors_data: The parsed error data
+        identifier: Unique identifier
+        parent_issue_url: Optional parent issue URL
+
+    Returns:
+        Formatted issue body
+    """
+    examples = errors_data.get("examples", [])
+    query = errors_data.get("query", "")
+
+    body_parts = []
+
+    # Add parent issue reference if provided
+    if parent_issue_url:
+        body_parts.append(f"**Parent Issue:** {parent_issue_url}\n")
+
+    # Add identifier for searchability
+    body_parts.append(f"**Identifier:** `{identifier}`\n")
+
+    # Add query info
+    body_parts.append(f"**Query:** `{query}`\n")
+
+    # Add error summary
+    if examples:
+        first_example = examples[0]
+        body_parts.append("## Error Summary\n")
+
+        if first_example.get("issue_id"):
+            body_parts.append(f"- **Issue ID:** `{first_example['issue_id']}`")
+        if first_example.get("total_count"):
+            body_parts.append(
+                f"- **Total Occurrences:** {first_example['total_count']}"
+            )
+        if first_example.get("error_type"):
+            body_parts.append(f"- **Error Type:** `{first_example['error_type']}`")
+        if first_example.get("service"):
+            body_parts.append(f"- **Service:** `{first_example['service']}`")
+        if first_example.get("file_path"):
+            body_parts.append(f"- **File:** `{first_example['file_path']}`")
+        if first_example.get("function_name"):
+            body_parts.append(f"- **Function:** `{first_example['function_name']}`")
+        if first_example.get("state"):
+            body_parts.append(f"- **State:** {first_example['state']}")
+
+        body_parts.append("")
+
+        # Add error message if available
+        if first_example.get("error_message"):
+            body_parts.append("## Error Message\n")
+            body_parts.append("```")
+            body_parts.append(first_example["error_message"])
+            body_parts.append("```\n")
+
+    # Add note about full data
+    body_parts.append("## Full Error Data\n")
+    body_parts.append(
+        "The complete error tracking data has been saved and will be analyzed "
+        "by the debugging agent.\n"
+    )
+
+    # Add JSON data as collapsible section
+    body_parts.append("<details>")
+    body_parts.append("<summary>View Full Error Data (JSON)</summary>\n")
+    body_parts.append("```json")
+    body_parts.append(json.dumps(errors_data, indent=2))
+    body_parts.append("```")
+    body_parts.append("</details>\n")
+
+    body_parts.append("---")
+    body_parts.append(
+        "*This issue is being tracked by an automated debugging agent. "
+        "Analysis findings will be posted as comments below.*"
+    )
+
+    return "\n".join(body_parts)
+
+
+def setup_github_issue(
+    query: str,
+    errors_file: Path,
+    issue_repo: str,
+    issue_prefix: str,
+    issue_parent: str | None,
+) -> tuple[int, str]:
+    """
+    Create or find GitHub issue for tracking debugging progress.
+
+    Args:
+        query: The Datadog query
+        errors_file: Path to the errors JSON file
+        issue_repo: GitHub repository for issues
+        issue_prefix: Prefix for issue titles
+        issue_parent: Optional parent issue URL
+
+    Returns:
+        Tuple of (issue_number, issue_url)
+    """
+    github_token = os.getenv("GITHUB_TOKEN")
+    if not github_token:
+        print("❌ GITHUB_TOKEN environment variable not set")
+        sys.exit(1)
+
+    # Load error data
+    with open(errors_file) as f:
+        errors_data = json.load(f)
+
+    # Create unique identifier
+    identifier = create_unique_identifier(query, errors_data)
+
+    # Search for existing issue
+    issue_number = search_existing_issue(issue_repo, identifier, github_token)
+
+    if issue_number:
+        # Return existing issue
+        issue_url = f"https://github.com/{issue_repo}/issues/{issue_number}"
+        return issue_number, issue_url
+
+    # Create new issue
+    # Determine title from error data
+    examples = errors_data.get("examples", [])
+    if examples and examples[0].get("error_type"):
+        error_name = examples[0]["error_type"]
+    else:
+        # Use query as fallback
+        error_name = query[:50]  # Limit length
+
+    title = f"{issue_prefix}{error_name}"
+
+    # Format issue body
+    body = format_issue_body(errors_data, identifier, issue_parent)
+
+    # Create issue
+    issue_number = create_github_issue(issue_repo, title, body, github_token)
+    issue_url = f"https://github.com/{issue_repo}/issues/{issue_number}"
+
+    return issue_number, issue_url
+
+
+def create_debugging_prompt(
+    query: str, repos: list[str], errors_file: Path, issue_url: str
+) -> str:
+    """Create the debugging prompt for the agent."""
+    repos_list = "\n".join(f"- {repo}" for repo in repos)
+    dd_site = os.getenv("DD_SITE", "datadoghq.com")
+    error_tracking_url = f"https://api.{dd_site}/api/v2/error-tracking/issues/search"
+    logs_url = f"https://api.{dd_site}/api/v2/logs/events/search"
+
+    # Load Jinja2 template
+    template_dir = Path(__file__).parent
+    env = Environment(loader=FileSystemLoader(template_dir))
+    template = env.get_template("debug_prompt.jinja")
+
+    # Render template with context
+    prompt = template.render(
+        issue_url=issue_url,
+        errors_file=errors_file,
+        query=query,
+        error_tracking_url=error_tracking_url,
+        logs_url=logs_url,
+        repos_list=repos_list,
+    )
+
+    return prompt
+
+
+def main():
+    """Main function to run the Datadog debugging example."""
+    parser = argparse.ArgumentParser(
+        description="Debug errors from Datadog logs using OpenHands agent",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--query-type",
+        choices=["log-query", "log-error-id"],
+        default="log-query",
+        help=(
+            "Type of query: 'log-query' for search queries "
+            "(e.g., 'service:deploy ClientDisconnect'), "
+            "'log-error-id' for specific error tracking ID "
+            "(e.g., '2adba034-ab5a-11f0-b04e-da7ad0900000')"
+        ),
+    )
+    parser.add_argument(
+        "--query",
+        required=True,
+        help=(
+            "Datadog query string. For 'log-query': search query like "
+            "'status:error service:deploy'. For 'log-error-id': "
+            "specific error tracking ID"
+        ),
+    )
+    parser.add_argument(
+        "--repos",
+        required=True,
+        help="Comma-separated list of GitHub repositories to analyze "
+        "(e.g., 'All-Hands-AI/OpenHands,All-Hands-AI/deploy')",
+    )
+    parser.add_argument(
+        "--working-dir",
+        default="./datadog_debug_workspace",
+        help="Working directory for cloning repos and analysis "
+        "(default: ./datadog_debug_workspace)",
+    )
+    parser.add_argument(
+        "--issue-repo",
+        required=True,
+        help="GitHub repository for creating/updating issues "
+        "(e.g., 'All-Hands-AI/infra')",
+    )
+    parser.add_argument(
+        "--issue-parent",
+        help="Parent issue URL to reference (e.g., "
+        "'https://github.com/All-Hands-AI/infra/issues/672')",
+    )
+    parser.add_argument(
+        "--issue-prefix",
+        default="",
+        help="Prefix to add to issue titles (e.g., 'DataDog Error Bash: ')",
+    )
+
+    args = parser.parse_args()
+
+    # Validate environment
+    if not validate_environment():
+        sys.exit(1)
+
+    # Parse repositories
+    repos = [repo.strip() for repo in args.repos.split(",")]
+
+    # Create working directory
+    working_dir = Path(args.working_dir).resolve()
+    working_dir.mkdir(exist_ok=True)
+
+    print("🔍 Starting Datadog debugging session")
+    print(f"📊 Query: {args.query}")
+    print(f"📁 Repositories: {', '.join(repos)}")
+    print(f"🌍 Datadog site: {os.getenv('DD_SITE', 'datadoghq.com')}")
+    print(f"💼 Working directory: {working_dir}")
+    print()
+
+    # Fetch error examples from Datadog
+    errors_file = fetch_datadog_errors(args.query, working_dir, args.query_type)
+    print()
+
+    # Setup GitHub issue for tracking
+    print("📋 Setting up GitHub issue for tracking...")
+    issue_number, issue_url = setup_github_issue(
+        args.query,
+        errors_file,
+        args.issue_repo,
+        args.issue_prefix,
+        args.issue_parent,
+    )
+    print(f"📌 Tracking issue: {issue_url}")
+    print()
+
+    # Configure LLM
+    api_key = os.getenv("LLM_API_KEY")
+    if not api_key:
+        print("❌ LLM_API_KEY environment variable is required")
+        sys.exit(1)
+
+    # Get LLM configuration from environment
+    model = os.getenv("LLM_MODEL", "openhands/claude-sonnet-4-5-20250929")
+    base_url = os.getenv("LLM_BASE_URL")
+
+    llm = LLM(
+        model=model,
+        base_url=base_url,
+        api_key=SecretStr(api_key),
+    )
+
+    # Run debugging session
+    run_debugging_session(llm, working_dir, args.query, repos, errors_file, issue_url)
+
+
+def run_debugging_session(
+    llm: LLM,
+    working_dir: Path,
+    query: str,
+    repos: list[str],
+    errors_file: Path,
+    issue_url: str,
+):
+    """Run the debugging session with the given configuration."""
+    # Register and set up tools
+    register_tool("BashTool", BashTool)
+    register_tool("FileEditorTool", FileEditorTool)
+    register_tool("TaskTrackerTool", TaskTrackerTool)
+
+    tools = [
+        Tool(name="BashTool"),
+        Tool(name="FileEditorTool"),
+        Tool(name="TaskTrackerTool"),
+    ]
+
+    # Create agent
+    agent = Agent(llm=llm, tools=tools)
+
+    # Collect LLM messages for debugging
+    llm_messages = []
+
+    def conversation_callback(event: Event):
+        if isinstance(event, LLMConvertibleEvent):
+            llm_messages.append(event.to_llm_message())
+
+    # Start conversation with local workspace
+    conversation = Conversation(
+        agent=agent, workspace=str(working_dir), callbacks=[conversation_callback]
+    )
+
+    # Send the debugging task
+    debugging_prompt = create_debugging_prompt(query, repos, errors_file, issue_url)
+
+    conversation.send_message(
+        message=Message(
+            role="user",
+            content=[TextContent(text=debugging_prompt)],
+        )
+    )
+
+    print("🤖 Starting debugging analysis...")
+    try:
+        conversation.run()
+
+        print("\n" + "=" * 80)
+        print("🎯 Debugging session completed!")
+        print(f"📁 Results saved in: {working_dir}")
+        print(f"💬 Total LLM messages: {len(llm_messages)}")
+
+        # Show summary of what was accomplished
+        print("\n📋 Session Summary:")
+        print("- Queried Datadog logs for error analysis")
+        print("- Cloned and analyzed relevant repositories")
+        print("- Investigated potential root causes")
+        print("- Attempted error reproduction")
+
+        # Check for cloned repositories
+        if working_dir.exists():
+            cloned_repos = [
+                d for d in working_dir.iterdir() if d.is_dir() and (d / ".git").exists()
+            ]
+            if cloned_repos:
+                print(
+                    f"- Cloned repositories: {', '.join(d.name for d in cloned_repos)}"
+                )
+    finally:
+        # Clean up conversation
+        logger.info("Closing conversation...")
+        conversation.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/03_github_workflows/04_datadog_debugging/debug_prompt.jinja b/examples/03_github_workflows/04_datadog_debugging/debug_prompt.jinja
new file mode 100644
index 0000000000..b3e463b225
--- /dev/null
+++ b/examples/03_github_workflows/04_datadog_debugging/debug_prompt.jinja
@@ -0,0 +1,127 @@
+Your task is to debug an error from Datadog Error Tracking to find out why it is happening.
+
+## GitHub Issue for Tracking
+
+A GitHub issue has been created to track this investigation: {{ issue_url }}
+
+**IMPORTANT**: As you make progress in your investigation, post your findings as comments on this GitHub issue using curl commands:
+
+```bash
+curl -X POST \
+  'https://api.github.com/repos/{REPO}/issues/{NUMBER}/comments' \
+  -H 'Authorization: Bearer $GITHUB_TOKEN' \
+  -H 'Accept: application/vnd.github+json' \
+  -H 'Content-Type: application/json' \
+  -d '{"body": "Your finding here"}'
+```
+
+Post updates when you:
+- Complete analyzing the error data
+- Find relevant code in the repositories
+- Identify the root cause
+- Attempt a reproduction
+- Make any significant discovery
+
+## Error Tracking Issues
+
+I have already fetched error tracking issues and saved them to: `{{ errors_file }}`
+
+This JSON file contains:
+- `query`: The Datadog query used to fetch these errors
+- `total_examples`: Number of error tracking issues in the file
+- `examples`: Array of error tracking issues, where each has:
+  - `issue_id`: Unique identifier for the aggregated error issue
+  - `total_count`: Total number of error occurrences
+  - `impacted_users`: Number of users affected
+  - `service`: Service name where errors occurred
+  - `error_type`: Type of error (e.g., exception class)
+  - `error_message`: Error message text
+  - `file_path`: Source file where error occurred
+  - `function_name`: Function where error occurred
+  - `first_seen`: Timestamp when first seen (milliseconds)
+  - `last_seen`: Timestamp when last seen (milliseconds)
+  - `state`: Issue state (OPEN, ACKNOWLEDGED, RESOLVED, etc.)
+
+**First, read the GitHub issue** to see the error summary, then read `{{ errors_file }}` to understand the error patterns. Error Tracking aggregates similar errors together, so each issue may represent many occurrences.
+
+## Additional Context
+
+The original Datadog query was: `{{ query }}`
+
+If you need more details, you can use Datadog APIs via curl commands with $DD_API_KEY and $DD_APP_KEY environment variables.
+
+To search for more error tracking issues:
+```bash
+curl -X POST '{{ error_tracking_url }}' \
+  -H 'Content-Type: application/json' \
+  -H 'DD-API-KEY: $DD_API_KEY' \
+  -H 'DD-APPLICATION-KEY: $DD_APP_KEY' \
+  -d '{"data": {"attributes": {"query": "service:YOUR_SERVICE", "from": <timestamp_ms>, "to": <timestamp_ms>, "track": "logs"}, "type": "search_request"}}'
+```
+
+To query individual log entries, use the Logs API:
+```bash
+curl -X POST '{{ logs_url }}' \
+  -H 'Content-Type: application/json' \
+  -H 'DD-API-KEY: $DD_API_KEY' \
+  -H 'DD-APPLICATION-KEY: $DD_APP_KEY' \
+  -d '{
+    "filter": {
+      "query": "YOUR_QUERY_HERE",
+      "from": "now-1d",
+      "to": "now"
+    },
+    "sort": "timestamp",
+    "page": {
+      "limit": 10
+    }
+  }'
+```
+
+The Datadog query syntax supports:
+- status:error - Find error logs
+- service:my-service - Filter by service
+- "exact phrase" - Search for exact text
+- -(status:info OR status:debug) - Exclude certain statuses
+- Use time ranges to focus on recent issues
+
+The error class that I would like you to debug is characterized by this datadog query:
+{{ query }}
+
+To clone the GitHub repositories, use git with authentication:
+```bash
+git clone https://$GITHUB_TOKEN@github.com/OWNER/REPO.git
+```
+
+The github repos that you should clone (using GITHUB_TOKEN) are the following:
+{{ repos_list }}
+
+## Debugging Steps
+
+Follow these steps systematically:
+
+1. **Read the error file** - Start by reading `{{ errors_file }}` to understand the error patterns. Examine all examples to identify:
+   - Common error messages
+   - Stack traces and their origins
+   - Affected services
+   - Timestamps (when did errors start?)
+
+2. **Analyze the timeline** - Check when the error class started occurring/becoming frequent. Look at the timestamps in the error examples. This helps identify what code changes or deployment may have caused the issue. Code changed during the release cycle before the error occurred will be most suspicious.
+
+3. **Clone repositories** - Clone the relevant repositories using:
+   ```bash
+   git clone https://$GITHUB_TOKEN@github.com/OWNER/REPO.git
+   ```
+
+4. **Investigate the codebase** - Carefully read the code related to the error. Look at:
+   - Files mentioned in stack traces
+   - Recent commits (use git log)
+   - Related code paths
+
+5. **Develop hypotheses** - Think of 5 possible root causes and write sample code to test each hypothesis. Try to reproduce the error.
+
+6. **Create fix or summarize** - Based on your findings:
+   - If reproducible: Create a fix and optionally open a draft PR
+   - If not reproducible: Summarize your investigation, findings, and recommendations
+
+**Important**: Use the task_tracker tool to organize your work and keep track of your progress through these steps.
diff --git a/examples/03_github_workflows/04_datadog_debugging/workflow.yml b/examples/03_github_workflows/04_datadog_debugging/workflow.yml
new file mode 100644
index 0000000000..757e48d40f
--- /dev/null
+++ b/examples/03_github_workflows/04_datadog_debugging/workflow.yml
@@ -0,0 +1,110 @@
+---
+name: Datadog Error Debugging
+
+on:
+    workflow_dispatch:
+        inputs:
+            query_type:
+                description: 'Query type: log-query (search) or log-error-id (specific ID)'
+                required: true
+                type: choice
+                options:
+                    - log-query
+                    - log-error-id
+                default: log-query
+            datadog_query:
+                description: >-
+                    Datadog query (search query for log-query mode,
+                    or error tracking ID for log-error-id mode)
+                required: true
+                default: service:deploy ClientDisconnect
+            repo_list:
+                description: Comma-separated list of repositories to clone (owner/repo)
+                required: true
+                default: OpenHands/OpenHands,All-Hands-AI/infra
+            issue_repo:
+                description: Repository to create/update issues in (owner/repo)
+                required: true
+                default: All-Hands-AI/infra
+            issue_parent:
+                description: Parent GitHub issue URL for tracking
+                required: false
+                default: https://github.com/All-Hands-AI/infra/issues/672
+            issue_prefix:
+                description: Prefix for issue titles
+                required: false
+                default: 'DataDog Error: '
+
+permissions:
+    contents: read
+    issues: write
+
+jobs:
+    debug-datadog-errors:
+        runs-on: ubuntu-latest
+        timeout-minutes: 30
+        env:
+            # URLs to download script and template from the SDK repository
+            SCRIPT_URL: 
+                https://raw.githubusercontent.com/OpenHands/software-agent-sdk/main/examples/03_github_workflows/04_datadog_debugging/datadog_debugging.py
+            TEMPLATE_URL: 
+                https://raw.githubusercontent.com/OpenHands/software-agent-sdk/main/examples/03_github_workflows/04_datadog_debugging/debug_prompt.jinja
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v4
+
+            - name: Set up Python
+              uses: actions/setup-python@v5
+              with:
+                  python-version: '3.12'
+
+            - name: Install uv
+              uses: astral-sh/setup-uv@v7
+              with:
+                  enable-cache: true
+
+            - name: Install OpenHands dependencies
+              run: |
+                  # Install OpenHands SDK and tools from git repository
+                  uv pip install --system "openhands-sdk @ git+https://github.com/OpenHands/software-agent-sdk.git@main#subdirectory=openhands-sdk"
+                  uv pip install --system "openhands-tools @ git+https://github.com/OpenHands/software-agent-sdk.git@main#subdirectory=openhands-tools"
+                  # Install additional dependencies for the datadog script
+                  uv pip install --system requests jinja2
+
+            - name: Download debugging script and template
+              run: |
+                  mkdir -p /tmp/datadog-debug-script
+                  echo "Downloading script from: $SCRIPT_URL"
+                  curl -sSL "$SCRIPT_URL" -o /tmp/datadog-debug-script/datadog_debugging.py
+                  echo "Downloading template from: $TEMPLATE_URL"
+                  curl -sSL "$TEMPLATE_URL" -o /tmp/datadog-debug-script/debug_prompt.jinja
+
+            - name: Run Datadog Debugging Script
+              env:
+                  DD_API_KEY: ${{ secrets.DD_API_KEY }}
+                  DD_APP_KEY: ${{ secrets.DD_APP_KEY }}
+                  DD_SITE: ${{ secrets.DD_SITE }}
+                  LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+                  LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+                  LLM_MODEL: <YOUR_LLM_MODEL>
+                  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+                  PYTHONPATH: ''
+              run: |
+                  mkdir -p /tmp/datadog-debug
+                  cd /tmp/datadog-debug-script
+                  python datadog_debugging.py \
+                    --query-type "${{ inputs.query_type }}" \
+                    --query "${{ inputs.datadog_query }}" \
+                    --repos "${{ inputs.repo_list }}" \
+                    --working-dir "/tmp/datadog-debug" \
+                    --issue-repo "${{ inputs.issue_repo }}" \
+                    --issue-parent "${{ inputs.issue_parent }}" \
+                    --issue-prefix "${{ inputs.issue_prefix }}"
+
+            - name: Upload debugging artifacts
+              if: always()
+              uses: actions/upload-artifact@v4
+              with:
+                  name: datadog-debugging-artifacts
+                  path: /tmp/datadog-debug/
+                  retention-days: 7
diff --git a/openhands-agent-server/openhands/agent_server/conversation_router.py b/openhands-agent-server/openhands/agent_server/conversation_router.py
index a1287b9166..6cb3b200a9 100644
--- a/openhands-agent-server/openhands/agent_server/conversation_router.py
+++ b/openhands-agent-server/openhands/agent_server/conversation_router.py
@@ -22,7 +22,7 @@
     UpdateSecretsRequest,
 )
 from openhands.sdk import LLM, Agent, TextContent, Tool
-from openhands.sdk.conversation.state import AgentExecutionStatus
+from openhands.sdk.conversation.state import ConversationExecutionStatus
 from openhands.sdk.workspace import LocalWorkspace
 
 
@@ -66,8 +66,8 @@ async def search_conversations(
         Query(title="The max number of results in the page", gt=0, lte=100),
     ] = 100,
     status: Annotated[
-        AgentExecutionStatus | None,
-        Query(title="Optional filter by agent execution status"),
+        ConversationExecutionStatus | None,
+        Query(title="Optional filter by conversation execution status"),
     ] = None,
     sort_order: Annotated[
         ConversationSortOrder,
@@ -86,8 +86,8 @@ async def search_conversations(
 @conversation_router.get("/count")
 async def count_conversations(
     status: Annotated[
-        AgentExecutionStatus | None,
-        Query(title="Optional filter by agent execution status"),
+        ConversationExecutionStatus | None,
+        Query(title="Optional filter by conversation execution status"),
     ] = None,
     conversation_service: ConversationService = Depends(get_conversation_service),
 ) -> int:
@@ -213,7 +213,7 @@ async def update_conversation_secrets(
     # Strings are valid SecretValue (SecretValue = str | SecretProvider)
     from typing import cast
 
-    from openhands.sdk.conversation.secrets_manager import SecretValue
+    from openhands.sdk.conversation.secret_registry import SecretValue
 
     secrets = cast(dict[str, SecretValue], request.secrets)
     await event_service.update_secrets(secrets)
diff --git a/openhands-agent-server/openhands/agent_server/conversation_service.py b/openhands-agent-server/openhands/agent_server/conversation_service.py
index 2d94a26b69..611f0dbc51 100644
--- a/openhands-agent-server/openhands/agent_server/conversation_service.py
+++ b/openhands-agent-server/openhands/agent_server/conversation_service.py
@@ -21,8 +21,10 @@
 from openhands.agent_server.server_details_router import update_last_execution_time
 from openhands.agent_server.utils import utc_now
 from openhands.sdk import LLM, Event, Message
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
-from openhands.sdk.utils.cipher import Cipher
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 
 
 logger = logging.getLogger(__name__)
@@ -50,7 +52,6 @@ class ConversationService:
     conversations_dir: Path = field()
     webhook_specs: list[WebhookSpec] = field(default_factory=list)
     session_api_key: str | None = field(default=None)
-    cipher: Cipher | None = field(default=None)
     _event_services: dict[UUID, EventService] | None = field(default=None, init=False)
     _conversation_webhook_subscribers: list["ConversationWebhookSubscriber"] = field(
         default_factory=list, init=False
@@ -69,7 +70,7 @@ async def search_conversations(
         self,
         page_id: str | None = None,
         limit: int = 100,
-        agent_status: AgentExecutionStatus | None = None,
+        execution_status: ConversationExecutionStatus | None = None,
         sort_order: ConversationSortOrder = ConversationSortOrder.CREATED_AT_DESC,
     ) -> ConversationPage:
         if self._event_services is None:
@@ -82,8 +83,8 @@ async def search_conversations(
             conversation_info = _compose_conversation_info(event_service.stored, state)
             # Apply status filter if provided
             if (
-                agent_status is not None
-                and conversation_info.agent_status != agent_status
+                execution_status is not None
+                and conversation_info.execution_status != execution_status
             ):
                 continue
 
@@ -124,7 +125,7 @@ async def search_conversations(
 
     async def count_conversations(
         self,
-        agent_status: AgentExecutionStatus | None = None,
+        execution_status: ConversationExecutionStatus | None = None,
     ) -> int:
         """Count conversations matching the given filters."""
         if self._event_services is None:
@@ -135,7 +136,10 @@ async def count_conversations(
             state = await event_service.get_state()
 
             # Apply status filter if provided
-            if agent_status is not None and state.agent_status != agent_status:
+            if (
+                execution_status is not None
+                and state.execution_status != execution_status
+            ):
                 continue
 
             count += 1
@@ -300,12 +304,7 @@ async def __aenter__(self):
                 if not meta_file.exists():
                     continue
                 json_str = meta_file.read_text()
-                stored = StoredConversation.model_validate_json(
-                    json_str,
-                    context={
-                        "cipher": self.cipher,
-                    },
-                )
+                stored = StoredConversation.model_validate_json(json_str)
                 await self._start_event_service(stored)
             except Exception:
                 logger.exception(
@@ -344,7 +343,6 @@ def get_instance(cls, config: Config) -> "ConversationService":
             session_api_key=(
                 config.session_api_keys[0] if config.session_api_keys else None
             ),
-            cipher=config.cipher,
         )
 
     async def _start_event_service(self, stored: StoredConversation) -> EventService:
@@ -355,7 +353,6 @@ async def _start_event_service(self, stored: StoredConversation) -> EventService
         event_service = EventService(
             stored=stored,
             conversations_dir=self.conversations_dir,
-            cipher=self.cipher,
         )
         # Create subscribers...
         await event_service.subscribe_to_events(_EventSubscriber(service=event_service))
diff --git a/openhands-agent-server/openhands/agent_server/docker/Dockerfile b/openhands-agent-server/openhands/agent_server/docker/Dockerfile
index 3ee35a1e41..e3a1635998 100644
--- a/openhands-agent-server/openhands/agent_server/docker/Dockerfile
+++ b/openhands-agent-server/openhands/agent_server/docker/Dockerfile
@@ -181,7 +181,7 @@ COPY --chown=${USERNAME}:${USERNAME} openhands-agent-server/openhands/agent_serv
 
 USER ${USERNAME}
 WORKDIR /
-ENV OH_ENABLE_VNC=true
+ENV OH_ENABLE_VNC=false
 ENV LOG_JSON=true
 EXPOSE ${PORT} ${NOVNC_PORT}
 
diff --git a/openhands-agent-server/openhands/agent_server/docker/build.py b/openhands-agent-server/openhands/agent_server/docker/build.py
index 1cc9cd4d1c..b740d3071d 100755
--- a/openhands-agent-server/openhands/agent_server/docker/build.py
+++ b/openhands-agent-server/openhands/agent_server/docker/build.py
@@ -258,6 +258,10 @@ class BuildOptions(BaseModel):
     push: bool | None = Field(
         default=None, description="None=auto (CI push, local load)"
     )
+    arch: str | None = Field(
+        default=None,
+        description="Architecture suffix (e.g., 'amd64', 'arm64') to append to tags",
+    )
 
     @field_validator("target")
     @classmethod
@@ -295,12 +299,14 @@ def cache_tags(self) -> tuple[str, str]:
     @property
     def all_tags(self) -> list[str]:
         tags: list[str] = []
+        arch_suffix = f"-{self.arch}" if self.arch else ""
+
         for t in self.custom_tag_list:
-            tags.append(f"{self.image}:{SHORT_SHA}-{t}")
+            tags.append(f"{self.image}:{SHORT_SHA}-{t}{arch_suffix}")
         if GIT_REF in ("main", "refs/heads/main"):
             for t in self.custom_tag_list:
-                tags.append(f"{self.image}:main-{t}")
-        tags.append(f"{self.image}:{self.versioned_tag}")
+                tags.append(f"{self.image}:main-{t}{arch_suffix}")
+        tags.append(f"{self.image}:{self.versioned_tag}{arch_suffix}")
         if self.is_dev:
             tags = [f"{t}-dev" for t in tags]
         return tags
@@ -324,6 +330,7 @@ def _extract_tarball(tarball: Path, dest: Path) -> None:
 
 
 def _make_build_context(sdk_project_root: Path) -> Path:
+    dockerfile_path = _get_dockerfile_path(sdk_project_root)
     tmp_root = Path(tempfile.mkdtemp(prefix="agent-build-", dir=None)).resolve()
     sdist_dir = Path(tempfile.mkdtemp(prefix="agent-sdist-", dir=None)).resolve()
     try:
@@ -349,6 +356,8 @@ def _make_build_context(sdk_project_root: Path) -> Path:
             "Expected single folder in sdist"
         )
         tmp_root = entries[0].resolve()
+        # copy Dockerfile into place
+        shutil.copy2(dockerfile_path, tmp_root / "Dockerfile")
         logger.debug(f"[build] Clean context ready at {tmp_root}")
         return tmp_root
     except Exception:
@@ -379,13 +388,9 @@ def _default_local_cache_dir() -> Path:
     return Path(xdg) / "openhands" / "buildx-cache"
 
 
-# --- single entry point ---
-
-
-def build(opts: BuildOptions) -> list[str]:
-    """Single entry point for building the agent-server image."""
+def _get_dockerfile_path(sdk_project_root: Path) -> Path:
     dockerfile_path = (
-        opts.sdk_project_root
+        sdk_project_root
         / "openhands-agent-server"
         / "openhands"
         / "agent_server"
@@ -394,7 +399,15 @@ def build(opts: BuildOptions) -> list[str]:
     )
     if not dockerfile_path.exists():
         raise FileNotFoundError(f"Dockerfile not found at {dockerfile_path}")
+    return dockerfile_path
 
+
+# --- single entry point ---
+
+
+def build(opts: BuildOptions) -> list[str]:
+    """Single entry point for building the agent-server image."""
+    dockerfile_path = _get_dockerfile_path(opts.sdk_project_root)
     push = opts.push
     if push is None:
         push = IN_CI
@@ -536,6 +549,13 @@ def main(argv: list[str]) -> int:
         default=_env("PLATFORMS", "linux/amd64,linux/arm64"),
         help="Comma-separated platforms (default from $PLATFORMS).",
     )
+    parser.add_argument(
+        "--arch",
+        default=_env("ARCH", ""),
+        help=(
+            "Architecture suffix for tags (e.g., 'amd64', 'arm64', default from $ARCH)."
+        ),
+    )
     group = parser.add_mutually_exclusive_group()
     group.add_argument(
         "--push",
@@ -574,7 +594,31 @@ def main(argv: list[str]) -> int:
     if args.build_ctx_only:
         ctx = _make_build_context(sdk_project_root)
         logger.info(f"[build] Clean build context (kept for debugging): {ctx}")
-        # Print path to stdout so other tooling can capture it
+
+        # Create BuildOptions to generate tags
+        opts = BuildOptions(
+            base_image=args.base_image,
+            custom_tags=args.custom_tags,
+            image=args.image,
+            target=args.target,  # type: ignore
+            platforms=[p.strip() for p in args.platforms.split(",") if p.strip()],  # type: ignore
+            push=None,  # Not relevant for build-ctx-only
+            sdk_project_root=sdk_project_root,
+            arch=args.arch or None,
+        )
+
+        # If running in GitHub Actions, write outputs directly to GITHUB_OUTPUT
+        github_output = os.environ.get("GITHUB_OUTPUT")
+        if github_output:
+            with open(github_output, "a") as fh:
+                fh.write(f"build_context={ctx}\n")
+                fh.write(f"dockerfile={ctx / 'Dockerfile'}\n")
+                fh.write(f"tags_csv={','.join(opts.all_tags)}\n")
+                fh.write(f"versioned_tag={opts.versioned_tag}\n")
+                fh.write(f"base_image_slug={opts.base_image_slug}\n")
+            logger.info("[build] Wrote outputs to $GITHUB_OUTPUT")
+
+        # Also print to stdout for debugging/local use
         print(str(ctx))
         return 0
 
@@ -602,6 +646,7 @@ def main(argv: list[str]) -> int:
         platforms=[p.strip() for p in args.platforms.split(",") if p.strip()],  # type: ignore
         push=push,
         sdk_project_root=sdk_project_root,
+        arch=args.arch or None,
     )
     tags = build(opts)
 
diff --git a/openhands-agent-server/openhands/agent_server/event_router.py b/openhands-agent-server/openhands/agent_server/event_router.py
index 4247d41d45..b42bce9885 100644
--- a/openhands-agent-server/openhands/agent_server/event_router.py
+++ b/openhands-agent-server/openhands/agent_server/event_router.py
@@ -3,6 +3,7 @@
 """
 
 import logging
+from datetime import datetime
 from typing import Annotated
 
 from fastapi import (
@@ -31,9 +32,31 @@
 )
 logger = logging.getLogger(__name__)
 
+
 # Read methods
 
 
+def normalize_datetime_to_server_timezone(dt: datetime) -> datetime:
+    """
+    Normalize datetime to server timezone for consistent comparison.
+
+    If the datetime has timezone info, convert to server native timezone.
+    If it's naive (no timezone), assume it's already in server timezone.
+
+    Args:
+        dt: Input datetime (may be timezone-aware or naive)
+
+    Returns:
+        Datetime in server native timezone (timezone-aware)
+    """
+    if dt.tzinfo is not None:
+        # Timezone-aware: convert to server native timezone
+        return dt.astimezone(None)
+    else:
+        # Naive datetime: assume it's already in server timezone
+        return dt
+
+
 @event_router.get("/search", responses={404: {"description": "Conversation not found"}})
 async def search_conversation_events(
     page_id: Annotated[
@@ -54,12 +77,33 @@ async def search_conversation_events(
         EventSortOrder,
         Query(title="Sort order for events"),
     ] = EventSortOrder.TIMESTAMP,
+    timestamp__gte: Annotated[
+        datetime | None,
+        Query(title="Filter: event timestamp >= this datetime"),
+    ] = None,
+    timestamp__lt: Annotated[
+        datetime | None,
+        Query(title="Filter: event timestamp < this datetime"),
+    ] = None,
     event_service: EventService = Depends(get_event_service),
 ) -> EventPage:
     """Search / List local events"""
     assert limit > 0
     assert limit <= 100
-    return await event_service.search_events(page_id, limit, kind, sort_order)
+
+    # Normalize timezone-aware datetimes to server timezone
+    normalized_gte = (
+        normalize_datetime_to_server_timezone(timestamp__gte)
+        if timestamp__gte
+        else None
+    )
+    normalized_lt = (
+        normalize_datetime_to_server_timezone(timestamp__lt) if timestamp__lt else None
+    )
+
+    return await event_service.search_events(
+        page_id, limit, kind, sort_order, normalized_gte, normalized_lt
+    )
 
 
 @event_router.get("/count", responses={404: {"description": "Conversation not found"}})
@@ -70,10 +114,29 @@ async def count_conversation_events(
             title="Optional filter by event kind/type (e.g., ActionEvent, MessageEvent)"
         ),
     ] = None,
+    timestamp__gte: Annotated[
+        datetime | None,
+        Query(title="Filter: event timestamp >= this datetime"),
+    ] = None,
+    timestamp__lt: Annotated[
+        datetime | None,
+        Query(title="Filter: event timestamp < this datetime"),
+    ] = None,
     event_service: EventService = Depends(get_event_service),
 ) -> int:
     """Count local events matching the given filters"""
-    count = await event_service.count_events(kind)
+    # Normalize timezone-aware datetimes to server timezone
+    normalized_gte = (
+        normalize_datetime_to_server_timezone(timestamp__gte)
+        if timestamp__gte
+        else None
+    )
+    normalized_lt = (
+        normalize_datetime_to_server_timezone(timestamp__lt) if timestamp__lt else None
+    )
+
+    count = await event_service.count_events(kind, normalized_gte, normalized_lt)
+
     return count
 
 
diff --git a/openhands-agent-server/openhands/agent_server/event_service.py b/openhands-agent-server/openhands/agent_server/event_service.py
index 81a8f43727..0247b3e0e5 100644
--- a/openhands-agent-server/openhands/agent_server/event_service.py
+++ b/openhands-agent-server/openhands/agent_server/event_service.py
@@ -1,5 +1,6 @@
 import asyncio
 from dataclasses import dataclass, field
+from datetime import datetime
 from pathlib import Path
 from uuid import UUID
 
@@ -13,8 +14,11 @@
 from openhands.agent_server.utils import utc_now
 from openhands.sdk import LLM, Agent, Event, Message, get_logger
 from openhands.sdk.conversation.impl.local_conversation import LocalConversation
-from openhands.sdk.conversation.secrets_manager import SecretValue
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
+from openhands.sdk.conversation.secret_registry import SecretValue
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 from openhands.sdk.event.conversation_state import ConversationStateUpdateEvent
 from openhands.sdk.security.confirmation_policy import ConfirmationPolicyBase
 from openhands.sdk.utils.async_utils import AsyncCallbackWrapper
@@ -82,10 +86,16 @@ async def search_events(
         limit: int = 100,
         kind: str | None = None,
         sort_order: EventSortOrder = EventSortOrder.TIMESTAMP,
+        timestamp__gte: datetime | None = None,
+        timestamp__lt: datetime | None = None,
     ) -> EventPage:
         if not self._conversation:
             raise ValueError("inactive_service")
 
+        # Convert datetime to ISO string for comparison (ISO strings are comparable)
+        timestamp_gte_str = timestamp__gte.isoformat() if timestamp__gte else None
+        timestamp_lt_str = timestamp__lt.isoformat() if timestamp__lt else None
+
         # Collect all events
         all_events = []
         with self._conversation._state as state:
@@ -97,6 +107,16 @@ async def search_events(
                     != kind
                 ):
                     continue
+
+                # Apply timestamp filters if provided (ISO string comparison)
+                if (
+                    timestamp_gte_str is not None
+                    and event.timestamp < timestamp_gte_str
+                ):
+                    continue
+                if timestamp_lt_str is not None and event.timestamp >= timestamp_lt_str:
+                    continue
+
                 all_events.append(event)
 
         # Sort events based on sort_order
@@ -131,11 +151,17 @@ async def search_events(
     async def count_events(
         self,
         kind: str | None = None,
+        timestamp__gte: datetime | None = None,
+        timestamp__lt: datetime | None = None,
     ) -> int:
         """Count events matching the given filters."""
         if not self._conversation:
             raise ValueError("inactive_service")
 
+        # Convert datetime to ISO string for comparison (ISO strings are comparable)
+        timestamp_gte_str = timestamp__gte.isoformat() if timestamp__gte else None
+        timestamp_lt_str = timestamp__lt.isoformat() if timestamp__lt else None
+
         count = 0
         with self._conversation._state as state:
             for event in state.events:
@@ -146,6 +172,16 @@ async def count_events(
                     != kind
                 ):
                     continue
+
+                # Apply timestamp filters if provided (ISO string comparison)
+                if (
+                    timestamp_gte_str is not None
+                    and event.timestamp < timestamp_gte_str
+                ):
+                    continue
+                if timestamp_lt_str is not None and event.timestamp >= timestamp_lt_str:
+                    continue
+
                 count += 1
 
         return count
@@ -165,7 +201,7 @@ async def send_message(self, message: Message, run: bool = False):
         await loop.run_in_executor(None, self._conversation.send_message, message)
         if run:
             with self._conversation.state as state:
-                run = state.agent_status != AgentExecutionStatus.RUNNING
+                run = state.execution_status != ConversationExecutionStatus.RUNNING
         if run:
             loop.run_in_executor(None, self._conversation.run)
 
diff --git a/openhands-agent-server/openhands/agent_server/file_router.py b/openhands-agent-server/openhands/agent_server/file_router.py
index da28e69982..fc93220d10 100644
--- a/openhands-agent-server/openhands/agent_server/file_router.py
+++ b/openhands-agent-server/openhands/agent_server/file_router.py
@@ -33,6 +33,7 @@ async def upload_file(
     file: UploadFile = File(...),
 ) -> Success:
     """Upload a file to the workspace."""
+    logger.info(f"Uploading file: {path}")
     try:
         target_path = Path(path)
         if not target_path.is_absolute():
@@ -65,6 +66,7 @@ async def download_file(
     path: Annotated[str, FastApiPath(description="Absolute file path.")],
 ) -> FileResponse:
     """Download a file from the workspace."""
+    logger.info(f"Downloading file: {path}")
     try:
         target_path = Path(path)
         if not target_path.is_absolute():
diff --git a/openhands-agent-server/openhands/agent_server/models.py b/openhands-agent-server/openhands/agent_server/models.py
index c7ffeccc89..a19080f5e9 100644
--- a/openhands-agent-server/openhands/agent_server/models.py
+++ b/openhands-agent-server/openhands/agent_server/models.py
@@ -2,14 +2,17 @@
 from datetime import datetime
 from enum import Enum
 from typing import Literal
-from uuid import UUID, uuid4
+from uuid import uuid4
 
 from pydantic import BaseModel, Field
 
-from openhands.agent_server.utils import utc_now
+from openhands.agent_server.utils import OpenHandsUUID, utc_now
 from openhands.sdk import LLM, AgentBase, Event, ImageContent, Message, TextContent
 from openhands.sdk.conversation.secret_source import SecretSource
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 from openhands.sdk.llm.utils.metrics import MetricsSnapshot
 from openhands.sdk.security.confirmation_policy import (
     ConfirmationPolicyBase,
@@ -64,7 +67,7 @@ class StartConversationRequest(BaseModel):
         ...,
         description="Working directory for agent operations and tool execution",
     )
-    conversation_id: UUID | None = Field(
+    conversation_id: OpenHandsUUID | None = Field(
         default=None,
         description=(
             "Optional conversation ID. If not provided, a random UUID will be "
@@ -99,7 +102,7 @@ class StartConversationRequest(BaseModel):
 class StoredConversation(StartConversationRequest):
     """Stored details about a conversation"""
 
-    id: UUID
+    id: OpenHandsUUID
     title: str | None = Field(
         default=None, description="User-defined title for the conversation"
     )
@@ -129,7 +132,7 @@ class ConversationPage(BaseModel):
 
 class ConversationResponse(BaseModel):
     conversation_id: str
-    state: AgentExecutionStatus
+    state: ConversationExecutionStatus
 
 
 class ConfirmationResponseRequest(BaseModel):
@@ -190,7 +193,7 @@ class GenerateTitleResponse(BaseModel):
 class BashEventBase(DiscriminatedUnionMixin, ABC):
     """Base class for all bash event types"""
 
-    id: UUID = Field(default_factory=uuid4)
+    id: OpenHandsUUID = Field(default_factory=uuid4)
     timestamp: datetime = Field(default_factory=utc_now)
 
 
@@ -213,7 +216,7 @@ class BashOutput(BashEventBase):
     depending on how large the output is.
     """
 
-    command_id: UUID
+    command_id: OpenHandsUUID
     order: int = Field(
         default=0, description="The order for this output, sequentially starting with 0"
     )
diff --git a/openhands-agent-server/openhands/agent_server/utils.py b/openhands-agent-server/openhands/agent_server/utils.py
index 96e1403fc3..a32e0fff3e 100644
--- a/openhands-agent-server/openhands/agent_server/utils.py
+++ b/openhands-agent-server/openhands/agent_server/utils.py
@@ -1,4 +1,8 @@
 from datetime import UTC, datetime
+from typing import Annotated
+from uuid import UUID
+
+from pydantic import PlainSerializer
 
 
 def utc_now():
@@ -177,3 +181,11 @@ def _patched_openapi(self):
     except (ImportError, AttributeError):
         # FastAPI not available or internal API changed
         pass
+
+
+def _uuid_to_hex(uuid_obj: UUID) -> str:
+    """Converts a UUID object to a hex string without hyphens."""
+    return uuid_obj.hex
+
+
+OpenHandsUUID = Annotated[UUID, PlainSerializer(_uuid_to_hex, when_used="json")]
diff --git a/openhands-agent-server/openhands/agent_server/vscode_extensions/openhands-settings/extension.js b/openhands-agent-server/openhands/agent_server/vscode_extensions/openhands-settings/extension.js
index d29e44cbca..4705f62595 100644
--- a/openhands-agent-server/openhands/agent_server/vscode_extensions/openhands-settings/extension.js
+++ b/openhands-agent-server/openhands/agent_server/vscode_extensions/openhands-settings/extension.js
@@ -14,6 +14,7 @@ function activate(context) {
   config.update('telemetry.telemetryLevel', 'off', target);
   config.update('extensions.autoCheckUpdates', false, target);
   config.update('extensions.autoUpdate', false, target);
+  config.update('chat.commandCenter.enabled', false, target);
 }
 
 function deactivate() {}
diff --git a/openhands-agent-server/pyproject.toml b/openhands-agent-server/pyproject.toml
index cc7950181c..72739f5061 100644
--- a/openhands-agent-server/pyproject.toml
+++ b/openhands-agent-server/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "openhands-agent-server"
-version = "1.0.0a5"
+version = "1.0.0a6"
 description = "OpenHands Agent Server - REST/WebSocket interface for OpenHands AI Agent"
 
 requires-python = ">=3.12"
diff --git a/openhands-sdk/openhands/sdk/__init__.py b/openhands-sdk/openhands/sdk/__init__.py
index 07ec5f48a1..a07bf60c25 100644
--- a/openhands-sdk/openhands/sdk/__init__.py
+++ b/openhands-sdk/openhands/sdk/__init__.py
@@ -37,7 +37,6 @@
     Action,
     Observation,
     Tool,
-    ToolBase,
     ToolDefinition,
     list_registered_tools,
     register_tool,
@@ -67,7 +66,6 @@
     "RedactedThinkingBlock",
     "Tool",
     "ToolDefinition",
-    "ToolBase",
     "AgentBase",
     "Agent",
     "Action",
diff --git a/openhands-sdk/openhands/sdk/agent/agent.py b/openhands-sdk/openhands/sdk/agent/agent.py
index 92a7aded2b..83ef39ae65 100644
--- a/openhands-sdk/openhands/sdk/agent/agent.py
+++ b/openhands-sdk/openhands/sdk/agent/agent.py
@@ -10,7 +10,7 @@
     ConversationState,
     LocalConversation,
 )
-from openhands.sdk.conversation.state import AgentExecutionStatus
+from openhands.sdk.conversation.state import ConversationExecutionStatus
 from openhands.sdk.event import (
     ActionEvent,
     AgentErrorEvent,
@@ -28,22 +28,48 @@
     TextContent,
     ThinkingBlock,
 )
-from openhands.sdk.llm.exceptions import FunctionCallValidationError
+from openhands.sdk.llm.exceptions import (
+    FunctionCallValidationError,
+    LLMContextWindowExceedError,
+)
 from openhands.sdk.logger import get_logger
+from openhands.sdk.observability.laminar import (
+    maybe_init_laminar,
+    observe,
+    should_enable_observability,
+)
+from openhands.sdk.observability.utils import extract_action_name
 from openhands.sdk.security.confirmation_policy import NeverConfirm
 from openhands.sdk.security.llm_analyzer import LLMSecurityAnalyzer
 from openhands.sdk.tool import (
     Action,
-    FinishTool,
     Observation,
 )
-from openhands.sdk.tool.builtins import FinishAction, ThinkAction
+from openhands.sdk.tool.builtins import (
+    FinishAction,
+    FinishTool,
+    ThinkAction,
+)
 
 
 logger = get_logger(__name__)
+maybe_init_laminar()
 
 
 class Agent(AgentBase):
+    """Main agent implementation for OpenHands.
+
+    The Agent class provides the core functionality for running AI agents that can
+    interact with tools, process messages, and execute actions. It inherits from
+    AgentBase and implements the agent execution logic.
+
+    Example:
+        >>> from openhands.sdk import LLM, Agent, Tool
+        >>> llm = LLM(model="claude-sonnet-4-20250514", api_key=SecretStr("key"))
+        >>> tools = [Tool(name="BashTool"), Tool(name="FileEditorTool")]
+        >>> agent = Agent(llm=llm, tools=tools)
+    """
+
     @property
     def _add_security_risk_prediction(self) -> bool:
         return isinstance(self.security_analyzer, LLMSecurityAnalyzer)
@@ -95,6 +121,7 @@ def _execute_actions(
         for action_event in action_events:
             self._execute_action_event(conversation, action_event, on_event=on_event)
 
+    @observe(name="agent.step", ignore_inputs=["state", "on_event"])
     def step(
         self,
         conversation: LocalConversation,
@@ -148,13 +175,13 @@ def step(
                     include=None,
                     store=False,
                     add_security_risk_prediction=self._add_security_risk_prediction,
-                    metadata=self.llm.metadata,
+                    extra_body=self.llm.litellm_extra_body,
                 )
             else:
                 llm_response = self.llm.completion(
                     messages=_messages,
                     tools=list(self.tools_map.values()),
-                    extra_body={"metadata": self.llm.metadata},
+                    extra_body=self.llm.litellm_extra_body,
                     add_security_risk_prediction=self._add_security_risk_prediction,
                 )
         except FunctionCallValidationError as e:
@@ -168,22 +195,19 @@ def step(
             )
             on_event(error_message)
             return
-        except Exception as e:
-            # If there is a condenser registered and the exception is a context window
-            # exceeded, we can recover by triggering a condensation request.
+        except LLMContextWindowExceedError:
+            # If condenser is available and handles requests, trigger condensation
             if (
                 self.condenser is not None
                 and self.condenser.handles_condensation_requests()
-                and self.llm.is_context_window_exceeded_exception(e)
             ):
                 logger.warning(
                     "LLM raised context window exceeded error, triggering condensation"
                 )
                 on_event(CondensationRequest())
                 return
-            # If the error isn't recoverable, keep propagating it up the stack.
-            else:
-                raise e
+            # No condenser available; re-raise for client handling
+            raise
 
         # LLMResponse already contains the converted message and metrics snapshot
         message: Message = llm_response.message
@@ -228,10 +252,11 @@ def step(
 
         else:
             logger.info("LLM produced a message response - awaits user input")
-            state.agent_status = AgentExecutionStatus.FINISHED
+            state.execution_status = ConversationExecutionStatus.FINISHED
             msg_event = MessageEvent(
                 source="agent",
                 llm_message=message,
+                llm_response_id=llm_response.id,
             )
             on_event(msg_event)
 
@@ -271,7 +296,9 @@ def _requires_user_confirmation(
 
         # Grab the confirmation policy from the state and pass in the risks.
         if any(state.confirmation_policy.should_confirm(risk) for risk in risks):
-            state.agent_status = AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+            state.execution_status = (
+                ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
+            )
             return True
 
         return False
@@ -383,6 +410,7 @@ def _get_action_event(
         on_event(action_event)
         return action_event
 
+    @observe(ignore_inputs=["state", "on_event"])
     def _execute_action_event(
         self,
         conversation: LocalConversation,
@@ -403,7 +431,13 @@ def _execute_action_event(
             )
 
         # Execute actions!
-        observation: Observation = tool(action_event.action, conversation)
+        if should_enable_observability():
+            tool_name = extract_action_name(action_event)
+            observation: Observation = observe(name=tool_name, span_type="TOOL")(tool)(
+                action_event.action, conversation
+            )
+        else:
+            observation = tool(action_event.action, conversation)
         assert isinstance(observation, Observation), (
             f"Tool '{tool.name}' executor must return an Observation"
         )
@@ -418,5 +452,5 @@ def _execute_action_event(
 
         # Set conversation state
         if tool.name == FinishTool.name:
-            state.agent_status = AgentExecutionStatus.FINISHED
+            state.execution_status = ConversationExecutionStatus.FINISHED
         return obs_event
diff --git a/openhands-sdk/openhands/sdk/agent/base.py b/openhands-sdk/openhands/sdk/agent/base.py
index c413d15ba0..999559e5fe 100644
--- a/openhands-sdk/openhands/sdk/agent/base.py
+++ b/openhands-sdk/openhands/sdk/agent/base.py
@@ -28,8 +28,11 @@
 
 
 class AgentBase(DiscriminatedUnionMixin, ABC):
-    """Abstract base class for agents.
+    """Abstract base class for OpenHands agents.
+
     Agents are stateless and should be fully defined by their configuration.
+    This base class provides the common interface and functionality that all
+    agent implementations must follow.
     """
 
     model_config = ConfigDict(
@@ -220,7 +223,9 @@ def _initialize(self, state: "ConversationState"):
             )
 
         # Always include built-in tools; not subject to filtering
-        tools.extend(BUILT_IN_TOOLS)
+        # Instantiate built-in tools using their .create() method
+        for tool_class in BUILT_IN_TOOLS:
+            tools.extend(tool_class.create(state))
 
         # Check tool types
         for tool in tools:
@@ -252,7 +257,7 @@ def step(
         2. Executing the tool
         3. Updating the conversation state with
             LLM calls (role="assistant") and tool results (role="tool")
-        4.1 If conversation is finished, set state.agent_status to FINISHED
+        4.1 If conversation is finished, set state.execution_status to FINISHED
         4.2 Otherwise, just return, Conversation will kick off the next step
 
         NOTE: state will be mutated in-place.
diff --git a/openhands-sdk/openhands/sdk/context/__init__.py b/openhands-sdk/openhands/sdk/context/__init__.py
index 8ee1d9df5a..a2944de815 100644
--- a/openhands-sdk/openhands/sdk/context/__init__.py
+++ b/openhands-sdk/openhands/sdk/context/__init__.py
@@ -8,6 +8,7 @@
     SkillValidationError,
     TaskTrigger,
     load_skills_from_dir,
+    load_user_skills,
 )
 
 
@@ -19,6 +20,7 @@
     "TaskTrigger",
     "SkillKnowledge",
     "load_skills_from_dir",
+    "load_user_skills",
     "render_template",
     "SkillValidationError",
 ]
diff --git a/openhands-sdk/openhands/sdk/context/agent_context.py b/openhands-sdk/openhands/sdk/context/agent_context.py
index f2d5962f1b..9396fcc64b 100644
--- a/openhands-sdk/openhands/sdk/context/agent_context.py
+++ b/openhands-sdk/openhands/sdk/context/agent_context.py
@@ -1,11 +1,12 @@
 import pathlib
 
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, Field, field_validator, model_validator
 
 from openhands.sdk.context.prompts import render_template
 from openhands.sdk.context.skills import (
     Skill,
     SkillKnowledge,
+    load_user_skills,
 )
 from openhands.sdk.llm import Message, TextContent
 from openhands.sdk.logger import get_logger
@@ -48,6 +49,13 @@ class AgentContext(BaseModel):
     user_message_suffix: str | None = Field(
         default=None, description="Optional suffix to append to the user's message."
     )
+    load_user_skills: bool = Field(
+        default=False,
+        description=(
+            "Whether to automatically load user skills from ~/.openhands/skills/ "
+            "and ~/.openhands/microagents/ (for backward compatibility). "
+        ),
+    )
 
     @field_validator("skills")
     @classmethod
@@ -62,6 +70,29 @@ def _validate_skills(cls, v: list[Skill], _info):
             seen_names.add(skill.name)
         return v
 
+    @model_validator(mode="after")
+    def _load_user_skills(self):
+        """Load user skills from home directory if enabled."""
+        if not self.load_user_skills:
+            return self
+
+        try:
+            user_skills = load_user_skills()
+            # Merge user skills with explicit skills, avoiding duplicates
+            existing_names = {skill.name for skill in self.skills}
+            for user_skill in user_skills:
+                if user_skill.name not in existing_names:
+                    self.skills.append(user_skill)
+                else:
+                    logger.warning(
+                        f"Skipping user skill '{user_skill.name}' "
+                        f"(already in explicit skills)"
+                    )
+        except Exception as e:
+            logger.warning(f"Failed to load user skills: {str(e)}")
+
+        return self
+
     def get_system_message_suffix(self) -> str | None:
         """Get the system message with repo skill content and custom suffix.
 
diff --git a/openhands-sdk/openhands/sdk/context/condenser/llm_summarizing_condenser.py b/openhands-sdk/openhands/sdk/context/condenser/llm_summarizing_condenser.py
index a5d518494f..2ba289aad4 100644
--- a/openhands-sdk/openhands/sdk/context/condenser/llm_summarizing_condenser.py
+++ b/openhands-sdk/openhands/sdk/context/condenser/llm_summarizing_condenser.py
@@ -8,6 +8,7 @@
 from openhands.sdk.event.condenser import Condensation
 from openhands.sdk.event.llm_convertible import MessageEvent
 from openhands.sdk.llm import LLM, Message, TextContent
+from openhands.sdk.observability.laminar import observe
 
 
 class LLMSummarizingCondenser(RollingCondenser):
@@ -33,6 +34,7 @@ def should_condense(self, view: View) -> bool:
             return True
         return len(view) > self.max_size
 
+    @observe(ignore_inputs=["view"])
     def get_condensation(self, view: View) -> Condensation:
         head = view[: self.keep_first]
         target_size = self.max_size // 2
@@ -65,7 +67,7 @@ def get_condensation(self, view: View) -> Condensation:
 
         llm_response = self.llm.completion(
             messages=messages,
-            extra_body={"metadata": self.llm.metadata},
+            extra_body=self.llm.litellm_extra_body,
         )
         # Extract summary from the LLMResponse message
         summary = None
@@ -78,4 +80,5 @@ def get_condensation(self, view: View) -> Condensation:
             forgotten_event_ids=[event.id for event in forgotten_events],
             summary=summary,
             summary_offset=self.keep_first,
+            llm_response_id=llm_response.id,
         )
diff --git a/openhands-sdk/openhands/sdk/context/skills/__init__.py b/openhands-sdk/openhands/sdk/context/skills/__init__.py
index fba276ce92..c8c2b7eefb 100644
--- a/openhands-sdk/openhands/sdk/context/skills/__init__.py
+++ b/openhands-sdk/openhands/sdk/context/skills/__init__.py
@@ -1,5 +1,9 @@
 from openhands.sdk.context.skills.exceptions import SkillValidationError
-from openhands.sdk.context.skills.skill import Skill, load_skills_from_dir
+from openhands.sdk.context.skills.skill import (
+    Skill,
+    load_skills_from_dir,
+    load_user_skills,
+)
 from openhands.sdk.context.skills.trigger import (
     BaseTrigger,
     KeywordTrigger,
@@ -15,5 +19,6 @@
     "TaskTrigger",
     "SkillKnowledge",
     "load_skills_from_dir",
+    "load_user_skills",
     "SkillValidationError",
 ]
diff --git a/openhands-sdk/openhands/sdk/context/skills/skill.py b/openhands-sdk/openhands/sdk/context/skills/skill.py
index 4ec8ab4d80..8f6dadd41d 100644
--- a/openhands-sdk/openhands/sdk/context/skills/skill.py
+++ b/openhands-sdk/openhands/sdk/context/skills/skill.py
@@ -307,3 +307,53 @@ def load_skills_from_dir(
         f"{[*repo_skills.keys(), *knowledge_skills.keys()]}"
     )
     return repo_skills, knowledge_skills
+
+
+# Default user skills directories (in order of priority)
+USER_SKILLS_DIRS = [
+    Path.home() / ".openhands" / "skills",
+    Path.home() / ".openhands" / "microagents",  # Legacy support
+]
+
+
+def load_user_skills() -> list[Skill]:
+    """Load skills from user's home directory.
+
+    Searches for skills in ~/.openhands/skills/ and ~/.openhands/microagents/
+    (legacy). Skills from both directories are merged, with skills/ taking
+    precedence for duplicate names.
+
+    Returns:
+        List of Skill objects loaded from user directories.
+        Returns empty list if no skills found or loading fails.
+    """
+    all_skills = []
+    seen_names = set()
+
+    for skills_dir in USER_SKILLS_DIRS:
+        if not skills_dir.exists():
+            logger.debug(f"User skills directory does not exist: {skills_dir}")
+            continue
+
+        try:
+            logger.debug(f"Loading user skills from {skills_dir}")
+            repo_skills, knowledge_skills = load_skills_from_dir(skills_dir)
+
+            # Merge repo and knowledge skills
+            for skills_dict in [repo_skills, knowledge_skills]:
+                for name, skill in skills_dict.items():
+                    if name not in seen_names:
+                        all_skills.append(skill)
+                        seen_names.add(name)
+                    else:
+                        logger.warning(
+                            f"Skipping duplicate skill '{name}' from {skills_dir}"
+                        )
+
+        except Exception as e:
+            logger.warning(f"Failed to load user skills from {skills_dir}: {str(e)}")
+
+    logger.debug(
+        f"Loaded {len(all_skills)} user skills: {[s.name for s in all_skills]}"
+    )
+    return all_skills
diff --git a/openhands-sdk/openhands/sdk/conversation/__init__.py b/openhands-sdk/openhands/sdk/conversation/__init__.py
index fe1dff2fbb..d89d3040d5 100644
--- a/openhands-sdk/openhands/sdk/conversation/__init__.py
+++ b/openhands-sdk/openhands/sdk/conversation/__init__.py
@@ -5,7 +5,7 @@
 from openhands.sdk.conversation.impl.local_conversation import LocalConversation
 from openhands.sdk.conversation.impl.remote_conversation import RemoteConversation
 from openhands.sdk.conversation.response_utils import get_agent_final_response
-from openhands.sdk.conversation.secrets_manager import SecretsManager
+from openhands.sdk.conversation.secret_registry import SecretRegistry
 from openhands.sdk.conversation.state import ConversationState
 from openhands.sdk.conversation.stuck_detector import StuckDetector
 from openhands.sdk.conversation.types import ConversationCallbackType
@@ -18,7 +18,7 @@
     "ConversationState",
     "ConversationCallbackType",
     "ConversationVisualizer",
-    "SecretsManager",
+    "SecretRegistry",
     "StuckDetector",
     "EventLog",
     "LocalConversation",
diff --git a/openhands-sdk/openhands/sdk/conversation/base.py b/openhands-sdk/openhands/sdk/conversation/base.py
index 4c0622058e..2644de1f98 100644
--- a/openhands-sdk/openhands/sdk/conversation/base.py
+++ b/openhands-sdk/openhands/sdk/conversation/base.py
@@ -5,7 +5,7 @@
 
 from openhands.sdk.conversation.conversation_stats import ConversationStats
 from openhands.sdk.conversation.events_list_base import EventsListBase
-from openhands.sdk.conversation.secrets_manager import SecretValue
+from openhands.sdk.conversation.secret_registry import SecretValue
 from openhands.sdk.conversation.types import ConversationCallbackType, ConversationID
 from openhands.sdk.llm.llm import LLM
 from openhands.sdk.llm.message import Message
@@ -18,7 +18,7 @@
 
 if TYPE_CHECKING:
     from openhands.sdk.agent.base import AgentBase
-    from openhands.sdk.conversation.state import AgentExecutionStatus
+    from openhands.sdk.conversation.state import ConversationExecutionStatus
 
 
 class ConversationStateProtocol(Protocol):
@@ -35,8 +35,8 @@ def events(self) -> EventsListBase:
         ...
 
     @property
-    def agent_status(self) -> "AgentExecutionStatus":
-        """The current agent execution status."""
+    def execution_status(self) -> "ConversationExecutionStatus":
+        """The current conversation execution status."""
         ...
 
     @property
@@ -69,6 +69,13 @@ def agent(self) -> "AgentBase":
 
 
 class BaseConversation(ABC):
+    """Abstract base class for conversation implementations.
+
+    This class defines the interface that all conversation implementations must follow.
+    Conversations manage the interaction between users and agents, handling message
+    exchange, execution control, and state management.
+    """
+
     @property
     @abstractmethod
     def id(self) -> ConversationID: ...
@@ -82,13 +89,23 @@ def state(self) -> ConversationStateProtocol: ...
     def conversation_stats(self) -> ConversationStats: ...
 
     @abstractmethod
-    def send_message(self, message: str | Message) -> None: ...
+    def send_message(self, message: str | Message) -> None:
+        """Send a message to the agent."""
+        ...
 
     @abstractmethod
-    def run(self) -> None: ...
+    def run(self) -> None:
+        """Execute the agent to process messages and perform actions.
+
+        This method runs the agent until it finishes processing the current
+        message or reaches the maximum iteration limit.
+        """
+        ...
 
     @abstractmethod
-    def set_confirmation_policy(self, policy: ConfirmationPolicyBase) -> None: ...
+    def set_confirmation_policy(self, policy: ConfirmationPolicyBase) -> None:
+        """Set the confirmation policy for the conversation."""
+        ...
 
     @property
     def confirmation_policy_active(self) -> bool:
diff --git a/openhands-sdk/openhands/sdk/conversation/conversation.py b/openhands-sdk/openhands/sdk/conversation/conversation.py
index f4c3c2152b..b4e5a25ab6 100644
--- a/openhands-sdk/openhands/sdk/conversation/conversation.py
+++ b/openhands-sdk/openhands/sdk/conversation/conversation.py
@@ -2,7 +2,7 @@
 
 from openhands.sdk.agent.base import AgentBase
 from openhands.sdk.conversation.base import BaseConversation
-from openhands.sdk.conversation.secrets_manager import SecretValue
+from openhands.sdk.conversation.secret_registry import SecretValue
 from openhands.sdk.conversation.types import ConversationCallbackType, ConversationID
 from openhands.sdk.logger import get_logger
 from openhands.sdk.workspace import LocalWorkspace, RemoteWorkspace
@@ -16,11 +16,23 @@
 
 
 class Conversation:
-    """Factory entrypoint that returns a LocalConversation or RemoteConversation.
+    """Factory class for creating conversation instances with OpenHands agents.
 
-    Usage:
-        - Conversation(agent=...) -> LocalConversation
-        - Conversation(agent=..., host="http://...") -> RemoteConversation
+    This factory automatically creates either a LocalConversation or RemoteConversation
+    based on the workspace type provided. LocalConversation runs the agent locally,
+    while RemoteConversation connects to a remote agent server.
+
+    Returns:
+        LocalConversation if workspace is local, RemoteConversation if workspace
+        is remote.
+
+    Example:
+        >>> from openhands.sdk import LLM, Agent, Conversation
+        >>> llm = LLM(model="claude-sonnet-4-20250514", api_key=SecretStr("key"))
+        >>> agent = Agent(llm=llm, tools=[])
+        >>> conversation = Conversation(agent=agent, workspace="./workspace")
+        >>> conversation.send_message("Hello!")
+        >>> conversation.run()
     """
 
     @overload
diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
index 17d239bf0e..0cc49518f5 100644
--- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
+++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
@@ -1,3 +1,4 @@
+import atexit
 import uuid
 from collections.abc import Mapping
 from pathlib import Path
@@ -5,8 +6,11 @@
 from openhands.sdk.agent.base import AgentBase
 from openhands.sdk.conversation.base import BaseConversation
 from openhands.sdk.conversation.exceptions import ConversationRunError
-from openhands.sdk.conversation.secrets_manager import SecretValue
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
+from openhands.sdk.conversation.secret_registry import SecretValue
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 from openhands.sdk.conversation.stuck_detector import StuckDetector
 from openhands.sdk.conversation.title_utils import generate_conversation_title
 from openhands.sdk.conversation.types import ConversationCallbackType, ConversationID
@@ -22,6 +26,12 @@
 from openhands.sdk.llm import LLM, Message, TextContent
 from openhands.sdk.llm.llm_registry import LLMRegistry
 from openhands.sdk.logger import get_logger
+from openhands.sdk.observability.laminar import (
+    end_active_span,
+    observe,
+    should_enable_observability,
+    start_active_span,
+)
 from openhands.sdk.security.confirmation_policy import (
     ConfirmationPolicyBase,
 )
@@ -40,6 +50,7 @@ class LocalConversation(BaseConversation):
     max_iteration_per_run: int
     _stuck_detector: StuckDetector | None
     llm_registry: LLMRegistry
+    _cleanup_initiated: bool
 
     def __init__(
         self,
@@ -133,6 +144,11 @@ def _default_callback(e):
             secret_values: dict[str, SecretValue] = {k: v for k, v in secrets.items()}
             self.update_secrets(secret_values)
 
+        self._cleanup_initiated = False
+        atexit.register(self.close)
+        if should_enable_observability():
+            start_active_span("conversation", session_id=str(desired_id))
+
     @property
     def id(self) -> ConversationID:
         """Get the unique ID of the conversation."""
@@ -158,6 +174,7 @@ def stuck_detector(self) -> StuckDetector | None:
         """Get the stuck detector instance if enabled."""
         return self._stuck_detector
 
+    @observe(name="conversation.send_message")
     def send_message(self, message: str | Message) -> None:
         """Send a message to the agent.
 
@@ -173,9 +190,9 @@ def send_message(self, message: str | Message) -> None:
             "Only user messages are allowed to be sent to the agent."
         )
         with self._state:
-            if self._state.agent_status == AgentExecutionStatus.FINISHED:
-                self._state.agent_status = (
-                    AgentExecutionStatus.IDLE
+            if self._state.execution_status == ConversationExecutionStatus.FINISHED:
+                self._state.execution_status = (
+                    ConversationExecutionStatus.IDLE
                 )  # now we have a new message
 
             # TODO: We should add test cases for all these scenarios
@@ -209,6 +226,7 @@ def send_message(self, message: str | Message) -> None:
             )
             self._on_event(user_msg_event)
 
+    @observe(name="conversation.run")
     def run(self) -> None:
         """Runs the conversation until the agent finishes.
 
@@ -223,11 +241,12 @@ def run(self) -> None:
         """
 
         with self._state:
-            if self._state.agent_status in [
-                AgentExecutionStatus.IDLE,
-                AgentExecutionStatus.PAUSED,
+            if self._state.execution_status in [
+                ConversationExecutionStatus.IDLE,
+                ConversationExecutionStatus.PAUSED,
+                ConversationExecutionStatus.ERROR,
             ]:
-                self._state.agent_status = AgentExecutionStatus.RUNNING
+                self._state.execution_status = ConversationExecutionStatus.RUNNING
 
         iteration = 0
         try:
@@ -237,10 +256,10 @@ def run(self) -> None:
                     # Pause attempts to acquire the state lock
                     # Before value can be modified step can be taken
                     # Ensure step conditions are checked when lock is already acquired
-                    if self._state.agent_status in [
-                        AgentExecutionStatus.FINISHED,
-                        AgentExecutionStatus.PAUSED,
-                        AgentExecutionStatus.STUCK,
+                    if self._state.execution_status in [
+                        ConversationExecutionStatus.FINISHED,
+                        ConversationExecutionStatus.PAUSED,
+                        ConversationExecutionStatus.STUCK,
                     ]:
                         break
 
@@ -250,15 +269,19 @@ def run(self) -> None:
 
                         if is_stuck:
                             logger.warning("Stuck pattern detected.")
-                            self._state.agent_status = AgentExecutionStatus.STUCK
+                            self._state.execution_status = (
+                                ConversationExecutionStatus.STUCK
+                            )
                             continue
 
                     # clear the flag before calling agent.step() (user approved)
                     if (
-                        self._state.agent_status
-                        == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+                        self._state.execution_status
+                        == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
                     ):
-                        self._state.agent_status = AgentExecutionStatus.RUNNING
+                        self._state.execution_status = (
+                            ConversationExecutionStatus.RUNNING
+                        )
 
                     # step must mutate the SAME state object
                     self.agent.step(self, on_event=self._on_event)
@@ -273,14 +296,17 @@ def run(self) -> None:
                     # 4. Run loop continues to next iteration and processes the message
                     # 5. Without this design, concurrent messages would be lost
                     if (
-                        self.state.agent_status
-                        == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+                        self.state.execution_status
+                        == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
                         or iteration >= self.max_iteration_per_run
                     ):
                         break
         except Exception as e:
+            self._state.execution_status = ConversationExecutionStatus.ERROR
             # Re-raise with conversation id for better UX; include original traceback
             raise ConversationRunError(self._state.id, e) from e
+        finally:
+            end_active_span()
 
     def set_confirmation_policy(self, policy: ConfirmationPolicyBase) -> None:
         """Set the confirmation policy and store it in conversation state."""
@@ -299,10 +325,10 @@ def reject_pending_actions(self, reason: str = "User rejected the action") -> No
         with self._state:
             # Always clear the agent_waiting_for_confirmation flag
             if (
-                self._state.agent_status
-                == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+                self._state.execution_status
+                == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
             ):
-                self._state.agent_status = AgentExecutionStatus.IDLE
+                self._state.execution_status = ConversationExecutionStatus.IDLE
 
             if not pending_actions:
                 logger.warning("No pending actions to reject")
@@ -330,16 +356,16 @@ def pause(self) -> None:
         effect until the current LLM call completes.
         """
 
-        if self._state.agent_status == AgentExecutionStatus.PAUSED:
+        if self._state.execution_status == ConversationExecutionStatus.PAUSED:
             return
 
         with self._state:
             # Only pause when running or idle
             if (
-                self._state.agent_status == AgentExecutionStatus.IDLE
-                or self._state.agent_status == AgentExecutionStatus.RUNNING
+                self._state.execution_status == ConversationExecutionStatus.IDLE
+                or self._state.execution_status == ConversationExecutionStatus.RUNNING
             ):
-                self._state.agent_status = AgentExecutionStatus.PAUSED
+                self._state.execution_status = ConversationExecutionStatus.PAUSED
                 pause_event = PauseEvent()
                 self._on_event(pause_event)
                 logger.info("Agent execution pause requested")
@@ -353,13 +379,17 @@ def update_secrets(self, secrets: Mapping[str, SecretValue]) -> None:
                      when a command references the secret key.
         """
 
-        secrets_manager = self._state.secrets_manager
-        secrets_manager.update_secrets(secrets)
+        secret_registry = self._state.secret_registry
+        secret_registry.update_secrets(secrets)
         logger.info(f"Added {len(secrets)} secrets to conversation")
 
     def close(self) -> None:
         """Close the conversation and clean up all tool executors."""
+        if self._cleanup_initiated:
+            return
+        self._cleanup_initiated = True
         logger.debug("Closing conversation and cleaning up tool executors")
+        end_active_span()
         for tool in self.agent.tools_map.values():
             try:
                 executable_tool = tool.as_executable()
@@ -370,6 +400,7 @@ def close(self) -> None:
             except Exception as e:
                 logger.warning(f"Error closing executor for tool '{tool.name}': {e}")
 
+    @observe(name="conversation.generate_title", ignore_inputs=["llm"])
     def generate_title(self, llm: LLM | None = None, max_length: int = 50) -> str:
         """Generate a title for the conversation based on the first user message.
 
diff --git a/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py
index 7acd064558..075713f540 100644
--- a/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py
+++ b/openhands-sdk/openhands/sdk/conversation/impl/remote_conversation.py
@@ -14,8 +14,8 @@
 from openhands.sdk.conversation.conversation_stats import ConversationStats
 from openhands.sdk.conversation.events_list_base import EventsListBase
 from openhands.sdk.conversation.exceptions import ConversationRunError
-from openhands.sdk.conversation.secrets_manager import SecretValue
-from openhands.sdk.conversation.state import AgentExecutionStatus
+from openhands.sdk.conversation.secret_registry import SecretValue
+from openhands.sdk.conversation.state import ConversationExecutionStatus
 from openhands.sdk.conversation.types import ConversationCallbackType, ConversationID
 from openhands.sdk.conversation.visualizer import (
     ConversationVisualizer,
@@ -28,6 +28,12 @@
 )
 from openhands.sdk.llm import LLM, Message, TextContent
 from openhands.sdk.logger import get_logger
+from openhands.sdk.observability.laminar import (
+    end_active_span,
+    observe,
+    should_enable_observability,
+    start_active_span,
+)
 from openhands.sdk.security.confirmation_policy import (
     ConfirmationPolicyBase,
 )
@@ -309,26 +315,26 @@ def id(self) -> ConversationID:
         return uuid.UUID(self._conversation_id)
 
     @property
-    def agent_status(self) -> AgentExecutionStatus:
-        """The current agent execution status."""
+    def execution_status(self) -> ConversationExecutionStatus:
+        """The current conversation execution status."""
         info = self._get_conversation_info()
-        status_str = info.get("agent_status", None)
+        status_str = info.get("execution_status")
         if status_str is None:
             raise RuntimeError(
-                "agent_status missing in conversation info: " + str(info)
+                "execution_status missing in conversation info: " + str(info)
             )
-        return AgentExecutionStatus(status_str)
+        return ConversationExecutionStatus(status_str)
 
-    @agent_status.setter
-    def agent_status(self, value: AgentExecutionStatus) -> None:
-        """Set agent status is No-OP for RemoteConversation.
+    @execution_status.setter
+    def execution_status(self, value: ConversationExecutionStatus) -> None:
+        """Set execution status is No-OP for RemoteConversation.
 
-        # For remote conversations, agent status is managed server-side
+        # For remote conversations, execution status is managed server-side
         # This setter is provided for test compatibility but doesn't actually change remote state  # noqa: E501
         """  # noqa: E501
         raise NotImplementedError(
-            f"Setting agent_status on RemoteState has no effect. "
-            f"Remote agent status is managed server-side. Attempted to set: {value}"
+            f"Setting execution_status on RemoteState has no effect. "
+            f"Remote execution status is managed server-side. Attempted to set: {value}"
         )
 
     @property
@@ -507,6 +513,9 @@ def __init__(
             secret_values: dict[str, SecretValue] = {k: v for k, v in secrets.items()}
             self.update_secrets(secret_values)
 
+        if should_enable_observability():
+            start_active_span("conversation", session_id=str(self._id))
+
     @property
     def id(self) -> ConversationID:
         return self._id
@@ -532,6 +541,7 @@ def stuck_detector(self):
             " since it would be handled server-side."
         )
 
+    @observe(name="conversation.send_message")
     def send_message(self, message: str | Message) -> None:
         if isinstance(message, str):
             message = Message(role="user", content=[TextContent(text=message)])
@@ -547,6 +557,7 @@ def send_message(self, message: str | Message) -> None:
             self._client, "POST", f"/api/conversations/{self._id}/events", json=payload
         )
 
+    @observe(name="conversation.run")
     def run(self) -> None:
         # Trigger a run on the server using the dedicated run endpoint.
         # Let the server tell us if it's already running (409), avoiding an extra GET.
@@ -604,6 +615,7 @@ def update_secrets(self, secrets: Mapping[str, SecretValue]) -> None:
             self._client, "POST", f"/api/conversations/{self._id}/secrets", json=payload
         )
 
+    @observe(name="conversation.generate_title", ignore_inputs=["llm"])
     def generate_title(self, llm: LLM | None = None, max_length: int = 50) -> str:
         """Generate a title for the conversation based on the first user message.
 
@@ -641,6 +653,8 @@ def close(self) -> None:
         except Exception:
             pass
 
+        end_active_span()
+
         try:
             self._client.close()
         except Exception:
diff --git a/openhands-sdk/openhands/sdk/conversation/response_utils.py b/openhands-sdk/openhands/sdk/conversation/response_utils.py
index 63991fa9ac..ac1891f244 100644
--- a/openhands-sdk/openhands/sdk/conversation/response_utils.py
+++ b/openhands-sdk/openhands/sdk/conversation/response_utils.py
@@ -5,7 +5,7 @@
 from openhands.sdk.event import ActionEvent, MessageEvent
 from openhands.sdk.event.base import Event
 from openhands.sdk.llm.message import content_to_str
-from openhands.sdk.tool.builtins.finish import FinishAction
+from openhands.sdk.tool.builtins.finish import FinishAction, FinishTool
 
 
 def get_agent_final_response(events: Sequence[Event]) -> str:
@@ -27,7 +27,7 @@ def get_agent_final_response(events: Sequence[Event]) -> str:
         if (
             isinstance(event, ActionEvent)
             and event.source == "agent"
-            and event.tool_name == "finish"
+            and event.tool_name == FinishTool.name
         ):
             # Extract message from finish tool call
             if event.action is not None and isinstance(event.action, FinishAction):
diff --git a/openhands-sdk/openhands/sdk/conversation/secrets_manager.py b/openhands-sdk/openhands/sdk/conversation/secret_registry.py
similarity index 97%
rename from openhands-sdk/openhands/sdk/conversation/secrets_manager.py
rename to openhands-sdk/openhands/sdk/conversation/secret_registry.py
index d24c4e34bd..34c5ec4f9b 100644
--- a/openhands-sdk/openhands/sdk/conversation/secrets_manager.py
+++ b/openhands-sdk/openhands/sdk/conversation/secret_registry.py
@@ -17,10 +17,10 @@
 SecretValue = str | SecretSource
 
 
-class SecretsManager(OpenHandsModel):
+class SecretRegistry(OpenHandsModel):
     """Manages secrets and injects them into bash commands when needed.
 
-    The secrets manager stores a mapping of secret keys to SecretSources
+    The secret registry stores a mapping of secret keys to SecretSources
     that retrieve the actual secret values. When a bash command is about to be
     executed, it scans the command for any secret keys and injects the corresponding
     environment variables.
diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py
index 6d48dbfbd5..ddfdd0a793 100644
--- a/openhands-sdk/openhands/sdk/conversation/state.py
+++ b/openhands-sdk/openhands/sdk/conversation/state.py
@@ -4,14 +4,14 @@
 from enum import Enum
 from typing import Any, Self
 
-from pydantic import Field, PrivateAttr
+from pydantic import AliasChoices, Field, PrivateAttr
 
 from openhands.sdk.agent.base import AgentBase
 from openhands.sdk.conversation.conversation_stats import ConversationStats
 from openhands.sdk.conversation.event_store import EventLog
 from openhands.sdk.conversation.fifo_lock import FIFOLock
 from openhands.sdk.conversation.persistence_const import BASE_STATE, EVENTS_DIR
-from openhands.sdk.conversation.secrets_manager import SecretsManager
+from openhands.sdk.conversation.secret_registry import SecretRegistry
 from openhands.sdk.conversation.types import ConversationCallbackType, ConversationID
 from openhands.sdk.event import ActionEvent, ObservationEvent, UserRejectObservation
 from openhands.sdk.event.base import Event
@@ -28,18 +28,18 @@
 logger = get_logger(__name__)
 
 
-class AgentExecutionStatus(str, Enum):
-    """Enum representing the current execution state of the agent."""
+class ConversationExecutionStatus(str, Enum):
+    """Enum representing the current execution state of the conversation."""
 
-    IDLE = "idle"  # Agent is ready to receive tasks
-    RUNNING = "running"  # Agent is actively processing
-    PAUSED = "paused"  # Agent execution is paused by user
+    IDLE = "idle"  # Conversation is ready to receive tasks
+    RUNNING = "running"  # Conversation is actively processing
+    PAUSED = "paused"  # Conversation execution is paused by user
     WAITING_FOR_CONFIRMATION = (
-        "waiting_for_confirmation"  # Agent is waiting for user confirmation
+        "waiting_for_confirmation"  # Conversation is waiting for user confirmation
     )
-    FINISHED = "finished"  # Agent has completed the current task
-    ERROR = "error"  # Agent encountered an error (optional for future use)
-    STUCK = "stuck"  # Agent is stuck in a loop or unable to proceed
+    FINISHED = "finished"  # Conversation has completed the current task
+    ERROR = "error"  # Conversation encountered an error (optional for future use)
+    STUCK = "stuck"  # Conversation is stuck in a loop or unable to proceed
 
 
 class ConversationState(OpenHandsModel):
@@ -77,7 +77,9 @@ class ConversationState(OpenHandsModel):
     )
 
     # Enum-based state management
-    agent_status: AgentExecutionStatus = Field(default=AgentExecutionStatus.IDLE)
+    execution_status: ConversationExecutionStatus = Field(
+        default=ConversationExecutionStatus.IDLE
+    )
     confirmation_policy: ConfirmationPolicyBase = NeverConfirm()
 
     activated_knowledge_skills: list[str] = Field(
@@ -91,10 +93,12 @@ class ConversationState(OpenHandsModel):
         description="Conversation statistics for tracking LLM metrics",
     )
 
-    # Secrets manager for handling sensitive data (changed from private attribute)
-    secrets_manager: SecretsManager = Field(
-        default_factory=SecretsManager,
-        description="Manager for handling secrets and sensitive data",
+    # Secret registry for handling sensitive data
+    secret_registry: SecretRegistry = Field(
+        default_factory=SecretRegistry,
+        description="Registry for handling secrets and sensitive data",
+        validation_alias=AliasChoices("secret_registry", "secrets_manager"),
+        serialization_alias="secret_registry",
     )
 
     # ===== Private attrs (NOT Fields) =====
diff --git a/openhands-sdk/openhands/sdk/conversation/visualizer.py b/openhands-sdk/openhands/sdk/conversation/visualizer.py
index d4c70168f1..1029344e0c 100644
--- a/openhands-sdk/openhands/sdk/conversation/visualizer.py
+++ b/openhands-sdk/openhands/sdk/conversation/visualizer.py
@@ -199,15 +199,24 @@ def _create_event_panel(self, event: Event) -> Panel | None:
             }
             role_color = role_colors.get(event.llm_message.role, "white")
 
-            # Use "to" for user messages (user sending to agent)
-            # and "from" for assistant messages
-            direction = "to" if event.llm_message.role == "user" else "from"
-            title_text = f"[bold {role_color}]Message {direction} "
-            if self._name_for_visualization:
-                title_text += f"{self._name_for_visualization} Agent"
+            # "User Message To [Name] Agent" for user
+            # "Message from [Name] Agent" for agent
+            agent_name = (
+                f"{self._name_for_visualization} "
+                if self._name_for_visualization
+                else ""
+            )
+
+            if event.llm_message.role == "user":
+                title_text = (
+                    f"[bold {role_color}]User Message to "
+                    f"{agent_name}Agent[/bold {role_color}]"
+                )
             else:
-                title_text += event.source.capitalize()
-            title_text += f"[/bold {role_color}]"
+                title_text = (
+                    f"[bold {role_color}]Message from "
+                    f"{agent_name}Agent[/bold {role_color}]"
+                )
             return Panel(
                 content,
                 title=title_text,
@@ -285,14 +294,14 @@ def _format_metrics_subtitle(self) -> str | None:
         def abbr(n: int | float) -> str:
             n = int(n or 0)
             if n >= 1_000_000_000:
-                s = f"{n / 1_000_000_000:.2f}B"
+                val, suffix = n / 1_000_000_000, "B"
             elif n >= 1_000_000:
-                s = f"{n / 1_000_000:.2f}M"
+                val, suffix = n / 1_000_000, "M"
             elif n >= 1_000:
-                s = f"{n / 1_000:.2f}K"
+                val, suffix = n / 1_000, "K"
             else:
                 return str(n)
-            return s.replace(".0", "")
+            return f"{val:.2f}".rstrip("0").rstrip(".") + suffix
 
         input_tokens = abbr(usage.prompt_tokens or 0)
         output_tokens = abbr(usage.completion_tokens or 0)
@@ -304,7 +313,7 @@ def abbr(n: int | float) -> str:
         reasoning_tokens = usage.reasoning_tokens or 0
 
         # Cost
-        cost_str = f"{cost:.4f}" if cost > 0 else "$0.00"
+        cost_str = f"{cost:.4f}" if cost > 0 else "0.00"
 
         # Build with fixed color scheme
         parts: list[str] = []
diff --git a/openhands-sdk/openhands/sdk/event/base.py b/openhands-sdk/openhands/sdk/event/base.py
index 106f729988..9f0f3024e9 100644
--- a/openhands-sdk/openhands/sdk/event/base.py
+++ b/openhands-sdk/openhands/sdk/event/base.py
@@ -108,7 +108,8 @@ def events_to_messages(events: list["LLMConvertibleEvent"]) -> list[Message]:
                 # Look ahead for related events
                 j = i + 1
                 while j < len(events) and isinstance(events[j], ActionEvent):
-                    event = events[j]  # Now type checker knows this is ActionEvent
+                    event = events[j]
+                    assert isinstance(event, ActionEvent)  # for type checker
                     if event.llm_response_id != response_id:
                         break
                     batch_events.append(event)
diff --git a/openhands-sdk/openhands/sdk/event/condenser.py b/openhands-sdk/openhands/sdk/event/condenser.py
index ceda5eb53a..eb4b98e054 100644
--- a/openhands-sdk/openhands/sdk/event/condenser.py
+++ b/openhands-sdk/openhands/sdk/event/condenser.py
@@ -25,6 +25,11 @@ class Condensation(Event):
         description="An optional offset to the start of the resulting view"
         " indicating where the summary should be inserted.",
     )
+    llm_response_id: EventID = Field(
+        description=(
+            "Completion or Response ID of the LLM response that generated this event"
+        ),
+    )
 
     source: SourceType = "environment"
 
diff --git a/openhands-sdk/openhands/sdk/event/llm_convertible/action.py b/openhands-sdk/openhands/sdk/event/llm_convertible/action.py
index 507bf56f85..d2bd9ada30 100644
--- a/openhands-sdk/openhands/sdk/event/llm_convertible/action.py
+++ b/openhands-sdk/openhands/sdk/event/llm_convertible/action.py
@@ -3,8 +3,8 @@
 from pydantic import Field
 from rich.text import Text
 
-from openhands.sdk.event.base import N_CHAR_PREVIEW, LLMConvertibleEvent
-from openhands.sdk.event.types import EventID, SourceType, ToolCallID
+from openhands.sdk.event.base import N_CHAR_PREVIEW, EventID, LLMConvertibleEvent
+from openhands.sdk.event.types import SourceType, ToolCallID
 from openhands.sdk.llm import (
     Message,
     MessageToolCall,
@@ -52,11 +52,11 @@ class ActionEvent(LLMConvertibleEvent):
         ),
     )
     llm_response_id: EventID = Field(
-        ...,
         description=(
-            "Groups related actions from same LLM response. This helps in tracking "
-            "and managing results of parallel function calling from the same LLM "
-            "response."
+            "Completion or Response ID of the LLM response that generated this event"
+            "E.g., Can be used to group related actions from same LLM response. "
+            "This helps in tracking and managing results of parallel function calling "
+            "from the same LLM response."
         ),
     )
 
diff --git a/openhands-sdk/openhands/sdk/event/llm_convertible/message.py b/openhands-sdk/openhands/sdk/event/llm_convertible/message.py
index 24b489d205..861a34e095 100644
--- a/openhands-sdk/openhands/sdk/event/llm_convertible/message.py
+++ b/openhands-sdk/openhands/sdk/event/llm_convertible/message.py
@@ -5,7 +5,7 @@
 from pydantic import ConfigDict, Field
 from rich.text import Text
 
-from openhands.sdk.event.base import N_CHAR_PREVIEW, LLMConvertibleEvent
+from openhands.sdk.event.base import N_CHAR_PREVIEW, EventID, LLMConvertibleEvent
 from openhands.sdk.event.types import SourceType
 from openhands.sdk.llm import (
     ImageContent,
@@ -28,6 +28,13 @@ class MessageEvent(LLMConvertibleEvent):
     llm_message: Message = Field(
         ..., description="The exact LLM message for this message event"
     )
+    llm_response_id: EventID | None = Field(
+        default=None,
+        description=(
+            "Completion or Response ID of the LLM response that generated this event"
+            "If the source != 'agent', this field is None"
+        ),
+    )
 
     # context extensions stuff / skill can go here
     activated_skills: list[str] = Field(
diff --git a/openhands-sdk/openhands/sdk/io/local.py b/openhands-sdk/openhands/sdk/io/local.py
index 00d0de09db..40a192b003 100644
--- a/openhands-sdk/openhands/sdk/io/local.py
+++ b/openhands-sdk/openhands/sdk/io/local.py
@@ -2,6 +2,7 @@
 import shutil
 
 from openhands.sdk.logger import get_logger
+from openhands.sdk.observability.laminar import observe
 
 from .base import FileStore
 
@@ -33,6 +34,7 @@ def get_full_path(self, path: str) -> str:
             raise ValueError(f"path escapes filestore root: {path}")
         return full
 
+    @observe(name="LocalFileStore.write", span_type="TOOL")
     def write(self, path: str, contents: str | bytes) -> None:
         full_path = self.get_full_path(path)
         os.makedirs(os.path.dirname(full_path), exist_ok=True)
@@ -48,6 +50,7 @@ def read(self, path: str) -> str:
         with open(full_path, encoding="utf-8") as f:
             return f.read()
 
+    @observe(name="LocalFileStore.list", span_type="TOOL")
     def list(self, path: str) -> list[str]:
         full_path = self.get_full_path(path)
         if not os.path.exists(full_path):
@@ -62,6 +65,7 @@ def list(self, path: str) -> list[str]:
         files = [f + "/" if os.path.isdir(self.get_full_path(f)) else f for f in files]
         return files
 
+    @observe(name="LocalFileStore.delete", span_type="TOOL")
     def delete(self, path: str) -> None:
         try:
             full_path = self.get_full_path(path)
diff --git a/openhands-sdk/openhands/sdk/llm/exceptions.py b/openhands-sdk/openhands/sdk/llm/exceptions.py
deleted file mode 100644
index dc6c37ef4a..0000000000
--- a/openhands-sdk/openhands/sdk/llm/exceptions.py
+++ /dev/null
@@ -1,110 +0,0 @@
-class LLMError(Exception):
-    """Base class for all LLM-related exceptions."""
-
-    message: str
-
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
-        self.message = message
-
-    def __str__(self) -> str:
-        return self.message
-
-
-class LLMMalformedActionError(LLMError):
-    """Exception raised when the LLM response is malformed or does not conform to the expected format."""  # noqa: E501
-
-    def __init__(self, message: str = "Malformed response") -> None:
-        super().__init__(message)
-
-
-class LLMNoActionError(LLMError):
-    """Exception raised when the LLM response does not include an action."""
-
-    def __init__(self, message: str = "Agent must return an action") -> None:
-        super().__init__(message)
-
-
-class LLMResponseError(LLMError):
-    """Exception raised when the LLM response does not include an action or the action is not of the expected type."""  # noqa: E501
-
-    def __init__(
-        self, message: str = "Failed to retrieve action from LLM response"
-    ) -> None:
-        super().__init__(message)
-
-
-class LLMNoResponseError(LLMError):
-    """Exception raised when the LLM does not return a response, typically seen in
-    Gemini models.
-
-    This exception should be retried
-    Typically, after retry with a non-zero temperature, the LLM will return a response
-    """
-
-    def __init__(
-        self,
-        message: str = "LLM did not return a response. This is only seen in Gemini models so far.",  # noqa: E501
-    ) -> None:
-        super().__init__(message)
-
-
-class LLMContextWindowExceedError(LLMError):
-    def __init__(
-        self,
-        message: str = "Conversation history longer than LLM context window limit. Consider turning on enable_history_truncation config to avoid this error",  # noqa: E501
-    ) -> None:
-        super().__init__(message)
-
-
-# ============================================
-# LLM function calling Exceptions
-# ============================================
-
-
-class FunctionCallConversionError(LLMError):
-    """Exception raised when FunctionCallingConverter failed to convert a non-function
-    call message to a function call message.
-
-    This typically happens when there's a malformed message (e.g., missing
-    <function=...> tags). But not due to LLM output.
-    """
-
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
-
-
-class FunctionCallValidationError(LLMError):
-    """Exception raised when FunctionCallingConverter failed to validate a function
-    call message.
-
-    This typically happens when the LLM outputs unrecognized function call /
-    parameter names / values.
-    """
-
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
-
-
-class FunctionCallNotExistsError(LLMError):
-    """Exception raised when an LLM call a tool that is not registered."""
-
-    def __init__(self, message: str) -> None:
-        super().__init__(message)
-
-
-# ============================================
-# Other Exceptions
-# ============================================
-
-
-class UserCancelledError(Exception):
-    def __init__(self, message: str = "User cancelled the request") -> None:
-        super().__init__(message)
-
-
-class OperationCancelled(Exception):
-    """Exception raised when an operation is cancelled (e.g. by a keyboard interrupt)."""  # noqa: E501
-
-    def __init__(self, message: str = "Operation was cancelled") -> None:
-        super().__init__(message)
diff --git a/openhands-sdk/openhands/sdk/llm/exceptions/__init__.py b/openhands-sdk/openhands/sdk/llm/exceptions/__init__.py
new file mode 100644
index 0000000000..f933c02015
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/llm/exceptions/__init__.py
@@ -0,0 +1,45 @@
+from .classifier import is_context_window_exceeded, looks_like_auth_error
+from .mapping import map_provider_exception
+from .types import (
+    FunctionCallConversionError,
+    FunctionCallNotExistsError,
+    FunctionCallValidationError,
+    LLMAuthenticationError,
+    LLMBadRequestError,
+    LLMContextWindowExceedError,
+    LLMError,
+    LLMMalformedActionError,
+    LLMNoActionError,
+    LLMNoResponseError,
+    LLMRateLimitError,
+    LLMResponseError,
+    LLMServiceUnavailableError,
+    LLMTimeoutError,
+    OperationCancelled,
+    UserCancelledError,
+)
+
+
+__all__ = [
+    # Types
+    "LLMError",
+    "LLMMalformedActionError",
+    "LLMNoActionError",
+    "LLMResponseError",
+    "FunctionCallConversionError",
+    "FunctionCallValidationError",
+    "FunctionCallNotExistsError",
+    "LLMNoResponseError",
+    "LLMContextWindowExceedError",
+    "LLMAuthenticationError",
+    "LLMRateLimitError",
+    "LLMTimeoutError",
+    "LLMServiceUnavailableError",
+    "LLMBadRequestError",
+    "UserCancelledError",
+    "OperationCancelled",
+    # Helpers
+    "is_context_window_exceeded",
+    "looks_like_auth_error",
+    "map_provider_exception",
+]
diff --git a/openhands-sdk/openhands/sdk/llm/exceptions/classifier.py b/openhands-sdk/openhands/sdk/llm/exceptions/classifier.py
new file mode 100644
index 0000000000..7f49bd39b3
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/llm/exceptions/classifier.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from litellm.exceptions import BadRequestError, ContextWindowExceededError, OpenAIError
+
+from .types import LLMContextWindowExceedError
+
+
+# Minimal, provider-agnostic context-window detection
+LONG_PROMPT_PATTERNS: list[str] = [
+    "contextwindowexceedederror",
+    "prompt is too long",
+    "input length and `max_tokens` exceed context limit",
+    "please reduce the length of",
+    "the request exceeds the available context size",
+    "context length exceeded",
+]
+
+
+def is_context_window_exceeded(exception: Exception) -> bool:
+    if isinstance(exception, (ContextWindowExceededError, LLMContextWindowExceedError)):
+        return True
+
+    if not isinstance(exception, (BadRequestError, OpenAIError)):
+        return False
+
+    s = str(exception).lower()
+    return any(p in s for p in LONG_PROMPT_PATTERNS)
+
+
+AUTH_PATTERNS: list[str] = [
+    "invalid api key",
+    "unauthorized",
+    "missing api key",
+    "invalid authentication",
+    "access denied",
+]
+
+
+def looks_like_auth_error(exception: Exception) -> bool:
+    if not isinstance(exception, (BadRequestError, OpenAIError)):
+        return False
+    s = str(exception).lower()
+    if any(p in s for p in AUTH_PATTERNS):
+        return True
+    # Some providers include explicit status codes in message text
+    for code in ("status 401", "status 403"):
+        if code in s:
+            return True
+    return False
diff --git a/openhands-sdk/openhands/sdk/llm/exceptions/mapping.py b/openhands-sdk/openhands/sdk/llm/exceptions/mapping.py
new file mode 100644
index 0000000000..8510eaaa57
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/llm/exceptions/mapping.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from litellm.exceptions import (
+    APIConnectionError,
+    BadRequestError,
+    InternalServerError,
+    RateLimitError,
+    ServiceUnavailableError,
+    Timeout as LiteLLMTimeout,
+)
+
+from .classifier import is_context_window_exceeded, looks_like_auth_error
+from .types import (
+    LLMAuthenticationError,
+    LLMBadRequestError,
+    LLMContextWindowExceedError,
+    LLMRateLimitError,
+    LLMServiceUnavailableError,
+    LLMTimeoutError,
+)
+
+
+def map_provider_exception(exception: Exception) -> Exception:
+    """
+    Map provider/LiteLLM exceptions to SDK-typed exceptions.
+
+    Returns original exception if no mapping applies.
+    """
+    # Context window exceeded first (highest priority)
+    if is_context_window_exceeded(exception):
+        return LLMContextWindowExceedError(str(exception))
+
+    # Auth-like errors often appear as BadRequest/OpenAIError with specific text
+    if looks_like_auth_error(exception):
+        return LLMAuthenticationError(str(exception))
+
+    if isinstance(exception, RateLimitError):
+        return LLMRateLimitError(str(exception))
+
+    if isinstance(exception, LiteLLMTimeout):
+        return LLMTimeoutError(str(exception))
+
+    # Connectivity and service-side availability issues → service unavailable
+    if isinstance(
+        exception, (APIConnectionError, ServiceUnavailableError, InternalServerError)
+    ):
+        return LLMServiceUnavailableError(str(exception))
+
+    # Generic client-side 4xx errors
+    if isinstance(exception, BadRequestError):
+        return LLMBadRequestError(str(exception))
+
+    # Unknown: let caller re-raise original
+    return exception
diff --git a/openhands-sdk/openhands/sdk/llm/exceptions/types.py b/openhands-sdk/openhands/sdk/llm/exceptions/types.py
new file mode 100644
index 0000000000..e6d0522da9
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/llm/exceptions/types.py
@@ -0,0 +1,101 @@
+class LLMError(Exception):
+    message: str
+
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
+        self.message = message
+
+    def __str__(self) -> str:
+        return self.message
+
+
+# General response parsing/validation errors
+class LLMMalformedActionError(LLMError):
+    def __init__(self, message: str = "Malformed response") -> None:
+        super().__init__(message)
+
+
+class LLMNoActionError(LLMError):
+    def __init__(self, message: str = "Agent must return an action") -> None:
+        super().__init__(message)
+
+
+class LLMResponseError(LLMError):
+    def __init__(
+        self, message: str = "Failed to retrieve action from LLM response"
+    ) -> None:
+        super().__init__(message)
+
+
+# Function-calling conversion/validation
+class FunctionCallConversionError(LLMError):
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
+
+
+class FunctionCallValidationError(LLMError):
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
+
+
+class FunctionCallNotExistsError(LLMError):
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
+
+
+# Provider/transport related
+class LLMNoResponseError(LLMError):
+    def __init__(
+        self,
+        message: str = (
+            "LLM did not return a response. This is only seen in Gemini models so far."
+        ),
+    ) -> None:
+        super().__init__(message)
+
+
+class LLMContextWindowExceedError(LLMError):
+    def __init__(
+        self,
+        message: str = (
+            "Conversation history longer than LLM context window limit. "
+            "Consider enabling a condenser or shortening inputs."
+        ),
+    ) -> None:
+        super().__init__(message)
+
+
+class LLMAuthenticationError(LLMError):
+    def __init__(self, message: str = "Invalid or missing API credentials") -> None:
+        super().__init__(message)
+
+
+class LLMRateLimitError(LLMError):
+    def __init__(self, message: str = "Rate limit exceeded") -> None:
+        super().__init__(message)
+
+
+class LLMTimeoutError(LLMError):
+    def __init__(self, message: str = "LLM request timed out") -> None:
+        super().__init__(message)
+
+
+class LLMServiceUnavailableError(LLMError):
+    def __init__(self, message: str = "LLM service unavailable") -> None:
+        super().__init__(message)
+
+
+class LLMBadRequestError(LLMError):
+    def __init__(self, message: str = "Bad request to LLM provider") -> None:
+        super().__init__(message)
+
+
+# Other
+class UserCancelledError(Exception):
+    def __init__(self, message: str = "User cancelled the request") -> None:
+        super().__init__(message)
+
+
+class OperationCancelled(Exception):
+    def __init__(self, message: str = "Operation was cancelled") -> None:
+        super().__init__(message)
diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py
index 337a0c7fd0..bbdd3a6124 100644
--- a/openhands-sdk/openhands/sdk/llm/llm.py
+++ b/openhands-sdk/openhands/sdk/llm/llm.py
@@ -26,7 +26,7 @@
 
 
 if TYPE_CHECKING:  # type hints only, avoid runtime import cycle
-    from openhands.sdk.tool.tool import ToolBase
+    from openhands.sdk.tool.tool import ToolDefinition
 
 from openhands.sdk.utils.pydantic_diff import pretty_pydantic_diff
 
@@ -44,10 +44,7 @@
 )
 from litellm.exceptions import (
     APIConnectionError,
-    BadRequestError,
-    ContextWindowExceededError,
     InternalServerError,
-    OpenAIError,
     RateLimitError,
     ServiceUnavailableError,
     Timeout as LiteLLMTimeout,
@@ -62,7 +59,10 @@
     token_counter,
 )
 
-from openhands.sdk.llm.exceptions import LLMNoResponseError
+from openhands.sdk.llm.exceptions import (
+    LLMNoResponseError,
+    map_provider_exception,
+)
 
 # OpenHands utilities
 from openhands.sdk.llm.llm_response import LLMResponse
@@ -101,7 +101,23 @@
 
 
 class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
-    """Refactored LLM: simple `completion()`, centralized Telemetry, tiny helpers."""
+    """Language model interface for OpenHands agents.
+
+    The LLM class provides a unified interface for interacting with various
+    language models through the litellm library. It handles model configuration,
+    API authentication,
+    retry logic, and tool calling capabilities.
+
+    Example:
+        >>> from openhands.sdk import LLM
+        >>> from pydantic import SecretStr
+        >>> llm = LLM(
+        ...     model="claude-sonnet-4-20250514",
+        ...     api_key=SecretStr("your-api-key"),
+        ...     usage_id="my-agent"
+        ... )
+        >>> # Use with agent or conversation
+    """
 
     # =========================================================================
     # Config fields
@@ -150,6 +166,10 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
         ge=1,
         description="The maximum number of output tokens. This is sent to the LLM.",
     )
+    extra_headers: dict[str, str] | None = Field(
+        default=None,
+        description="Optional HTTP headers to forward to LiteLLM requests.",
+    )
     input_cost_per_token: float | None = Field(
         default=None,
         ge=0,
@@ -188,10 +208,9 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
     custom_tokenizer: str | None = Field(
         default=None, description="A custom tokenizer to use for token counting."
     )
-    native_tool_calling: bool | None = Field(
-        default=None,
-        description="Whether to use native tool calling "
-        "if supported by the model. Can be True, False, or not set.",
+    native_tool_calling: bool = Field(
+        default=True,
+        description="Whether to use native tool calling.",
     )
     reasoning_effort: Literal["low", "medium", "high", "none"] | None = Field(
         default=None,
@@ -227,10 +246,12 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
             "telemetry, and spend tracking."
         ),
     )
-    metadata: dict[str, Any] = Field(
+    litellm_extra_body: dict[str, Any] = Field(
         default_factory=dict,
         description=(
-            "Additional metadata for the LLM instance. "
+            "Additional key-value pairs to pass to litellm's extra_body parameter. "
+            "This is useful for custom inference clusters that need additional "
+            "metadata for logging, tracking, or routing purposes. "
             "Example structure: "
             "{'trace_version': '1.0.0', 'tags': ['model:gpt-4', 'agent:my-agent'], "
             "'session_id': 'session-123', 'trace_user_id': 'user-456'}"
@@ -258,7 +279,6 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
     # Runtime-only private attrs
     _model_info: Any = PrivateAttr(default=None)
     _tokenizer: Any = PrivateAttr(default=None)
-    _function_calling_active: bool = PrivateAttr(default=False)
     _telemetry: Telemetry | None = PrivateAttr(default=None)
 
     model_config: ClassVar[ConfigDict] = ConfigDict(
@@ -390,6 +410,15 @@ def service_id(self, value: str) -> None:
 
     @property
     def metrics(self) -> Metrics:
+        """Get usage metrics for this LLM instance.
+
+        Returns:
+            Metrics object containing token usage, costs, and other statistics.
+
+        Example:
+            >>> cost = llm.metrics.accumulated_cost
+            >>> print(f"Total cost: ${cost}")
+        """
         assert self._metrics is not None, (
             "Metrics should be initialized after model validation"
         )
@@ -402,14 +431,27 @@ def restore_metrics(self, metrics: Metrics) -> None:
     def completion(
         self,
         messages: list[Message],
-        tools: Sequence[ToolBase] | None = None,
+        tools: Sequence[ToolDefinition] | None = None,
         _return_metrics: bool = False,
         add_security_risk_prediction: bool = False,
         **kwargs,
     ) -> LLMResponse:
-        """Single entry point for LLM completion.
+        """Generate a completion from the language model.
 
-        Normalize → (maybe) mock tools → transport → postprocess.
+        This is the method for getting responses from the model via Completion API.
+        It handles message formatting, tool calling, and response processing.
+
+        Returns:
+            LLMResponse containing the model's response and metadata.
+
+        Raises:
+            ValueError: If streaming is requested (not supported).
+
+        Example:
+            >>> from openhands.sdk.llm import Message, TextContent
+            >>> messages = [Message(role="user", content=[TextContent(text="Hello")])]
+            >>> response = llm.completion(messages)
+            >>> print(response.content)
         """
         # Check if streaming is requested
         if kwargs.get("stream", False):
@@ -419,7 +461,7 @@ def completion(
         formatted_messages = self.format_messages_for_llm(messages)
 
         # 2) choose function-calling strategy
-        use_native_fc = self.is_function_calling_active()
+        use_native_fc = self.native_tool_calling
         original_fncall_msgs = copy.deepcopy(formatted_messages)
 
         # Convert Tool objects to ChatCompletionToolParam once here
@@ -461,7 +503,6 @@ def completion(
             }
             if tools and not use_native_fc:
                 log_ctx["raw_messages"] = original_fncall_msgs
-        self._telemetry.on_request(log_ctx=log_ctx)
 
         # 5) do the call with retries
         @self.retry_decorator(
@@ -474,6 +515,7 @@ def completion(
         )
         def _one_attempt(**retry_kwargs) -> ModelResponse:
             assert self._telemetry is not None
+            self._telemetry.on_request(log_ctx=log_ctx)
             # Merge retry-modified kwargs (like temperature) with call_kwargs
             final_kwargs = {**call_kwargs, **retry_kwargs}
             resp = self._transport_call(messages=formatted_messages, **final_kwargs)
@@ -515,6 +557,9 @@ def _one_attempt(**retry_kwargs) -> ModelResponse:
             )
         except Exception as e:
             self._telemetry.on_error(e)
+            mapped = map_provider_exception(e)
+            if mapped is not e:
+                raise mapped from e
             raise
 
     # =========================================================================
@@ -523,7 +568,7 @@ def _one_attempt(**retry_kwargs) -> ModelResponse:
     def responses(
         self,
         messages: list[Message],
-        tools: Sequence[ToolBase] | None = None,
+        tools: Sequence[ToolDefinition] | None = None,
         include: list[str] | None = None,
         store: bool | None = None,
         _return_metrics: bool = False,
@@ -634,6 +679,9 @@ def _one_attempt(**retry_kwargs) -> ResponsesAPIResponse:
             )
         except Exception as e:
             self._telemetry.on_error(e)
+            mapped = map_provider_exception(e)
+            if mapped is not e:
+                raise mapped from e
             raise
 
     # =========================================================================
@@ -666,7 +714,7 @@ def _transport_call(
                 ret = litellm_completion(
                     model=self.model,
                     api_key=self.api_key.get_secret_value() if self.api_key else None,
-                    base_url=self.base_url,
+                    api_base=self.base_url,
                     api_version=self.api_version,
                     timeout=self.timeout,
                     drop_params=self.drop_params,
@@ -708,11 +756,12 @@ def _init_model_info_and_caps(self) -> None:
             if not base_url.startswith(("http://", "https://")):
                 base_url = "http://" + base_url
             try:
+                headers = {}
                 api_key = self.api_key.get_secret_value() if self.api_key else ""
-                response = httpx.get(
-                    f"{base_url}/v1/model/info",
-                    headers={"Authorization": f"Bearer {api_key}"},
-                )
+                if api_key:
+                    headers["Authorization"] = f"Bearer {api_key}"
+
+                response = httpx.get(f"{base_url}/v1/model/info", headers=headers)
                 data = response.json().get("data", [])
                 current = next(
                     (
@@ -729,7 +778,11 @@ def _init_model_info_and_caps(self) -> None:
                         f"Got model info from litellm proxy: {self._model_info}"
                     )
             except Exception as e:
-                logger.debug(f"Error fetching model info from proxy: {e}")
+                logger.debug(
+                    f"Error fetching model info from proxy: {e}",
+                    exc_info=True,
+                    stack_info=True,
+                )
 
         # Fallbacks: try base name variants
         if not self._model_info:
@@ -769,15 +822,6 @@ def _init_model_info_and_caps(self) -> None:
                 elif isinstance(self._model_info.get("max_tokens"), int):
                     self.max_output_tokens = self._model_info.get("max_tokens")
 
-        # Function-calling capabilities
-        feats = get_features(self.model)
-        logger.debug(f"Model features for {self.model}: {feats}")
-        self._function_calling_active = (
-            self.native_tool_calling
-            if self.native_tool_calling is not None
-            else feats.supports_function_calling
-        )
-
     def vision_is_active(self) -> bool:
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
@@ -818,12 +862,6 @@ def is_caching_prompt_active(self) -> bool:
         # only Anthropic models need explicit caching breakpoints
         return self.caching_prompt and get_features(self.model).supports_prompt_cache
 
-    def is_function_calling_active(self) -> bool:
-        """Returns whether function calling is supported
-        and enabled for this LLM instance.
-        """
-        return bool(self._function_calling_active)
-
     def uses_responses_api(self) -> bool:
         """Whether this model uses the OpenAI Responses API path."""
 
@@ -864,11 +902,10 @@ def format_messages_for_llm(self, messages: list[Message]) -> list[dict]:
         for message in messages:
             message.cache_enabled = self.is_caching_prompt_active()
             message.vision_enabled = self.vision_is_active()
-            message.function_calling_enabled = self.is_function_calling_active()
-            if "deepseek" in self.model or (
-                "kimi-k2-instruct" in self.model and "groq" in self.model
-            ):
-                message.force_string_serializer = True
+            message.function_calling_enabled = self.native_tool_calling
+            message.force_string_serializer = get_features(
+                self.model
+            ).force_string_serializer
 
         formatted_messages = [message.to_chat_dict() for message in messages]
 
@@ -1032,51 +1069,3 @@ def resolve_diff_from_deserialized(self, persisted: LLM) -> LLM:
                 f"Diff: {pretty_pydantic_diff(self, reconciled)}"
             )
         return reconciled
-
-    @staticmethod
-    def is_context_window_exceeded_exception(exception: Exception) -> bool:
-        """Check if the exception indicates a context window exceeded error.
-
-        Context window exceeded errors vary by provider, and LiteLLM does not do a
-        consistent job of identifying and wrapping them.
-        """
-        # A context window exceeded error from litellm is the best signal we have.
-        if isinstance(exception, ContextWindowExceededError):
-            return True
-
-        # But with certain providers the exception might be a bad request or generic
-        # OpenAI error, and we have to use the content of the error to figure out what
-        # is wrong.
-        if not isinstance(exception, (BadRequestError, OpenAIError)):
-            return False
-
-        # Not all BadRequestError or OpenAIError are context window exceeded errors, so
-        # we need to check the message content for known patterns.
-        error_string = str(exception).lower()
-
-        known_exception_patterns: list[str] = [
-            "contextwindowexceedederror",
-            "prompt is too long",
-            "input length and `max_tokens` exceed context limit",
-            "please reduce the length of",
-            "the request exceeds the available context size",
-            "context length exceeded",
-        ]
-
-        if any(pattern in error_string for pattern in known_exception_patterns):
-            return True
-
-        # A special case for SambaNova, where multiple patterns are needed
-        # simultaneously.
-        samba_nova_patterns: list[str] = [
-            "sambanovaexception",
-            "maximum context length",
-        ]
-
-        if all(pattern in error_string for pattern in samba_nova_patterns):
-            return True
-
-        # If we've made it this far and haven't managed to positively ID it as a context
-        # window exceeded error, we'll have to assume it's not and rely on the call-site
-        # context to handle it appropriately.
-        return False
diff --git a/openhands-sdk/openhands/sdk/llm/mixins/fn_call_converter.py b/openhands-sdk/openhands/sdk/llm/mixins/fn_call_converter.py
index a3c4fe3faf..f6af281d96 100644
--- a/openhands-sdk/openhands/sdk/llm/mixins/fn_call_converter.py
+++ b/openhands-sdk/openhands/sdk/llm/mixins/fn_call_converter.py
@@ -33,8 +33,8 @@ class TextPart(TypedDict):
 
 Content = str | list[TextPart]
 
-EXECUTE_BASH_TOOL_NAME = "execute_bash"
-STR_REPLACE_EDITOR_TOOL_NAME = "str_replace_editor"
+EXECUTE_BASH_TOOL_NAME = "bash"
+STR_REPLACE_EDITOR_TOOL_NAME = "file_editor"
 BROWSER_TOOL_NAME = "browser"
 FINISH_TOOL_NAME = "finish"
 LLM_BASED_EDIT_TOOL_NAME = "edit_file"
@@ -81,7 +81,7 @@ def refine_prompt(prompt: str) -> str:
 
 # Example snippets for each tool
 TOOL_EXAMPLES = {
-    "execute_bash": {
+    "bash": {
         "check_dir": """
 ASSISTANT: Sure! Let me first check the current directory:
 <function=execute_bash>
@@ -205,7 +205,7 @@ def refine_prompt(prompt: str) -> str:
 The server is running on port 5000 with PID 126. You can access the list of numbers in a table format by visiting http://127.0.0.1:5000.
 """,  # noqa: E501
     },
-    "str_replace_editor": {
+    "file_editor": {
         "create_file": """
 ASSISTANT:
 There is no `app.py` file in the current directory. Let me create a Python file `app.py`:
@@ -339,9 +339,9 @@ def get_example_for_tools(tools: list[ChatCompletionToolParam]) -> str:
         if tool["type"] == "function":
             name = tool["function"]["name"]
             if name == EXECUTE_BASH_TOOL_NAME:
-                available_tools.add("execute_bash")
+                available_tools.add("bash")
             elif name == STR_REPLACE_EDITOR_TOOL_NAME:
-                available_tools.add("str_replace_editor")
+                available_tools.add("file_editor")
             elif name == BROWSER_TOOL_NAME:
                 available_tools.add("browser")
             elif name == FINISH_TOOL_NAME:
@@ -361,30 +361,30 @@ def get_example_for_tools(tools: list[ChatCompletionToolParam]) -> str:
 """  # noqa: E501
 
     # Build example based on available tools
-    if "execute_bash" in available_tools:
-        example += TOOL_EXAMPLES["execute_bash"]["check_dir"]
+    if "bash" in available_tools:
+        example += TOOL_EXAMPLES["bash"]["check_dir"]
 
-    if "str_replace_editor" in available_tools:
-        example += TOOL_EXAMPLES["str_replace_editor"]["create_file"]
+    if "file_editor" in available_tools:
+        example += TOOL_EXAMPLES["file_editor"]["create_file"]
     elif "edit_file" in available_tools:
         example += TOOL_EXAMPLES["edit_file"]["create_file"]
 
-    if "execute_bash" in available_tools:
-        example += TOOL_EXAMPLES["execute_bash"]["run_server"]
+    if "bash" in available_tools:
+        example += TOOL_EXAMPLES["bash"]["run_server"]
 
     if "browser" in available_tools:
         example += TOOL_EXAMPLES["browser"]["view_page"]
 
-    if "execute_bash" in available_tools:
-        example += TOOL_EXAMPLES["execute_bash"]["kill_server"]
+    if "bash" in available_tools:
+        example += TOOL_EXAMPLES["bash"]["kill_server"]
 
-    if "str_replace_editor" in available_tools:
-        example += TOOL_EXAMPLES["str_replace_editor"]["edit_file"]
+    if "file_editor" in available_tools:
+        example += TOOL_EXAMPLES["file_editor"]["edit_file"]
     elif "edit_file" in available_tools:
         example += TOOL_EXAMPLES["edit_file"]["edit_file"]
 
-    if "execute_bash" in available_tools:
-        example += TOOL_EXAMPLES["execute_bash"]["run_server_again"]
+    if "bash" in available_tools:
+        example += TOOL_EXAMPLES["bash"]["run_server_again"]
 
     if "finish" in available_tools:
         example += TOOL_EXAMPLES["finish"]["example"]
diff --git a/openhands-sdk/openhands/sdk/llm/mixins/non_native_fc.py b/openhands-sdk/openhands/sdk/llm/mixins/non_native_fc.py
index 9d9db4acf4..5f4c56e641 100644
--- a/openhands-sdk/openhands/sdk/llm/mixins/non_native_fc.py
+++ b/openhands-sdk/openhands/sdk/llm/mixins/non_native_fc.py
@@ -17,8 +17,7 @@
 class _HostSupports(Protocol):
     model: str
     disable_stop_word: bool | None
-
-    def is_function_calling_active(self) -> bool: ...
+    native_tool_calling: bool
 
 
 class NonNativeToolCallingMixin:
@@ -27,13 +26,13 @@ class NonNativeToolCallingMixin:
     Host requirements:
     - self.model: str
     - self.disable_stop_word: bool | None
-    - self.is_function_calling_active() -> bool
+    - self.native_tool_calling -> bool
     """
 
     def should_mock_tool_calls(
         self: _HostSupports, tools: list[ChatCompletionToolParam] | None
     ) -> bool:
-        return bool(tools) and not self.is_function_calling_active()
+        return bool(tools) and not self.native_tool_calling
 
     def pre_request_prompt_mock(
         self: _HostSupports,
diff --git a/openhands-sdk/openhands/sdk/llm/options/chat_options.py b/openhands-sdk/openhands/sdk/llm/options/chat_options.py
index 0f50921015..174be317f8 100644
--- a/openhands-sdk/openhands/sdk/llm/options/chat_options.py
+++ b/openhands-sdk/openhands/sdk/llm/options/chat_options.py
@@ -28,6 +28,10 @@ def select_chat_options(
         if "max_completion_tokens" in out:
             out["max_tokens"] = out.pop("max_completion_tokens")
 
+    # If user didn't set extra_headers, propagate from llm config
+    if llm.extra_headers is not None and "extra_headers" not in out:
+        out["extra_headers"] = dict(llm.extra_headers)
+
     # Reasoning-model quirks
     if get_features(llm.model).supports_reasoning_effort:
         # Preferred: use reasoning_effort
@@ -49,7 +53,12 @@ def select_chat_options(
                 "budget_tokens": llm.extended_thinking_budget,
             }
             # Enable interleaved thinking
-            out["extra_headers"] = {"anthropic-beta": "interleaved-thinking-2025-05-14"}
+            # Merge default header with any user-provided headers; user wins on conflict
+            existing = out.get("extra_headers") or {}
+            out["extra_headers"] = {
+                "anthropic-beta": "interleaved-thinking-2025-05-14",
+                **existing,
+            }
             # Fix litellm behavior
             out["max_tokens"] = llm.max_output_tokens
         # Anthropic models ignore temp/top_p
@@ -67,8 +76,12 @@ def select_chat_options(
         out.pop("tools", None)
         out.pop("tool_choice", None)
 
+    # Pass through litellm_extra_body if provided
+    if llm.litellm_extra_body:
+        out["extra_body"] = llm.litellm_extra_body
     # non litellm proxy special-case: keep `extra_body` off unless model requires it
-    if "litellm_proxy" not in llm.model:
+    # or user provided it
+    elif "litellm_proxy" not in llm.model:
         out.pop("extra_body", None)
 
     return out
diff --git a/openhands-sdk/openhands/sdk/llm/options/responses_options.py b/openhands-sdk/openhands/sdk/llm/options/responses_options.py
index f820664b79..e695f6491d 100644
--- a/openhands-sdk/openhands/sdk/llm/options/responses_options.py
+++ b/openhands-sdk/openhands/sdk/llm/options/responses_options.py
@@ -25,6 +25,10 @@ def select_responses_options(
     out["temperature"] = 1.0
     out["tool_choice"] = "auto"
 
+    # If user didn't set extra_headers, propagate from llm config
+    if llm.extra_headers is not None and "extra_headers" not in out:
+        out["extra_headers"] = dict(llm.extra_headers)
+
     # Store defaults to False (stateless) unless explicitly provided
     if store is not None:
         out["store"] = bool(store)
@@ -43,4 +47,8 @@ def select_responses_options(
     effort = llm.reasoning_effort or "high"
     out["reasoning"] = {"effort": effort, "summary": "detailed"}
 
+    # Pass through litellm_extra_body if provided
+    if llm.litellm_extra_body:
+        out["extra_body"] = llm.litellm_extra_body
+
     return out
diff --git a/openhands-sdk/openhands/sdk/llm/router/base.py b/openhands-sdk/openhands/sdk/llm/router/base.py
index e6188a8562..cd908255e6 100644
--- a/openhands-sdk/openhands/sdk/llm/router/base.py
+++ b/openhands-sdk/openhands/sdk/llm/router/base.py
@@ -11,7 +11,7 @@
 from openhands.sdk.llm.llm_response import LLMResponse
 from openhands.sdk.llm.message import Message
 from openhands.sdk.logger import get_logger
-from openhands.sdk.tool.tool import ToolBase
+from openhands.sdk.tool.tool import ToolDefinition
 
 
 logger = get_logger(__name__)
@@ -49,7 +49,7 @@ def validate_llms_not_empty(cls, v):
     def completion(
         self,
         messages: list[Message],
-        tools: Sequence[ToolBase] | None = None,
+        tools: Sequence[ToolDefinition] | None = None,
         return_metrics: bool = False,
         add_security_risk_prediction: bool = False,
         **kwargs,
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
index e9f41c876a..e53d892085 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -1,115 +1,37 @@
 from dataclasses import dataclass
-from fnmatch import fnmatch
-
-
-def normalize_model_name(model: str) -> str:
-    """Normalize a model string to a canonical, comparable name.
-
-    Strategy:
-    - Trim whitespace
-    - Lowercase
-    - If there is a '/', keep only the basename after the last '/'
-      (handles prefixes like openrouter/, litellm_proxy/, anthropic/, etc.)
-      and treat ':' inside that basename as an Ollama-style variant tag to be removed
-    - There is no provider:model form; providers, when present, use 'provider/model'
-    - Drop a trailing "-gguf" suffix if present
-    - If basename starts with a known vendor prefix followed by '.', drop that prefix
-      (e.g., 'anthropic.claude-*' -> 'claude-*')
-    """
-    raw = (model or "").strip().lower()
-    if "/" in raw:
-        name = raw.split("/")[-1]
-        if ":" in name:
-            # Drop Ollama-style variant tag in basename
-            name = name.split(":", 1)[0]
-    else:
-        # No '/', keep the whole raw name (we do not support provider:model)
-        name = raw
-
-    # Drop common vendor prefixes embedded in the basename (bedrock style), once.
-    # Keep this list small and explicit to avoid accidental over-matching.
-    vendor_prefixes = {
-        "anthropic",
-        "meta",
-        "cohere",
-        "mistral",
-        "ai21",
-        "amazon",
-    }
-    if "." in name:
-        vendor, rest = name.split(".", 1)
-        if vendor in vendor_prefixes and rest:
-            name = rest
-
-    if name.endswith("-gguf"):
-        name = name[: -len("-gguf")]
-    return name
 
 
 def model_matches(model: str, patterns: list[str]) -> bool:
-    """Return True if the model matches any of the glob patterns.
+    """Return True if any pattern appears as a substring in the raw model name.
 
-    If a pattern contains a '/', it is treated as provider-qualified and matched
-    against the full, lowercased model string (including provider prefix).
-    Otherwise, it is matched against the normalized basename.
+    Matching semantics:
+    - Case-insensitive substring search on full raw model string
     """
     raw = (model or "").strip().lower()
-    name = normalize_model_name(model)
     for pat in patterns:
-        pat_l = pat.lower()
-        if "/" in pat_l:
-            if fnmatch(raw, pat_l):
-                return True
-        else:
-            if fnmatch(name, pat_l):
-                return True
+        token = pat.strip().lower()
+        if token in raw:
+            return True
     return False
 
 
 @dataclass(frozen=True)
 class ModelFeatures:
-    supports_function_calling: bool
     supports_reasoning_effort: bool
     supports_extended_thinking: bool
     supports_prompt_cache: bool
     supports_stop_words: bool
     supports_responses_api: bool
+    force_string_serializer: bool
 
 
 # Pattern tables capturing current behavior. Keep patterns lowercase.
-FUNCTION_CALLING_PATTERNS: list[str] = [
-    # Anthropic families
-    "claude-3-7-sonnet*",
-    "claude-3.7-sonnet*",
-    "claude-sonnet-3-7-latest",
-    "claude-3-5-sonnet*",
-    "claude-3.5-haiku*",
-    "claude-3-5-haiku*",
-    "claude-sonnet-4*",
-    "claude-haiku-4*",
-    "claude-opus-4*",
-    # OpenAI families
-    "gpt-4o*",
-    "gpt-4.1",
-    "gpt-5*",
-    # o-series (keep exact o1 support per existing list)
-    "o1-2024-12-17",
-    "o3*",
-    "o4-mini",
-    # Google Gemini
-    "gemini-2.5-pro",
-    # Others
-    "kimi-k2-0711-preview",
-    "kimi-k2-instruct",
-    "qwen3-coder*",
-    "qwen3-coder-480b-a35b-instruct",
-]
 
 REASONING_EFFORT_PATTERNS: list[str] = [
     # Mirror main behavior exactly (no unintended expansion)
     "o1-2024-12-17",
-    "o1*",  # Match all o1 variants including o1-preview
-    "o3*",  # Match all o3 variants
+    "o1",
+    "o3",
     "o3-2025-04-16",
     "o3-mini-2025-01-31",
     "o3-mini",
@@ -118,51 +40,68 @@ class ModelFeatures:
     "gemini-2.5-flash",
     "gemini-2.5-pro",
     # OpenAI GPT-5 family (includes mini variants)
-    "gpt-5*",
+    "gpt-5",
 ]
 
 EXTENDED_THINKING_PATTERNS: list[str] = [
     # Anthropic model family
     # We did not include sonnet 3.7 and 4 here as they don't brings
     # significant performance improvements for agents
-    "claude-sonnet-4-5*",
-    "claude-haiku-4-5*",
+    "claude-sonnet-4-5",
+    "claude-haiku-4-5",
 ]
 
 PROMPT_CACHE_PATTERNS: list[str] = [
-    "claude-3-7-sonnet*",
-    "claude-3.7-sonnet*",
+    "claude-3-7-sonnet",
+    "claude-3.7-sonnet",
     "claude-sonnet-3-7-latest",
-    "claude-3-5-sonnet*",
-    "claude-3.5-sonnet*",
-    "claude-3-5-haiku*",
-    "claude-3.5-haiku*",
-    "claude-3-haiku-20240307*",
-    "claude-3-opus-20240229*",
-    "claude-sonnet-4*",
-    "claude-opus-4*",
+    "claude-3-5-sonnet",
+    "claude-3.5-sonnet",
+    "claude-3-5-haiku",
+    "claude-3.5-haiku",
+    "claude-3-haiku-20240307",
+    "claude-3-opus-20240229",
+    "claude-sonnet-4",
+    "claude-opus-4",
+    # Anthropic Haiku 4.5 variants (dot and dash)
+    "claude-haiku-4.5",
+    "claude-haiku-4-5",
 ]
 
 SUPPORTS_STOP_WORDS_FALSE_PATTERNS: list[str] = [
     # o1 family doesn't support stop words
-    "o1*",
+    "o1",
     # grok-4 specific model name (basename)
     "grok-4-0709",
     "grok-code-fast-1",
     # DeepSeek R1 family
-    "deepseek-r1-0528*",
+    "deepseek-r1-0528",
 ]
 
 # Models that should use the OpenAI Responses API path by default
 RESPONSES_API_PATTERNS: list[str] = [
     # OpenAI GPT-5 family (includes mini variants)
-    "gpt-5*",
+    "gpt-5",
+    # OpenAI Codex (uses Responses API)
+    "codex-mini-latest",
+]
+
+# Models that require string serializer for tool messages
+# These models don't support structured content format [{"type":"text","text":"..."}]
+# and need plain strings instead
+# NOTE: model_matches uses case-insensitive substring matching, not globbing.
+#       Keep these entries as bare substrings without wildcards.
+FORCE_STRING_SERIALIZER_PATTERNS: list[str] = [
+    "deepseek",  # e.g., DeepSeek-V3.2-Exp
+    "glm",  # e.g., GLM-4.5 / GLM-4.6
+    # Kimi K2-Instruct requires string serialization only on Groq
+    "groq/kimi-k2-instruct",  # explicit provider-prefixed IDs
 ]
 
 
 def get_features(model: str) -> ModelFeatures:
+    """Get model features."""
     return ModelFeatures(
-        supports_function_calling=model_matches(model, FUNCTION_CALLING_PATTERNS),
         supports_reasoning_effort=model_matches(model, REASONING_EFFORT_PATTERNS),
         supports_extended_thinking=model_matches(model, EXTENDED_THINKING_PATTERNS),
         supports_prompt_cache=model_matches(model, PROMPT_CACHE_PATTERNS),
@@ -170,4 +109,5 @@ def get_features(model: str) -> ModelFeatures:
             model, SUPPORTS_STOP_WORDS_FALSE_PATTERNS
         ),
         supports_responses_api=model_matches(model, RESPONSES_API_PATTERNS),
+        force_string_serializer=model_matches(model, FORCE_STRING_SERIALIZER_PATTERNS),
     )
diff --git a/openhands-sdk/openhands/sdk/logger/logger.py b/openhands-sdk/openhands/sdk/logger/logger.py
index 3ca3dee3d3..3942906f66 100644
--- a/openhands-sdk/openhands/sdk/logger/logger.py
+++ b/openhands-sdk/openhands/sdk/logger/logger.py
@@ -160,7 +160,24 @@ def setup_logging(
 
 
 def get_logger(name: str) -> logging.Logger:
-    """Return a logger for the given module name."""
+    """Get a logger instance for the specified module.
+
+    This function returns a configured logger that inherits from the root logger
+    setup. The logger supports both Rich formatting for human-readable output
+    and JSON formatting for machine processing, depending on environment configuration.
+
+    Args:
+        name: The name of the module, typically __name__.
+
+    Returns:
+        A configured Logger instance.
+
+    Example:
+        >>> from openhands.sdk.logger import get_logger
+        >>> logger = get_logger(__name__)
+        >>> logger.info("This is an info message")
+        >>> logger.error("This is an error message")
+    """
     logger = logging.getLogger(name)
     logger.propagate = True
     return logger
diff --git a/openhands-sdk/openhands/sdk/mcp/tool.py b/openhands-sdk/openhands/sdk/mcp/tool.py
index 3ee9ff5ff6..0e28b9eda5 100644
--- a/openhands-sdk/openhands/sdk/mcp/tool.py
+++ b/openhands-sdk/openhands/sdk/mcp/tool.py
@@ -16,6 +16,7 @@
 from openhands.sdk.logger import get_logger
 from openhands.sdk.mcp.client import MCPClient
 from openhands.sdk.mcp.definition import MCPToolAction, MCPToolObservation
+from openhands.sdk.observability.laminar import observe
 from openhands.sdk.tool import (
     Action,
     Observation,
@@ -50,6 +51,7 @@ def __init__(self, tool_name: str, client: MCPClient):
         self.tool_name = tool_name
         self.client = client
 
+    @observe(name="MCPToolExecutor.call_tool", span_type="TOOL")
     async def call_tool(self, action: MCPToolAction) -> MCPToolObservation:
         async with self.client:
             assert self.client.is_connected(), "MCP client is not connected."
@@ -119,6 +121,11 @@ class MCPToolDefinition(ToolDefinition[MCPToolAction, MCPToolObservation]):
 
     mcp_tool: mcp.types.Tool = Field(description="The MCP tool definition.")
 
+    @property
+    def name(self) -> str:  # type: ignore[override]
+        """Return the MCP tool name instead of the class name."""
+        return self.mcp_tool.name
+
     def __call__(
         self,
         action: Action,
@@ -200,21 +207,17 @@ def create(
                 else None
             )
 
-            return [
-                cls(
-                    name=mcp_tool.name,
-                    description=mcp_tool.description or "No description provided",
-                    action_type=MCPToolAction,
-                    observation_type=MCPToolObservation,
-                    annotations=annotations,
-                    meta=mcp_tool.meta,
-                    executor=MCPToolExecutor(
-                        tool_name=mcp_tool.name, client=mcp_client
-                    ),
-                    # pass-through fields (enabled by **extra in Tool.create)
-                    mcp_tool=mcp_tool,
-                )
-            ]
+            tool_instance = cls(
+                description=mcp_tool.description or "No description provided",
+                action_type=MCPToolAction,
+                observation_type=MCPToolObservation,
+                annotations=annotations,
+                meta=mcp_tool.meta,
+                executor=MCPToolExecutor(tool_name=mcp_tool.name, client=mcp_client),
+                # pass-through fields (enabled by **extra in Tool.create)
+                mcp_tool=mcp_tool,
+            )
+            return [tool_instance]
         except ValidationError as e:
             logger.error(
                 f"Validation error creating MCPTool for {mcp_tool.name}: "
diff --git a/openhands-sdk/openhands/sdk/mcp/utils.py b/openhands-sdk/openhands/sdk/mcp/utils.py
index 5d193b5962..7c3252c37c 100644
--- a/openhands-sdk/openhands/sdk/mcp/utils.py
+++ b/openhands-sdk/openhands/sdk/mcp/utils.py
@@ -8,7 +8,7 @@
 
 from openhands.sdk.logger import get_logger
 from openhands.sdk.mcp import MCPClient, MCPToolDefinition
-from openhands.sdk.tool.tool import ToolBase
+from openhands.sdk.tool.tool import ToolDefinition
 
 
 logger = get_logger(__name__)
@@ -30,9 +30,9 @@ async def log_handler(message: LogMessage):
     logger.log(level, msg, extra=extra)
 
 
-async def _list_tools(client: MCPClient) -> list[ToolBase]:
+async def _list_tools(client: MCPClient) -> list[ToolDefinition]:
     """List tools from an MCP client."""
-    tools: list[ToolBase] = []
+    tools: list[ToolDefinition] = []
 
     async with client:
         assert client.is_connected(), "MCP client is not connected."
diff --git a/openhands-sdk/openhands/sdk/observability/__init__.py b/openhands-sdk/openhands/sdk/observability/__init__.py
new file mode 100644
index 0000000000..4f4ea48583
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/observability/__init__.py
@@ -0,0 +1,4 @@
+from openhands.sdk.observability.laminar import maybe_init_laminar, observe
+
+
+__all__ = ["maybe_init_laminar", "observe"]
diff --git a/openhands-sdk/openhands/sdk/observability/laminar.py b/openhands-sdk/openhands/sdk/observability/laminar.py
new file mode 100644
index 0000000000..830fc0deed
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/observability/laminar.py
@@ -0,0 +1,166 @@
+from collections.abc import Callable
+from typing import (
+    Any,
+    Literal,
+)
+
+import litellm
+from lmnr import (
+    Instruments,
+    Laminar,
+    LaminarLiteLLMCallback,
+    observe as laminar_observe,
+)
+from opentelemetry import trace
+
+from openhands.sdk.logger import get_logger
+from openhands.sdk.observability.utils import get_env
+
+
+logger = get_logger(__name__)
+
+
+def maybe_init_laminar():
+    """Initialize Laminar if the environment variables are set.
+
+    Example configuration:
+    OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://otel-collector:4317/v1/traces
+
+    # comma separated, key=value url-encoded pairs
+    OTEL_EXPORTER_OTLP_TRACES_HEADERS="Authorization=Bearer%20<KEY>,X-Key=<CUSTOM_VALUE>"
+
+    # grpc is assumed if not specified
+    OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf # or grpc/protobuf
+    # or
+    OTEL_EXPORTER=otlp_http # or otlp_grpc
+    """
+    if should_enable_observability():
+        if _is_otel_backend_laminar():
+            Laminar.initialize()
+        else:
+            # Do not enable browser session replays for non-laminar backends
+            Laminar.initialize(
+                disabled_instruments=[
+                    Instruments.BROWSER_USE_SESSION,
+                    Instruments.PATCHRIGHT,
+                    Instruments.PLAYWRIGHT,
+                ],
+            )
+        litellm.callbacks.append(LaminarLiteLLMCallback())
+    else:
+        logger.debug(
+            "Observability/OTEL environment variables are not set. "
+            "Skipping Laminar initialization."
+        )
+
+
+def observe[**P, R](
+    *,
+    name: str | None = None,
+    session_id: str | None = None,
+    user_id: str | None = None,
+    ignore_input: bool = False,
+    ignore_output: bool = False,
+    span_type: Literal["DEFAULT", "LLM", "TOOL"] = "DEFAULT",
+    ignore_inputs: list[str] | None = None,
+    input_formatter: Callable[P, str] | None = None,
+    output_formatter: Callable[[R], str] | None = None,
+    metadata: dict[str, Any] | None = None,
+    tags: list[str] | None = None,
+    preserve_global_context: bool = False,
+    **kwargs: dict[str, Any],
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    def decorator(func: Callable[P, R]) -> Callable[P, R]:
+        return laminar_observe(
+            name=name,
+            session_id=session_id,
+            user_id=user_id,
+            ignore_input=ignore_input,
+            ignore_output=ignore_output,
+            span_type=span_type,
+            ignore_inputs=ignore_inputs,
+            input_formatter=input_formatter,
+            output_formatter=output_formatter,
+            metadata=metadata,
+            tags=tags,
+            preserve_global_context=preserve_global_context,
+            **kwargs,
+        )(func)
+
+    return decorator
+
+
+def should_enable_observability():
+    keys = [
+        "LMNR_PROJECT_API_KEY",
+        "OTEL_ENDPOINT",
+        "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT",
+        "OTEL_EXPORTER_OTLP_ENDPOINT",
+    ]
+    if any(get_env(key) for key in keys):
+        return True
+    if Laminar.is_initialized():
+        return True
+    return False
+
+
+def _is_otel_backend_laminar():
+    """Simple heuristic to check if the OTEL backend is Laminar.
+    Caveat: This will still be True if another backend uses the same
+    authentication scheme, and the user uses LMNR_PROJECT_API_KEY
+    instead of OTEL_HEADERS to authenticate.
+    """
+    key = get_env("LMNR_PROJECT_API_KEY")
+    return key is not None and key != ""
+
+
+class SpanManager:
+    """Manages a stack of active spans and their associated tokens."""
+
+    def __init__(self):
+        self._stack: list[trace.Span] = []
+
+    def start_active_span(self, name: str, session_id: str | None = None) -> None:
+        """Start a new active span and push it to the stack."""
+        span = Laminar.start_active_span(name)
+        if session_id:
+            Laminar.set_trace_session_id(session_id)
+        self._stack.append(span)
+
+    def end_active_span(self) -> None:
+        """End the most recent active span by popping it from the stack."""
+        if not self._stack:
+            logger.warning("Attempted to end active span, but stack is empty")
+            return
+
+        try:
+            span = self._stack.pop()
+            if span and span.is_recording():
+                span.end()
+        except IndexError:
+            logger.warning("Attempted to end active span, but stack is empty")
+            return
+
+
+_span_manager: SpanManager | None = None
+
+
+def _get_span_manager() -> SpanManager:
+    global _span_manager
+    if _span_manager is None:
+        _span_manager = SpanManager()
+    return _span_manager
+
+
+def start_active_span(name: str, session_id: str | None = None) -> None:
+    """Start a new active span using the global span manager."""
+    _get_span_manager().start_active_span(name, session_id)
+
+
+def end_active_span() -> None:
+    """End the most recent active span using the global span manager."""
+    try:
+        _get_span_manager().end_active_span()
+    except Exception:
+        logger.debug("Error ending active span")
+        pass
diff --git a/openhands-sdk/openhands/sdk/observability/utils.py b/openhands-sdk/openhands/sdk/observability/utils.py
new file mode 100644
index 0000000000..018a82bdf1
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/observability/utils.py
@@ -0,0 +1,20 @@
+import os
+
+from dotenv import dotenv_values
+
+from openhands.sdk.event import ActionEvent
+
+
+def get_env(key: str) -> str | None:
+    """Get an environment variable from the environment or the dotenv file."""
+    return os.getenv(key) or dotenv_values().get(key)
+
+
+def extract_action_name(action_event: ActionEvent) -> str:
+    try:
+        if action_event.action is not None and hasattr(action_event.action, "kind"):
+            return action_event.action.kind
+        else:
+            return action_event.tool_name
+    except Exception:
+        return "agent.execute_action"
diff --git a/openhands-sdk/openhands/sdk/tool/__init__.py b/openhands-sdk/openhands/sdk/tool/__init__.py
index 76381eabcb..de40f06011 100644
--- a/openhands-sdk/openhands/sdk/tool/__init__.py
+++ b/openhands-sdk/openhands/sdk/tool/__init__.py
@@ -1,5 +1,3 @@
-"""OpenHands runtime package."""
-
 from openhands.sdk.tool.builtins import BUILT_IN_TOOLS, FinishTool, ThinkTool
 from openhands.sdk.tool.registry import (
     list_registered_tools,
@@ -14,7 +12,6 @@
 from openhands.sdk.tool.tool import (
     ExecutableTool,
     ToolAnnotations,
-    ToolBase,
     ToolDefinition,
     ToolExecutor,
 )
@@ -23,7 +20,6 @@
 __all__ = [
     "Tool",
     "ToolDefinition",
-    "ToolBase",
     "ToolAnnotations",
     "ToolExecutor",
     "ExecutableTool",
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/finish.py b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
index 6d2ac10420..3c40f3b586 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/finish.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
@@ -1,5 +1,5 @@
 from collections.abc import Sequence
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Self
 
 from pydantic import Field
 from rich.text import Text
@@ -16,6 +16,7 @@
 
 if TYPE_CHECKING:
     from openhands.sdk.conversation.base import BaseConversation
+    from openhands.sdk.conversation.state import ConversationState
 
 
 class FinishAction(Action):
@@ -67,17 +68,41 @@ def __call__(
         return FinishObservation(message=action.message)
 
 
-FinishTool = ToolDefinition(
-    name="finish",
-    action_type=FinishAction,
-    observation_type=FinishObservation,
-    description=TOOL_DESCRIPTION,
-    executor=FinishExecutor(),
-    annotations=ToolAnnotations(
-        title="finish",
-        readOnlyHint=True,
-        destructiveHint=False,
-        idempotentHint=True,
-        openWorldHint=False,
-    ),
-)
+class FinishTool(ToolDefinition[FinishAction, FinishObservation]):
+    """Tool for signaling the completion of a task or conversation."""
+
+    @classmethod
+    def create(
+        cls,
+        conv_state: "ConversationState | None" = None,  # noqa: ARG003
+        **params,
+    ) -> Sequence[Self]:
+        """Create FinishTool instance.
+
+        Args:
+            conv_state: Optional conversation state (not used by FinishTool).
+            **params: Additional parameters (none supported).
+
+        Returns:
+            A sequence containing a single FinishTool instance.
+
+        Raises:
+            ValueError: If any parameters are provided.
+        """
+        if params:
+            raise ValueError("FinishTool doesn't accept parameters")
+        return [
+            cls(
+                action_type=FinishAction,
+                observation_type=FinishObservation,
+                description=TOOL_DESCRIPTION,
+                executor=FinishExecutor(),
+                annotations=ToolAnnotations(
+                    title="finish",
+                    readOnlyHint=True,
+                    destructiveHint=False,
+                    idempotentHint=True,
+                    openWorldHint=False,
+                ),
+            )
+        ]
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/think.py b/openhands-sdk/openhands/sdk/tool/builtins/think.py
index 01d84d6ece..b28418795b 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/think.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/think.py
@@ -1,5 +1,5 @@
 from collections.abc import Sequence
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Self
 
 from pydantic import Field
 from rich.text import Text
@@ -16,6 +16,7 @@
 
 if TYPE_CHECKING:
     from openhands.sdk.conversation.base import BaseConversation
+    from openhands.sdk.conversation.state import ConversationState
 
 
 class ThinkAction(Action):
@@ -83,16 +84,40 @@ def __call__(
         return ThinkObservation()
 
 
-ThinkTool = ToolDefinition(
-    name="think",
-    description=THINK_DESCRIPTION,
-    action_type=ThinkAction,
-    observation_type=ThinkObservation,
-    executor=ThinkExecutor(),
-    annotations=ToolAnnotations(
-        readOnlyHint=True,
-        destructiveHint=False,
-        idempotentHint=True,
-        openWorldHint=False,
-    ),
-)
+class ThinkTool(ToolDefinition[ThinkAction, ThinkObservation]):
+    """Tool for logging thoughts without making changes."""
+
+    @classmethod
+    def create(
+        cls,
+        conv_state: "ConversationState | None" = None,  # noqa: ARG003
+        **params,
+    ) -> Sequence[Self]:
+        """Create ThinkTool instance.
+
+        Args:
+            conv_state: Optional conversation state (not used by ThinkTool).
+            **params: Additional parameters (none supported).
+
+        Returns:
+            A sequence containing a single ThinkTool instance.
+
+        Raises:
+            ValueError: If any parameters are provided.
+        """
+        if params:
+            raise ValueError("ThinkTool doesn't accept parameters")
+        return [
+            cls(
+                description=THINK_DESCRIPTION,
+                action_type=ThinkAction,
+                observation_type=ThinkObservation,
+                executor=ThinkExecutor(),
+                annotations=ToolAnnotations(
+                    readOnlyHint=True,
+                    destructiveHint=False,
+                    idempotentHint=True,
+                    openWorldHint=False,
+                ),
+            )
+        ]
diff --git a/openhands-sdk/openhands/sdk/tool/registry.py b/openhands-sdk/openhands/sdk/tool/registry.py
index 2a00dc3d6e..0095b2b770 100644
--- a/openhands-sdk/openhands/sdk/tool/registry.py
+++ b/openhands-sdk/openhands/sdk/tool/registry.py
@@ -3,13 +3,15 @@
 from threading import RLock
 from typing import TYPE_CHECKING, Any
 
+from openhands.sdk.logger import get_logger
 from openhands.sdk.tool.spec import Tool
-from openhands.sdk.tool.tool import ToolBase, ToolDefinition
+from openhands.sdk.tool.tool import ToolDefinition
 
 
 if TYPE_CHECKING:
     from openhands.sdk.conversation.state import ConversationState
 
+logger = get_logger(__name__)
 
 # A resolver produces ToolDefinition instances for given params.
 Resolver = Callable[[dict[str, Any], "ConversationState"], Sequence[ToolDefinition]]
@@ -85,7 +87,7 @@ def _is_abstract_method(cls: type, name: str) -> bool:
     return getattr(attr, "__isabstractmethod__", False)
 
 
-def _resolver_from_subclass(_name: str, cls: type[ToolBase]) -> Resolver:
+def _resolver_from_subclass(_name: str, cls: type[ToolDefinition]) -> Resolver:
     create = getattr(cls, "create", None)
 
     if create is None or not callable(create) or _is_abstract_method(cls, "create"):
@@ -115,14 +117,16 @@ def _resolve(
 
 def register_tool(
     name: str,
-    factory: ToolDefinition | type[ToolBase] | Callable[..., Sequence[ToolDefinition]],
+    factory: ToolDefinition
+    | type[ToolDefinition]
+    | Callable[..., Sequence[ToolDefinition]],
 ) -> None:
     if not isinstance(name, str) or not name.strip():
         raise ValueError("ToolDefinition name must be a non-empty string")
 
     if isinstance(factory, ToolDefinition):
         resolver = _resolver_from_instance(name, factory)
-    elif isinstance(factory, type) and issubclass(factory, ToolBase):
+    elif isinstance(factory, type) and issubclass(factory, ToolDefinition):
         resolver = _resolver_from_subclass(name, factory)
     elif callable(factory):
         resolver = _resolver_from_callable(name, factory)
@@ -134,6 +138,9 @@ def register_tool(
         )
 
     with _LOCK:
+        # TODO: throw exception when registering duplicate name tools
+        if name in _REG:
+            logger.warning(f"Duplicate tool name registerd {name}")
         _REG[name] = resolver
 
 
diff --git a/openhands-sdk/openhands/sdk/tool/tool.py b/openhands-sdk/openhands/sdk/tool/tool.py
index ff2f2207ae..f4043db979 100644
--- a/openhands-sdk/openhands/sdk/tool/tool.py
+++ b/openhands-sdk/openhands/sdk/tool/tool.py
@@ -1,6 +1,14 @@
+import re
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Any, ClassVar, Protocol, Self, TypeVar
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    ClassVar,
+    Protocol,
+    Self,
+    TypeVar,
+)
 
 from litellm import (
     ChatCompletionToolParam,
@@ -35,6 +43,20 @@
 _action_types_with_risk: dict[type, type] = {}
 
 
+def _camel_to_snake(name: str) -> str:
+    """Convert CamelCase to snake_case.
+
+    Examples:
+        BashTool -> bash_tool
+        FileEditorTool -> file_editor_tool
+        XMLHttpRequest -> xml_http_request
+    """
+    # Insert underscore before uppercase letters (except the first one)
+    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    # Insert underscore before uppercase letters that follow lowercase letters
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
+
+
 class ToolAnnotations(BaseModel):
     """Annotations to provide hints about the tool's behavior.
 
@@ -122,20 +144,51 @@ def __call__(
         ...
 
 
-class ToolBase[ActionT, ObservationT](DiscriminatedUnionMixin, ABC):
-    """Tool that wraps an executor function with input/output validation and schema.
+class ToolDefinition[ActionT, ObservationT](DiscriminatedUnionMixin, ABC):
+    """Base class for all tool implementations.
+
+    This class serves as a base for the discriminated union of all tool types.
+    All tools must inherit from this class and implement the .create() method for
+    proper initialization with executors and parameters.
 
+    Features:
     - Normalize input/output schemas (class or dict) into both model+schema.
     - Validate inputs before execute.
     - Coerce outputs only if an output model is defined; else return vanilla JSON.
     - Export MCP tool description.
+
+    Examples:
+        Simple tool with no parameters:
+            class FinishTool(ToolDefinition[FinishAction, FinishObservation]):
+                @classmethod
+                def create(cls, conv_state=None, **params):
+                    return [cls(name="finish", ..., executor=FinishExecutor())]
+
+        Complex tool with initialization parameters:
+            class BashTool(ToolDefinition[ExecuteBashAction, ExecuteBashObservation]):
+                @classmethod
+                def create(cls, conv_state, **params):
+                    executor = BashExecutor(
+                        working_dir=conv_state.workspace.working_dir,
+                        **params,
+                    )
+                    return [cls(name="bash", ..., executor=executor)]
     """
 
     model_config: ClassVar[ConfigDict] = ConfigDict(
         frozen=True, arbitrary_types_allowed=True
     )
 
-    name: str
+    # Automatic tool naming - set by __init_subclass__
+    name: ClassVar[str] = ""
+
+    def __init_subclass__(cls, **kwargs):
+        """Automatically set name from class name when subclass is created."""
+        super().__init_subclass__(**kwargs)
+        # Only set automatically if not explicitly defined in the current class
+        if "name" not in cls.__dict__:
+            cls.name = _camel_to_snake(cls.__name__).removesuffix("_tool")
+
     description: str
     action_type: type[Action] = Field(repr=False)
     observation_type: type[Observation] | None = Field(default=None, repr=False)
@@ -151,15 +204,21 @@ class ToolBase[ActionT, ObservationT](DiscriminatedUnionMixin, ABC):
     @classmethod
     @abstractmethod
     def create(cls, *args, **kwargs) -> Sequence[Self]:
-        """Create a sequence of Tool instances. Placeholder for subclasses.
+        """Create a sequence of Tool instances.
 
-        This can be overridden in subclasses to provide custom initialization logic
-            (e.g., typically initializing the executor with parameters).
+        This method must be implemented by all subclasses to provide custom
+        initialization logic, typically initializing the executor with parameters
+        from conv_state and other optional parameters.
+
+        Args:
+            *args: Variable positional arguments (typically conv_state as first arg).
+            **kwargs: Optional parameters for tool initialization.
 
         Returns:
             A sequence of Tool instances. Even single tools are returned as a sequence
             to provide a consistent interface and eliminate union return types.
         """
+        raise NotImplementedError("ToolDefinition subclasses must implement .create()")
 
     @computed_field(return_type=str, alias="title")
     @property
@@ -366,36 +425,21 @@ def to_responses_tool(
 
     @classmethod
     def resolve_kind(cls, kind: str) -> type:
-        for subclass in get_known_concrete_subclasses(cls):
-            if subclass.__name__ == kind:
-                return subclass
-        # Fallback to "ToolDefinition" for unknown type
-        return ToolDefinition
-
+        """Resolve a kind string to its corresponding tool class.
 
-class ToolDefinition[ActionT, ObservationT](ToolBase[ActionT, ObservationT]):
-    """Concrete tool class that inherits from ToolBase.
-
-    This class serves as a concrete implementation of ToolBase for cases where
-    you want to create a tool instance directly without implementing a custom
-    subclass. Built-in tools (like FinishTool, ThinkTool) are instantiated
-    directly from this class, while more complex tools (like BashTool,
-    FileEditorTool) inherit from this class and provide their own create()
-    method implementations.
-    """
+        Args:
+            kind: The name of the tool class to resolve
 
-    @classmethod
-    def create(cls, *args, **kwargs) -> Sequence[Self]:
-        """Create a sequence of ToolDefinition instances.
+        Returns:
+            The tool class corresponding to the kind
 
-        TODO https://github.com/OpenHands/agent-sdk/issues/493
-        Refactor this - the ToolDefinition class should not have a concrete create()
-        implementation. Built-in tools should be refactored to not rely on this
-        method, and then this should be made abstract with @abstractmethod.
+        Raises:
+            ValueError: If the kind is unknown
         """
-        raise NotImplementedError(
-            "ToolDefinition.create() should be implemented by subclasses"
-        )
+        for subclass in get_known_concrete_subclasses(cls):
+            if subclass.__name__ == kind:
+                return subclass
+        raise ValueError(f"Unknown kind '{kind}' for {cls}")
 
 
 def _create_action_type_with_risk(action_type: type[Schema]) -> type[Schema]:
diff --git a/openhands-sdk/openhands/sdk/workspace/base.py b/openhands-sdk/openhands/sdk/workspace/base.py
index b3a6a7dfaf..30d787cf8e 100644
--- a/openhands-sdk/openhands/sdk/workspace/base.py
+++ b/openhands-sdk/openhands/sdk/workspace/base.py
@@ -14,13 +14,16 @@
 
 
 class BaseWorkspace(DiscriminatedUnionMixin, ABC):
-    """Abstract base mixin for workspace.
+    """Abstract base class for workspace implementations.
 
-    All workspace implementations support the context manager protocol,
-    allowing safe resource management:
+    Workspaces provide a sandboxed environment where agents can execute commands,
+    read/write files, and perform other operations. All workspace implementations
+    support the context manager protocol for safe resource management.
 
-        with workspace:
-            workspace.execute_command("echo 'hello'")
+    Example:
+        >>> with workspace:
+        ...     result = workspace.execute_command("echo 'hello'")
+        ...     content = workspace.read_file("example.txt")
     """
 
     working_dir: str = Field(
diff --git a/openhands-sdk/openhands/sdk/workspace/local.py b/openhands-sdk/openhands/sdk/workspace/local.py
index 1a8ed72d40..b3692f6a9b 100644
--- a/openhands-sdk/openhands/sdk/workspace/local.py
+++ b/openhands-sdk/openhands/sdk/workspace/local.py
@@ -14,7 +14,18 @@
 
 
 class LocalWorkspace(BaseWorkspace):
-    """Mixin providing local workspace operations."""
+    """Local workspace implementation that operates on the host filesystem.
+
+    LocalWorkspace provides direct access to the local filesystem and command execution
+    environment. It's suitable for development and testing scenarios where the agent
+    should operate directly on the host system.
+
+    Example:
+        >>> workspace = LocalWorkspace(working_dir="/path/to/project")
+        >>> with workspace:
+        ...     result = workspace.execute_command("ls -la")
+        ...     content = workspace.read_file("README.md")
+    """
 
     def execute_command(
         self,
diff --git a/openhands-sdk/openhands/sdk/workspace/remote/base.py b/openhands-sdk/openhands/sdk/workspace/remote/base.py
index b852fe18c8..8ab080e500 100644
--- a/openhands-sdk/openhands/sdk/workspace/remote/base.py
+++ b/openhands-sdk/openhands/sdk/workspace/remote/base.py
@@ -12,7 +12,21 @@
 
 
 class RemoteWorkspace(RemoteWorkspaceMixin, BaseWorkspace):
-    """Remote Workspace Implementation."""
+    """Remote workspace implementation that connects to an OpenHands agent server.
+
+    RemoteWorkspace provides access to a sandboxed environment running on a remote
+    OpenHands agent server. This is the recommended approach for production deployments
+    as it provides better isolation and security.
+
+    Example:
+        >>> workspace = RemoteWorkspace(
+        ...     host="https://agent-server.example.com",
+        ...     working_dir="/workspace"
+        ... )
+        >>> with workspace:
+        ...     result = workspace.execute_command("ls -la")
+        ...     content = workspace.read_file("README.md")
+    """
 
     _client: httpx.Client | None = PrivateAttr(default=None)
 
diff --git a/openhands-sdk/openhands/sdk/workspace/remote/remote_workspace_mixin.py b/openhands-sdk/openhands/sdk/workspace/remote/remote_workspace_mixin.py
index 6344b69875..714e59585f 100644
--- a/openhands-sdk/openhands/sdk/workspace/remote/remote_workspace_mixin.py
+++ b/openhands-sdk/openhands/sdk/workspace/remote/remote_workspace_mixin.py
@@ -182,7 +182,7 @@ def _file_upload_generator(
             # Make HTTP call
             response: httpx.Response = yield {
                 "method": "POST",
-                "url": f"{self.host}/api/file/upload",
+                "url": f"{self.host}/api/file/upload/{destination}",
                 "files": files,
                 "data": data,
                 "headers": self._headers,
diff --git a/openhands-sdk/pyproject.toml b/openhands-sdk/pyproject.toml
index dad0fabe64..0fe1c25827 100644
--- a/openhands-sdk/pyproject.toml
+++ b/openhands-sdk/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "openhands-sdk"
-version = "1.0.0a5"
+version = "1.0.0a6"
 description = "OpenHands SDK - Core functionality for building AI agents"
 
 requires-python = ">=3.12"
@@ -13,6 +13,7 @@ dependencies = [
     "python-json-logger>=3.3.0",
     "tenacity>=9.1.2",
     "websockets>=12",
+    "lmnr>=0.7.20"
 ]
 
 [project.optional-dependencies]
diff --git a/openhands-tools/openhands/tools/browser_use/__init__.py b/openhands-tools/openhands/tools/browser_use/__init__.py
index 7c0d05009f..21227e274a 100644
--- a/openhands-tools/openhands/tools/browser_use/__init__.py
+++ b/openhands-tools/openhands/tools/browser_use/__init__.py
@@ -23,31 +23,10 @@
     BrowserToolSet,
     BrowserTypeAction,
     BrowserTypeTool,
-    browser_click_tool,
-    browser_close_tab_tool,
-    browser_get_content_tool,
-    browser_get_state_tool,
-    browser_go_back_tool,
-    browser_list_tabs_tool,
-    browser_navigate_tool,
-    browser_scroll_tool,
-    browser_switch_tab_tool,
-    browser_type_tool,
 )
 
 
 __all__ = [
-    # Tool objects
-    "browser_navigate_tool",
-    "browser_click_tool",
-    "browser_type_tool",
-    "browser_get_state_tool",
-    "browser_get_content_tool",
-    "browser_scroll_tool",
-    "browser_go_back_tool",
-    "browser_list_tabs_tool",
-    "browser_switch_tab_tool",
-    "browser_close_tab_tool",
     # Tool classes
     "BrowserNavigateTool",
     "BrowserClickTool",
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 39ce262380..a9a66cfbb7 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -11,8 +11,8 @@
     Observation,
     ToolAnnotations,
     ToolDefinition,
+    register_tool,
 )
-from openhands.sdk.tool.tool import ToolBase
 from openhands.sdk.utils import maybe_truncate
 
 
@@ -60,10 +60,23 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         return content
 
 
+# ============================================
+# Base Browser Action
+# ============================================
+class BrowserAction(Action):
+    """Base class for all browser actions.
+
+    This base class serves as the parent for all browser-related actions,
+    enabling proper type hierarchy and eliminating the need for union types.
+    """
+
+    pass
+
+
 # ============================================
 # `go_to_url`
 # ============================================
-class BrowserNavigateAction(Action):
+class BrowserNavigateAction(BrowserAction):
     """Schema for browser navigation."""
 
     url: str = Field(description="The URL to navigate to")
@@ -85,20 +98,6 @@ class BrowserNavigateAction(Action):
 - Open GitHub in new tab: url="https://github.com", new_tab=True
 """  # noqa: E501
 
-browser_navigate_tool = ToolDefinition(
-    name="browser_navigate",
-    action_type=BrowserNavigateAction,
-    observation_type=BrowserObservation,
-    description=BROWSER_NAVIGATE_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="browser_navigate",
-        readOnlyHint=False,
-        destructiveHint=False,
-        idempotentHint=False,
-        openWorldHint=True,
-    ),
-)
-
 
 class BrowserNavigateTool(ToolDefinition[BrowserNavigateAction, BrowserObservation]):
     """Tool for browser navigation."""
@@ -107,11 +106,16 @@ class BrowserNavigateTool(ToolDefinition[BrowserNavigateAction, BrowserObservati
     def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
         return [
             cls(
-                name=browser_navigate_tool.name,
                 description=BROWSER_NAVIGATE_DESCRIPTION,
                 action_type=BrowserNavigateAction,
                 observation_type=BrowserObservation,
-                annotations=browser_navigate_tool.annotations,
+                annotations=ToolAnnotations(
+                    title="browser_navigate",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
                 executor=executor,
             )
         ]
@@ -120,7 +124,7 @@ def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
 # ============================================
 # `browser_click`
 # ============================================
-class BrowserClickAction(Action):
+class BrowserClickAction(BrowserAction):
     """Schema for clicking elements."""
 
     index: int = Field(
@@ -144,20 +148,6 @@ class BrowserClickAction(Action):
 Important: Only use indices that appear in your current browser_get_state output.
 """  # noqa: E501
 
-browser_click_tool = ToolDefinition(
-    name="browser_click",
-    action_type=BrowserClickAction,
-    observation_type=BrowserObservation,
-    description=BROWSER_CLICK_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="browser_click",
-        readOnlyHint=False,
-        destructiveHint=False,
-        idempotentHint=False,
-        openWorldHint=True,
-    ),
-)
-
 
 class BrowserClickTool(ToolDefinition[BrowserClickAction, BrowserObservation]):
     """Tool for clicking browser elements."""
@@ -166,11 +156,16 @@ class BrowserClickTool(ToolDefinition[BrowserClickAction, BrowserObservation]):
     def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
         return [
             cls(
-                name=browser_click_tool.name,
                 description=BROWSER_CLICK_DESCRIPTION,
                 action_type=BrowserClickAction,
                 observation_type=BrowserObservation,
-                annotations=browser_click_tool.annotations,
+                annotations=ToolAnnotations(
+                    title="browser_click",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
                 executor=executor,
             )
         ]
@@ -179,7 +174,7 @@ def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
 # ============================================
 # `browser_type`
 # ============================================
-class BrowserTypeAction(Action):
+class BrowserTypeAction(BrowserAction):
     """Schema for typing text into elements."""
 
     index: int = Field(
@@ -200,20 +195,6 @@ class BrowserTypeAction(Action):
 Important: Only use indices that appear in your current browser_get_state output.
 """  # noqa: E501
 
-browser_type_tool = ToolDefinition(
-    name="browser_type",
-    action_type=BrowserTypeAction,
-    observation_type=BrowserObservation,
-    description=BROWSER_TYPE_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="browser_type",
-        readOnlyHint=False,
-        destructiveHint=False,
-        idempotentHint=False,
-        openWorldHint=True,
-    ),
-)
-
 
 class BrowserTypeTool(ToolDefinition[BrowserTypeAction, BrowserObservation]):
     """Tool for typing text into browser elements."""
@@ -222,11 +203,16 @@ class BrowserTypeTool(ToolDefinition[BrowserTypeAction, BrowserObservation]):
     def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
         return [
             cls(
-                name=browser_type_tool.name,
                 description=BROWSER_TYPE_DESCRIPTION,
                 action_type=BrowserTypeAction,
                 observation_type=BrowserObservation,
-                annotations=browser_type_tool.annotations,
+                annotations=ToolAnnotations(
+                    title="browser_type",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
                 executor=executor,
             )
         ]
@@ -235,7 +221,7 @@ def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
 # ============================================
 # `browser_get_state`
 # ============================================
-class BrowserGetStateAction(Action):
+class BrowserGetStateAction(BrowserAction):
     """Schema for getting browser state."""
 
     include_screenshot: bool = Field(
@@ -253,20 +239,6 @@ class BrowserGetStateAction(Action):
 - include_screenshot: Whether to include a screenshot (optional, default: False)
 """  # noqa: E501
 
-browser_get_state_tool = ToolDefinition(
-    name="browser_get_state",
-    action_type=BrowserGetStateAction,
-    observation_type=BrowserObservation,
-    description=BROWSER_GET_STATE_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="browser_get_state",
-        readOnlyHint=True,
-        destructiveHint=False,
-        idempotentHint=True,
-        openWorldHint=True,
-    ),
-)
-
 
 class BrowserGetStateTool(ToolDefinition[BrowserGetStateAction, BrowserObservation]):
     """Tool for getting browser state."""
@@ -275,11 +247,16 @@ class BrowserGetStateTool(ToolDefinition[BrowserGetStateAction, BrowserObservati
     def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
         return [
             cls(
-                name=browser_get_state_tool.name,
                 description=BROWSER_GET_STATE_DESCRIPTION,
                 action_type=BrowserGetStateAction,
                 observation_type=BrowserObservation,
-                annotations=browser_get_state_tool.annotations,
+                annotations=ToolAnnotations(
+                    title="browser_get_state",
+                    readOnlyHint=True,
+                    destructiveHint=False,
+                    idempotentHint=True,
+                    openWorldHint=True,
+                ),
                 executor=executor,
             )
         ]
@@ -288,7 +265,7 @@ def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
 # ============================================
 # `browser_get_content`
 # ============================================
-class BrowserGetContentAction(Action):
+class BrowserGetContentAction(BrowserAction):
     """Schema for getting page content in markdown."""
 
     extract_links: bool = Field(
@@ -307,20 +284,6 @@ class BrowserGetContentAction(Action):
 If the content was truncated and you need more information, use start_from_char parameter to continue from where truncation occurred.
 """  # noqa: E501
 
-browser_get_content_tool = ToolDefinition(
-    name="browser_get_content",
-    action_type=BrowserGetContentAction,
-    observation_type=BrowserObservation,
-    description=BROWSER_GET_CONTENT_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="browser_get_content",
-        readOnlyHint=True,
-        destructiveHint=False,
-        idempotentHint=True,
-        openWorldHint=True,
-    ),
-)
-
 
 class BrowserGetContentTool(
     ToolDefinition[BrowserGetContentAction, BrowserObservation]
@@ -331,11 +294,16 @@ class BrowserGetContentTool(
     def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
         return [
             cls(
-                name=browser_get_content_tool.name,
                 description=BROWSER_GET_CONTENT_DESCRIPTION,
                 action_type=BrowserGetContentAction,
                 observation_type=BrowserObservation,
-                annotations=browser_get_content_tool.annotations,
+                annotations=ToolAnnotations(
+                    title="browser_get_content",
+                    readOnlyHint=True,
+                    destructiveHint=False,
+                    idempotentHint=True,
+                    openWorldHint=True,
+                ),
                 executor=executor,
             )
         ]
@@ -344,7 +312,7 @@ def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
 # ============================================
 # `browser_scroll`
 # ============================================
-class BrowserScrollAction(Action):
+class BrowserScrollAction(BrowserAction):
     """Schema for scrolling the page."""
 
     direction: Literal["up", "down"] = Field(
@@ -362,20 +330,6 @@ class BrowserScrollAction(Action):
 - direction: Direction to scroll - "up" or "down" (optional, default: "down")
 """  # noqa: E501
 
-browser_scroll_tool = ToolDefinition(
-    name="browser_scroll",
-    action_type=BrowserScrollAction,
-    observation_type=BrowserObservation,
-    description=BROWSER_SCROLL_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="browser_scroll",
-        readOnlyHint=False,
-        destructiveHint=False,
-        idempotentHint=False,
-        openWorldHint=True,
-    ),
-)
-
 
 class BrowserScrollTool(ToolDefinition[BrowserScrollAction, BrowserObservation]):
     """Tool for scrolling the browser page."""
@@ -384,11 +338,16 @@ class BrowserScrollTool(ToolDefinition[BrowserScrollAction, BrowserObservation])
     def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
         return [
             cls(
-                name=browser_scroll_tool.name,
                 description=BROWSER_SCROLL_DESCRIPTION,
                 action_type=BrowserScrollAction,
                 observation_type=BrowserObservation,
-                annotations=browser_scroll_tool.annotations,
+                annotations=ToolAnnotations(
+                    title="browser_scroll",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
                 executor=executor,
             )
         ]
@@ -397,7 +356,7 @@ def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
 # ============================================
 # `browser_go_back`
 # ============================================
-class BrowserGoBackAction(Action):
+class BrowserGoBackAction(BrowserAction):
     """Schema for going back in browser history."""
 
     pass
@@ -409,20 +368,6 @@ class BrowserGoBackAction(Action):
 browser's back button.
 """  # noqa: E501
 
-browser_go_back_tool = ToolDefinition(
-    name="browser_go_back",
-    action_type=BrowserGoBackAction,
-    observation_type=BrowserObservation,
-    description=BROWSER_GO_BACK_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="browser_go_back",
-        readOnlyHint=False,
-        destructiveHint=False,
-        idempotentHint=False,
-        openWorldHint=True,
-    ),
-)
-
 
 class BrowserGoBackTool(ToolDefinition[BrowserGoBackAction, BrowserObservation]):
     """Tool for going back in browser history."""
@@ -431,11 +376,16 @@ class BrowserGoBackTool(ToolDefinition[BrowserGoBackAction, BrowserObservation])
     def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
         return [
             cls(
-                name=browser_go_back_tool.name,
                 description=BROWSER_GO_BACK_DESCRIPTION,
                 action_type=BrowserGoBackAction,
                 observation_type=BrowserObservation,
-                annotations=browser_go_back_tool.annotations,
+                annotations=ToolAnnotations(
+                    title="browser_go_back",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
                 executor=executor,
             )
         ]
@@ -444,7 +394,7 @@ def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
 # ============================================
 # `browser_list_tabs`
 # ============================================
-class BrowserListTabsAction(Action):
+class BrowserListTabsAction(BrowserAction):
     """Schema for listing browser tabs."""
 
     pass
@@ -456,20 +406,6 @@ class BrowserListTabsAction(Action):
 with browser_switch_tab or browser_close_tab.
 """  # noqa: E501
 
-browser_list_tabs_tool = ToolDefinition(
-    name="browser_list_tabs",
-    action_type=BrowserListTabsAction,
-    observation_type=BrowserObservation,
-    description=BROWSER_LIST_TABS_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="browser_list_tabs",
-        readOnlyHint=True,
-        destructiveHint=False,
-        idempotentHint=True,
-        openWorldHint=False,
-    ),
-)
-
 
 class BrowserListTabsTool(ToolDefinition[BrowserListTabsAction, BrowserObservation]):
     """Tool for listing browser tabs."""
@@ -478,11 +414,16 @@ class BrowserListTabsTool(ToolDefinition[BrowserListTabsAction, BrowserObservati
     def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
         return [
             cls(
-                name=browser_list_tabs_tool.name,
                 description=BROWSER_LIST_TABS_DESCRIPTION,
                 action_type=BrowserListTabsAction,
                 observation_type=BrowserObservation,
-                annotations=browser_list_tabs_tool.annotations,
+                annotations=ToolAnnotations(
+                    title="browser_list_tabs",
+                    readOnlyHint=True,
+                    destructiveHint=False,
+                    idempotentHint=True,
+                    openWorldHint=False,
+                ),
                 executor=executor,
             )
         ]
@@ -491,7 +432,7 @@ def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
 # ============================================
 # `browser_switch_tab`
 # ============================================
-class BrowserSwitchTabAction(Action):
+class BrowserSwitchTabAction(BrowserAction):
     """Schema for switching browser tabs."""
 
     tab_id: str = Field(
@@ -508,36 +449,24 @@ class BrowserSwitchTabAction(Action):
 - tab_id: 4 Character Tab ID of the tab to switch to
 """
 
-browser_switch_tab_tool = ToolDefinition(
-    name="browser_switch_tab",
-    action_type=BrowserSwitchTabAction,
-    observation_type=BrowserObservation,
-    description=BROWSER_SWITCH_TAB_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="browser_switch_tab",
-        readOnlyHint=False,
-        destructiveHint=False,
-        idempotentHint=False,
-        openWorldHint=False,
-    ),
-)
-
 
 class BrowserSwitchTabTool(ToolDefinition[BrowserSwitchTabAction, BrowserObservation]):
     """Tool for switching browser tabs."""
 
-    # Override executor to be non-optional for initialized BrowserSwitchTabTool
-    # instances
-
     @classmethod
     def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
         return [
             cls(
-                name=browser_switch_tab_tool.name,
                 description=BROWSER_SWITCH_TAB_DESCRIPTION,
                 action_type=BrowserSwitchTabAction,
                 observation_type=BrowserObservation,
-                annotations=browser_switch_tab_tool.annotations,
+                annotations=ToolAnnotations(
+                    title="browser_switch_tab",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=False,
+                ),
                 executor=executor,
             )
         ]
@@ -546,7 +475,7 @@ def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
 # ============================================
 # `browser_close_tab`
 # ============================================
-class BrowserCloseTabAction(Action):
+class BrowserCloseTabAction(BrowserAction):
     """Schema for closing browser tabs."""
 
     tab_id: str = Field(
@@ -562,20 +491,6 @@ class BrowserCloseTabAction(Action):
 - tab_id: 4 Character Tab ID of the tab to close
 """
 
-browser_close_tab_tool = ToolDefinition(
-    name="browser_close_tab",
-    action_type=BrowserCloseTabAction,
-    observation_type=BrowserObservation,
-    description=BROWSER_CLOSE_TAB_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="browser_close_tab",
-        readOnlyHint=False,
-        destructiveHint=True,
-        idempotentHint=False,
-        openWorldHint=False,
-    ),
-)
-
 
 class BrowserCloseTabTool(ToolDefinition[BrowserCloseTabAction, BrowserObservation]):
     """Tool for closing browser tabs."""
@@ -584,32 +499,22 @@ class BrowserCloseTabTool(ToolDefinition[BrowserCloseTabAction, BrowserObservati
     def create(cls, executor: "BrowserToolExecutor") -> Sequence[Self]:
         return [
             cls(
-                name=browser_close_tab_tool.name,
                 description=BROWSER_CLOSE_TAB_DESCRIPTION,
                 action_type=BrowserCloseTabAction,
                 observation_type=BrowserObservation,
-                annotations=browser_close_tab_tool.annotations,
+                annotations=ToolAnnotations(
+                    title="browser_close_tab",
+                    readOnlyHint=False,
+                    destructiveHint=True,
+                    idempotentHint=False,
+                    openWorldHint=False,
+                ),
                 executor=executor,
             )
         ]
 
 
-# Union type for all browser actions
-BrowserAction = (
-    BrowserNavigateAction
-    | BrowserClickAction
-    | BrowserTypeAction
-    | BrowserGetStateAction
-    | BrowserGetContentAction
-    | BrowserScrollAction
-    | BrowserGoBackAction
-    | BrowserListTabsAction
-    | BrowserSwitchTabAction
-    | BrowserCloseTabAction
-)
-
-
-class BrowserToolSet(ToolBase[BrowserAction, BrowserObservation]):
+class BrowserToolSet(ToolDefinition[BrowserAction, BrowserObservation]):
     """A set of all browser tools.
 
     This tool set includes all available browser-related tools
@@ -623,21 +528,28 @@ class BrowserToolSet(ToolBase[BrowserAction, BrowserObservation]):
     def create(
         cls,
         **executor_config,
-    ) -> list[ToolBase[BrowserAction, BrowserObservation]]:
+    ) -> list[ToolDefinition[BrowserAction, BrowserObservation]]:
         # Import executor only when actually needed to
         # avoid hanging during module import
         from openhands.tools.browser_use.impl import BrowserToolExecutor
 
         executor = BrowserToolExecutor(**executor_config)
-        return [
-            browser_navigate_tool.set_executor(executor),
-            browser_click_tool.set_executor(executor),
-            browser_get_state_tool.set_executor(executor),
-            browser_get_content_tool.set_executor(executor),
-            browser_type_tool.set_executor(executor),
-            browser_scroll_tool.set_executor(executor),
-            browser_go_back_tool.set_executor(executor),
-            browser_list_tabs_tool.set_executor(executor),
-            browser_switch_tab_tool.set_executor(executor),
-            browser_close_tab_tool.set_executor(executor),
-        ]
+        # Each tool.create() returns a Sequence[Self], so we flatten the results
+        tools: list[ToolDefinition[BrowserAction, BrowserObservation]] = []
+        for tool_class in [
+            BrowserNavigateTool,
+            BrowserClickTool,
+            BrowserGetStateTool,
+            BrowserGetContentTool,
+            BrowserTypeTool,
+            BrowserScrollTool,
+            BrowserGoBackTool,
+            BrowserListTabsTool,
+            BrowserSwitchTabTool,
+            BrowserCloseTabTool,
+        ]:
+            tools.extend(tool_class.create(executor))
+        return tools
+
+
+register_tool(BrowserToolSet.name, BrowserToolSet)
diff --git a/openhands-tools/openhands/tools/browser_use/impl.py b/openhands-tools/openhands/tools/browser_use/impl.py
index 368ed11029..cbe6aebf1b 100644
--- a/openhands-tools/openhands/tools/browser_use/impl.py
+++ b/openhands-tools/openhands/tools/browser_use/impl.py
@@ -121,6 +121,7 @@ class BrowserToolExecutor(ToolExecutor[BrowserAction, BrowserObservation]):
     _config: dict[str, Any]
     _initialized: bool
     _async_executor: AsyncExecutor
+    _cleanup_initiated: bool
 
     def __init__(
         self,
@@ -169,6 +170,7 @@ def init_logic():
 
         self._initialized = False
         self._async_executor = AsyncExecutor()
+        self._cleanup_initiated = False
 
     def __call__(
         self,
@@ -331,6 +333,9 @@ async def cleanup(self):
 
     def close(self):
         """Close the browser executor and cleanup resources."""
+        if self._cleanup_initiated:
+            return
+        self._cleanup_initiated = True
         try:
             # Run cleanup in the async executor with a shorter timeout
             self._async_executor.run_async(self.cleanup, timeout=30.0)
diff --git a/openhands-tools/openhands/tools/delegate/__init__.py b/openhands-tools/openhands/tools/delegate/__init__.py
index c6a84b9383..ea9f3f813e 100644
--- a/openhands-tools/openhands/tools/delegate/__init__.py
+++ b/openhands-tools/openhands/tools/delegate/__init__.py
@@ -4,7 +4,6 @@
     DelegateAction,
     DelegateObservation,
     DelegateTool,
-    delegate_tool,
 )
 from openhands.tools.delegate.impl import DelegateExecutor
 
@@ -14,5 +13,4 @@
     "DelegateObservation",
     "DelegateExecutor",
     "DelegateTool",
-    "delegate_tool",
 ]
diff --git a/openhands-tools/openhands/tools/delegate/definition.py b/openhands-tools/openhands/tools/delegate/definition.py
index 2f8be795a6..36835e8c7b 100644
--- a/openhands-tools/openhands/tools/delegate/definition.py
+++ b/openhands-tools/openhands/tools/delegate/definition.py
@@ -76,20 +76,6 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
 - Sub-agents work in the same workspace as the main agent: {workspace_path}
 """  # noqa
 
-delegate_tool = ToolDefinition(
-    name="delegate",
-    action_type=DelegateAction,
-    observation_type=DelegateObservation,
-    description=TOOL_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="delegate",
-        readOnlyHint=False,
-        destructiveHint=False,
-        idempotentHint=False,
-        openWorldHint=True,
-    ),
-)
-
 
 class DelegateTool(ToolDefinition[DelegateAction, DelegateObservation]):
     """A ToolDefinition subclass that automatically initializes a DelegateExecutor."""
@@ -124,11 +110,16 @@ def create(
         # Initialize the parent Tool with the executor
         return [
             cls(
-                name=delegate_tool.name,
-                description=tool_description,
                 action_type=DelegateAction,
                 observation_type=DelegateObservation,
-                annotations=delegate_tool.annotations,
+                description=tool_description,
+                annotations=ToolAnnotations(
+                    title="delegate",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
                 executor=executor,
             )
         ]
diff --git a/openhands-tools/openhands/tools/execute_bash/__init__.py b/openhands-tools/openhands/tools/execute_bash/__init__.py
index b72257fd68..0a631e9d23 100644
--- a/openhands-tools/openhands/tools/execute_bash/__init__.py
+++ b/openhands-tools/openhands/tools/execute_bash/__init__.py
@@ -3,7 +3,6 @@
     BashTool,
     ExecuteBashAction,
     ExecuteBashObservation,
-    execute_bash_tool,
 )
 from openhands.tools.execute_bash.impl import BashExecutor
 
@@ -18,7 +17,6 @@
 __all__ = [
     # === Core Tool Interface ===
     "BashTool",
-    "execute_bash_tool",
     "ExecuteBashAction",
     "ExecuteBashObservation",
     "BashExecutor",
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 575976c266..19721362ab 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -17,6 +17,8 @@
     Observation,
     ToolAnnotations,
     ToolDefinition,
+    ToolExecutor,
+    register_tool,
 )
 from openhands.sdk.utils import maybe_truncate
 from openhands.tools.execute_bash.constants import (
@@ -217,21 +219,6 @@ def visualize(self) -> Text:
 """  # noqa
 
 
-execute_bash_tool = ToolDefinition(
-    name="execute_bash",
-    action_type=ExecuteBashAction,
-    observation_type=ExecuteBashObservation,
-    description=TOOL_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="execute_bash",
-        readOnlyHint=False,
-        destructiveHint=True,
-        idempotentHint=False,
-        openWorldHint=True,
-    ),
-)
-
-
 class BashTool(ToolDefinition[ExecuteBashAction, ExecuteBashObservation]):
     """A ToolDefinition subclass that automatically initializes a BashExecutor with auto-detection."""  # noqa: E501
 
@@ -242,6 +229,7 @@ def create(
         username: str | None = None,
         no_change_timeout_seconds: int | None = None,
         terminal_type: Literal["tmux", "subprocess"] | None = None,
+        executor: ToolExecutor | None = None,
     ) -> Sequence["BashTool"]:
         """Initialize BashTool with executor parameters.
 
@@ -265,21 +253,31 @@ def create(
             raise ValueError(f"working_dir '{working_dir}' is not a valid directory")
 
         # Initialize the executor
-        executor = BashExecutor(
-            working_dir=working_dir,
-            username=username,
-            no_change_timeout_seconds=no_change_timeout_seconds,
-            terminal_type=terminal_type,
-        )
+        if executor is None:
+            executor = BashExecutor(
+                working_dir=working_dir,
+                username=username,
+                no_change_timeout_seconds=no_change_timeout_seconds,
+                terminal_type=terminal_type,
+            )
 
         # Initialize the parent ToolDefinition with the executor
         return [
             cls(
-                name=execute_bash_tool.name,
-                description=TOOL_DESCRIPTION,
                 action_type=ExecuteBashAction,
                 observation_type=ExecuteBashObservation,
-                annotations=execute_bash_tool.annotations,
+                description=TOOL_DESCRIPTION,
+                annotations=ToolAnnotations(
+                    title="bash",
+                    readOnlyHint=False,
+                    destructiveHint=True,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
                 executor=executor,
             )
         ]
+
+
+# Automatically register the tool when this module is imported
+register_tool(BashTool.name, BashTool)
diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 47b44e5e84..911e544ec5 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -64,8 +64,8 @@ def _export_envs(
         env_vars = {}
         if conversation is not None:
             try:
-                secrets_manager = conversation.state.secrets_manager
-                env_vars = secrets_manager.get_secrets_as_env_vars(action.command)
+                secret_registry = conversation.state.secret_registry
+                env_vars = secret_registry.get_secrets_as_env_vars(action.command)
             except Exception:
                 env_vars = {}
 
@@ -129,7 +129,7 @@ def __call__(
         if action.reset and action.is_input:
             raise ValueError("Cannot use reset=True with is_input=True")
 
-        if action.reset:
+        if action.reset or self.session._closed:
             reset_result = self.reset()
 
             # Handle command execution after reset
@@ -160,8 +160,8 @@ def __call__(
         # Apply automatic secrets masking
         if observation.output and conversation is not None:
             try:
-                secrets_manager = conversation.state.secrets_manager
-                masked_output = secrets_manager.mask_secrets_in_output(
+                secret_registry = conversation.state.secret_registry
+                masked_output = secret_registry.mask_secrets_in_output(
                     observation.output
                 )
                 if masked_output:
diff --git a/openhands-tools/openhands/tools/file_editor/__init__.py b/openhands-tools/openhands/tools/file_editor/__init__.py
index 961bc2987a..5c5c7b41a4 100644
--- a/openhands-tools/openhands/tools/file_editor/__init__.py
+++ b/openhands-tools/openhands/tools/file_editor/__init__.py
@@ -2,13 +2,11 @@
     FileEditorAction,
     FileEditorObservation,
     FileEditorTool,
-    file_editor_tool,
 )
 from openhands.tools.file_editor.impl import FileEditorExecutor, file_editor
 
 
 __all__ = [
-    "file_editor_tool",
     "FileEditorAction",
     "FileEditorObservation",
     "file_editor",
diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
index 571baa5660..d2041687d3 100644
--- a/openhands-tools/openhands/tools/file_editor/definition.py
+++ b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -8,6 +8,7 @@
 
 if TYPE_CHECKING:
     from openhands.sdk.conversation.state import ConversationState
+
 from rich.text import Text
 
 from openhands.sdk.llm import ImageContent, TextContent
@@ -16,6 +17,7 @@
     Observation,
     ToolAnnotations,
     ToolDefinition,
+    register_tool,
 )
 from openhands.tools.file_editor.utils.diff import visualize_diff
 
@@ -187,21 +189,6 @@ def _has_meaningful_diff(self) -> bool:
 """  # noqa: E501
 
 
-file_editor_tool = ToolDefinition(
-    name="str_replace_editor",
-    action_type=FileEditorAction,
-    observation_type=FileEditorObservation,
-    description=TOOL_DESCRIPTION,
-    annotations=ToolAnnotations(
-        title="str_replace_editor",
-        readOnlyHint=False,
-        destructiveHint=True,
-        idempotentHint=False,
-        openWorldHint=False,
-    ),
-)
-
-
 class FileEditorTool(ToolDefinition[FileEditorAction, FileEditorObservation]):
     """A ToolDefinition subclass that automatically initializes a FileEditorExecutor."""
 
@@ -236,11 +223,20 @@ def create(
         # Initialize the parent Tool with the executor
         return [
             cls(
-                name=file_editor_tool.name,
-                description=enhanced_description,
                 action_type=FileEditorAction,
                 observation_type=FileEditorObservation,
-                annotations=file_editor_tool.annotations,
+                description=enhanced_description,
+                annotations=ToolAnnotations(
+                    title="file_editor",
+                    readOnlyHint=False,
+                    destructiveHint=True,
+                    idempotentHint=False,
+                    openWorldHint=False,
+                ),
                 executor=executor,
             )
         ]
+
+
+# Automatically register the tool when this module is imported
+register_tool(FileEditorTool.name, FileEditorTool)
diff --git a/openhands-tools/openhands/tools/glob/definition.py b/openhands-tools/openhands/tools/glob/definition.py
index c18d86425b..4e76ca9e20 100644
--- a/openhands-tools/openhands/tools/glob/definition.py
+++ b/openhands-tools/openhands/tools/glob/definition.py
@@ -11,7 +11,13 @@
     from openhands.sdk.conversation.state import ConversationState
 
 from openhands.sdk.llm import ImageContent, TextContent
-from openhands.sdk.tool import Action, Observation, ToolAnnotations, ToolDefinition
+from openhands.sdk.tool import (
+    Action,
+    Observation,
+    ToolAnnotations,
+    ToolDefinition,
+    register_tool,
+)
 
 
 class GlobAction(Action):
@@ -117,7 +123,6 @@ def create(
         # Initialize the parent ToolDefinition with the executor
         return [
             cls(
-                name="glob",
                 description=enhanced_description,
                 action_type=GlobAction,
                 observation_type=GlobObservation,
@@ -131,3 +136,7 @@ def create(
                 executor=executor,
             )
         ]
+
+
+# Automatically register the tool when this module is imported
+register_tool(GlobTool.name, GlobTool)
diff --git a/openhands-tools/openhands/tools/grep/definition.py b/openhands-tools/openhands/tools/grep/definition.py
index 4913fde795..4f378424e9 100644
--- a/openhands-tools/openhands/tools/grep/definition.py
+++ b/openhands-tools/openhands/tools/grep/definition.py
@@ -11,7 +11,13 @@
     from openhands.sdk.conversation.state import ConversationState
 
 from openhands.sdk.llm import ImageContent, TextContent
-from openhands.sdk.tool import Action, Observation, ToolAnnotations, ToolDefinition
+from openhands.sdk.tool import (
+    Action,
+    Observation,
+    ToolAnnotations,
+    ToolDefinition,
+    register_tool,
+)
 
 
 class GrepAction(Action):
@@ -130,7 +136,6 @@ def create(
         # Initialize the parent ToolDefinition with the executor
         return [
             cls(
-                name="grep",
                 description=enhanced_description,
                 action_type=GrepAction,
                 observation_type=GrepObservation,
@@ -144,3 +149,7 @@ def create(
                 executor=executor,
             )
         ]
+
+
+# Automatically register the tool when this module is imported
+register_tool(GrepTool.name, GrepTool)
diff --git a/openhands-tools/openhands/tools/planning_file_editor/definition.py b/openhands-tools/openhands/tools/planning_file_editor/definition.py
index 2368d82304..8354a50c46 100644
--- a/openhands-tools/openhands/tools/planning_file_editor/definition.py
+++ b/openhands-tools/openhands/tools/planning_file_editor/definition.py
@@ -11,6 +11,7 @@
 from openhands.sdk.tool import (
     ToolAnnotations,
     ToolDefinition,
+    register_tool,
 )
 from openhands.tools.file_editor.definition import (
     TOOL_DESCRIPTION as FILE_EDITOR_TOOL_DESCRIPTION,
@@ -100,7 +101,6 @@ def create(
 
         return [
             cls(
-                name="planning_file_editor",
                 description=enhanced_description,
                 action_type=PlanningFileEditorAction,
                 observation_type=PlanningFileEditorObservation,
@@ -114,3 +114,7 @@ def create(
                 executor=executor,
             )
         ]
+
+
+# Automatically register the tool when this module is imported
+register_tool(PlanningFileEditorTool.name, PlanningFileEditorTool)
diff --git a/openhands-tools/openhands/tools/preset/default.py b/openhands-tools/openhands/tools/preset/default.py
index 0bffe02071..521edb0889 100644
--- a/openhands-tools/openhands/tools/preset/default.py
+++ b/openhands-tools/openhands/tools/preset/default.py
@@ -8,7 +8,7 @@
 from openhands.sdk.llm.llm import LLM
 from openhands.sdk.logger import get_logger
 from openhands.sdk.security.llm_analyzer import LLMSecurityAnalyzer
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 
 
 logger = get_logger(__name__)
@@ -16,22 +16,19 @@
 
 def register_default_tools(enable_browser: bool = True) -> None:
     """Register the default set of tools."""
+    # Tools are now automatically registered when imported
     from openhands.tools.execute_bash import BashTool
     from openhands.tools.file_editor import FileEditorTool
     from openhands.tools.task_tracker import TaskTrackerTool
 
-    register_tool("BashTool", BashTool)
-    logger.debug("Tool: BashTool registered.")
-    register_tool("FileEditorTool", FileEditorTool)
-    logger.debug("Tool: FileEditorTool registered.")
-    register_tool("TaskTrackerTool", TaskTrackerTool)
-    logger.debug("Tool: TaskTrackerTool registered.")
+    logger.debug(f"Tool: {BashTool.name} registered.")
+    logger.debug(f"Tool: {FileEditorTool.name} registered.")
+    logger.debug(f"Tool: {TaskTrackerTool.name} registered.")
 
     if enable_browser:
         from openhands.tools.browser_use import BrowserToolSet
 
-        register_tool("BrowserToolSet", BrowserToolSet)
-        logger.debug("Tool: BrowserToolSet registered.")
+        logger.debug(f"Tool: {BrowserToolSet.name} registered.")
 
 
 def get_default_tools(
@@ -44,13 +41,20 @@ def get_default_tools(
     """
     register_default_tools(enable_browser=enable_browser)
 
+    # Import tools to access their name attributes
+    from openhands.tools.execute_bash import BashTool
+    from openhands.tools.file_editor import FileEditorTool
+    from openhands.tools.task_tracker import TaskTrackerTool
+
     tools = [
-        Tool(name="BashTool"),
-        Tool(name="FileEditorTool"),
-        Tool(name="TaskTrackerTool"),
+        Tool(name=BashTool.name),
+        Tool(name=FileEditorTool.name),
+        Tool(name=TaskTrackerTool.name),
     ]
     if enable_browser:
-        tools.append(Tool(name="BrowserToolSet"))
+        from openhands.tools.browser_use import BrowserToolSet
+
+        tools.append(Tool(name=BrowserToolSet.name))
     return tools
 
 
@@ -75,13 +79,6 @@ def get_default_agent(
     agent = Agent(
         llm=llm,
         tools=tools,
-        mcp_config={
-            "mcpServers": {
-                "fetch": {"command": "uvx", "args": ["mcp-server-fetch"]},
-                "repomix": {"command": "npx", "args": ["-y", "repomix@1.4.2", "--mcp"]},
-            }
-        },
-        filter_tools_regex="^(?!repomix)(.*)|^repomix.*pack_codebase.*$",
         system_prompt_kwargs={"cli_mode": cli_mode},
         condenser=get_default_condenser(
             llm=llm.model_copy(update={"usage_id": "condenser"})
diff --git a/openhands-tools/openhands/tools/preset/planning.py b/openhands-tools/openhands/tools/preset/planning.py
index 59a411dc12..ecb8318663 100644
--- a/openhands-tools/openhands/tools/preset/planning.py
+++ b/openhands-tools/openhands/tools/preset/planning.py
@@ -4,7 +4,7 @@
 from openhands.sdk.context.condenser import LLMSummarizingCondenser
 from openhands.sdk.llm.llm import LLM
 from openhands.sdk.logger import get_logger
-from openhands.sdk.tool import Tool, register_tool
+from openhands.sdk.tool import Tool
 
 
 logger = get_logger(__name__)
@@ -90,15 +90,15 @@ def get_plan_headers() -> str:
 
 def register_planning_tools() -> None:
     """Register the planning agent tools."""
-    from openhands.tools.glob import GlobTool
-    from openhands.tools.grep import GrepTool
-    from openhands.tools.planning_file_editor import PlanningFileEditorTool
+    # Tools are now automatically registered when imported
+    from openhands.tools.glob import GlobTool  # noqa: F401
+    from openhands.tools.grep import GrepTool  # noqa: F401
+    from openhands.tools.planning_file_editor import (
+        PlanningFileEditorTool,  # noqa: F401
+    )
 
-    register_tool("GlobTool", GlobTool)
     logger.debug("Tool: GlobTool registered.")
-    register_tool("GrepTool", GrepTool)
     logger.debug("Tool: GrepTool registered.")
-    register_tool("PlanningFileEditorTool", PlanningFileEditorTool)
     logger.debug("Tool: PlanningFileEditorTool registered.")
 
 
@@ -112,10 +112,15 @@ def get_planning_tools() -> list[Tool]:
     """
     register_planning_tools()
 
+    # Import tools to access their name attributes
+    from openhands.tools.glob import GlobTool
+    from openhands.tools.grep import GrepTool
+    from openhands.tools.planning_file_editor import PlanningFileEditorTool
+
     return [
-        Tool(name="GlobTool"),
-        Tool(name="GrepTool"),
-        Tool(name="PlanningFileEditorTool"),
+        Tool(name=GlobTool.name),
+        Tool(name=GrepTool.name),
+        Tool(name=PlanningFileEditorTool.name),
     ]
 
 
@@ -152,22 +157,9 @@ def get_planning_agent(
     """
     tools = get_planning_tools()
 
-    # Add MCP tools that are useful for planning
-    mcp_config = {
-        "mcpServers": {
-            "fetch": {"command": "uvx", "args": ["mcp-server-fetch"]},
-            "repomix": {"command": "npx", "args": ["-y", "repomix@1.4.2", "--mcp"]},
-        }
-    }
-
-    # Filter to only read-only MCP tools
-    filter_tools_regex = "^(?!repomix)(.*)|^repomix.*pack_codebase.*$"
-
     agent = Agent(
         llm=llm,
         tools=tools,
-        mcp_config=mcp_config,
-        filter_tools_regex=filter_tools_regex,
         system_prompt_filename="system_prompt_planning.j2",
         system_prompt_kwargs={"plan_structure": format_plan_structure()},
         condenser=get_planning_condenser(
diff --git a/openhands-tools/openhands/tools/task_tracker/__init__.py b/openhands-tools/openhands/tools/task_tracker/__init__.py
index ca64609f07..c291f43f9c 100644
--- a/openhands-tools/openhands/tools/task_tracker/__init__.py
+++ b/openhands-tools/openhands/tools/task_tracker/__init__.py
@@ -3,7 +3,6 @@
     TaskTrackerExecutor,
     TaskTrackerObservation,
     TaskTrackerTool,
-    task_tracker_tool,
 )
 
 
@@ -12,5 +11,4 @@
     "TaskTrackerExecutor",
     "TaskTrackerObservation",
     "TaskTrackerTool",
-    "task_tracker_tool",
 ]
diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index 0f04f3d9ae..350f276fa2 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -9,6 +9,7 @@
 if TYPE_CHECKING:
     from openhands.sdk.conversation import LocalConversation
     from openhands.sdk.conversation.state import ConversationState
+
 from rich.text import Text
 
 from openhands.sdk import ImageContent, TextContent
@@ -19,6 +20,7 @@
     ToolAnnotations,
     ToolDefinition,
     ToolExecutor,
+    register_tool,
 )
 
 
@@ -391,20 +393,6 @@ def _save_tasks(self) -> None:
 systematic approach and ensures comprehensive requirement fulfillment."""  # noqa: E501
 
 
-task_tracker_tool = ToolDefinition(
-    name="task_tracker",
-    description=TASK_TRACKER_DESCRIPTION,
-    action_type=TaskTrackerAction,
-    observation_type=TaskTrackerObservation,
-    annotations=ToolAnnotations(
-        readOnlyHint=False,
-        destructiveHint=False,
-        idempotentHint=True,
-        openWorldHint=False,
-    ),
-)
-
-
 class TaskTrackerTool(ToolDefinition[TaskTrackerAction, TaskTrackerObservation]):
     """A ToolDefinition subclass that automatically initializes a TaskTrackerExecutor."""  # noqa: E501
 
@@ -422,11 +410,19 @@ def create(cls, conv_state: "ConversationState") -> Sequence["TaskTrackerTool"]:
         # Initialize the parent Tool with the executor
         return [
             cls(
-                name="task_tracker",
                 description=TASK_TRACKER_DESCRIPTION,
                 action_type=TaskTrackerAction,
                 observation_type=TaskTrackerObservation,
-                annotations=task_tracker_tool.annotations,
+                annotations=ToolAnnotations(
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=True,
+                    openWorldHint=False,
+                ),
                 executor=executor,
             )
         ]
+
+
+# Automatically register the tool when this module is imported
+register_tool(TaskTrackerTool.name, TaskTrackerTool)
diff --git a/openhands-tools/pyproject.toml b/openhands-tools/pyproject.toml
index 3bcb2fbe3c..422dac6027 100644
--- a/openhands-tools/pyproject.toml
+++ b/openhands-tools/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "openhands-tools"
-version = "1.0.0a5"
+version = "1.0.0a6"
 description = "OpenHands Tools - Runtime tools for AI agents"
 
 requires-python = ">=3.12"
diff --git a/openhands-workspace/pyproject.toml b/openhands-workspace/pyproject.toml
index 344e63d584..cb3f251b38 100644
--- a/openhands-workspace/pyproject.toml
+++ b/openhands-workspace/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "openhands-workspace"
-version = "1.0.0a5"
+version = "1.0.0a6"
 description = "OpenHands Workspace - Docker and container-based workspace implementations"
 
 requires-python = ">=3.12"
diff --git a/scripts/agent_server_ui/static/app-dev.js b/scripts/agent_server_ui/static/app-dev.js
index 324f21a7e6..d487b8257d 100644
--- a/scripts/agent_server_ui/static/app-dev.js
+++ b/scripts/agent_server_ui/static/app-dev.js
@@ -178,7 +178,7 @@ class OpenHandsWebChat {
             <div class="conversation-title">${title}</div>
             <div class="conversation-meta">
                 <span>${createdAt}</span>
-                <span class="conversation-status ${conversation.agent_status.toLowerCase()}">${conversation.agent_status}</span>
+                <span class="conversation-status ${conversation.execution_status.toLowerCase()}">${conversation.execution_status}</span>
             </div>
         `;
         
@@ -223,7 +223,7 @@ class OpenHandsWebChat {
         const conversation = this.conversations.get(conversationId);
         if (conversation) {
             this.conversationTitle.textContent = this.getConversationTitle(conversation);
-            this.updateConversationStatus(conversation.agent_status);
+            this.updateConversationStatus(conversation.execution_status);
             this.enableChatControls();
         }
         
diff --git a/scripts/agent_server_ui/static/app.js b/scripts/agent_server_ui/static/app.js
index 2cbddc2251..02257a0e49 100644
--- a/scripts/agent_server_ui/static/app.js
+++ b/scripts/agent_server_ui/static/app.js
@@ -198,7 +198,7 @@ class OpenHandsWebChat {
             <div class="conversation-title">${title}</div>
             <div class="conversation-meta">
                 <span>${createdAt}</span>
-                <span class="conversation-status ${conversation.agent_status.toLowerCase()}">${conversation.agent_status}</span>
+                <span class="conversation-status ${conversation.execution_status.toLowerCase()}">${conversation.execution_status}</span>
             </div>
         `;
         
@@ -243,7 +243,7 @@ class OpenHandsWebChat {
         const conversation = this.conversations.get(conversationId);
         if (conversation) {
             this.conversationTitle.textContent = this.getConversationTitle(conversation);
-            this.updateConversationStatus(conversation.agent_status);
+            this.updateConversationStatus(conversation.execution_status);
             this.enableChatControls();
         }
         
diff --git a/tests/agent_server/test_conversation_router.py b/tests/agent_server/test_conversation_router.py
new file mode 100644
index 0000000000..55db0c72e7
--- /dev/null
+++ b/tests/agent_server/test_conversation_router.py
@@ -0,0 +1,1171 @@
+"""Tests for conversation_router.py endpoints."""
+
+from unittest.mock import AsyncMock
+from uuid import uuid4
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+from pydantic import SecretStr
+
+from openhands.agent_server.conversation_router import conversation_router
+from openhands.agent_server.conversation_service import ConversationService
+from openhands.agent_server.dependencies import get_conversation_service
+from openhands.agent_server.event_service import EventService
+from openhands.agent_server.models import (
+    ConversationInfo,
+    ConversationPage,
+    ConversationSortOrder,
+    SendMessageRequest,
+    StartConversationRequest,
+)
+from openhands.agent_server.utils import utc_now
+from openhands.sdk import LLM, Agent, TextContent, Tool
+from openhands.sdk.conversation.state import ConversationExecutionStatus
+from openhands.sdk.workspace import LocalWorkspace
+
+
+@pytest.fixture
+def client():
+    """Create a test client for the FastAPI app without authentication."""
+    app = FastAPI()
+    app.include_router(conversation_router, prefix="/api")
+    return TestClient(app)
+
+
+@pytest.fixture
+def sample_conversation_id():
+    """Return a sample conversation ID."""
+    return uuid4()
+
+
+@pytest.fixture
+def sample_conversation_info():
+    """Create a sample ConversationInfo for testing."""
+    conversation_id = uuid4()
+    now = utc_now()
+    return ConversationInfo(
+        id=conversation_id,
+        agent=Agent(
+            llm=LLM(
+                model="gpt-4o",
+                api_key=SecretStr("test-key"),
+                usage_id="test-llm",
+            ),
+            tools=[Tool(name="BashTool")],
+        ),
+        workspace=LocalWorkspace(working_dir="/tmp/test"),
+        execution_status=ConversationExecutionStatus.IDLE,
+        title="Test Conversation",
+        created_at=now,
+        updated_at=now,
+    )
+
+
+@pytest.fixture
+def mock_conversation_service():
+    """Create a mock ConversationService for testing."""
+    service = AsyncMock(spec=ConversationService)
+    return service
+
+
+@pytest.fixture
+def mock_event_service():
+    """Create a mock EventService for testing."""
+    service = AsyncMock(spec=EventService)
+    return service
+
+
+@pytest.fixture
+def sample_start_conversation_request():
+    """Create a sample StartConversationRequest for testing."""
+    return StartConversationRequest(
+        agent=Agent(
+            llm=LLM(
+                model="gpt-4o",
+                api_key=SecretStr("test-key"),
+                usage_id="test-llm",
+            ),
+            tools=[Tool(name="BashTool")],
+        ),
+        workspace=LocalWorkspace(working_dir="/tmp/test"),
+        initial_message=SendMessageRequest(
+            role="user", content=[TextContent(text="Hello, world!")]
+        ),
+    )
+
+
+def test_search_conversations_default_params(
+    client, mock_conversation_service, sample_conversation_info
+):
+    """Test search_conversations endpoint with default parameters."""
+
+    # Mock the service response
+    mock_page = ConversationPage(items=[sample_conversation_info], next_page_id=None)
+    mock_conversation_service.search_conversations.return_value = mock_page
+
+    # Override the dependency
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get("/api/conversations/search")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert "items" in data
+        assert "next_page_id" in data
+        assert len(data["items"]) == 1
+        assert data["items"][0]["id"] == str(sample_conversation_info.id)
+
+        # Verify service was called with default parameters
+        mock_conversation_service.search_conversations.assert_called_once_with(
+            None, 100, None, ConversationSortOrder.CREATED_AT_DESC
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_search_conversations_with_all_params(
+    client, mock_conversation_service, sample_conversation_info
+):
+    """Test search_conversations endpoint with all parameters."""
+
+    # Mock the service response
+    mock_page = ConversationPage(
+        items=[sample_conversation_info], next_page_id="next_page"
+    )
+    mock_conversation_service.search_conversations.return_value = mock_page
+
+    # Override the dependency
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get(
+            "/api/conversations/search",
+            params={
+                "page_id": "test_page",
+                "limit": 50,
+                "status": ConversationExecutionStatus.IDLE.value,
+                "sort_order": ConversationSortOrder.UPDATED_AT_DESC.value,
+            },
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert len(data["items"]) == 1
+        assert data["next_page_id"] == "next_page"
+
+        # Verify service was called with correct parameters
+        mock_conversation_service.search_conversations.assert_called_once_with(
+            "test_page",
+            50,
+            ConversationExecutionStatus.IDLE,
+            ConversationSortOrder.UPDATED_AT_DESC,
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_search_conversations_limit_validation(client, mock_conversation_service):
+    """Test search_conversations endpoint with invalid limit values."""
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        # Test limit too low (gt=0 means > 0, so 0 should fail)
+        response = client.get("/api/conversations/search", params={"limit": 0})
+        assert response.status_code == 422
+
+        # Test limit too high - endpoint has FastAPI validation (lte=100) and assertion
+        # The assertion in the endpoint will cause an AssertionError to be raised
+        with pytest.raises(AssertionError):
+            response = client.get("/api/conversations/search", params={"limit": 101})
+
+        # Test valid limit
+        mock_conversation_service.search_conversations.return_value = ConversationPage(
+            items=[], next_page_id=None
+        )
+        response = client.get("/api/conversations/search", params={"limit": 50})
+        assert response.status_code == 200
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_search_conversations_empty_result(client, mock_conversation_service):
+    """Test search_conversations endpoint with empty result."""
+
+    # Mock empty response
+    mock_page = ConversationPage(items=[], next_page_id=None)
+    mock_conversation_service.search_conversations.return_value = mock_page
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get("/api/conversations/search")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["items"] == []
+        assert data["next_page_id"] is None
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_count_conversations_no_filter(client, mock_conversation_service):
+    """Test count_conversations endpoint without status filter."""
+
+    # Mock the service response
+    mock_conversation_service.count_conversations.return_value = 5
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get("/api/conversations/count")
+
+        assert response.status_code == 200
+        assert response.json() == 5
+
+        # Verify service was called with no status filter
+        mock_conversation_service.count_conversations.assert_called_once_with(None)
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_count_conversations_with_status_filter(client, mock_conversation_service):
+    """Test count_conversations endpoint with status filter."""
+
+    # Mock the service response
+    mock_conversation_service.count_conversations.return_value = 3
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get(
+            "/api/conversations/count",
+            params={"status": ConversationExecutionStatus.RUNNING.value},
+        )
+
+        assert response.status_code == 200
+        assert response.json() == 3
+
+        # Verify service was called with status filter
+        mock_conversation_service.count_conversations.assert_called_once_with(
+            ConversationExecutionStatus.RUNNING
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_count_conversations_zero_result(client, mock_conversation_service):
+    """Test count_conversations endpoint with zero result."""
+
+    # Mock zero count response
+    mock_conversation_service.count_conversations.return_value = 0
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get("/api/conversations/count")
+
+        assert response.status_code == 200
+        assert response.json() == 0
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_get_conversation_success(
+    client, mock_conversation_service, sample_conversation_info, sample_conversation_id
+):
+    """Test get_conversation endpoint with existing conversation."""
+
+    # Mock the service response
+    mock_conversation_service.get_conversation.return_value = sample_conversation_info
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get(f"/api/conversations/{sample_conversation_id}")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["id"] == str(sample_conversation_info.id)
+        assert data["title"] == sample_conversation_info.title
+
+        # Verify service was called with correct conversation ID
+        mock_conversation_service.get_conversation.assert_called_once_with(
+            sample_conversation_id
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_get_conversation_not_found(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test get_conversation endpoint with non-existent conversation."""
+
+    # Mock the service to return None (conversation not found)
+    mock_conversation_service.get_conversation.return_value = None
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get(f"/api/conversations/{sample_conversation_id}")
+
+        assert response.status_code == 404
+
+        # Verify service was called with correct conversation ID
+        mock_conversation_service.get_conversation.assert_called_once_with(
+            sample_conversation_id
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_get_conversation_invalid_uuid(client, mock_conversation_service):
+    """Test get_conversation endpoint with invalid UUID."""
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get("/api/conversations/invalid-uuid")
+
+        assert response.status_code == 422  # Validation error for invalid UUID
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_batch_get_conversations_success(
+    client, mock_conversation_service, sample_conversation_info
+):
+    """Test batch_get_conversations endpoint with valid IDs."""
+
+    # Create additional conversation info for testing
+    conversation_id_1 = uuid4()
+    conversation_id_2 = uuid4()
+
+    # Mock the service response - return one found, one None
+    mock_conversation_service.batch_get_conversations.return_value = [
+        sample_conversation_info,
+        None,
+    ]
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get(
+            "/api/conversations",
+            params={"ids": [str(conversation_id_1), str(conversation_id_2)]},
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert len(data) == 2
+        assert data[0]["id"] == str(sample_conversation_info.id)
+        assert data[1] is None
+
+        # Verify service was called with correct IDs
+        mock_conversation_service.batch_get_conversations.assert_called_once_with(
+            [conversation_id_1, conversation_id_2]
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_batch_get_conversations_empty_list(client, mock_conversation_service):
+    """Test batch_get_conversations endpoint with empty ID list."""
+
+    # Mock empty response
+    mock_conversation_service.batch_get_conversations.return_value = []
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        # FastAPI requires at least one value for query parameters that expect a list
+        # So we'll test with a single valid UUID instead
+        test_id = str(uuid4())
+        mock_conversation_service.batch_get_conversations.return_value = [None]
+
+        response = client.get("/api/conversations", params={"ids": [test_id]})
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data == [None]
+
+        # Verify service was called
+        mock_conversation_service.batch_get_conversations.assert_called_once()
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_batch_get_conversations_too_many_ids(client, mock_conversation_service):
+    """Test batch_get_conversations endpoint with too many IDs."""
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        # The assertion is len(ids) < 100, so 100 should fail with AssertionError
+        many_ids = [str(uuid4()) for _ in range(100)]
+        with pytest.raises(AssertionError):
+            response = client.get("/api/conversations", params={"ids": many_ids})
+
+        # Test with 99 IDs (should work)
+        mock_conversation_service.batch_get_conversations.return_value = [None] * 99
+        valid_ids = [str(uuid4()) for _ in range(99)]
+        response = client.get("/api/conversations", params={"ids": valid_ids})
+        assert response.status_code == 200
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_batch_get_conversations_invalid_uuid(client, mock_conversation_service):
+    """Test batch_get_conversations endpoint with invalid UUID."""
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.get("/api/conversations", params={"ids": ["invalid-uuid"]})
+
+        assert response.status_code == 422  # Validation error for invalid UUID
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_start_conversation_new(
+    client, mock_conversation_service, sample_conversation_info
+):
+    """Test start_conversation endpoint creating a new conversation."""
+
+    # Mock the service response - new conversation created
+    mock_conversation_service.start_conversation.return_value = (
+        sample_conversation_info,
+        True,
+    )
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        # Create request data with proper serialization
+        request_data = {
+            "agent": {
+                "llm": {
+                    "model": "gpt-4o",
+                    "api_key": "test-key",
+                    "usage_id": "test-llm",
+                },
+                "tools": [{"name": "BashTool"}],
+            },
+            "workspace": {"working_dir": "/tmp/test"},
+            "initial_message": {
+                "role": "user",
+                "content": [{"type": "text", "text": "Hello, world!"}],
+            },
+        }
+
+        response = client.post("/api/conversations", json=request_data)
+
+        assert response.status_code == 201  # Created
+        data = response.json()
+        assert data["id"] == str(sample_conversation_info.id)
+        assert data["title"] == sample_conversation_info.title
+
+        # Verify service was called
+        mock_conversation_service.start_conversation.assert_called_once()
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_start_conversation_existing(
+    client, mock_conversation_service, sample_conversation_info
+):
+    """Test start_conversation endpoint with existing conversation."""
+
+    # Mock the service response - existing conversation returned
+    mock_conversation_service.start_conversation.return_value = (
+        sample_conversation_info,
+        False,
+    )
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        # Create request data with proper serialization
+        request_data = {
+            "agent": {
+                "llm": {
+                    "model": "gpt-4o",
+                    "api_key": "test-key",
+                    "usage_id": "test-llm",
+                },
+                "tools": [{"name": "BashTool"}],
+            },
+            "workspace": {"working_dir": "/tmp/test"},
+        }
+
+        response = client.post("/api/conversations", json=request_data)
+
+        assert response.status_code == 200  # OK (existing)
+        data = response.json()
+        assert data["id"] == str(sample_conversation_info.id)
+
+        # Verify service was called
+        mock_conversation_service.start_conversation.assert_called_once()
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_start_conversation_invalid_request(client, mock_conversation_service):
+    """Test start_conversation endpoint with invalid request data."""
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        # Test with missing required fields
+        invalid_request = {"invalid": "data"}
+
+        response = client.post("/api/conversations", json=invalid_request)
+
+        assert response.status_code == 422  # Validation error
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_start_conversation_minimal_request(
+    client, mock_conversation_service, sample_conversation_info
+):
+    """Test start_conversation endpoint with minimal valid request."""
+
+    # Mock the service response
+    mock_conversation_service.start_conversation.return_value = (
+        sample_conversation_info,
+        True,
+    )
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        # Create minimal valid request
+        minimal_request = {
+            "agent": {
+                "llm": {
+                    "model": "gpt-4o",
+                    "api_key": "test-key",
+                    "usage_id": "test-llm",
+                },
+                "tools": [{"name": "BashTool"}],
+            },
+            "workspace": {"working_dir": "/tmp/test"},
+        }
+
+        response = client.post("/api/conversations", json=minimal_request)
+
+        assert response.status_code == 201
+        data = response.json()
+        assert data["id"] == str(sample_conversation_info.id)
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_pause_conversation_success(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test pause_conversation endpoint with successful pause."""
+
+    # Mock the service response - pause successful
+    mock_conversation_service.pause_conversation.return_value = True
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.post(f"/api/conversations/{sample_conversation_id}/pause")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+
+        # Verify service was called with correct conversation ID
+        mock_conversation_service.pause_conversation.assert_called_once_with(
+            sample_conversation_id
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_pause_conversation_failure(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test pause_conversation endpoint with pause failure."""
+
+    # Mock the service response - pause failed
+    mock_conversation_service.pause_conversation.return_value = False
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.post(f"/api/conversations/{sample_conversation_id}/pause")
+
+        assert response.status_code == 400  # Bad Request
+
+        # Verify service was called
+        mock_conversation_service.pause_conversation.assert_called_once_with(
+            sample_conversation_id
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_delete_conversation_success(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test delete_conversation endpoint with successful deletion."""
+
+    # Mock the service response - deletion successful
+    mock_conversation_service.delete_conversation.return_value = True
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.delete(f"/api/conversations/{sample_conversation_id}")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+
+        # Verify service was called with correct conversation ID
+        mock_conversation_service.delete_conversation.assert_called_once_with(
+            sample_conversation_id
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_delete_conversation_failure(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test delete_conversation endpoint with deletion failure."""
+
+    # Mock the service response - deletion failed
+    mock_conversation_service.delete_conversation.return_value = False
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.delete(f"/api/conversations/{sample_conversation_id}")
+
+        assert response.status_code == 400  # Bad Request
+
+        # Verify service was called
+        mock_conversation_service.delete_conversation.assert_called_once_with(
+            sample_conversation_id
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_run_conversation_success(
+    client, mock_conversation_service, mock_event_service, sample_conversation_id
+):
+    """Test run_conversation endpoint with successful run."""
+
+    # Mock the service responses
+    mock_conversation_service.get_event_service.return_value = mock_event_service
+    mock_event_service.run.return_value = None  # Successful run
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.post(f"/api/conversations/{sample_conversation_id}/run")
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+
+        # Verify services were called
+        mock_conversation_service.get_event_service.assert_called_once_with(
+            sample_conversation_id
+        )
+        mock_event_service.run.assert_called_once()
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_run_conversation_not_found(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test run_conversation endpoint when conversation is not found."""
+
+    # Mock the service response - conversation not found
+    mock_conversation_service.get_event_service.return_value = None
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.post(f"/api/conversations/{sample_conversation_id}/run")
+
+        assert response.status_code == 404
+
+        # Verify service was called
+        mock_conversation_service.get_event_service.assert_called_once_with(
+            sample_conversation_id
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_run_conversation_already_running(
+    client, mock_conversation_service, mock_event_service, sample_conversation_id
+):
+    """Test run_conversation endpoint when conversation is already running."""
+
+    # Mock the service responses
+    mock_conversation_service.get_event_service.return_value = mock_event_service
+    mock_event_service.run.side_effect = ValueError("conversation_already_running")
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.post(f"/api/conversations/{sample_conversation_id}/run")
+
+        assert response.status_code == 409  # Conflict
+        data = response.json()
+        assert "already running" in data["detail"]
+
+        # Verify services were called
+        mock_conversation_service.get_event_service.assert_called_once_with(
+            sample_conversation_id
+        )
+        mock_event_service.run.assert_called_once()
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_run_conversation_other_error(
+    client, mock_conversation_service, mock_event_service, sample_conversation_id
+):
+    """Test run_conversation endpoint with other ValueError."""
+
+    # Mock the service responses
+    mock_conversation_service.get_event_service.return_value = mock_event_service
+    mock_event_service.run.side_effect = ValueError("some other error")
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        response = client.post(f"/api/conversations/{sample_conversation_id}/run")
+
+        assert response.status_code == 400  # Bad Request
+        data = response.json()
+        assert data["detail"] == "some other error"
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_update_conversation_secrets_success(
+    client, mock_conversation_service, mock_event_service, sample_conversation_id
+):
+    """Test update_conversation_secrets endpoint with successful update."""
+
+    # Mock the service responses
+    mock_conversation_service.get_event_service.return_value = mock_event_service
+    mock_event_service.update_secrets.return_value = None
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        # Use proper secret source format
+        request_data = {
+            "secrets": {
+                "API_KEY": {"kind": "StaticSecret", "value": "secret-value"},
+                "TOKEN": {"kind": "StaticSecret", "value": "token-value"},
+            }
+        }
+
+        response = client.post(
+            f"/api/conversations/{sample_conversation_id}/secrets", json=request_data
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+
+        # Verify services were called
+        mock_conversation_service.get_event_service.assert_called_once_with(
+            sample_conversation_id
+        )
+        mock_event_service.update_secrets.assert_called_once()
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_update_conversation_secrets_not_found(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test update_conversation_secrets endpoint when conversation is not found."""
+
+    # Mock the service response - conversation not found
+    mock_conversation_service.get_event_service.return_value = None
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        request_data = {
+            "secrets": {"API_KEY": {"kind": "StaticSecret", "value": "secret-value"}}
+        }
+
+        response = client.post(
+            f"/api/conversations/{sample_conversation_id}/secrets", json=request_data
+        )
+
+        assert response.status_code == 404
+
+        # Verify service was called
+        mock_conversation_service.get_event_service.assert_called_once_with(
+            sample_conversation_id
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_set_conversation_confirmation_policy_success(
+    client, mock_conversation_service, mock_event_service, sample_conversation_id
+):
+    """Test set_conversation_confirmation_policy endpoint with successful update."""
+
+    # Mock the service responses
+    mock_conversation_service.get_event_service.return_value = mock_event_service
+    mock_event_service.set_confirmation_policy.return_value = None
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        request_data = {"policy": {"kind": "NeverConfirm"}}
+
+        response = client.post(
+            f"/api/conversations/{sample_conversation_id}/confirmation_policy",
+            json=request_data,
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+
+        # Verify services were called
+        mock_conversation_service.get_event_service.assert_called_once_with(
+            sample_conversation_id
+        )
+        mock_event_service.set_confirmation_policy.assert_called_once()
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_set_conversation_confirmation_policy_not_found(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test set_conversation_confirmation_policy endpoint when conversation is not found."""  # noqa: E501
+
+    # Mock the service response - conversation not found
+    mock_conversation_service.get_event_service.return_value = None
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        request_data = {"policy": {"kind": "NeverConfirm"}}
+
+        response = client.post(
+            f"/api/conversations/{sample_conversation_id}/confirmation_policy",
+            json=request_data,
+        )
+
+        assert response.status_code == 404
+
+        # Verify service was called
+        mock_conversation_service.get_event_service.assert_called_once_with(
+            sample_conversation_id
+        )
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_update_conversation_success(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test update_conversation endpoint with successful update."""
+
+    # Mock the service response - update successful
+    mock_conversation_service.update_conversation.return_value = True
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        request_data = {"title": "Updated Conversation Title"}
+
+        response = client.patch(
+            f"/api/conversations/{sample_conversation_id}", json=request_data
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is True
+
+        # Verify service was called with correct parameters
+        mock_conversation_service.update_conversation.assert_called_once()
+        call_args = mock_conversation_service.update_conversation.call_args
+        assert call_args[0][0] == sample_conversation_id
+        assert call_args[0][1].title == "Updated Conversation Title"
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_update_conversation_failure(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test update_conversation endpoint with update failure."""
+
+    # Mock the service response - update failed
+    mock_conversation_service.update_conversation.return_value = False
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        request_data = {"title": "Updated Title"}
+
+        response = client.patch(
+            f"/api/conversations/{sample_conversation_id}", json=request_data
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["success"] is False
+
+        # Verify service was called
+        mock_conversation_service.update_conversation.assert_called_once()
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_update_conversation_invalid_title(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test update_conversation endpoint with invalid title."""
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        # Test with empty title
+        request_data = {"title": ""}
+        response = client.patch(
+            f"/api/conversations/{sample_conversation_id}", json=request_data
+        )
+        assert response.status_code == 422  # Validation error
+
+        # Test with too long title
+        long_title = "x" * 201  # Exceeds max_length=200
+        request_data = {"title": long_title}
+        response = client.patch(
+            f"/api/conversations/{sample_conversation_id}", json=request_data
+        )
+        assert response.status_code == 422  # Validation error
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_generate_conversation_title_success(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test generate_conversation_title endpoint with successful generation."""
+
+    # Mock the service response
+    mock_conversation_service.generate_conversation_title.return_value = (
+        "Generated Title"
+    )
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        request_data = {"max_length": 30}
+
+        response = client.post(
+            f"/api/conversations/{sample_conversation_id}/generate_title",
+            json=request_data,
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["title"] == "Generated Title"
+
+        # Verify service was called with correct parameters
+        mock_conversation_service.generate_conversation_title.assert_called_once()
+        call_args = mock_conversation_service.generate_conversation_title.call_args
+        assert call_args[0][0] == sample_conversation_id
+        assert call_args[0][1] == 30  # max_length
+        assert call_args[0][2] is None  # llm (default)
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_generate_conversation_title_with_llm(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test generate_conversation_title endpoint with custom LLM."""
+
+    # Mock the service response
+    mock_conversation_service.generate_conversation_title.return_value = (
+        "Custom LLM Title"
+    )
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        request_data = {
+            "max_length": 40,
+            "llm": {
+                "model": "gpt-3.5-turbo",
+                "api_key": "custom-key",
+                "usage_id": "custom-llm",
+            },
+        }
+
+        response = client.post(
+            f"/api/conversations/{sample_conversation_id}/generate_title",
+            json=request_data,
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+        assert data["title"] == "Custom LLM Title"
+
+        # Verify service was called
+        mock_conversation_service.generate_conversation_title.assert_called_once()
+        call_args = mock_conversation_service.generate_conversation_title.call_args
+        assert call_args[0][0] == sample_conversation_id
+        assert call_args[0][1] == 40  # max_length
+        assert call_args[0][2] is not None  # llm provided
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_generate_conversation_title_failure(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test generate_conversation_title endpoint with generation failure."""
+
+    # Mock the service response - generation failed
+    mock_conversation_service.generate_conversation_title.return_value = None
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        request_data = {"max_length": 50}
+
+        response = client.post(
+            f"/api/conversations/{sample_conversation_id}/generate_title",
+            json=request_data,
+        )
+
+        assert response.status_code == 500  # Internal Server Error
+
+        # Verify service was called
+        mock_conversation_service.generate_conversation_title.assert_called_once()
+    finally:
+        client.app.dependency_overrides.clear()
+
+
+def test_generate_conversation_title_invalid_params(
+    client, mock_conversation_service, sample_conversation_id
+):
+    """Test generate_conversation_title endpoint with invalid parameters."""
+
+    client.app.dependency_overrides[get_conversation_service] = (
+        lambda: mock_conversation_service
+    )
+
+    try:
+        # Test with max_length too low
+        request_data = {"max_length": 0}
+        response = client.post(
+            f"/api/conversations/{sample_conversation_id}/generate_title",
+            json=request_data,
+        )
+        assert response.status_code == 422  # Validation error
+
+        # Test with max_length too high
+        request_data = {"max_length": 201}
+        response = client.post(
+            f"/api/conversations/{sample_conversation_id}/generate_title",
+            json=request_data,
+        )
+        assert response.status_code == 422  # Validation error
+    finally:
+        client.app.dependency_overrides.clear()
diff --git a/tests/agent_server/test_conversation_service.py b/tests/agent_server/test_conversation_service.py
index 8e500c8f9c..23f99c4ba9 100644
--- a/tests/agent_server/test_conversation_service.py
+++ b/tests/agent_server/test_conversation_service.py
@@ -18,7 +18,10 @@
 )
 from openhands.sdk import LLM, Agent
 from openhands.sdk.conversation.secret_source import SecretSource, StaticSecret
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 from openhands.sdk.security.confirmation_policy import NeverConfirm
 from openhands.sdk.workspace import LocalWorkspace
 
@@ -89,7 +92,7 @@ async def test_search_conversations_basic(
             id=sample_stored_conversation.id,
             agent=sample_stored_conversation.agent,
             workspace=sample_stored_conversation.workspace,
-            agent_status=AgentExecutionStatus.IDLE,
+            execution_status=ConversationExecutionStatus.IDLE,
             confirmation_policy=sample_stored_conversation.confirmation_policy,
         )
         mock_service.get_state.return_value = mock_state
@@ -101,7 +104,7 @@ async def test_search_conversations_basic(
 
         assert len(result.items) == 1
         assert result.items[0].id == conversation_id
-        assert result.items[0].agent_status == AgentExecutionStatus.IDLE
+        assert result.items[0].execution_status == ConversationExecutionStatus.IDLE
         assert result.next_page_id is None
 
     @pytest.mark.asyncio
@@ -111,9 +114,9 @@ async def test_search_conversations_status_filter(self, conversation_service):
         conversations = []
         for i, status in enumerate(
             [
-                AgentExecutionStatus.IDLE,
-                AgentExecutionStatus.RUNNING,
-                AgentExecutionStatus.FINISHED,
+                ConversationExecutionStatus.IDLE,
+                ConversationExecutionStatus.RUNNING,
+                ConversationExecutionStatus.FINISHED,
             ]
         ):
             stored_conv = StoredConversation(
@@ -133,7 +136,7 @@ async def test_search_conversations_status_filter(self, conversation_service):
                 id=stored_conv.id,
                 agent=stored_conv.agent,
                 workspace=stored_conv.workspace,
-                agent_status=status,
+                execution_status=status,
                 confirmation_policy=stored_conv.confirmation_policy,
             )
             mock_service.get_state.return_value = mock_state
@@ -143,21 +146,21 @@ async def test_search_conversations_status_filter(self, conversation_service):
 
         # Test filtering by IDLE status
         result = await conversation_service.search_conversations(
-            agent_status=AgentExecutionStatus.IDLE
+            execution_status=ConversationExecutionStatus.IDLE
         )
         assert len(result.items) == 1
-        assert result.items[0].agent_status == AgentExecutionStatus.IDLE
+        assert result.items[0].execution_status == ConversationExecutionStatus.IDLE
 
         # Test filtering by RUNNING status
         result = await conversation_service.search_conversations(
-            agent_status=AgentExecutionStatus.RUNNING
+            execution_status=ConversationExecutionStatus.RUNNING
         )
         assert len(result.items) == 1
-        assert result.items[0].agent_status == AgentExecutionStatus.RUNNING
+        assert result.items[0].execution_status == ConversationExecutionStatus.RUNNING
 
         # Test filtering by non-existent status
         result = await conversation_service.search_conversations(
-            agent_status=AgentExecutionStatus.ERROR
+            execution_status=ConversationExecutionStatus.ERROR
         )
         assert len(result.items) == 0
 
@@ -187,7 +190,7 @@ async def test_search_conversations_sorting(self, conversation_service):
                 id=stored_conv.id,
                 agent=stored_conv.agent,
                 workspace=stored_conv.workspace,
-                agent_status=AgentExecutionStatus.IDLE,
+                execution_status=ConversationExecutionStatus.IDLE,
                 confirmation_policy=stored_conv.confirmation_policy,
             )
             mock_service.get_state.return_value = mock_state
@@ -262,7 +265,7 @@ async def test_search_conversations_pagination(self, conversation_service):
                 id=stored_conv.id,
                 agent=stored_conv.agent,
                 workspace=stored_conv.workspace,
-                agent_status=AgentExecutionStatus.IDLE,
+                execution_status=ConversationExecutionStatus.IDLE,
                 confirmation_policy=stored_conv.confirmation_policy,
             )
             mock_service.get_state.return_value = mock_state
@@ -297,19 +300,19 @@ async def test_search_conversations_combined_filter_and_sort(
         # Create conversations with mixed statuses and timestamps
         conversations_data = [
             (
-                AgentExecutionStatus.IDLE,
+                ConversationExecutionStatus.IDLE,
                 datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC),
             ),
             (
-                AgentExecutionStatus.RUNNING,
+                ConversationExecutionStatus.RUNNING,
                 datetime(2025, 1, 2, 12, 0, 0, tzinfo=UTC),
             ),
             (
-                AgentExecutionStatus.IDLE,
+                ConversationExecutionStatus.IDLE,
                 datetime(2025, 1, 3, 12, 0, 0, tzinfo=UTC),
             ),
             (
-                AgentExecutionStatus.FINISHED,
+                ConversationExecutionStatus.FINISHED,
                 datetime(2025, 1, 4, 12, 0, 0, tzinfo=UTC),
             ),
         ]
@@ -332,7 +335,7 @@ async def test_search_conversations_combined_filter_and_sort(
                 id=stored_conv.id,
                 agent=stored_conv.agent,
                 workspace=stored_conv.workspace,
-                agent_status=status,
+                execution_status=status,
                 confirmation_policy=stored_conv.confirmation_policy,
             )
             mock_service.get_state.return_value = mock_state
@@ -341,7 +344,7 @@ async def test_search_conversations_combined_filter_and_sort(
 
         # Filter by IDLE status and sort by CREATED_AT_DESC
         result = await conversation_service.search_conversations(
-            agent_status=AgentExecutionStatus.IDLE,
+            execution_status=ConversationExecutionStatus.IDLE,
             sort_order=ConversationSortOrder.CREATED_AT_DESC,
         )
 
@@ -360,7 +363,7 @@ async def test_search_conversations_invalid_page_id(
             id=sample_stored_conversation.id,
             agent=sample_stored_conversation.agent,
             workspace=sample_stored_conversation.workspace,
-            agent_status=AgentExecutionStatus.IDLE,
+            execution_status=ConversationExecutionStatus.IDLE,
             confirmation_policy=sample_stored_conversation.confirmation_policy,
         )
         mock_service.get_state.return_value = mock_state
@@ -409,7 +412,7 @@ async def test_count_conversations_basic(
             id=sample_stored_conversation.id,
             agent=sample_stored_conversation.agent,
             workspace=sample_stored_conversation.workspace,
-            agent_status=AgentExecutionStatus.IDLE,
+            execution_status=ConversationExecutionStatus.IDLE,
             confirmation_policy=sample_stored_conversation.confirmation_policy,
         )
         mock_service.get_state.return_value = mock_state
@@ -425,10 +428,10 @@ async def test_count_conversations_status_filter(self, conversation_service):
         """Test counting conversations with status filter."""
         # Create multiple conversations with different statuses
         statuses = [
-            AgentExecutionStatus.IDLE,
-            AgentExecutionStatus.RUNNING,
-            AgentExecutionStatus.FINISHED,
-            AgentExecutionStatus.IDLE,  # Another IDLE one
+            ConversationExecutionStatus.IDLE,
+            ConversationExecutionStatus.RUNNING,
+            ConversationExecutionStatus.FINISHED,
+            ConversationExecutionStatus.IDLE,  # Another IDLE one
         ]
 
         for i, status in enumerate(statuses):
@@ -449,7 +452,7 @@ async def test_count_conversations_status_filter(self, conversation_service):
                 id=stored_conv.id,
                 agent=stored_conv.agent,
                 workspace=stored_conv.workspace,
-                agent_status=status,
+                execution_status=status,
                 confirmation_policy=stored_conv.confirmation_policy,
             )
             mock_service.get_state.return_value = mock_state
@@ -462,19 +465,19 @@ async def test_count_conversations_status_filter(self, conversation_service):
 
         # Test counting by IDLE status (should be 2)
         result = await conversation_service.count_conversations(
-            agent_status=AgentExecutionStatus.IDLE
+            execution_status=ConversationExecutionStatus.IDLE
         )
         assert result == 2
 
         # Test counting by RUNNING status (should be 1)
         result = await conversation_service.count_conversations(
-            agent_status=AgentExecutionStatus.RUNNING
+            execution_status=ConversationExecutionStatus.RUNNING
         )
         assert result == 1
 
         # Test counting by non-existent status (should be 0)
         result = await conversation_service.count_conversations(
-            agent_status=AgentExecutionStatus.ERROR
+            execution_status=ConversationExecutionStatus.ERROR
         )
         assert result == 0
 
@@ -514,7 +517,7 @@ async def test_start_conversation_with_secrets(self, conversation_service):
                     id=uuid4(),
                     agent=request.agent,
                     workspace=request.workspace,
-                    agent_status=AgentExecutionStatus.IDLE,
+                    execution_status=ConversationExecutionStatus.IDLE,
                     confirmation_policy=request.confirmation_policy,
                 )
                 mock_event_service.get_state.return_value = mock_state
@@ -551,7 +554,7 @@ async def test_start_conversation_with_secrets(self, conversation_service):
 
                 # Verify the result
                 assert result.id == mock_state.id
-                assert result.agent_status == AgentExecutionStatus.IDLE
+                assert result.execution_status == ConversationExecutionStatus.IDLE
 
     @pytest.mark.asyncio
     async def test_start_conversation_without_secrets(self, conversation_service):
@@ -576,7 +579,7 @@ async def test_start_conversation_without_secrets(self, conversation_service):
                     id=uuid4(),
                     agent=request.agent,
                     workspace=request.workspace,
-                    agent_status=AgentExecutionStatus.IDLE,
+                    execution_status=ConversationExecutionStatus.IDLE,
                     confirmation_policy=request.confirmation_policy,
                 )
                 mock_event_service.get_state.return_value = mock_state
@@ -603,7 +606,7 @@ async def test_start_conversation_without_secrets(self, conversation_service):
 
                 # Verify the result
                 assert result.id == mock_state.id
-                assert result.agent_status == AgentExecutionStatus.IDLE
+                assert result.execution_status == ConversationExecutionStatus.IDLE
 
     @pytest.mark.asyncio
     async def test_start_conversation_with_custom_id(self, conversation_service):
@@ -670,7 +673,7 @@ async def test_update_conversation_success(
             id=sample_stored_conversation.id,
             agent=sample_stored_conversation.agent,
             workspace=sample_stored_conversation.workspace,
-            agent_status=AgentExecutionStatus.IDLE,
+            execution_status=ConversationExecutionStatus.IDLE,
             confirmation_policy=sample_stored_conversation.confirmation_policy,
         )
         mock_service.get_state.return_value = mock_state
@@ -701,7 +704,7 @@ async def test_update_conversation_strips_whitespace(
             id=sample_stored_conversation.id,
             agent=sample_stored_conversation.agent,
             workspace=sample_stored_conversation.workspace,
-            agent_status=AgentExecutionStatus.IDLE,
+            execution_status=ConversationExecutionStatus.IDLE,
             confirmation_policy=sample_stored_conversation.confirmation_policy,
         )
         mock_service.get_state.return_value = mock_state
@@ -753,7 +756,7 @@ async def test_update_conversation_notifies_webhooks(
             id=sample_stored_conversation.id,
             agent=sample_stored_conversation.agent,
             workspace=sample_stored_conversation.workspace,
-            agent_status=AgentExecutionStatus.IDLE,
+            execution_status=ConversationExecutionStatus.IDLE,
             confirmation_policy=sample_stored_conversation.confirmation_policy,
         )
         mock_service.get_state.return_value = mock_state
@@ -790,7 +793,7 @@ async def test_update_conversation_persists_changes(
             id=sample_stored_conversation.id,
             agent=sample_stored_conversation.agent,
             workspace=sample_stored_conversation.workspace,
-            agent_status=AgentExecutionStatus.IDLE,
+            execution_status=ConversationExecutionStatus.IDLE,
             confirmation_policy=sample_stored_conversation.confirmation_policy,
         )
         mock_service.get_state.return_value = mock_state
@@ -822,7 +825,7 @@ async def test_update_conversation_multiple_times(
             id=sample_stored_conversation.id,
             agent=sample_stored_conversation.agent,
             workspace=sample_stored_conversation.workspace,
-            agent_status=AgentExecutionStatus.IDLE,
+            execution_status=ConversationExecutionStatus.IDLE,
             confirmation_policy=sample_stored_conversation.confirmation_policy,
         )
         mock_service.get_state.return_value = mock_state
diff --git a/tests/agent_server/test_event_router.py b/tests/agent_server/test_event_router.py
index 60bff117a6..9e8d3a3cb5 100644
--- a/tests/agent_server/test_event_router.py
+++ b/tests/agent_server/test_event_router.py
@@ -8,10 +8,12 @@
 from fastapi import FastAPI
 from fastapi.testclient import TestClient
 
+from openhands.agent_server.dependencies import get_event_service
 from openhands.agent_server.event_router import event_router
 from openhands.agent_server.event_service import EventService
+from openhands.agent_server.models import SendMessageRequest
 from openhands.sdk import Message
-from openhands.sdk.llm.message import TextContent
+from openhands.sdk.llm.message import ImageContent, TextContent
 
 
 @pytest.fixture
@@ -44,8 +46,6 @@ async def test_send_message_with_run_true(
         self, client, sample_conversation_id, mock_event_service
     ):
         """Test send_message endpoint with run=True."""
-        from openhands.agent_server.dependencies import get_event_service
-
         # Override the dependency to return our mock
         client.app.dependency_overrides[get_event_service] = lambda: mock_event_service
 
@@ -83,8 +83,6 @@ async def test_send_message_with_run_false(
         self, client, sample_conversation_id, mock_event_service
     ):
         """Test send_message endpoint with run=False."""
-        from openhands.agent_server.dependencies import get_event_service
-
         # Override the dependency to return our mock
         client.app.dependency_overrides[get_event_service] = lambda: mock_event_service
 
@@ -119,8 +117,6 @@ async def test_send_message_default_run_value(
         self, client, sample_conversation_id, mock_event_service
     ):
         """Test send_message endpoint with default run value."""
-        from openhands.agent_server.dependencies import get_event_service
-
         # Override the dependency to return our mock
         client.app.dependency_overrides[get_event_service] = lambda: mock_event_service
 
@@ -157,8 +153,6 @@ async def test_send_message_conversation_not_found(
         """Test send_message endpoint when conversation is not found."""
         from fastapi import HTTPException, status
 
-        from openhands.agent_server.dependencies import get_event_service
-
         def raise_not_found():
             raise HTTPException(
                 status_code=status.HTTP_404_NOT_FOUND,
@@ -189,8 +183,6 @@ async def test_send_message_with_different_content_types(
         self, client, sample_conversation_id, mock_event_service
     ):
         """Test send_message endpoint with different content types."""
-        from openhands.agent_server.dependencies import get_event_service
-
         # Override the dependency to return our mock
         client.app.dependency_overrides[get_event_service] = lambda: mock_event_service
 
@@ -234,8 +226,6 @@ async def test_send_message_with_system_role(
         self, client, sample_conversation_id, mock_event_service
     ):
         """Test send_message endpoint with system role."""
-        from openhands.agent_server.dependencies import get_event_service
-
         # Override the dependency to return our mock
         client.app.dependency_overrides[get_event_service] = lambda: mock_event_service
 
@@ -270,8 +260,6 @@ async def test_send_message_invalid_request_data(
         self, client, sample_conversation_id
     ):
         """Test send_message endpoint with invalid request data."""
-        from openhands.agent_server.dependencies import get_event_service
-
         # Override the dependency (though it shouldn't be called for validation errors)
         client.app.dependency_overrides[get_event_service] = lambda: None
 
@@ -306,3 +294,219 @@ async def test_send_message_invalid_request_data(
         finally:
             # Clean up the dependency override
             client.app.dependency_overrides.clear()
+
+    def test_create_message(self):
+        content: list[TextContent | ImageContent] = [
+            TextContent(
+                text="This is a message",
+            )
+        ]
+        request = SendMessageRequest(
+            role="user",
+            content=content,
+        )
+        message = request.create_message()
+        assert message.content == content
+
+
+class TestSearchEventsEndpoint:
+    """Test cases for the search events endpoint with timestamp filtering."""
+
+    @pytest.mark.asyncio
+    async def test_search_events_with_naive_datetime(
+        self, client, sample_conversation_id, mock_event_service
+    ):
+        """Test search events with naive datetime (no timezone)."""
+        # Override the dependency to return our mock
+        client.app.dependency_overrides[get_event_service] = lambda: mock_event_service
+
+        try:
+            # Mock the search_events method to return a sample result
+            mock_event_service.search_events = AsyncMock(
+                return_value={"items": [], "next_page_id": None}
+            )
+
+            # Test with naive datetime
+            response = client.get(
+                f"/api/conversations/{sample_conversation_id}/events/search",
+                params={
+                    "timestamp__gte": "2025-01-01T12:00:00",  # Naive datetime string
+                    "limit": 10,
+                },
+            )
+
+            assert response.status_code == 200
+            mock_event_service.search_events.assert_called_once()
+            # Verify that the datetime was normalized (converted to datetime object)
+            call_args = mock_event_service.search_events.call_args
+            # Check positional arguments: (page_id, limit, kind, sort_order,
+            # timestamp__gte, timestamp__lt)
+            assert len(call_args[0]) >= 5  # Should have at least 5 positional args
+            assert call_args[0][4] is not None  # timestamp__gte should be normalized
+            assert call_args[0][5] is None  # timestamp__lt should be None
+        finally:
+            # Clean up the dependency override
+            client.app.dependency_overrides.clear()
+
+    @pytest.mark.asyncio
+    async def test_search_events_with_timezone_aware_datetime(
+        self, client, sample_conversation_id, mock_event_service
+    ):
+        """Test search events with timezone-aware datetime."""
+        # Override the dependency to return our mock
+        client.app.dependency_overrides[get_event_service] = lambda: mock_event_service
+
+        try:
+            # Mock the search_events method to return a sample result
+            mock_event_service.search_events = AsyncMock(
+                return_value={"items": [], "next_page_id": None}
+            )
+
+            # Test with timezone-aware datetime (UTC)
+            response = client.get(
+                f"/api/conversations/{sample_conversation_id}/events/search",
+                params={
+                    "timestamp__gte": "2025-01-01T12:00:00Z",  # UTC timezone
+                    "limit": 10,
+                },
+            )
+
+            assert response.status_code == 200
+            mock_event_service.search_events.assert_called_once()
+            # Verify that the datetime was normalized
+            call_args = mock_event_service.search_events.call_args
+            # Check positional arguments: (page_id, limit, kind, sort_order,
+            # timestamp__gte, timestamp__lt)
+            assert len(call_args[0]) >= 5  # Should have at least 5 positional args
+            assert call_args[0][4] is not None  # timestamp__gte should be normalized
+            assert call_args[0][5] is None  # timestamp__lt should be None
+        finally:
+            # Clean up the dependency override
+            client.app.dependency_overrides.clear()
+
+    @pytest.mark.asyncio
+    async def test_search_events_with_timezone_range(
+        self, client, sample_conversation_id, mock_event_service
+    ):
+        """Test search events with both timestamp filters using
+        timezone-aware datetimes."""
+        # Override the dependency to return our mock
+        client.app.dependency_overrides[get_event_service] = lambda: mock_event_service
+
+        try:
+            # Mock the search_events method to return a sample result
+            mock_event_service.search_events = AsyncMock(
+                return_value={"items": [], "next_page_id": None}
+            )
+
+            # Test with both timestamp filters using timezone-aware datetimes
+            response = client.get(
+                f"/api/conversations/{sample_conversation_id}/events/search",
+                params={
+                    "timestamp__gte": "2025-01-01T10:00:00+05:00",  # UTC+5
+                    "timestamp__lt": "2025-01-01T14:00:00-08:00",  # UTC-8
+                    "limit": 10,
+                },
+            )
+
+            assert response.status_code == 200
+            mock_event_service.search_events.assert_called_once()
+            # Verify that both datetimes were normalized
+            call_args = mock_event_service.search_events.call_args
+            # Check positional arguments: (page_id, limit, kind, sort_order,
+            # timestamp__gte, timestamp__lt)
+            assert len(call_args[0]) >= 6  # Should have at least 6 positional args
+            assert call_args[0][4] is not None  # timestamp__gte should be normalized
+            assert call_args[0][5] is not None  # timestamp__lt should be normalized
+        finally:
+            # Clean up the dependency override
+            client.app.dependency_overrides.clear()
+
+    @pytest.mark.asyncio
+    async def test_count_events_with_timezone_aware_datetime(
+        self, client, sample_conversation_id, mock_event_service
+    ):
+        """Test count events with timezone-aware datetime."""
+        # Override the dependency to return our mock
+        client.app.dependency_overrides[get_event_service] = lambda: mock_event_service
+
+        try:
+            # Mock the count_events method to return a sample result
+            mock_event_service.count_events = AsyncMock(return_value=5)
+
+            # Test with timezone-aware datetime
+            response = client.get(
+                f"/api/conversations/{sample_conversation_id}/events/count",
+                params={
+                    "timestamp__gte": "2025-01-01T12:00:00+02:00",  # UTC+2
+                },
+            )
+
+            assert response.status_code == 200
+            assert response.json() == 5
+            mock_event_service.count_events.assert_called_once()
+            # Verify that the datetime was normalized
+            call_args = mock_event_service.count_events.call_args
+            # Check positional arguments: (kind, timestamp__gte, timestamp__lt)
+            assert len(call_args[0]) >= 2  # Should have at least 2 positional args
+            assert call_args[0][1] is not None  # timestamp__gte should be normalized
+            assert call_args[0][2] is None  # timestamp__lt should be None
+        finally:
+            # Clean up the dependency override
+            client.app.dependency_overrides.clear()
+
+    @pytest.mark.asyncio
+    async def test_search_events_timezone_normalization_consistency(
+        self, client, sample_conversation_id, mock_event_service
+    ):
+        """Test that different timezone representations of the same moment
+        normalize consistently."""
+        # Override the dependency to return our mock
+        client.app.dependency_overrides[get_event_service] = lambda: mock_event_service
+
+        try:
+            # Mock the search_events method to return a sample result
+            mock_event_service.search_events = AsyncMock(
+                return_value={"items": [], "next_page_id": None}
+            )
+
+            # Test 1: UTC timezone
+            response1 = client.get(
+                f"/api/conversations/{sample_conversation_id}/events/search",
+                params={
+                    "timestamp__gte": "2025-01-01T12:00:00Z",  # 12:00 UTC
+                    "limit": 10,
+                },
+            )
+
+            # Test 2: EST timezone (UTC-5) - same moment as 12:00 UTC
+            response2 = client.get(
+                f"/api/conversations/{sample_conversation_id}/events/search",
+                params={
+                    # 07:00 EST = 12:00 UTC
+                    "timestamp__gte": "2025-01-01T07:00:00-05:00",
+                    "limit": 10,
+                },
+            )
+
+            assert response1.status_code == 200
+            assert response2.status_code == 200
+
+            # Both calls should have been made
+            assert mock_event_service.search_events.call_count == 2
+
+            # Get the normalized datetimes from both calls
+            call1_args = mock_event_service.search_events.call_args_list[0]
+            call2_args = mock_event_service.search_events.call_args_list[1]
+
+            # Both should normalize to the same server time
+            # Check positional arguments: (page_id, limit, kind, sort_order,
+            # timestamp__gte, timestamp__lt)
+            normalized_time1 = call1_args[0][4]  # timestamp__gte from first call
+            normalized_time2 = call2_args[0][4]  # timestamp__gte from second call
+
+            # They should be the same after normalization
+            assert normalized_time1 == normalized_time2
+        finally:
+            # Clean up the dependency override
+            client.app.dependency_overrides.clear()
diff --git a/tests/agent_server/test_event_service.py b/tests/agent_server/test_event_service.py
index 1f2ff9b2a2..c213e605e0 100644
--- a/tests/agent_server/test_event_service.py
+++ b/tests/agent_server/test_event_service.py
@@ -12,7 +12,10 @@
     StoredConversation,
 )
 from openhands.sdk import LLM, Agent, Conversation, Message
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 from openhands.sdk.event.llm_convertible import MessageEvent
 from openhands.sdk.security.confirmation_policy import NeverConfirm
 from openhands.sdk.workspace import LocalWorkspace
@@ -65,6 +68,40 @@ def mock_conversation_with_events():
     return conversation
 
 
+@pytest.fixture
+def mock_conversation_with_timestamped_events():
+    """Create a mock conversation with events having specific timestamps for testing."""
+    conversation = MagicMock(spec=Conversation)
+    state = MagicMock(spec=ConversationState)
+
+    # Create events with specific ISO format timestamps
+    # These timestamps are in chronological order
+    timestamps = [
+        "2025-01-01T10:00:00.000000",
+        "2025-01-01T11:00:00.000000",
+        "2025-01-01T12:00:00.000000",
+        "2025-01-01T13:00:00.000000",
+        "2025-01-01T14:00:00.000000",
+    ]
+
+    events = []
+    for index, timestamp in enumerate(timestamps, 1):
+        event = MessageEvent(
+            id=f"event{index}",
+            source="user",
+            llm_message=Message(role="user"),
+            timestamp=timestamp,
+        )
+        events.append(event)
+
+    state.events = events
+    state.__enter__ = MagicMock(return_value=state)
+    state.__exit__ = MagicMock(return_value=None)
+    conversation._state = state
+
+    return conversation
+
+
 class TestEventServiceSearchEvents:
     """Test cases for EventService.search_events method."""
 
@@ -286,6 +323,115 @@ async def test_search_events_exact_pagination_boundary(self, event_service):
         assert len(result.items) == 3
         assert result.next_page_id is None  # No more events available
 
+    @pytest.mark.asyncio
+    async def test_search_events_timestamp_gte_filter(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test filtering events with timestamp__gte (greater than or equal)."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Filter events >= 12:00:00 (should return events 3, 4, 5)
+        filter_time = datetime(2025, 1, 1, 12, 0, 0)
+        result = await event_service.search_events(timestamp__gte=filter_time)
+
+        assert len(result.items) == 3
+        assert result.items[0].id == "event3"
+        assert result.items[1].id == "event4"
+        assert result.items[2].id == "event5"
+        # All returned events should have timestamp >= filter value
+        filter_iso = filter_time.isoformat()
+        for event in result.items:
+            assert event.timestamp >= filter_iso
+
+    @pytest.mark.asyncio
+    async def test_search_events_timestamp_lt_filter(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test filtering events with timestamp__lt (less than)."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Filter events < 13:00:00 (should return events 1, 2, 3)
+        filter_time = datetime(2025, 1, 1, 13, 0, 0)
+        result = await event_service.search_events(timestamp__lt=filter_time)
+
+        assert len(result.items) == 3
+        assert result.items[0].id == "event1"
+        assert result.items[1].id == "event2"
+        assert result.items[2].id == "event3"
+        # All returned events should have timestamp < filter value
+        filter_iso = filter_time.isoformat()
+        for event in result.items:
+            assert event.timestamp < filter_iso
+
+    @pytest.mark.asyncio
+    async def test_search_events_timestamp_range_filter(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test filtering events with both timestamp__gte and timestamp__lt."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Filter events between 11:00:00 and 13:00:00 (should return events 2, 3)
+        gte_time = datetime(2025, 1, 1, 11, 0, 0)
+        lt_time = datetime(2025, 1, 1, 13, 0, 0)
+        result = await event_service.search_events(
+            timestamp__gte=gte_time, timestamp__lt=lt_time
+        )
+
+        assert len(result.items) == 2
+        assert result.items[0].id == "event2"
+        assert result.items[1].id == "event3"
+        # All returned events should be within the range
+        gte_iso = gte_time.isoformat()
+        lt_iso = lt_time.isoformat()
+        for event in result.items:
+            assert event.timestamp >= gte_iso
+            assert event.timestamp < lt_iso
+
+    @pytest.mark.asyncio
+    async def test_search_events_timestamp_filter_with_timezone_aware(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test filtering events with timezone-aware datetime."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Filter events >= 12:00:00 UTC (should return events 3, 4, 5)
+        filter_time = datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC)
+        result = await event_service.search_events(timestamp__gte=filter_time)
+
+        assert len(result.items) == 3
+        assert result.items[0].id == "event3"
+        assert result.items[1].id == "event4"
+        assert result.items[2].id == "event5"
+
+    @pytest.mark.asyncio
+    async def test_search_events_timestamp_filter_no_matches(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test filtering events with timestamps that don't match any events."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Filter events >= 15:00:00 (should return no events)
+        filter_time = datetime(2025, 1, 1, 15, 0, 0)
+        result = await event_service.search_events(timestamp__gte=filter_time)
+
+        assert len(result.items) == 0
+        assert result.next_page_id is None
+
+    @pytest.mark.asyncio
+    async def test_search_events_timestamp_filter_all_events(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test filtering events with timestamps that include all events."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Filter events >= 09:00:00 (should return all events)
+        filter_time = datetime(2025, 1, 1, 9, 0, 0)
+        result = await event_service.search_events(timestamp__gte=filter_time)
+
+        assert len(result.items) == 5
+        assert result.items[0].id == "event1"
+        assert result.items[4].id == "event5"
+
 
 class TestEventServiceCountEvents:
     """Test cases for EventService.count_events method."""
@@ -344,6 +490,81 @@ async def test_count_events_kind_filter(
         result = await event_service.count_events(kind="NonExistentEvent")
         assert result == 0
 
+    @pytest.mark.asyncio
+    async def test_count_events_timestamp_gte_filter(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test counting events with timestamp__gte filter."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Count events >= 12:00:00 (should return 3)
+        filter_time = datetime(2025, 1, 1, 12, 0, 0)
+        result = await event_service.count_events(timestamp__gte=filter_time)
+        assert result == 3
+
+    @pytest.mark.asyncio
+    async def test_count_events_timestamp_lt_filter(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test counting events with timestamp__lt filter."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Count events < 13:00:00 (should return 3)
+        filter_time = datetime(2025, 1, 1, 13, 0, 0)
+        result = await event_service.count_events(timestamp__lt=filter_time)
+        assert result == 3
+
+    @pytest.mark.asyncio
+    async def test_count_events_timestamp_range_filter(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test counting events with both timestamp filters."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Count events between 11:00:00 and 13:00:00 (should return 2)
+        gte_time = datetime(2025, 1, 1, 11, 0, 0)
+        lt_time = datetime(2025, 1, 1, 13, 0, 0)
+        result = await event_service.count_events(
+            timestamp__gte=gte_time, timestamp__lt=lt_time
+        )
+        assert result == 2
+
+    @pytest.mark.asyncio
+    async def test_count_events_timestamp_filter_with_timezone_aware(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test counting events with timezone-aware datetime."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Count events >= 12:00:00 UTC (should return 3)
+        filter_time = datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC)
+        result = await event_service.count_events(timestamp__gte=filter_time)
+        assert result == 3
+
+    @pytest.mark.asyncio
+    async def test_count_events_timestamp_filter_no_matches(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test counting events with timestamps that don't match any events."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Count events >= 15:00:00 (should return 0)
+        filter_time = datetime(2025, 1, 1, 15, 0, 0)
+        result = await event_service.count_events(timestamp__gte=filter_time)
+        assert result == 0
+
+    @pytest.mark.asyncio
+    async def test_count_events_timestamp_filter_all_events(
+        self, event_service, mock_conversation_with_timestamped_events
+    ):
+        """Test counting events with timestamps that include all events."""
+        event_service._conversation = mock_conversation_with_timestamped_events
+
+        # Count events >= 09:00:00 (should return 5)
+        filter_time = datetime(2025, 1, 1, 9, 0, 0)
+        result = await event_service.count_events(timestamp__gte=filter_time)
+        assert result == 5
+
 
 class TestEventServiceSendMessage:
     """Test cases for EventService.send_message method."""
@@ -367,7 +588,7 @@ async def test_send_message_with_run_false_default(self, event_service):
         # Mock conversation and its methods
         conversation = MagicMock()
         state = MagicMock()
-        state.agent_status = AgentExecutionStatus.IDLE
+        state.execution_status = ConversationExecutionStatus.IDLE
         state.__enter__ = MagicMock(return_value=state)
         state.__exit__ = MagicMock(return_value=None)
         conversation.state = state
@@ -431,7 +652,7 @@ async def test_send_message_with_run_true_agent_already_running(
         # Mock conversation and its methods
         conversation = MagicMock()
         state = MagicMock()
-        state.agent_status = AgentExecutionStatus.RUNNING
+        state.execution_status = ConversationExecutionStatus.RUNNING
         state.__enter__ = MagicMock(return_value=state)
         state.__exit__ = MagicMock(return_value=None)
         conversation.state = state
@@ -463,7 +684,7 @@ async def test_send_message_with_run_true_agent_idle(self, event_service):
         # Mock conversation and its methods
         conversation = MagicMock()
         state = MagicMock()
-        state.agent_status = AgentExecutionStatus.IDLE
+        state.execution_status = ConversationExecutionStatus.IDLE
         state.__enter__ = MagicMock(return_value=state)
         state.__exit__ = MagicMock(return_value=None)
         conversation.state = state
diff --git a/tests/agent_server/test_webhook_subscriber.py b/tests/agent_server/test_webhook_subscriber.py
index 43b61844dd..b3b45747e0 100644
--- a/tests/agent_server/test_webhook_subscriber.py
+++ b/tests/agent_server/test_webhook_subscriber.py
@@ -992,7 +992,7 @@ async def test_post_conversation_info_success(
             ConversationWebhookSubscriber,
         )
         from openhands.agent_server.models import ConversationInfo
-        from openhands.sdk.conversation.state import AgentExecutionStatus
+        from openhands.sdk.conversation.state import ConversationExecutionStatus
 
         # Setup mock client
         mock_client = AsyncMock()
@@ -1012,7 +1012,7 @@ async def test_post_conversation_info_success(
             workspace=mock_event_service.stored.workspace,
             created_at=utc_now(),
             updated_at=utc_now(),
-            agent_status=AgentExecutionStatus.RUNNING,
+            execution_status=ConversationExecutionStatus.RUNNING,
         )
 
         await subscriber.post_conversation_info(conversation_info)
@@ -1039,7 +1039,7 @@ async def test_post_conversation_info_with_session_api_key(
             ConversationWebhookSubscriber,
         )
         from openhands.agent_server.models import ConversationInfo
-        from openhands.sdk.conversation.state import AgentExecutionStatus
+        from openhands.sdk.conversation.state import ConversationExecutionStatus
 
         # Setup mock client
         mock_client = AsyncMock()
@@ -1060,7 +1060,7 @@ async def test_post_conversation_info_with_session_api_key(
             workspace=mock_event_service.stored.workspace,
             created_at=utc_now(),
             updated_at=utc_now(),
-            agent_status=AgentExecutionStatus.PAUSED,
+            execution_status=ConversationExecutionStatus.PAUSED,
         )
 
         await subscriber.post_conversation_info(conversation_info)
@@ -1088,7 +1088,7 @@ async def test_post_conversation_info_http_error_with_retries(
             ConversationWebhookSubscriber,
         )
         from openhands.agent_server.models import ConversationInfo
-        from openhands.sdk.conversation.state import AgentExecutionStatus
+        from openhands.sdk.conversation.state import ConversationExecutionStatus
 
         subscriber = ConversationWebhookSubscriber(
             spec=webhook_spec,
@@ -1101,7 +1101,7 @@ async def test_post_conversation_info_http_error_with_retries(
             workspace=mock_event_service.stored.workspace,
             created_at=utc_now(),
             updated_at=utc_now(),
-            agent_status=AgentExecutionStatus.FINISHED,
+            execution_status=ConversationExecutionStatus.FINISHED,
         )
 
         # Track retry attempts
diff --git a/tests/conftest.py b/tests/conftest.py
index 654dc755fd..9d19019b61 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,12 +1,17 @@
 """Common test fixtures and utilities."""
 
+import uuid
 from unittest.mock import MagicMock
 
 import pytest
 from pydantic import SecretStr
 
+from openhands.sdk import Agent
+from openhands.sdk.conversation.state import ConversationState
+from openhands.sdk.io import InMemoryFileStore
 from openhands.sdk.llm import LLM
 from openhands.sdk.tool import ToolExecutor
+from openhands.sdk.workspace import LocalWorkspace
 
 
 @pytest.fixture
@@ -22,6 +27,26 @@ def mock_llm():
     )
 
 
+@pytest.fixture
+def mock_conversation_state(mock_llm, tmp_path):
+    """Create a standard mock ConversationState for testing."""
+    agent = Agent(llm=mock_llm)
+    workspace = LocalWorkspace(working_dir=str(tmp_path))
+
+    state = ConversationState(
+        id=uuid.uuid4(),
+        workspace=workspace,
+        persistence_dir=str(tmp_path / ".state"),
+        agent=agent,
+    )
+
+    # Set up filestore for state persistence
+    state._fs = InMemoryFileStore()
+    state._autosave_enabled = False
+
+    return state
+
+
 @pytest.fixture
 def mock_tool():
     """Create a mock tool for testing."""
diff --git a/tests/cross/test_agent_secrets_integration.py b/tests/cross/test_agent_secrets_integration.py
index 56388126c6..3d9fdcf3c9 100644
--- a/tests/cross/test_agent_secrets_integration.py
+++ b/tests/cross/test_agent_secrets_integration.py
@@ -46,7 +46,7 @@ def conversation(agent: Agent, tmp_path) -> LocalConversation:
 @pytest.fixture
 def bash_executor(conversation: LocalConversation) -> BashExecutor:
     tools_map = conversation.agent.tools_map
-    bash_tool = tools_map["execute_bash"]
+    bash_tool = tools_map["bash"]
     return cast(BashExecutor, bash_tool.executor)
 
 
@@ -73,17 +73,17 @@ def test_agent_configures_bash_tools_env_provider(
     )
 
     # Get the bash tool from agent
-    bash_tool = agent.tools_map["execute_bash"]
+    bash_tool = agent.tools_map["bash"]
 
     assert bash_tool is not None
     assert bash_tool.executor is not None
 
     # Test that secrets are accessible via conversation
-    secrets_manager = conversation.state.secrets_manager
-    env_vars = secrets_manager.get_secrets_as_env_vars("echo $API_KEY")
+    secret_registry = conversation.state.secret_registry
+    env_vars = secret_registry.get_secrets_as_env_vars("echo $API_KEY")
     assert env_vars == {"API_KEY": "test-api-key"}
 
-    env_vars = secrets_manager.get_secrets_as_env_vars("echo $NOT_A_KEY")
+    env_vars = secret_registry.get_secrets_as_env_vars("echo $NOT_A_KEY")
     assert env_vars == {}
 
 
@@ -104,8 +104,8 @@ def get_value(self):
         }
     )
 
-    secrets_manager = conversation.state.secrets_manager
-    env_vars = secrets_manager.get_secrets_as_env_vars(
+    secret_registry = conversation.state.secret_registry
+    env_vars = secret_registry.get_secrets_as_env_vars(
         "export DYNAMIC_TOKEN=$DYNAMIC_TOKEN"
     )
     assert env_vars == {"DYNAMIC_TOKEN": "dynamic-token-123"}
@@ -128,16 +128,16 @@ def get_value(self):
         }
     )
 
-    secrets_manager = conversation.state.secrets_manager
+    secret_registry = conversation.state.secret_registry
 
     # Should not raise exception, should return empty dict
-    env_vars = secrets_manager.get_secrets_as_env_vars(
+    env_vars = secret_registry.get_secrets_as_env_vars(
         "export FAILING_KEY=$FAILING_KEY"
     )
     assert env_vars == {}
 
     # Working key should still work
-    env_vars = secrets_manager.get_secrets_as_env_vars(
+    env_vars = secret_registry.get_secrets_as_env_vars(
         "export WORKING_KEY=$WORKING_KEY"
     )
     assert env_vars == {"WORKING_KEY": "working-value"}
@@ -151,8 +151,8 @@ def test_agent_env_provider_no_matches(
     conversation.update_secrets({"API_KEY": "test-value"})
 
     # Test secrets manager with command that doesn't reference secrets
-    secrets_manager = conversation.state.secrets_manager
-    env_vars = secrets_manager.get_secrets_as_env_vars("echo hello world")
+    secret_registry = conversation.state.secret_registry
+    env_vars = secret_registry.get_secrets_as_env_vars("echo hello world")
 
     assert env_vars == {}
 
@@ -186,31 +186,31 @@ def test_agent_secrets_integration_workflow(
             }
         )
 
-        secrets_manager = conversation.state.secrets_manager
+        secret_registry = conversation.state.secret_registry
 
         # Single secret
-        env_vars = secrets_manager.get_secrets_as_env_vars(
+        env_vars = secret_registry.get_secrets_as_env_vars(
             "curl -H 'X-API-Key: $API_KEY'"
         )
         assert env_vars == {"API_KEY": "static-api-key-123"}
 
         # Multiple secrets
         command = "export API_KEY=$API_KEY && export AUTH_TOKEN=$AUTH_TOKEN"
-        env_vars = secrets_manager.get_secrets_as_env_vars(command)
+        env_vars = secret_registry.get_secrets_as_env_vars(command)
         assert env_vars == {
             "API_KEY": "static-api-key-123",
             "AUTH_TOKEN": "bearer-token-456",
         }
 
         # No secrets referenced
-        env_vars = secrets_manager.get_secrets_as_env_vars("echo hello world")
+        env_vars = secret_registry.get_secrets_as_env_vars("echo hello world")
         assert env_vars == {}
 
     # Step 5: Update secrets and verify changes propagate
     conversation.update_secrets({"API_KEY": "updated-api-key-789"})
 
-    secrets_manager = conversation.state.secrets_manager
-    env_vars = secrets_manager.get_secrets_as_env_vars("curl -H 'X-API-Key: $API_KEY'")
+    secret_registry = conversation.state.secret_registry
+    env_vars = secret_registry.get_secrets_as_env_vars("curl -H 'X-API-Key: $API_KEY'")
     assert env_vars == {"API_KEY": "updated-api-key-789"}
 
 
diff --git a/tests/cross/test_automatic_naming.py b/tests/cross/test_automatic_naming.py
new file mode 100644
index 0000000000..3878fa3a54
--- /dev/null
+++ b/tests/cross/test_automatic_naming.py
@@ -0,0 +1,60 @@
+"""Test automatic tool naming functionality."""
+
+
+def test_camel_to_snake_conversion():
+    """Test the _camel_to_snake utility function."""
+    from openhands.sdk.tool.tool import _camel_to_snake
+
+    # Test basic conversions
+    assert _camel_to_snake("BashTool") == "bash_tool"
+    assert _camel_to_snake("FileEditorTool") == "file_editor_tool"
+    assert _camel_to_snake("GrepTool") == "grep_tool"
+    assert _camel_to_snake("PlanningFileEditorTool") == "planning_file_editor_tool"
+    assert _camel_to_snake("BrowserToolSet") == "browser_tool_set"
+    assert _camel_to_snake("TaskTrackerTool") == "task_tracker_tool"
+    assert _camel_to_snake("GlobTool") == "glob_tool"
+
+    # Test edge cases
+    assert _camel_to_snake("Tool") == "tool"
+    assert _camel_to_snake("A") == "a"
+    assert _camel_to_snake("AB") == "ab"  # All uppercase, no separation needed
+    assert _camel_to_snake("ABC") == "abc"  # All uppercase, no separation needed
+    assert _camel_to_snake("XMLParser") == "xml_parser"
+    assert _camel_to_snake("HTTPClient") == "http_client"
+
+
+def test_real_tools_have_correct_names():
+    """Test that real tools have the expected automatic names."""
+    from openhands.tools.execute_bash import BashTool
+    from openhands.tools.file_editor import FileEditorTool
+    from openhands.tools.glob import GlobTool
+    from openhands.tools.grep import GrepTool
+    from openhands.tools.planning_file_editor import PlanningFileEditorTool
+    from openhands.tools.task_tracker import TaskTrackerTool
+
+    # Verify all tools have correct automatic names
+    assert BashTool.name == "bash"
+    assert FileEditorTool.name == "file_editor"
+    assert GrepTool.name == "grep"
+    assert PlanningFileEditorTool.name == "planning_file_editor"
+    assert TaskTrackerTool.name == "task_tracker"
+    assert GlobTool.name == "glob"
+
+
+def test_tool_name_consistency():
+    """Test that tool names are consistent across imports."""
+    # Import the same tool multiple times to ensure consistency
+    from openhands.tools.execute_bash import (
+        BashTool as BashTool1,
+        BashTool as BashTool2,
+    )
+
+    assert BashTool1.name == BashTool2.name == "bash"
+
+    # Test with different tools
+    from openhands.tools.file_editor import FileEditorTool
+    from openhands.tools.grep import GrepTool
+
+    assert FileEditorTool.name == "file_editor"
+    assert GrepTool.name == "grep"
+    assert FileEditorTool.name != GrepTool.name
diff --git a/tests/cross/test_automatic_registration.py b/tests/cross/test_automatic_registration.py
new file mode 100644
index 0000000000..a079e89b02
--- /dev/null
+++ b/tests/cross/test_automatic_registration.py
@@ -0,0 +1,108 @@
+"""Test automatic tool registration functionality."""
+
+from openhands.sdk.tool.registry import list_registered_tools
+
+
+def test_bash_tool_automatic_registration():
+    """Test that BashTool is automatically registered when imported."""
+    # Import the module to trigger registration
+    import openhands.tools.execute_bash.definition  # noqa: F401
+
+    # Check that the tool is registered with snake_case name
+    registered_tools = list_registered_tools()
+    assert "bash" in registered_tools
+
+
+def test_file_editor_tool_automatic_registration():
+    """Test that FileEditorTool is automatically registered when imported."""
+    # Import the module to trigger registration
+    import openhands.tools.file_editor.definition  # noqa: F401
+
+    # Check that the tool is registered with snake_case name
+    registered_tools = list_registered_tools()
+    assert "file_editor" in registered_tools
+
+
+def test_task_tracker_tool_automatic_registration():
+    """Test that TaskTrackerTool is automatically registered when imported."""
+    # Import the module to trigger registration
+    import openhands.tools.task_tracker.definition  # noqa: F401
+
+    # Check that the tool is registered with snake_case name
+    registered_tools = list_registered_tools()
+    assert "task_tracker" in registered_tools
+
+
+def test_browser_tool_automatic_registration():
+    """Test that BrowserToolSet is automatically registered when imported."""
+    # Import the module to trigger registration
+    import openhands.tools.browser_use.definition  # noqa: F401
+
+    # Check that the tool is registered with snake_case name
+    registered_tools = list_registered_tools()
+    assert "browser_tool_set" in registered_tools
+
+
+def test_grep_tool_automatic_registration():
+    """Test that GrepTool is automatically registered when imported."""
+    # Import the module to trigger registration
+    import openhands.tools.grep.definition  # noqa: F401
+
+    # Check that the tool is registered with snake_case name
+    registered_tools = list_registered_tools()
+    assert "grep" in registered_tools
+
+
+def test_glob_tool_automatic_registration():
+    """Test that GlobTool is automatically registered when imported."""
+    # Import the module to trigger registration
+    import openhands.tools.glob.definition  # noqa: F401
+
+    # Check that the tool is registered with snake_case name
+    registered_tools = list_registered_tools()
+    assert "glob" in registered_tools
+
+
+def test_planning_file_editor_tool_automatic_registration():
+    """Test that PlanningFileEditorTool is automatically registered when imported."""
+    # Import the module to trigger registration
+    import openhands.tools.planning_file_editor.definition  # noqa: F401
+
+    # Check that the tool is registered with snake_case name
+    registered_tools = list_registered_tools()
+    assert "planning_file_editor" in registered_tools
+
+
+def test_import_from_init_triggers_registration():
+    """Test that importing from __init__.py also triggers registration."""
+    # Import from the __init__.py file
+    from openhands.tools.execute_bash import BashTool  # noqa: F401
+
+    # Check that the tool is registered with snake_case name
+    registered_tools = list_registered_tools()
+    assert "bash" in registered_tools
+
+
+def test_tool_can_be_resolved_after_automatic_registration():
+    """Test that automatically registered tools can be resolved and used."""
+    from unittest.mock import MagicMock
+
+    # Import to trigger registration
+    import openhands.tools.execute_bash.definition  # noqa: F401
+    from openhands.sdk.conversation.state import ConversationState
+    from openhands.sdk.tool.registry import resolve_tool
+    from openhands.sdk.tool.spec import Tool
+
+    # Create a mock conversation state
+    mock_conv_state = MagicMock(spec=ConversationState)
+    mock_workspace = MagicMock()
+    mock_workspace.working_dir = "/tmp"
+    mock_conv_state.workspace = mock_workspace
+
+    # Try to resolve the tool using snake_case name
+    tool_spec = Tool(name="bash")
+    resolved_tools = resolve_tool(tool_spec, mock_conv_state)
+
+    # Should successfully resolve
+    assert len(resolved_tools) == 1
+    assert resolved_tools[0].name == "bash"
diff --git a/tests/cross/test_hello_world.py b/tests/cross/test_hello_world.py
index 5540b583ed..4f0960aa1c 100644
--- a/tests/cross/test_hello_world.py
+++ b/tests/cross/test_hello_world.py
@@ -111,7 +111,7 @@ def create_mock_llm_responses(self):
                                 "id": "call_1",
                                 "type": "function",
                                 "function": {
-                                    "name": "str_replace_editor",
+                                    "name": "file_editor",
                                     "arguments": f'{{"command": "create", '
                                     f'"path": "{hello_path}", '
                                     f'"file_text": "print(\\"Hello, World!\\")"}}',
@@ -165,11 +165,11 @@ def test_hello_world_with_real_llm_data(self, mock_completion, fncall_raw_logs):
         )
 
         # Tools setup with temporary directory - use registry + Tool as in runtime
-        register_tool("BashTool", BashTool)
-        register_tool("FileEditorTool", FileEditorTool)
+        register_tool("bash", BashTool)
+        register_tool("file_editor", FileEditorTool)
         tools = [
-            Tool(name="BashTool"),
-            Tool(name="FileEditorTool"),
+            Tool(name="bash"),
+            Tool(name="file_editor"),
         ]
 
         # Agent setup
@@ -285,11 +285,11 @@ def test_llm_completion_logging_fidelity(self, mock_completion, fncall_raw_logs)
         )
 
         # Tools setup with temporary directory - use registry + Tool as in runtime
-        register_tool("BashTool", BashTool)
-        register_tool("FileEditorTool", FileEditorTool)
+        register_tool("bash", BashTool)
+        register_tool("file_editor", FileEditorTool)
         tools = [
-            Tool(name="BashTool"),
-            Tool(name="FileEditorTool"),
+            Tool(name="bash"),
+            Tool(name="file_editor"),
         ]
 
         # Create agent and conversation
diff --git a/tests/cross/test_remote_conversation_live_server.py b/tests/cross/test_remote_conversation_live_server.py
index 093879c36e..8d0b1e797a 100644
--- a/tests/cross/test_remote_conversation_live_server.py
+++ b/tests/cross/test_remote_conversation_live_server.py
@@ -209,7 +209,7 @@ def test_remote_conversation_over_real_server(server_env, patched_llm):
 
     # Validate state transitions and that we received an assistant message
     state = conv.state
-    assert state.agent_status.value in {"finished", "idle", "running"}
+    assert state.execution_status.value in {"finished", "idle", "running"}
 
     # Wait for WS-delivered events and validate them using proper type checking
     found_system_prompt = False
@@ -367,3 +367,62 @@ def test_bash_command_endpoint_with_live_server(server_env):
     assert "8" in result.stdout, (
         f"Expected '8' (result of 5+3) not found in stdout: {result.stdout}"
     )
+
+
+def test_file_upload_endpoint_with_live_server(server_env, tmp_path: Path):
+    """Integration test for file upload through live server.
+
+    This test validates that the /api/file/upload/{path} endpoint works
+    correctly end-to-end by:
+    1. Starting a real FastAPI server with file upload endpoints
+    2. Creating a RemoteWorkspace pointing to that server
+    3. Creating a test file and uploading it
+    4. Verifying the file was uploaded to the correct location with correct content
+
+    This is a regression test for the file upload issue where the client was
+    calling /api/file/upload (without the path parameter) instead of
+    /api/file/upload/{path} as the server expects.
+    """
+    # Create a RemoteWorkspace pointing to the live server
+    workspace = RemoteWorkspace(
+        host=server_env["host"], working_dir="/tmp/test_workspace"
+    )
+
+    # Create a test file to upload
+    test_file = tmp_path / "test_upload.txt"
+    test_content = "Hello from file upload test!\nThis is line 2.\n"
+    test_file.write_text(test_content)
+
+    # Define the destination path (must be absolute for the server)
+    destination = "/tmp/test_workspace/uploaded_file.txt"
+
+    # Upload the file
+    result = workspace.file_upload(str(test_file), destination)
+
+    # Verify the upload was successful
+    assert result.success is True, (
+        f"File upload failed. Error: {result.error}, "
+        f"Source: {result.source_path}, Destination: {result.destination_path}"
+    )
+    assert result.source_path == str(test_file), (
+        f"Expected source_path to be {test_file}, got {result.source_path}"
+    )
+    assert result.destination_path == destination, (
+        f"Expected destination_path to be {destination}, got {result.destination_path}"
+    )
+
+    # Verify the file exists at the destination with correct content
+    # Use bash command to check file existence and read content
+    check_cmd = f"test -f {destination} && cat {destination}"
+    check_result = workspace.execute_command(check_cmd, timeout=5.0)
+
+    assert check_result.exit_code == 0, (
+        f"File does not exist at destination or could not be read. "
+        f"Exit code: {check_result.exit_code}, "
+        f"stderr: {check_result.stderr}"
+    )
+
+    # Verify the content matches what we uploaded
+    assert check_result.stdout == test_content, (
+        f"File content mismatch. Expected:\n{test_content}\nGot:\n{check_result.stdout}"
+    )
diff --git a/tests/cross/test_stuck_detector.py b/tests/cross/test_stuck_detector.py
index 06ae4ae0fd..8ce47b9c09 100644
--- a/tests/cross/test_stuck_detector.py
+++ b/tests/cross/test_stuck_detector.py
@@ -44,11 +44,11 @@ def test_history_too_short():
         source="agent",
         thought=[TextContent(text="I need to run ls command")],
         action=ExecuteBashAction(command="ls"),
-        tool_name="execute_bash",
+        tool_name="bash",
         tool_call_id="call_1",
         tool_call=MessageToolCall(
             id="call_1",
-            name="execute_bash",
+            name="bash",
             arguments='{"command": "ls"}',
             origin="completion",
         ),
@@ -62,7 +62,7 @@ def test_history_too_short():
             output="file1.txt\nfile2.txt", command="ls", exit_code=0
         ),
         action_id=action.id,
-        tool_name="execute_bash",
+        tool_name="bash",
         tool_call_id="call_1",
     )
     state.events.append(observation)
@@ -93,11 +93,11 @@ def test_repeating_action_observation_not_stuck_less_than_4_repeats():
             source="agent",
             thought=[TextContent(text="I need to run ls command")],
             action=ExecuteBashAction(command="ls"),
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id=f"call_{i}",
             tool_call=MessageToolCall(
                 id=f"call_{i}",
-                name="execute_bash",
+                name="bash",
                 arguments='{"command": "ls"}',
                 origin="completion",
             ),
@@ -111,7 +111,7 @@ def test_repeating_action_observation_not_stuck_less_than_4_repeats():
                 output="file1.txt\nfile2.txt", command="ls", exit_code=0
             ),
             action_id=action.id,
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id=f"call_{i}",
         )
         state.events.append(observation)
@@ -142,11 +142,11 @@ def test_repeating_action_observation_stuck():
             source="agent",
             thought=[TextContent(text="I need to run ls command")],
             action=ExecuteBashAction(command="ls"),
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id=f"call_{i}",
             tool_call=MessageToolCall(
                 id=f"call_{i}",
-                name="execute_bash",
+                name="bash",
                 arguments='{"command": "ls"}',
                 origin="completion",
             ),
@@ -160,7 +160,7 @@ def test_repeating_action_observation_stuck():
                 output="file1.txt\nfile2.txt", command="ls", exit_code=0
             ),
             action_id=action.id,
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id=f"call_{i}",
         )
         state.events.append(observation)
@@ -192,11 +192,11 @@ def create_action_and_error(i):
             source="agent",
             thought=[TextContent(text="I need to run invalid_command")],
             action=ExecuteBashAction(command="invalid_command"),
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id=f"call_{i}",
             tool_call=MessageToolCall(
                 id=f"call_{i}",
-                name="execute_bash",
+                name="bash",
                 arguments='{"command": "invalid_command"}',
                 origin="completion",
             ),
@@ -283,11 +283,11 @@ def test_not_stuck_with_different_actions():
             source="agent",
             thought=[TextContent(text=f"I need to run {cmd} command")],
             action=ExecuteBashAction(command=cmd),
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id=f"call_{i}",
             tool_call=MessageToolCall(
                 id=f"call_{i}",
-                name="execute_bash",
+                name="bash",
                 arguments=f'{{"command": "{cmd}"}}',
                 origin="completion",
             ),
@@ -301,7 +301,7 @@ def test_not_stuck_with_different_actions():
                 output=f"output from {cmd}", command=cmd, exit_code=0
             ),
             action_id=action.id,
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id=f"call_{i}",
         )
         state.events.append(observation)
@@ -332,11 +332,11 @@ def test_reset_after_user_message():
             source="agent",
             thought=[TextContent(text="I need to run ls command")],
             action=ExecuteBashAction(command="ls"),
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id=f"call_{i}",
             tool_call=MessageToolCall(
                 id=f"call_{i}",
-                name="execute_bash",
+                name="bash",
                 arguments='{"command": "ls"}',
                 origin="completion",
             ),
@@ -350,7 +350,7 @@ def test_reset_after_user_message():
                 output="file1.txt\nfile2.txt", command="ls", exit_code=0
             ),
             action_id=action.id,
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id=f"call_{i}",
         )
         state.events.append(observation)
@@ -375,11 +375,11 @@ def test_reset_after_user_message():
         source="agent",
         thought=[TextContent(text="I'll try pwd command")],
         action=ExecuteBashAction(command="pwd"),
-        tool_name="execute_bash",
+        tool_name="bash",
         tool_call_id="call_new",
         tool_call=MessageToolCall(
             id="call_new",
-            name="execute_bash",
+            name="bash",
             arguments='{"command": "pwd"}',
             origin="completion",
         ),
@@ -393,7 +393,7 @@ def test_reset_after_user_message():
             output="/home/user", command="pwd", exit_code=0
         ),
         action_id=action.id,
-        tool_name="execute_bash",
+        tool_name="bash",
         tool_call_id="call_new",
     )
     state.events.append(observation)
diff --git a/tests/sdk/agent/test_agent_context_window_condensation.py b/tests/sdk/agent/test_agent_context_window_condensation.py
new file mode 100644
index 0000000000..8711998b71
--- /dev/null
+++ b/tests/sdk/agent/test_agent_context_window_condensation.py
@@ -0,0 +1,64 @@
+import pytest
+from pydantic import PrivateAttr
+
+from openhands.sdk.agent import Agent
+from openhands.sdk.context.condenser.base import CondenserBase
+from openhands.sdk.context.view import View
+from openhands.sdk.conversation import Conversation
+from openhands.sdk.event.condenser import CondensationRequest
+from openhands.sdk.llm import LLM
+from openhands.sdk.llm.exceptions import LLMContextWindowExceedError
+
+
+class RaisingLLM(LLM):
+    _force_responses: bool = PrivateAttr(default=False)
+
+    def __init__(self, *, model: str = "test-model", force_responses: bool = False):
+        super().__init__(model=model, usage_id="test-llm")
+        self._force_responses = force_responses
+
+    def uses_responses_api(self) -> bool:  # override gating
+        return self._force_responses
+
+    def completion(self, *, messages, tools=None, **kwargs):  # type: ignore[override]
+        raise LLMContextWindowExceedError()
+
+    def responses(self, *, messages, tools=None, **kwargs):  # type: ignore[override]
+        raise LLMContextWindowExceedError()
+
+
+class HandlesRequestsCondenser(CondenserBase):
+    def condense(self, view: View):  # pragma: no cover - trivial passthrough
+        return view
+
+    def handles_condensation_requests(self) -> bool:
+        return True
+
+
+@pytest.mark.parametrize("force_responses", [True, False])
+def test_agent_triggers_condensation_request_when_ctx_exceeded_with_condenser(
+    force_responses: bool,
+):
+    llm = RaisingLLM(force_responses=force_responses)
+    agent = Agent(llm=llm, tools=[], condenser=HandlesRequestsCondenser())
+    convo = Conversation(agent=agent)
+
+    seen = []
+
+    def on_event(e):
+        seen.append(e)
+
+    # Expect Agent to emit CondensationRequest and not raise
+    agent.step(convo, on_event=on_event)
+
+    assert any(isinstance(e, CondensationRequest) for e in seen)
+
+
+@pytest.mark.parametrize("force_responses", [True, False])
+def test_agent_raises_ctx_exceeded_when_no_condenser(force_responses: bool):
+    llm = RaisingLLM(force_responses=force_responses)
+    agent = Agent(llm=llm, tools=[], condenser=None)
+    convo = Conversation(agent=agent)
+
+    with pytest.raises(LLMContextWindowExceedError):
+        agent.step(convo, on_event=lambda e: None)
diff --git a/tests/sdk/agent/test_agent_serialization.py b/tests/sdk/agent/test_agent_serialization.py
index 707bc4a017..0636c42978 100644
--- a/tests/sdk/agent/test_agent_serialization.py
+++ b/tests/sdk/agent/test_agent_serialization.py
@@ -13,7 +13,7 @@
 from openhands.sdk.llm import LLM
 from openhands.sdk.mcp.client import MCPClient
 from openhands.sdk.mcp.tool import MCPToolDefinition
-from openhands.sdk.tool.tool import ToolBase
+from openhands.sdk.tool.tool import ToolDefinition
 from openhands.sdk.utils.models import OpenHandsModel
 
 
@@ -55,7 +55,7 @@ def test_agent_supports_polymorphic_json_serialization() -> None:
 def test_mcp_tool_serialization():
     tool = create_mock_mcp_tool("test_mcp_tool_serialization")
     dumped = tool.model_dump_json()
-    loaded = ToolBase.model_validate_json(dumped)
+    loaded = ToolDefinition.model_validate_json(dumped)
     assert loaded.model_dump_json() == dumped
 
 
diff --git a/tests/sdk/agent/test_agent_tool_init.py b/tests/sdk/agent/test_agent_tool_init.py
index 5d4ce971c3..2539671d80 100644
--- a/tests/sdk/agent/test_agent_tool_init.py
+++ b/tests/sdk/agent/test_agent_tool_init.py
@@ -1,4 +1,5 @@
 from collections.abc import Sequence
+from typing import ClassVar
 from unittest.mock import patch
 
 from openhands.sdk import LLM, Conversation
@@ -27,16 +28,25 @@ def __call__(self, action: _Action, conversation=None) -> _Obs:
         return _Obs(out=action.text.upper())
 
 
+class _UpperTool(ToolDefinition[_Action, _Obs]):
+    """Concrete tool for uppercase testing."""
+
+    name: ClassVar[str] = "upper"
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["_UpperTool"]:
+        return [
+            cls(
+                description="Uppercase",
+                action_type=_Action,
+                observation_type=_Obs,
+                executor=_Exec(),
+            )
+        ]
+
+
 def _make_tool(conv_state=None, **kwargs) -> Sequence[ToolDefinition]:
-    return [
-        ToolDefinition(
-            name="upper",
-            description="Uppercase",
-            action_type=_Action,
-            observation_type=_Obs,
-            executor=_Exec(),
-        )
-    ]
+    return _UpperTool.create(conv_state, **kwargs)
 
 
 def test_agent_initializes_tools_from_toolspec_locally(monkeypatch):
diff --git a/tests/sdk/agent/test_message_while_finishing.py b/tests/sdk/agent/test_message_while_finishing.py
index 57be3b27ee..2b028f4294 100644
--- a/tests/sdk/agent/test_message_while_finishing.py
+++ b/tests/sdk/agent/test_message_while_finishing.py
@@ -30,7 +30,7 @@
 import sys
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
-from typing import Any
+from typing import Any, ClassVar
 
 
 # Ensure repo root on sys.path when running this file as a script
@@ -129,17 +129,26 @@ def __call__(self, action: SleepAction, conversation=None) -> SleepObservation:
         return SleepObservation(message=action.message)
 
 
+class SleepTool(ToolDefinition[SleepAction, SleepObservation]):
+    """Sleep tool for testing message processing during finish."""
+
+    name: ClassVar[str] = "sleep"
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["SleepTool"]:
+        return [
+            cls(
+                action_type=SleepAction,
+                observation_type=SleepObservation,
+                description="Sleep for specified duration and return a message",
+                executor=SleepExecutor(),
+            )
+        ]
+
+
 def _make_sleep_tool(conv_state=None, **kwargs) -> Sequence[ToolDefinition]:
     """Create sleep tool for testing."""
-    return [
-        ToolDefinition(
-            name="sleep_tool",
-            action_type=SleepAction,
-            observation_type=SleepObservation,
-            description="Sleep for specified duration and return a message",
-            executor=SleepExecutor(),
-        )
-    ]
+    return SleepTool.create(conv_state, **kwargs)
 
 
 # Register the tool
@@ -152,9 +161,7 @@ class TestMessageWhileFinishing:
     def setup_method(self):
         """Set up test fixtures."""
         # Use gpt-4o which supports native function calling and multiple tool calls
-        self.llm: LLM = LLM(
-            model="gpt-4o", native_tool_calling=True, usage_id="test-llm"
-        )
+        self.llm: LLM = LLM(model="gpt-4o", usage_id="test-llm")
         self.llm_completion_calls: list[Any] = []
         self.agent: Agent = Agent(llm=self.llm, tools=[Tool(name="SleepTool")])
         self.step_count: int = 0
@@ -182,7 +189,7 @@ def _mock_llm_response(self, messages, **kwargs):
                 id="sleep_call_1",
                 type="function",
                 function=Function(
-                    name="sleep_tool",
+                    name="sleep",
                     arguments='{"duration": 2.0, "message": "First sleep completed"}',
                 ),
             )
@@ -225,7 +232,7 @@ def _mock_llm_response(self, messages, **kwargs):
                 id="sleep_call_2",
                 type="function",
                 function=Function(
-                    name="sleep_tool",
+                    name="sleep",
                     arguments=f'{{"duration": 3.0, "message": "{sleep_message}"}}',
                 ),
             )
@@ -308,7 +315,7 @@ def test_message_processing_fix_verification(self):
 
         # Set the test start time reference for the sleep executor
         # Access the actual tool instances from the agent's _tools dict
-        sleep_tool = self.agent._tools.get("sleep_tool")
+        sleep_tool = self.agent._tools.get("sleep")
         if sleep_tool and sleep_tool.executor is not None:
             setattr(sleep_tool.executor, "test_start_time", self.test_start_time)
             setattr(sleep_tool.executor, "test_instance", self)
diff --git a/tests/sdk/agent/test_nonexistent_tool_handling.py b/tests/sdk/agent/test_nonexistent_tool_handling.py
index e2e7263dc8..3c4394ce6e 100644
--- a/tests/sdk/agent/test_nonexistent_tool_handling.py
+++ b/tests/sdk/agent/test_nonexistent_tool_handling.py
@@ -13,7 +13,7 @@
 
 from openhands.sdk.agent import Agent
 from openhands.sdk.conversation import Conversation
-from openhands.sdk.conversation.state import AgentExecutionStatus
+from openhands.sdk.conversation.state import ConversationExecutionStatus
 from openhands.sdk.event import AgentErrorEvent, MessageEvent
 from openhands.sdk.llm import LLM, Message, TextContent
 
@@ -97,9 +97,9 @@ def event_callback(event):
 
     # Verify that the conversation is NOT finished (this is the key fix)
     with conversation.state:
-        assert conversation.state.agent_status != AgentExecutionStatus.FINISHED, (
-            "Agent should not be finished after encountering non-existent tool"
-        )
+        assert (
+            conversation.state.execution_status != ConversationExecutionStatus.FINISHED
+        ), "Agent should not be finished after encountering non-existent tool"
 
     # Verify that the error event is properly formatted for LLM
     llm_message = error_event.to_llm_message()
@@ -278,7 +278,10 @@ def event_callback(event):
 
         # Verify conversation is not finished
         with conversation.state:
-            assert conversation.state.agent_status != AgentExecutionStatus.FINISHED
+            assert (
+                conversation.state.execution_status
+                != ConversationExecutionStatus.FINISHED
+            )
 
         # Run second step - should continue normally
         agent.step(conversation, on_event=event_callback)
@@ -298,7 +301,10 @@ def event_callback(event):
 
         # Now the conversation should be finished
         with conversation.state:
-            assert conversation.state.agent_status == AgentExecutionStatus.FINISHED
+            assert (
+                conversation.state.execution_status
+                == ConversationExecutionStatus.FINISHED
+            )
 
     # Verify we made two LLM calls
     assert call_count == 2
diff --git a/tests/sdk/config/test_llm_config.py b/tests/sdk/config/test_llm_config.py
index 17316b95b2..638edbbba8 100644
--- a/tests/sdk/config/test_llm_config.py
+++ b/tests/sdk/config/test_llm_config.py
@@ -36,7 +36,7 @@ def test_llm_config_defaults():
     assert config.caching_prompt is True
     assert config.log_completions is False
     assert config.custom_tokenizer is None
-    assert config.native_tool_calling is None
+    assert config.native_tool_calling is True
     assert config.reasoning_effort == "high"  # Default for non-Gemini models
     assert config.seed is None
     assert config.safety_settings is None
@@ -336,7 +336,6 @@ def test_llm_config_optional_fields():
         disable_vision=None,
         disable_stop_word=None,
         custom_tokenizer=None,
-        native_tool_calling=None,
         reasoning_effort=None,
         seed=None,
         safety_settings=None,
@@ -364,7 +363,6 @@ def test_llm_config_optional_fields():
     assert config.disable_vision is None
     assert config.disable_stop_word is None
     assert config.custom_tokenizer is None
-    assert config.native_tool_calling is None
     assert (
         config.reasoning_effort == "high"
     )  # Even when set to None, post_init sets it to "high" for non-Gemini models
diff --git a/tests/sdk/context/condenser/test_llm_summarizing_condenser.py b/tests/sdk/context/condenser/test_llm_summarizing_condenser.py
index ac8790457b..8b8f344b05 100644
--- a/tests/sdk/context/condenser/test_llm_summarizing_condenser.py
+++ b/tests/sdk/context/condenser/test_llm_summarizing_condenser.py
@@ -43,6 +43,7 @@ def create_completion_result(content: str) -> LLMResponse:
         )
         # Create a mock ModelResponse
         raw_response = MagicMock(spec=ModelResponse)
+        raw_response.id = "mock-llm-response-id"
         return LLMResponse(message=message, metrics=metrics, raw_response=raw_response)
 
     mock_llm.completion.return_value = create_completion_result(
@@ -63,7 +64,7 @@ def create_completion_result(content: str) -> LLMResponse:
     mock_llm.custom_tokenizer = None
     mock_llm.base_url = None
     mock_llm.reasoning_effort = None
-    mock_llm.metadata = {}
+    mock_llm.litellm_extra_body = {}
 
     # Explicitly set pricing attributes required by LLM -> Telemetry wiring
     mock_llm.input_cost_per_token = None
@@ -158,6 +159,7 @@ def test_get_condensation_with_previous_summary(mock_llm: LLM) -> None:
         forgotten_event_ids=[events[3].id, events[4].id],
         summary="Previous summary content",
         summary_offset=keep_first,
+        llm_response_id="condensation_response_1",
     )
     events_with_condensation = (
         events[:keep_first] + [condensation] + events[keep_first:]
diff --git a/tests/sdk/context/skill/test_load_user_skills.py b/tests/sdk/context/skill/test_load_user_skills.py
new file mode 100644
index 0000000000..0cd2459360
--- /dev/null
+++ b/tests/sdk/context/skill/test_load_user_skills.py
@@ -0,0 +1,274 @@
+"""Tests for load_user_skills functionality."""
+
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from openhands.sdk.context.agent_context import AgentContext
+from openhands.sdk.context.skills import (
+    KeywordTrigger,
+    Skill,
+    load_user_skills,
+)
+
+
+@pytest.fixture
+def temp_user_skills_dir():
+    """Create a temporary user skills directory structure."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        root = Path(temp_dir)
+
+        # Create .openhands/skills directory
+        skills_dir = root / ".openhands" / "skills"
+        skills_dir.mkdir(parents=True)
+
+        yield root, skills_dir
+
+
+@pytest.fixture
+def temp_microagents_dir():
+    """Create a temporary microagents directory structure."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        root = Path(temp_dir)
+
+        # Create .openhands/microagents directory
+        microagents_dir = root / ".openhands" / "microagents"
+        microagents_dir.mkdir(parents=True)
+
+        yield root, microagents_dir
+
+
+def test_load_user_skills_no_directories(tmp_path):
+    """Test load_user_skills when no user skills directories exist."""
+    # Point USER_SKILLS_DIRS to non-existent directories
+    from openhands.sdk.context.skills import skill
+
+    original_dirs = skill.USER_SKILLS_DIRS
+    try:
+        skill.USER_SKILLS_DIRS = [
+            tmp_path / "nonexistent1",
+            tmp_path / "nonexistent2",
+        ]
+        skills = load_user_skills()
+        assert skills == []
+    finally:
+        skill.USER_SKILLS_DIRS = original_dirs
+
+
+def test_load_user_skills_with_skills_directory(temp_user_skills_dir):
+    """Test load_user_skills loads from skills directory."""
+    root, skills_dir = temp_user_skills_dir
+
+    # Create a test skill file
+    skill_file = skills_dir / "test_skill.md"
+    skill_file.write_text(
+        "---\nname: test_skill\ntriggers:\n  - test\n---\nThis is a test skill."
+    )
+
+    from openhands.sdk.context.skills import skill
+
+    original_dirs = skill.USER_SKILLS_DIRS
+    try:
+        skill.USER_SKILLS_DIRS = [skills_dir]
+        skills = load_user_skills()
+        assert len(skills) == 1
+        assert skills[0].name == "test_skill"
+        assert skills[0].content == "This is a test skill."
+        assert isinstance(skills[0].trigger, KeywordTrigger)
+    finally:
+        skill.USER_SKILLS_DIRS = original_dirs
+
+
+def test_load_user_skills_with_microagents_directory(temp_microagents_dir):
+    """Test load_user_skills loads from microagents directory (legacy)."""
+    root, microagents_dir = temp_microagents_dir
+
+    # Create a test microagent file
+    microagent_file = microagents_dir / "legacy_skill.md"
+    microagent_file.write_text(
+        "---\n"
+        "name: legacy_skill\n"
+        "triggers:\n"
+        "  - legacy\n"
+        "---\n"
+        "This is a legacy microagent skill."
+    )
+
+    from openhands.sdk.context.skills import skill
+
+    original_dirs = skill.USER_SKILLS_DIRS
+    try:
+        skill.USER_SKILLS_DIRS = [microagents_dir]
+        skills = load_user_skills()
+        assert len(skills) == 1
+        assert skills[0].name == "legacy_skill"
+        assert skills[0].content == "This is a legacy microagent skill."
+    finally:
+        skill.USER_SKILLS_DIRS = original_dirs
+
+
+def test_load_user_skills_priority_order(tmp_path):
+    """Test that skills/ directory takes precedence over microagents/."""
+    # Create both directories
+    skills_dir = tmp_path / ".openhands" / "skills"
+    microagents_dir = tmp_path / ".openhands" / "microagents"
+    skills_dir.mkdir(parents=True)
+    microagents_dir.mkdir(parents=True)
+
+    # Create duplicate skill in both directories
+    (skills_dir / "duplicate.md").write_text(
+        "---\nname: duplicate\n---\nFrom skills directory."
+    )
+
+    (microagents_dir / "duplicate.md").write_text(
+        "---\nname: duplicate\n---\nFrom microagents directory."
+    )
+
+    from openhands.sdk.context.skills import skill
+
+    original_dirs = skill.USER_SKILLS_DIRS
+    try:
+        skill.USER_SKILLS_DIRS = [skills_dir, microagents_dir]
+        skills = load_user_skills()
+        assert len(skills) == 1
+        assert skills[0].name == "duplicate"
+        # Should be from skills directory (takes precedence)
+        assert skills[0].content == "From skills directory."
+    finally:
+        skill.USER_SKILLS_DIRS = original_dirs
+
+
+def test_load_user_skills_both_directories(tmp_path):
+    """Test loading unique skills from both directories."""
+    # Create both directories
+    skills_dir = tmp_path / ".openhands" / "skills"
+    microagents_dir = tmp_path / ".openhands" / "microagents"
+    skills_dir.mkdir(parents=True)
+    microagents_dir.mkdir(parents=True)
+
+    # Create different skills in each directory
+    (skills_dir / "skill1.md").write_text("---\nname: skill1\n---\nSkill 1 content.")
+    (microagents_dir / "skill2.md").write_text(
+        "---\nname: skill2\n---\nSkill 2 content."
+    )
+
+    from openhands.sdk.context.skills import skill
+
+    original_dirs = skill.USER_SKILLS_DIRS
+    try:
+        skill.USER_SKILLS_DIRS = [skills_dir, microagents_dir]
+        skills = load_user_skills()
+        assert len(skills) == 2
+        skill_names = {s.name for s in skills}
+        assert skill_names == {"skill1", "skill2"}
+    finally:
+        skill.USER_SKILLS_DIRS = original_dirs
+
+
+def test_load_user_skills_handles_errors_gracefully(temp_user_skills_dir):
+    """Test that errors in loading are handled gracefully."""
+    root, skills_dir = temp_user_skills_dir
+
+    # Create an invalid skill file
+    invalid_file = skills_dir / "invalid.md"
+    invalid_file.write_text(
+        "---\n"
+        "triggers: not_a_list\n"  # Invalid: triggers must be a list
+        "---\n"
+        "Invalid skill."
+    )
+
+    from openhands.sdk.context.skills import skill
+
+    original_dirs = skill.USER_SKILLS_DIRS
+    try:
+        skill.USER_SKILLS_DIRS = [skills_dir]
+        # Should not raise exception, just return empty list
+        skills = load_user_skills()
+        assert skills == []
+    finally:
+        skill.USER_SKILLS_DIRS = original_dirs
+
+
+def test_agent_context_loads_user_skills_by_default(temp_user_skills_dir):
+    """Test that AgentContext loads user skills when enabled."""
+    root, skills_dir = temp_user_skills_dir
+
+    # Create a test skill
+    skill_file = skills_dir / "auto_skill.md"
+    skill_file.write_text("---\nname: auto_skill\n---\nAutomatically loaded skill.")
+
+    from openhands.sdk.context.skills import skill
+
+    original_dirs = skill.USER_SKILLS_DIRS
+    try:
+        skill.USER_SKILLS_DIRS = [skills_dir]
+        context = AgentContext(load_user_skills=True)
+        skill_names = [s.name for s in context.skills]
+        assert "auto_skill" in skill_names
+    finally:
+        skill.USER_SKILLS_DIRS = original_dirs
+
+
+def test_agent_context_can_disable_user_skills_loading():
+    """Test that user skills loading can be disabled."""
+    context = AgentContext(load_user_skills=False)
+    assert context.skills == []
+
+
+def test_agent_context_merges_explicit_and_user_skills(temp_user_skills_dir):
+    """Test that explicit skills and user skills are merged correctly."""
+    root, skills_dir = temp_user_skills_dir
+
+    # Create user skill
+    user_skill_file = skills_dir / "user_skill.md"
+    user_skill_file.write_text("---\nname: user_skill\n---\nUser skill content.")
+
+    # Create explicit skill
+    explicit_skill = Skill(
+        name="explicit_skill",
+        content="Explicit skill content.",
+        trigger=None,
+    )
+
+    from openhands.sdk.context.skills import skill
+
+    original_dirs = skill.USER_SKILLS_DIRS
+    try:
+        skill.USER_SKILLS_DIRS = [skills_dir]
+        context = AgentContext(skills=[explicit_skill], load_user_skills=True)
+        skill_names = [s.name for s in context.skills]
+        assert "explicit_skill" in skill_names
+        assert "user_skill" in skill_names
+        assert len(context.skills) == 2
+    finally:
+        skill.USER_SKILLS_DIRS = original_dirs
+
+
+def test_agent_context_explicit_skill_takes_precedence(temp_user_skills_dir):
+    """Test that explicitly provided skills take precedence over user skills."""
+    root, skills_dir = temp_user_skills_dir
+
+    # Create user skill with same name
+    user_skill_file = skills_dir / "duplicate.md"
+    user_skill_file.write_text("---\nname: duplicate\n---\nUser skill content.")
+
+    # Create explicit skill with same name
+    explicit_skill = Skill(
+        name="duplicate",
+        content="Explicit skill content.",
+        trigger=None,
+    )
+
+    from openhands.sdk.context.skills import skill
+
+    original_dirs = skill.USER_SKILLS_DIRS
+    try:
+        skill.USER_SKILLS_DIRS = [skills_dir]
+        context = AgentContext(skills=[explicit_skill], load_user_skills=True)
+        assert len(context.skills) == 1
+        # Explicit skill should be used, not the user skill
+        assert context.skills[0].content == "Explicit skill content."
+    finally:
+        skill.USER_SKILLS_DIRS = original_dirs
diff --git a/tests/sdk/context/test_view.py b/tests/sdk/context/test_view.py
index f58fbbee27..92aa73d2fb 100644
--- a/tests/sdk/context/test_view.py
+++ b/tests/sdk/context/test_view.py
@@ -42,7 +42,10 @@ def test_view_forgets_events() -> None:
     # The condensation specifically targets the IDs of all M_i messages
     events: list[Event] = [
         *message_events,
-        Condensation(forgotten_event_ids=message_event_ids),
+        Condensation(
+            forgotten_event_ids=message_event_ids,
+            llm_response_id="condensation_response_1",
+        ),
     ]
 
     # All events should be forgotten and removed.
@@ -62,7 +65,10 @@ def test_view_keeps_non_forgotten_events() -> None:
             # `test_view_forgets_events`, in this test we only want to forget
             # one of the events. That way we can check that the rest of the
             # events are preserved.
-            Condensation(forgotten_event_ids=[forgotten_event_id]),
+            Condensation(
+                forgotten_event_ids=[forgotten_event_id],
+                llm_response_id="condensation_response_1",
+            ),
         ]
 
         view = View.from_events(events)
@@ -78,10 +84,13 @@ def test_view_inserts_summary() -> None:
     message_events = [message_event(f"Event {i}") for i in range(5)]
 
     for offset in range(5):
-        events: list[Event] = [
+        events = [
             *message_events,
             Condensation(
-                forgotten_event_ids=[], summary="My Summary", summary_offset=offset
+                forgotten_event_ids=[],
+                summary="My Summary",
+                summary_offset=offset,
+                llm_response_id="condensation_response_1",
             ),
         ]
         view = View.from_events(events)
@@ -120,7 +129,12 @@ def test_no_condensation_action_in_view() -> None:
     events: list[Event] = []
 
     events.extend(message_events[:2])
-    events.append(Condensation(forgotten_event_ids=[message_events[0].id]))
+    events.append(
+        Condensation(
+            forgotten_event_ids=[message_events[0].id],
+            llm_response_id="condensation_response_1",
+        )
+    )
     events.extend(message_events[2:])
 
     view = View.from_events(events)
@@ -167,7 +181,12 @@ def test_handled_condensation_request_with_condensation_action() -> None:
             message_event("Event 2"),
         ]
     )
-    events.append(Condensation(forgotten_event_ids=[event.id for event in events[:2]]))
+    events.append(
+        Condensation(
+            forgotten_event_ids=[event.id for event in events[:2]],
+            llm_response_id="condensation_response_1",
+        )
+    )
     events.append(message_event("Event 3"))
     view = View.from_events(events)
 
@@ -184,11 +203,13 @@ def test_handled_condensation_request_with_condensation_action() -> None:
 
 def test_multiple_condensation_requests_pattern() -> None:
     """Test the pattern with multiple condensation requests and actions."""
-    events: list[Event] = [
+    events = [
         message_event(content="Event 0"),
         CondensationRequest(),  # First request
         message_event(content="Event 1"),
-        Condensation(forgotten_event_ids=[]),  # Handles first request
+        Condensation(
+            forgotten_event_ids=[], llm_response_id="condensation_response_1"
+        ),  # Handles first request
         message_event(content="Event 2"),
         CondensationRequest(),  # Second request - should be unhandled
         message_event(content="Event 3"),
@@ -209,9 +230,11 @@ def test_condensation_action_before_request() -> None:
     """Test that CondensationAction before CondensationRequestAction doesn't affect the
     unhandled status.
     """
-    events: list[Event] = [
+    events = [
         message_event(content="Event 0"),
-        Condensation(forgotten_event_ids=[]),  # This doesn't handle the later request
+        Condensation(
+            forgotten_event_ids=[], llm_response_id="condensation_response_1"
+        ),  # This doesn't handle the later request
         message_event(content="Event 1"),
         CondensationRequest(),  # This should be unhandled
         message_event(content="Event 2"),
@@ -266,11 +289,11 @@ def test_condensation_request_always_removed_from_view() -> None:
         assert not isinstance(event, CondensationRequest)
 
     # Test case 2: Handled request
-    events_handled: list[Event] = [
+    events_handled = [
         message_event(content="Event 0"),
         CondensationRequest(),
         message_event(content="Event 1"),
-        Condensation(forgotten_event_ids=[]),
+        Condensation(forgotten_event_ids=[], llm_response_id="condensation_response_1"),
         message_event(content="Event 2"),
     ]
     view_handled = View.from_events(events_handled)
@@ -299,12 +322,20 @@ def test_condensations_field_stores_all_condensations_in_order() -> None:
 
     # Create multiple condensations
     condensation1 = Condensation(
-        forgotten_event_ids=[message_events[0].id], summary="Summary 1"
+        forgotten_event_ids=[message_events[0].id],
+        summary="Summary 1",
+        llm_response_id="condensation_response_1",
     )
     condensation2 = Condensation(
-        forgotten_event_ids=[message_events[1].id], summary="Summary 2"
+        forgotten_event_ids=[message_events[1].id],
+        summary="Summary 2",
+        llm_response_id="condensation_response_2",
+    )
+    condensation3 = Condensation(
+        forgotten_event_ids=[],
+        summary="Summary 3",
+        llm_response_id="condensation_response_3",
     )
-    condensation3 = Condensation(forgotten_event_ids=[], summary="Summary 3")
 
     events: list[Event] = [
         message_events[0],
@@ -336,14 +367,26 @@ def test_most_recent_condensation_property() -> None:
     assert view_no_condensation.most_recent_condensation is None
 
     # Test with single condensation
-    condensation1 = Condensation(forgotten_event_ids=[], summary="First summary")
+    condensation1 = Condensation(
+        forgotten_event_ids=[],
+        summary="First summary",
+        llm_response_id="condensation_response_1",
+    )
     events_single: list[Event] = [*message_events, condensation1]
     view_single = View.from_events(events_single)
     assert view_single.most_recent_condensation == condensation1
 
     # Test with multiple condensations
-    condensation2 = Condensation(forgotten_event_ids=[], summary="Second summary")
-    condensation3 = Condensation(forgotten_event_ids=[], summary="Third summary")
+    condensation2 = Condensation(
+        forgotten_event_ids=[],
+        summary="Second summary",
+        llm_response_id="condensation_response_2",
+    )
+    condensation3 = Condensation(
+        forgotten_event_ids=[],
+        summary="Third summary",
+        llm_response_id="condensation_response_3",
+    )
     events_multiple: list[Event] = [
         message_events[0],
         condensation1,
@@ -360,8 +403,13 @@ def test_condensations_field_with_mixed_events() -> None:
     """Test condensations field behavior with mixed event types including requests."""
     message_events = [message_event(f"Event {i}") for i in range(4)]
 
-    condensation1 = Condensation(forgotten_event_ids=[message_events[0].id])
-    condensation2 = Condensation(forgotten_event_ids=[])
+    condensation1 = Condensation(
+        forgotten_event_ids=[message_events[0].id],
+        llm_response_id="condensation_response_1",
+    )
+    condensation2 = Condensation(
+        forgotten_event_ids=[], llm_response_id="condensation_response_2"
+    )
 
     events: list[Event] = [
         message_events[0],
@@ -399,7 +447,10 @@ def test_summary_event_index_none_when_condensation_has_no_summary() -> None:
     message_events = [message_event(f"Event {i}") for i in range(3)]
 
     # Condensation without summary
-    condensation = Condensation(forgotten_event_ids=[message_events[0].id])
+    condensation = Condensation(
+        forgotten_event_ids=[message_events[0].id],
+        llm_response_id="condensation_response_1",
+    )
 
     events: list[Event] = [
         message_events[0],
@@ -426,6 +477,7 @@ def test_summary_event_index_and_event_with_summary() -> None:
         forgotten_event_ids=[message_events[0].id],
         summary="This is a test summary",
         summary_offset=1,
+        llm_response_id="condensation_response_1",
     )
 
     events: list[Event] = [
@@ -460,6 +512,7 @@ def test_summary_event_with_multiple_condensations() -> None:
         forgotten_event_ids=[message_events[0].id],
         summary="First summary",
         summary_offset=0,
+        llm_response_id="condensation_response_1",
     )
 
     # Second condensation with different summary (should override)
@@ -467,6 +520,7 @@ def test_summary_event_with_multiple_condensations() -> None:
         forgotten_event_ids=[message_events[1].id],
         summary="Second summary",
         summary_offset=1,
+        llm_response_id="condensation_response_2",
     )
 
     events: list[Event] = [
@@ -498,6 +552,7 @@ def test_summary_event_with_condensation_without_offset() -> None:
     condensation = Condensation(
         forgotten_event_ids=[message_events[0].id],
         summary="This summary should be ignored",
+        llm_response_id="condensation_response_1",
         # No summary_offset
     )
 
@@ -522,6 +577,7 @@ def test_summary_event_with_zero_offset() -> None:
         forgotten_event_ids=[message_events[0].id],
         summary="Summary at beginning",
         summary_offset=0,
+        llm_response_id="condensation_response_1",
     )
 
     events: list[Event] = [
diff --git a/tests/sdk/context/test_view_batch_atomicity.py b/tests/sdk/context/test_view_batch_atomicity.py
index dd27f82de9..ac37464cef 100644
--- a/tests/sdk/context/test_view_batch_atomicity.py
+++ b/tests/sdk/context/test_view_batch_atomicity.py
@@ -7,7 +7,6 @@
 """
 
 from openhands.sdk.context.view import View
-from openhands.sdk.event.base import Event
 from openhands.sdk.event.condenser import Condensation
 from openhands.sdk.event.llm_convertible import (
     ActionEvent,
@@ -109,7 +108,7 @@ def test_batch_atomicity_partial_batch_forgotten() -> None:
     # Condensation forgets the first 3 actions (E44-E46), but not the 4th (E47)
     # This simulates what might happen if the condenser uses event indices without
     # considering batch boundaries
-    events: list[Event] = [
+    events = [
         message_event("User message"),
         action1,
         action2,
@@ -119,7 +118,10 @@ def test_batch_atomicity_partial_batch_forgotten() -> None:
         obs2,
         obs3,
         obs4,
-        Condensation(forgotten_event_ids=[action1.id, action2.id, action3.id]),
+        Condensation(
+            forgotten_event_ids=[action1.id, action2.id, action3.id],
+            llm_response_id="condensation_response_1",
+        ),
     ]
 
     view = View.from_events(events)
@@ -160,13 +162,16 @@ def test_batch_atomicity_complete_batch_forgotten() -> None:
     obs1 = create_observation_event("tool_call_1")
     obs2 = create_observation_event("tool_call_2")
 
-    events: list[Event] = [
+    events = [
         message_event("User message"),
         action1,
         action2,
         obs1,
         obs2,
-        Condensation(forgotten_event_ids=[action1.id, action2.id]),
+        Condensation(
+            forgotten_event_ids=[action1.id, action2.id],
+            llm_response_id="condensation_response_1",
+        ),
     ]
 
     view = View.from_events(events)
@@ -203,7 +208,7 @@ def test_batch_atomicity_no_forgetting_preserves_batch() -> None:
     obs2 = create_observation_event("tool_call_2")
     obs3 = create_observation_event("tool_call_3")
 
-    events: list[Event] = [
+    events = [
         message_event("User message"),
         action1,
         action2,
@@ -211,7 +216,9 @@ def test_batch_atomicity_no_forgetting_preserves_batch() -> None:
         obs1,
         obs2,
         obs3,
-        Condensation(forgotten_event_ids=[]),  # Don't forget anything
+        Condensation(
+            forgotten_event_ids=[], llm_response_id="condensation_response_1"
+        ),  # Don't forget anything
     ]
 
     view = View.from_events(events)
@@ -255,7 +262,7 @@ def test_batch_atomicity_multiple_batches() -> None:
 
     # Forget only the first action of the first batch
     # This should cause the entire first batch to be forgotten, but not the second batch
-    events: list[Event] = [
+    events = [
         message_event("User message"),
         action1_1,
         action1_2,
@@ -266,7 +273,10 @@ def test_batch_atomicity_multiple_batches() -> None:
         action2_2,
         obs2_1,
         obs2_2,
-        Condensation(forgotten_event_ids=[action1_1.id]),
+        Condensation(
+            forgotten_event_ids=[action1_1.id],
+            llm_response_id="condensation_response_1",
+        ),
     ]
 
     view = View.from_events(events)
@@ -297,11 +307,13 @@ def test_batch_atomicity_single_action_batch() -> None:
     )
     obs = create_observation_event("tool_call_1")
 
-    events: list[Event] = [
+    events = [
         message_event("User message"),
         action,
         obs,
-        Condensation(forgotten_event_ids=[action.id]),
+        Condensation(
+            forgotten_event_ids=[action.id], llm_response_id="condensation_response_1"
+        ),
     ]
 
     view = View.from_events(events)
@@ -328,7 +340,7 @@ def test_batch_atomicity_no_thinking_blocks() -> None:
     obs3 = create_observation_event("tool_call_3")
 
     # Forget first two actions
-    events: list[Event] = [
+    events = [
         message_event("User message"),
         action1,
         obs1,
@@ -336,7 +348,10 @@ def test_batch_atomicity_no_thinking_blocks() -> None:
         obs2,
         action3,
         obs3,
-        Condensation(forgotten_event_ids=[action1.id, action2.id]),
+        Condensation(
+            forgotten_event_ids=[action1.id, action2.id],
+            llm_response_id="condensation_response_1",
+        ),
     ]
 
     view = View.from_events(events)
diff --git a/tests/sdk/conversation/local/test_agent_status_transition.py b/tests/sdk/conversation/local/test_agent_status_transition.py
index 9f0e518448..80d163b631 100644
--- a/tests/sdk/conversation/local/test_agent_status_transition.py
+++ b/tests/sdk/conversation/local/test_agent_status_transition.py
@@ -17,6 +17,7 @@
 
 import threading
 from collections.abc import Sequence
+from typing import ClassVar
 from unittest.mock import patch
 
 from litellm import ChatCompletionMessageToolCall
@@ -30,7 +31,7 @@
 
 from openhands.sdk.agent import Agent
 from openhands.sdk.conversation import Conversation
-from openhands.sdk.conversation.state import AgentExecutionStatus
+from openhands.sdk.conversation.state import ConversationExecutionStatus
 from openhands.sdk.event import MessageEvent
 from openhands.sdk.llm import LLM, ImageContent, Message, TextContent
 from openhands.sdk.tool import (
@@ -64,8 +65,8 @@ class StatusCheckingExecutor(
 ):
     """Executor that captures the agent status when executed."""
 
-    def __init__(self, status_during_execution: list[AgentExecutionStatus]):
-        self.status_during_execution: list[AgentExecutionStatus] = (
+    def __init__(self, status_during_execution: list[ConversationExecutionStatus]):
+        self.status_during_execution: list[ConversationExecutionStatus] = (
             status_during_execution
         )
 
@@ -74,26 +75,41 @@ def __call__(
     ) -> StatusTransitionMockObservation:
         # Capture the agent status during execution
         if conversation:
-            self.status_during_execution.append(conversation.state.agent_status)
+            self.status_during_execution.append(conversation.state.execution_status)
         return StatusTransitionMockObservation(result=f"Executed: {action.command}")
 
 
-@patch("openhands.sdk.llm.llm.litellm_completion")
-def test_agent_status_transitions_to_running_from_idle(mock_completion):
-    """Test that agent status transitions to RUNNING when run() is called from IDLE."""
-    status_during_execution: list[AgentExecutionStatus] = []
+class StatusTransitionTestTool(
+    ToolDefinition[StatusTransitionMockAction, StatusTransitionMockObservation]
+):
+    """Concrete tool for status transition testing."""
 
-    def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
+    name: ClassVar[str] = "test_tool"
+
+    @classmethod
+    def create(
+        cls, conv_state=None, *, executor: ToolExecutor, **params
+    ) -> Sequence["StatusTransitionTestTool"]:
         return [
-            ToolDefinition(
-                name="test_tool",
+            cls(
                 description="A test tool",
                 action_type=StatusTransitionMockAction,
                 observation_type=StatusTransitionMockObservation,
-                executor=StatusCheckingExecutor(status_during_execution),
+                executor=executor,
             )
         ]
 
+
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_execution_status_transitions_to_running_from_idle(mock_completion):
+    """Test that agent status transitions to RUNNING when run() is called from IDLE."""
+    status_during_execution: list[ConversationExecutionStatus] = []
+
+    def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
+        return StatusTransitionTestTool.create(
+            executor=StatusCheckingExecutor(status_during_execution)
+        )
+
     register_tool("test_tool", _make_tool)
 
     llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
@@ -101,7 +117,7 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
     conversation = Conversation(agent=agent)
 
     # Verify initial state is IDLE
-    assert conversation.state.agent_status == AgentExecutionStatus.IDLE
+    assert conversation.state.execution_status == ConversationExecutionStatus.IDLE
 
     # Mock LLM to return a message that finishes execution
     mock_completion.return_value = ModelResponse(
@@ -119,7 +135,7 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
     conversation.run()
 
     # After run completes, status should be FINISHED
-    assert conversation.state.agent_status == AgentExecutionStatus.FINISHED
+    assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
     # Verify we have agent response
     agent_messages = [
@@ -131,21 +147,15 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
 
 
 @patch("openhands.sdk.llm.llm.litellm_completion")
-def test_agent_status_is_running_during_execution_from_idle(mock_completion):
+def test_execution_status_is_running_during_execution_from_idle(mock_completion):
     """Test that agent status is RUNNING during execution when started from IDLE."""
-    status_during_execution: list[AgentExecutionStatus] = []
+    status_during_execution: list[ConversationExecutionStatus] = []
     execution_started = threading.Event()
 
     def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
-        return [
-            ToolDefinition(
-                name="test_tool",
-                description="A test tool",
-                action_type=StatusTransitionMockAction,
-                observation_type=StatusTransitionMockObservation,
-                executor=StatusCheckingExecutor(status_during_execution),
-            )
-        ]
+        return StatusTransitionTestTool.create(
+            executor=StatusCheckingExecutor(status_during_execution)
+        )
 
     register_tool("test_tool", _make_tool)
 
@@ -157,7 +167,7 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
     conversation = Conversation(agent=agent)
 
     # Verify initial state is IDLE
-    assert conversation.state.agent_status == AgentExecutionStatus.IDLE
+    assert conversation.state.execution_status == ConversationExecutionStatus.IDLE
 
     # Mock LLM to return an action first, then finish
     tool_call = ChatCompletionMessageToolCall(
@@ -217,7 +227,7 @@ def side_effect(*args, **kwargs):
     # Run in a separate thread so we can check status during execution
     status_checked = threading.Event()
     run_complete = threading.Event()
-    status_during_run: list[AgentExecutionStatus | None] = [None]
+    status_during_run: list[ConversationExecutionStatus | None] = [None]
 
     def run_agent():
         conversation.run()
@@ -230,7 +240,7 @@ def run_agent():
     assert execution_started.wait(timeout=2.0), "Execution never started"
 
     # Check status while running
-    status_during_run[0] = conversation.state.agent_status
+    status_during_run[0] = conversation.state.execution_status
     status_checked.set()
 
     # Wait for run to complete
@@ -238,16 +248,16 @@ def run_agent():
     t.join(timeout=0.1)
 
     # Verify status was RUNNING during execution
-    assert status_during_run[0] == AgentExecutionStatus.RUNNING, (
+    assert status_during_run[0] == ConversationExecutionStatus.RUNNING, (
         f"Expected RUNNING status during execution, got {status_during_run[0]}"
     )
 
     # After run completes, status should be FINISHED
-    assert conversation.state.agent_status == AgentExecutionStatus.FINISHED
+    assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
 
 @patch("openhands.sdk.llm.llm.litellm_completion")
-def test_agent_status_transitions_to_running_from_paused(mock_completion):
+def test_execution_status_transitions_to_running_from_paused(mock_completion):
     """Test that agent status transitions to RUNNING when run() is called from
     PAUSED."""
     llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
@@ -256,7 +266,7 @@ def test_agent_status_transitions_to_running_from_paused(mock_completion):
 
     # Pause the conversation
     conversation.pause()
-    assert conversation.state.agent_status == AgentExecutionStatus.PAUSED
+    assert conversation.state.execution_status == ConversationExecutionStatus.PAUSED
 
     # Mock LLM to return a message that finishes execution
     mock_completion.return_value = ModelResponse(
@@ -274,7 +284,7 @@ def test_agent_status_transitions_to_running_from_paused(mock_completion):
     conversation.run()
 
     # After run completes, status should be FINISHED
-    assert conversation.state.agent_status == AgentExecutionStatus.FINISHED
+    assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
     # Verify we have agent response
     agent_messages = [
@@ -286,22 +296,14 @@ def test_agent_status_transitions_to_running_from_paused(mock_completion):
 
 
 @patch("openhands.sdk.llm.llm.litellm_completion")
-def test_agent_status_transitions_from_waiting_for_confirmation(mock_completion):
+def test_execution_status_transitions_from_waiting_for_confirmation(mock_completion):
     """Test WAITING_FOR_CONFIRMATION -> RUNNING transition when run() is called."""
     from openhands.sdk.security.confirmation_policy import AlwaysConfirm
 
     llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
 
     def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
-        return [
-            ToolDefinition(
-                name="test_tool",
-                description="A test tool",
-                action_type=StatusTransitionMockAction,
-                observation_type=StatusTransitionMockObservation,
-                executor=StatusCheckingExecutor([]),
-            )
-        ]
+        return StatusTransitionTestTool.create(executor=StatusCheckingExecutor([]))
 
     register_tool("test_tool", _make_tool)
 
@@ -366,18 +368,19 @@ def side_effect(*args, **kwargs):
 
     # Should be waiting for confirmation
     assert (
-        conversation.state.agent_status == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+        conversation.state.execution_status
+        == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
     )
 
     # Call run again - this confirms and should transition to RUNNING, then FINISHED
     conversation.run()
 
     # After confirmation and execution, should be FINISHED
-    assert conversation.state.agent_status == AgentExecutionStatus.FINISHED
+    assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
 
 @patch("openhands.sdk.llm.llm.litellm_completion")
-def test_agent_status_finished_to_idle_to_running(mock_completion):
+def test_execution_status_finished_to_idle_to_running(mock_completion):
     """Test FINISHED -> IDLE -> RUNNING transition when new message is sent."""
     llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
     agent = Agent(llm=llm, tools=[])
@@ -399,17 +402,17 @@ def test_agent_status_finished_to_idle_to_running(mock_completion):
         Message(role="user", content=[TextContent(text="First task")])
     )
     conversation.run()
-    assert conversation.state.agent_status == AgentExecutionStatus.FINISHED
+    assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
     # Send new message - should transition to IDLE
     conversation.send_message(
         Message(role="user", content=[TextContent(text="Second task")])
     )
-    assert conversation.state.agent_status == AgentExecutionStatus.IDLE
+    assert conversation.state.execution_status == ConversationExecutionStatus.IDLE
 
     # Run again - should transition to RUNNING then FINISHED
     conversation.run()
-    assert conversation.state.agent_status == AgentExecutionStatus.FINISHED
+    assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
 
 @patch("openhands.sdk.llm.llm.litellm_completion")
@@ -433,7 +436,7 @@ def test_run_exits_immediately_when_already_finished(mock_completion):
     # Complete a task
     conversation.send_message(Message(role="user", content=[TextContent(text="Task")]))
     conversation.run()
-    assert conversation.state.agent_status == AgentExecutionStatus.FINISHED
+    assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
 
     # Call run again without sending a new message
     # Should exit immediately without calling LLM again
@@ -441,7 +444,7 @@ def test_run_exits_immediately_when_already_finished(mock_completion):
     conversation.run()
 
     # Status should still be FINISHED
-    assert conversation.state.agent_status == AgentExecutionStatus.FINISHED
+    assert conversation.state.execution_status == ConversationExecutionStatus.FINISHED
     # LLM should not be called again
     assert mock_completion.call_count == initial_call_count
 
@@ -454,12 +457,12 @@ def test_run_exits_immediately_when_stuck(mock_completion):
     conversation = Conversation(agent=agent)
 
     # Manually set status to STUCK (simulating stuck detection)
-    conversation._state.agent_status = AgentExecutionStatus.STUCK
+    conversation._state.execution_status = ConversationExecutionStatus.STUCK
 
     # Call run - should exit immediately
     conversation.run()
 
     # Status should still be STUCK
-    assert conversation.state.agent_status == AgentExecutionStatus.STUCK
+    assert conversation.state.execution_status == ConversationExecutionStatus.STUCK
     # LLM should not be called
     assert mock_completion.call_count == 0
diff --git a/tests/sdk/conversation/local/test_confirmation_mode.py b/tests/sdk/conversation/local/test_confirmation_mode.py
index 1c863c61f2..bd9ecbe064 100644
--- a/tests/sdk/conversation/local/test_confirmation_mode.py
+++ b/tests/sdk/conversation/local/test_confirmation_mode.py
@@ -5,6 +5,7 @@
 """
 
 from collections.abc import Sequence
+from typing import ClassVar
 from unittest.mock import MagicMock, Mock, patch
 
 import pytest
@@ -19,7 +20,10 @@
 
 from openhands.sdk.agent import Agent
 from openhands.sdk.conversation import Conversation, LocalConversation
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 from openhands.sdk.event import ActionEvent, MessageEvent, ObservationEvent
 from openhands.sdk.event.base import Event
 from openhands.sdk.event.llm_convertible import UserRejectObservation
@@ -58,6 +62,43 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         return [TextContent(text=self.result)]
 
 
+class TestExecutor(
+    ToolExecutor[MockConfirmationModeAction, MockConfirmationModeObservation]
+):
+    """Test executor for confirmation mode testing."""
+
+    def __call__(
+        self,
+        action: MockConfirmationModeAction,
+        conversation=None,  # noqa: ARG002
+    ) -> MockConfirmationModeObservation:
+        return MockConfirmationModeObservation(result=f"Executed: {action.command}")
+
+
+class ConfirmationTestTool(
+    ToolDefinition[MockConfirmationModeAction, MockConfirmationModeObservation]
+):
+    """Concrete tool for confirmation mode testing."""
+
+    name: ClassVar[str] = "test_tool"
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["ConfirmationTestTool"]:
+        return [
+            cls(
+                description="A test tool",
+                action_type=MockConfirmationModeAction,
+                observation_type=MockConfirmationModeObservation,
+                executor=TestExecutor(),
+            )
+        ]
+
+
+def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
+    """Factory function for creating test tools."""
+    return ConfirmationTestTool.create(conv_state, **params)
+
+
 class TestConfirmationMode:
     """Test suite for confirmation mode functionality."""
 
@@ -91,29 +132,6 @@ def setup_method(self):
         )
         self.mock_llm.metrics.get_snapshot.return_value = mock_metrics_snapshot
 
-        class TestExecutor(
-            ToolExecutor[MockConfirmationModeAction, MockConfirmationModeObservation]
-        ):
-            def __call__(
-                self,
-                action: MockConfirmationModeAction,
-                conversation=None,  # noqa: ARG002
-            ) -> MockConfirmationModeObservation:
-                return MockConfirmationModeObservation(
-                    result=f"Executed: {action.command}"
-                )
-
-        def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
-            return [
-                ToolDefinition(
-                    name="test_tool",
-                    description="A test tool",
-                    action_type=MockConfirmationModeAction,
-                    observation_type=MockConfirmationModeObservation,
-                    executor=TestExecutor(),
-                )
-            ]
-
         register_tool("test_tool", _make_tool)
 
         self.agent: Agent = Agent(
@@ -150,8 +168,8 @@ def _make_pending_action(self) -> None:
             self.conversation.run()
         assert self.conversation.state.confirmation_policy == AlwaysConfirm()
         assert (
-            self.conversation.state.agent_status
-            == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
         )
 
     def _mock_action_once(
@@ -334,7 +352,9 @@ def test_confirmation_mode_basic_functionality(self):
         """Test basic confirmation mode operations."""
         # Test initial state
         assert self.conversation.state.confirmation_policy == NeverConfirm()
-        assert self.conversation.state.agent_status == AgentExecutionStatus.IDLE
+        assert (
+            self.conversation.state.execution_status == ConversationExecutionStatus.IDLE
+        )
         assert (
             ConversationState.get_unmatched_actions(self.conversation.state.events)
             == []
@@ -422,7 +442,10 @@ def test_message_only_in_confirmation_mode_does_not_wait(self):
             )
             self.conversation.run()
 
-        assert self.conversation.state.agent_status == AgentExecutionStatus.FINISHED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.FINISHED
+        )
 
         msg_events = [
             e
@@ -466,7 +489,10 @@ def test_action_then_confirm_or_reject(self, should_reject: bool):
                 if isinstance(e, UserRejectObservation)
             ]
             assert len(rejection_events) == 0
-            assert self.conversation.state.agent_status == AgentExecutionStatus.FINISHED
+            assert (
+                self.conversation.state.execution_status
+                == ConversationExecutionStatus.FINISHED
+            )
         else:
             self.conversation.reject_pending_actions("Not safe to run")
 
@@ -511,7 +537,8 @@ def test_single_finish_action_skips_confirmation_entirely(self):
             self.conversation.state.confirmation_policy == AlwaysConfirm()
         )  # Still in confirmation mode
         assert (
-            self.conversation.state.agent_status == AgentExecutionStatus.FINISHED
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.FINISHED
         )  # Agent should be finished
 
         # Should have no pending actions (FinishAction was executed immediately)
@@ -550,7 +577,10 @@ def test_think_and_finish_action_skips_confirmation_entirely(self):
 
         # Still in confirmation mode overall, but both actions should have executed
         assert self.conversation.state.confirmation_policy == AlwaysConfirm()
-        assert self.conversation.state.agent_status == AgentExecutionStatus.FINISHED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.FINISHED
+        )
 
         # No pending actions
         pending_actions = ConversationState.get_unmatched_actions(
@@ -584,8 +614,8 @@ def test_pause_during_confirmation_preserves_waiting_status(self):
 
         # Verify we're in the expected state
         assert (
-            self.conversation.state.agent_status
-            == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
         )
         assert self.conversation.state.confirmation_policy == AlwaysConfirm()
 
@@ -595,26 +625,34 @@ def test_pause_during_confirmation_preserves_waiting_status(self):
         # Status should remain WAITING_FOR_CONFIRMATION, not change to PAUSED
         # This is the key fix: waiting for confirmation is a special type of pause
         assert (
-            self.conversation.state.agent_status
-            == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
         )
 
         # Test that pause works correctly for other states
         # Reset to IDLE state
         with self.conversation._state:
-            self.conversation._state.agent_status = AgentExecutionStatus.IDLE
+            self.conversation._state.execution_status = ConversationExecutionStatus.IDLE
 
         # Pause from IDLE should change status to PAUSED
         self.conversation.pause()
-        assert self.conversation._state.agent_status == AgentExecutionStatus.PAUSED
+        assert (
+            self.conversation._state.execution_status
+            == ConversationExecutionStatus.PAUSED
+        )
 
         # Reset to RUNNING state
         with self.conversation._state:
-            self.conversation._state.agent_status = AgentExecutionStatus.RUNNING
+            self.conversation._state.execution_status = (
+                ConversationExecutionStatus.RUNNING
+            )
 
         # Pause from RUNNING should change status to PAUSED
         self.conversation.pause()
-        assert self.conversation._state.agent_status == AgentExecutionStatus.PAUSED
+        assert (
+            self.conversation._state.execution_status
+            == ConversationExecutionStatus.PAUSED
+        )
 
     def test_is_confirmation_mode_active_property(self):
         """Test the is_confirmation_mode_active property behavior."""
diff --git a/tests/sdk/conversation/local/test_conversation_pause_functionality.py b/tests/sdk/conversation/local/test_conversation_pause_functionality.py
index 6a95a8d7b4..838c5ce626 100644
--- a/tests/sdk/conversation/local/test_conversation_pause_functionality.py
+++ b/tests/sdk/conversation/local/test_conversation_pause_functionality.py
@@ -11,6 +11,7 @@
 
 import threading
 from collections.abc import Sequence
+from typing import ClassVar
 from unittest.mock import patch
 
 import pytest
@@ -26,7 +27,7 @@
 from openhands.sdk.agent import Agent
 from openhands.sdk.conversation import Conversation, LocalConversation
 from openhands.sdk.conversation.base import BaseConversation
-from openhands.sdk.conversation.state import AgentExecutionStatus
+from openhands.sdk.conversation.state import ConversationExecutionStatus
 from openhands.sdk.event import ActionEvent, MessageEvent, ObservationEvent, PauseEvent
 from openhands.sdk.llm import (
     LLM,
@@ -77,6 +78,68 @@ def __call__(
         return PauseFunctionalityMockObservation(result=f"Executed: {action.command}")
 
 
+class TestExecutor(
+    ToolExecutor[PauseFunctionalityMockAction, PauseFunctionalityMockObservation]
+):
+    """Test executor for pause functionality testing."""
+
+    def __call__(
+        self,
+        action: PauseFunctionalityMockAction,
+        conversation: BaseConversation | None = None,
+    ) -> PauseFunctionalityMockObservation:
+        return PauseFunctionalityMockObservation(result=f"Executed: {action.command}")
+
+
+class PauseFunctionalityTestTool(
+    ToolDefinition[PauseFunctionalityMockAction, PauseFunctionalityMockObservation]
+):
+    """Concrete tool for pause functionality testing."""
+
+    name: ClassVar[str] = "test_tool"
+
+    @classmethod
+    def create(
+        cls, conv_state=None, **params
+    ) -> Sequence["PauseFunctionalityTestTool"]:
+        return [
+            cls(
+                description="A test tool",
+                action_type=PauseFunctionalityMockAction,
+                observation_type=PauseFunctionalityMockObservation,
+                executor=TestExecutor(),
+            )
+        ]
+
+
+def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
+    """Factory function for creating test tools."""
+    return PauseFunctionalityTestTool.create(conv_state, **params)
+
+
+class BlockingTestTool(
+    ToolDefinition[PauseFunctionalityMockAction, PauseFunctionalityMockObservation]
+):
+    """Concrete tool for blocking pause testing."""
+
+    name: ClassVar[str] = "test_tool"
+
+    @classmethod
+    def create(
+        cls, conv_state=None, step_entered=None, **params
+    ) -> Sequence["BlockingTestTool"]:
+        if step_entered is None:
+            raise ValueError("step_entered is required for BlockingTestTool")
+        return [
+            cls(
+                description="Blocking tool for pause test",
+                action_type=PauseFunctionalityMockAction,
+                observation_type=PauseFunctionalityMockObservation,
+                executor=BlockingExecutor(step_entered),
+            )
+        ]
+
+
 class TestPauseFunctionality:
     """Test suite for pause functionality."""
 
@@ -87,31 +150,6 @@ def setup_method(self):
             model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm"
         )
 
-        class TestExecutor(
-            ToolExecutor[
-                PauseFunctionalityMockAction, PauseFunctionalityMockObservation
-            ]
-        ):
-            def __call__(
-                self,
-                action: PauseFunctionalityMockAction,
-                conversation: BaseConversation | None = None,
-            ) -> PauseFunctionalityMockObservation:
-                return PauseFunctionalityMockObservation(
-                    result=f"Executed: {action.command}"
-                )
-
-        def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
-            return [
-                ToolDefinition(
-                    name="test_tool",
-                    description="A test tool",
-                    action_type=PauseFunctionalityMockAction,
-                    observation_type=PauseFunctionalityMockObservation,
-                    executor=TestExecutor(),
-                )
-            ]
-
         register_tool("test_tool", _make_tool)
 
         self.agent: Agent = Agent(
@@ -123,12 +161,17 @@ def _make_tool(conv_state=None, **params) -> Sequence[ToolDefinition]:
     def test_pause_basic_functionality(self):
         """Test basic pause operations."""
         # Test initial state
-        assert self.conversation.state.agent_status == AgentExecutionStatus.IDLE
+        assert (
+            self.conversation.state.execution_status == ConversationExecutionStatus.IDLE
+        )
         assert len(self.conversation.state.events) == 1  # System prompt event
 
         # Test pause method
         self.conversation.pause()
-        assert self.conversation.state.agent_status == AgentExecutionStatus.PAUSED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.PAUSED
+        )
 
         pause_events = [
             event
@@ -163,13 +206,19 @@ def test_pause_during_normal_execution(self, mock_completion):
         self.conversation.pause()
 
         # Verify pause was set
-        assert self.conversation.state.agent_status == AgentExecutionStatus.PAUSED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.PAUSED
+        )
 
         # Run resets pause flag at start and proceeds normally
         self.conversation.run()
 
         # Agent should be finished (pause was reset at start of run)
-        assert self.conversation.state.agent_status == AgentExecutionStatus.FINISHED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.FINISHED
+        )
 
         # Should have pause event from the pause() call
         pause_events = [
@@ -202,13 +251,19 @@ def test_resume_paused_agent(self, mock_completion):
 
         # Pause before run
         self.conversation.pause()
-        assert self.conversation.state.agent_status == AgentExecutionStatus.PAUSED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.PAUSED
+        )
 
         # First run() call resets pause and runs normally
         self.conversation.run()
 
         # Agent should be finished (pause was reset at start of run)
-        assert self.conversation.state.agent_status == AgentExecutionStatus.FINISHED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.FINISHED
+        )
 
         # Should have agent message since run completed normally
         agent_messages = [
@@ -224,7 +279,10 @@ def test_pause_with_confirmation_mode(self, mock_completion):
         # Enable confirmation mode
         self.conversation.set_confirmation_policy(AlwaysConfirm())
         self.conversation.pause()
-        assert self.conversation.state.agent_status == AgentExecutionStatus.PAUSED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.PAUSED
+        )
 
         # Mock action
         tool_call = ChatCompletionMessageToolCall(
@@ -261,8 +319,8 @@ def test_pause_with_confirmation_mode(self, mock_completion):
 
         # Pause should be reset, agent should be waiting for confirmation
         assert (
-            self.conversation.state.agent_status
-            == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
         )
 
         # Action did not execute (no ObservationEvent should be recorded)
@@ -301,7 +359,10 @@ def test_multiple_pause_calls_create_one_event(self):
         )
 
         # State should be paused
-        assert self.conversation.state.agent_status == AgentExecutionStatus.PAUSED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.PAUSED
+        )
 
     @pytest.mark.timeout(3)
     @patch("openhands.sdk.llm.llm.litellm_completion")
@@ -309,15 +370,9 @@ def test_pause_while_running_continuous_actions(self, mock_completion):
         step_entered = threading.Event()
 
         def _make_blocking_tool(conv_state=None, **kwargs) -> Sequence[ToolDefinition]:
-            return [
-                ToolDefinition(
-                    name="test_tool",
-                    description="Blocking tool for pause test",
-                    action_type=PauseFunctionalityMockAction,
-                    observation_type=PauseFunctionalityMockObservation,
-                    executor=BlockingExecutor(step_entered),
-                )
-            ]
+            return BlockingTestTool.create(
+                conv_state, step_entered=step_entered, **kwargs
+            )
 
         register_tool("test_tool", _make_blocking_tool)
         agent = Agent(
@@ -384,14 +439,20 @@ def run_agent():
         # Wait until we're *inside* tool execution of the current iteration
         assert step_entered.wait(timeout=3.0), "Agent never reached tool execution"
         self.conversation.pause()
-        assert self.conversation.state.agent_status == AgentExecutionStatus.PAUSED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.PAUSED
+        )
 
         assert finished.wait(timeout=3.0), "run() did not exit after pause"
         t.join(timeout=0.1)
         assert run_exc[0] is None, f"Run thread failed with: {run_exc[0]}"
 
         # paused, not finished, exactly one PauseEvent
-        assert self.conversation.state.agent_status == AgentExecutionStatus.PAUSED
+        assert (
+            self.conversation.state.execution_status
+            == ConversationExecutionStatus.PAUSED
+        )
         pause_events = [
             e for e in self.conversation.state.events if isinstance(e, PauseEvent)
         ]
diff --git a/tests/sdk/conversation/local/test_state_serialization.py b/tests/sdk/conversation/local/test_state_serialization.py
index f878e27e2e..f87b4f36df 100644
--- a/tests/sdk/conversation/local/test_state_serialization.py
+++ b/tests/sdk/conversation/local/test_state_serialization.py
@@ -11,7 +11,10 @@
 from openhands.sdk import Agent, Conversation
 from openhands.sdk.agent.base import AgentBase
 from openhands.sdk.conversation.impl.local_conversation import LocalConversation
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 from openhands.sdk.event.llm_convertible import MessageEvent, SystemPromptEvent
 from openhands.sdk.llm import LLM, Message, TextContent
 from openhands.sdk.llm.llm_registry import RegistryEvent
@@ -466,7 +469,7 @@ def test_conversation_state_flags_persistence():
         state.stats.register_llm(RegistryEvent(llm=llm))
 
         # Set various flags
-        state.agent_status = AgentExecutionStatus.FINISHED
+        state.execution_status = ConversationExecutionStatus.FINISHED
         state.confirmation_policy = AlwaysConfirm()
         state.activated_knowledge_skills = ["agent1", "agent2"]
 
@@ -482,7 +485,7 @@ def test_conversation_state_flags_persistence():
         assert loaded_state.id == state.id
         assert loaded_state.agent.llm.model == state.agent.llm.model
         # Verify flags are preserved
-        assert loaded_state.agent_status == AgentExecutionStatus.FINISHED
+        assert loaded_state.execution_status == ConversationExecutionStatus.FINISHED
         assert loaded_state.confirmation_policy == AlwaysConfirm()
         assert loaded_state.activated_knowledge_skills == ["agent1", "agent2"]
         # Test model_dump equality
diff --git a/tests/sdk/conversation/remote/test_remote_conversation.py b/tests/sdk/conversation/remote/test_remote_conversation.py
index 6d1c9a6125..2056d72e37 100644
--- a/tests/sdk/conversation/remote/test_remote_conversation.py
+++ b/tests/sdk/conversation/remote/test_remote_conversation.py
@@ -9,7 +9,7 @@
 
 from openhands.sdk.agent import Agent
 from openhands.sdk.conversation.impl.remote_conversation import RemoteConversation
-from openhands.sdk.conversation.secrets_manager import SecretValue
+from openhands.sdk.conversation.secret_registry import SecretValue
 from openhands.sdk.llm import LLM, Message, TextContent
 from openhands.sdk.security.confirmation_policy import AlwaysConfirm
 from openhands.sdk.workspace import RemoteWorkspace
@@ -472,7 +472,7 @@ def test_remote_conversation_update_secrets(self, mock_ws_client):
         # Test with string secrets
         from typing import cast
 
-        from openhands.sdk.conversation.secrets_manager import SecretValue
+        from openhands.sdk.conversation.secret_registry import SecretValue
 
         secrets = cast(
             dict[str, SecretValue],
diff --git a/tests/sdk/conversation/remote/test_remote_state.py b/tests/sdk/conversation/remote/test_remote_state.py
index 0879bdf0b6..4356dbc6d7 100644
--- a/tests/sdk/conversation/remote/test_remote_state.py
+++ b/tests/sdk/conversation/remote/test_remote_state.py
@@ -9,7 +9,7 @@
 
 from openhands.sdk.agent import Agent
 from openhands.sdk.conversation.impl.remote_conversation import RemoteState
-from openhands.sdk.conversation.state import AgentExecutionStatus
+from openhands.sdk.conversation.state import ConversationExecutionStatus
 from openhands.sdk.llm import LLM
 from openhands.sdk.security.confirmation_policy import AlwaysConfirm
 
@@ -37,7 +37,7 @@ def create_mock_conversation_info(conversation_id: str, mock_agent: Agent, **ove
     """Create mock conversation info response."""
     default_info = {
         "id": conversation_id,
-        "agent_status": "running",
+        "execution_status": "running",
         "confirmation_policy": {"kind": "NeverConfirm"},
         "activated_knowledge_skills": [],
         "agent": mock_agent.model_dump(mode="json"),
@@ -86,27 +86,29 @@ def test_remote_state_initialization(mock_client, conversation_id):
 @pytest.mark.parametrize(
     "status_value,expected",
     [
-        ("running", AgentExecutionStatus.RUNNING),
-        ("paused", AgentExecutionStatus.PAUSED),
-        ("finished", AgentExecutionStatus.FINISHED),
+        ("running", ConversationExecutionStatus.RUNNING),
+        ("paused", ConversationExecutionStatus.PAUSED),
+        ("finished", ConversationExecutionStatus.FINISHED),
     ],
 )
-def test_remote_state_agent_status(
+def test_remote_state_execution_status(
     mock_client, conversation_id, mock_agent, status_value, expected
 ):
-    """Test agent_status property with different values."""
+    """Test execution_status property with different values."""
     conversation_info = create_mock_conversation_info(
-        conversation_id, mock_agent, agent_status=status_value
+        conversation_id, mock_agent, execution_status=status_value
     )
     setup_mock_responses(mock_client, conversation_info)
 
     state = RemoteState(mock_client, conversation_id)
 
-    assert state.agent_status == expected
+    assert state.execution_status == expected
 
 
-def test_remote_state_agent_status_setter_not_implemented(mock_client, conversation_id):
-    """Test that setting agent_status raises NotImplementedError."""
+def test_remote_state_execution_status_setter_not_implemented(
+    mock_client, conversation_id
+):
+    """Test that setting execution_status raises NotImplementedError."""
     mock_events_response = Mock()
     mock_events_response.raise_for_status.return_value = None
     mock_events_response.json.return_value = {"items": [], "next_page_id": None}
@@ -115,9 +117,10 @@ def test_remote_state_agent_status_setter_not_implemented(mock_client, conversat
     state = RemoteState(mock_client, conversation_id)
 
     with pytest.raises(
-        NotImplementedError, match="Setting agent_status on RemoteState has no effect"
+        NotImplementedError,
+        match="Setting execution_status on RemoteState has no effect",
     ):
-        state.agent_status = AgentExecutionStatus.PAUSED
+        state.execution_status = ConversationExecutionStatus.PAUSED
 
 
 def test_remote_state_confirmation_policy(mock_client, conversation_id, mock_agent):
@@ -162,7 +165,11 @@ def test_remote_state_agent_property(mock_client, conversation_id, mock_agent):
 @pytest.mark.parametrize(
     "missing_field,property_name,error_match",
     [
-        ("agent_status", "agent_status", "agent_status missing in conversation info"),
+        (
+            "execution_status",
+            "execution_status",
+            "execution_status missing in conversation info",
+        ),
         (
             "confirmation_policy",
             "confirmation_policy",
@@ -241,4 +248,4 @@ def test_remote_state_api_error_handling(mock_client, conversation_id):
     state = RemoteState(mock_client, conversation_id)
 
     with pytest.raises(httpx.HTTPStatusError):
-        _ = state.agent_status
+        _ = state.execution_status
diff --git a/tests/sdk/conversation/test_agent_status_enum.py b/tests/sdk/conversation/test_agent_status_enum.py
deleted file mode 100644
index b8e389261f..0000000000
--- a/tests/sdk/conversation/test_agent_status_enum.py
+++ /dev/null
@@ -1,74 +0,0 @@
-"""Test the AgentExecutionStatus enum functionality."""
-
-from pydantic import SecretStr
-
-from openhands.sdk import Agent, Conversation
-from openhands.sdk.conversation.state import AgentExecutionStatus
-from openhands.sdk.llm import LLM
-
-
-def test_agent_execution_state_enum_basic():
-    """Test basic AgentExecutionStatus enum functionality."""
-    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
-    agent = Agent(llm=llm, tools=[])
-    conversation = Conversation(agent=agent)
-
-    # Test initial state
-    assert conversation._state.agent_status == AgentExecutionStatus.IDLE
-
-    # Test setting enum directly
-    conversation._state.agent_status = AgentExecutionStatus.RUNNING
-    assert conversation._state.agent_status == AgentExecutionStatus.RUNNING
-
-    # Test setting to FINISHED
-    conversation._state.agent_status = AgentExecutionStatus.FINISHED
-    assert conversation._state.agent_status == AgentExecutionStatus.FINISHED
-
-    # Test setting to PAUSED
-    conversation._state.agent_status = AgentExecutionStatus.PAUSED
-    assert conversation._state.agent_status == AgentExecutionStatus.PAUSED
-
-    # Test setting to WAITING_FOR_CONFIRMATION
-    conversation._state.agent_status = AgentExecutionStatus.WAITING_FOR_CONFIRMATION
-    assert (
-        conversation._state.agent_status
-        == AgentExecutionStatus.WAITING_FOR_CONFIRMATION
-    )
-
-    # Test setting to ERROR
-    conversation._state.agent_status = AgentExecutionStatus.ERROR
-    assert conversation._state.agent_status == AgentExecutionStatus.ERROR
-
-
-def test_enum_values():
-    """Test that all enum values are correct."""
-    assert AgentExecutionStatus.IDLE == "idle"
-    assert AgentExecutionStatus.RUNNING == "running"
-    assert AgentExecutionStatus.PAUSED == "paused"
-    assert AgentExecutionStatus.WAITING_FOR_CONFIRMATION == "waiting_for_confirmation"
-    assert AgentExecutionStatus.FINISHED == "finished"
-    assert AgentExecutionStatus.ERROR == "error"
-
-
-def test_enum_serialization():
-    """Test that the enum serializes and deserializes correctly."""
-    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
-    agent = Agent(llm=llm, tools=[])
-    conversation = Conversation(agent=agent)
-
-    # Set to different states and test serialization
-    conversation._state.agent_status = AgentExecutionStatus.FINISHED
-    serialized = conversation._state.model_dump_json()
-    assert '"agent_status": "finished"' in serialized
-
-    conversation._state.agent_status = AgentExecutionStatus.PAUSED
-    serialized = conversation._state.model_dump_json()
-    assert '"agent_status": "paused"' in serialized
-
-    conversation._state.agent_status = AgentExecutionStatus.WAITING_FOR_CONFIRMATION
-    serialized = conversation._state.model_dump_json()
-    assert '"agent_status": "waiting_for_confirmation"' in serialized
-
-    conversation._state.agent_status = AgentExecutionStatus.ERROR
-    serialized = conversation._state.model_dump_json()
-    assert '"agent_status": "error"' in serialized
diff --git a/tests/sdk/conversation/test_conversation_execution_status_enum.py b/tests/sdk/conversation/test_conversation_execution_status_enum.py
new file mode 100644
index 0000000000..de9737cb8a
--- /dev/null
+++ b/tests/sdk/conversation/test_conversation_execution_status_enum.py
@@ -0,0 +1,81 @@
+"""Test the ConversationExecutionStatus enum functionality."""
+
+from pydantic import SecretStr
+
+from openhands.sdk import Agent, Conversation
+from openhands.sdk.conversation.state import ConversationExecutionStatus
+from openhands.sdk.llm import LLM
+
+
+def test_agent_execution_state_enum_basic():
+    """Test basic ConversationExecutionStatus enum functionality."""
+    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
+    agent = Agent(llm=llm, tools=[])
+    conversation = Conversation(agent=agent)
+
+    # Test initial state
+    assert conversation._state.execution_status == ConversationExecutionStatus.IDLE
+
+    # Test setting enum directly
+    conversation._state.execution_status = ConversationExecutionStatus.RUNNING
+    assert conversation._state.execution_status == ConversationExecutionStatus.RUNNING
+
+    # Test setting to FINISHED
+    conversation._state.execution_status = ConversationExecutionStatus.FINISHED
+    assert conversation._state.execution_status == ConversationExecutionStatus.FINISHED
+
+    # Test setting to PAUSED
+    conversation._state.execution_status = ConversationExecutionStatus.PAUSED
+    assert conversation._state.execution_status == ConversationExecutionStatus.PAUSED
+
+    # Test setting to WAITING_FOR_CONFIRMATION
+    conversation._state.execution_status = (
+        ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
+    )
+    assert (
+        conversation._state.execution_status
+        == ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
+    )
+
+    # Test setting to ERROR
+    conversation._state.execution_status = ConversationExecutionStatus.ERROR
+    assert conversation._state.execution_status == ConversationExecutionStatus.ERROR
+
+
+def test_enum_values():
+    """Test that all enum values are correct."""
+    assert ConversationExecutionStatus.IDLE == "idle"
+    assert ConversationExecutionStatus.RUNNING == "running"
+    assert ConversationExecutionStatus.PAUSED == "paused"
+    assert (
+        ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
+        == "waiting_for_confirmation"
+    )
+    assert ConversationExecutionStatus.FINISHED == "finished"
+    assert ConversationExecutionStatus.ERROR == "error"
+
+
+def test_enum_serialization():
+    """Test that the enum serializes and deserializes correctly."""
+    llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
+    agent = Agent(llm=llm, tools=[])
+    conversation = Conversation(agent=agent)
+
+    # Set to different states and test serialization
+    conversation._state.execution_status = ConversationExecutionStatus.FINISHED
+    serialized = conversation._state.model_dump_json()
+    assert '"execution_status": "finished"' in serialized
+
+    conversation._state.execution_status = ConversationExecutionStatus.PAUSED
+    serialized = conversation._state.model_dump_json()
+    assert '"execution_status": "paused"' in serialized
+
+    conversation._state.execution_status = (
+        ConversationExecutionStatus.WAITING_FOR_CONFIRMATION
+    )
+    serialized = conversation._state.model_dump_json()
+    assert '"execution_status": "waiting_for_confirmation"' in serialized
+
+    conversation._state.execution_status = ConversationExecutionStatus.ERROR
+    serialized = conversation._state.model_dump_json()
+    assert '"execution_status": "error"' in serialized
diff --git a/tests/sdk/conversation/test_conversation_secrets_constructor.py b/tests/sdk/conversation/test_conversation_secrets_constructor.py
index a93e65ac08..3ca366b6f3 100644
--- a/tests/sdk/conversation/test_conversation_secrets_constructor.py
+++ b/tests/sdk/conversation/test_conversation_secrets_constructor.py
@@ -43,18 +43,18 @@ def test_local_conversation_constructor_with_secrets():
         assert isinstance(conv, LocalConversation)
 
         # Verify secrets were initialized
-        secrets_manager = conv.state.secrets_manager
-        assert secrets_manager is not None
+        secret_registry = conv.state.secret_registry
+        assert secret_registry is not None
 
-        # Verify secrets are accessible through the secrets manager
-        env_vars = secrets_manager.get_secrets_as_env_vars("echo $API_KEY")
+        # Verify secrets are accessible through the secret registry
+        env_vars = secret_registry.get_secrets_as_env_vars("echo $API_KEY")
         assert env_vars == {"API_KEY": "test-api-key-123"}
 
-        env_vars = secrets_manager.get_secrets_as_env_vars("echo $DATABASE_URL")
+        env_vars = secret_registry.get_secrets_as_env_vars("echo $DATABASE_URL")
         assert env_vars == {"DATABASE_URL": "postgresql://localhost/test"}
 
         # Test multiple secrets in one command
-        env_vars = secrets_manager.get_secrets_as_env_vars(
+        env_vars = secret_registry.get_secrets_as_env_vars(
             "export API_KEY=$API_KEY && export AUTH_TOKEN=$AUTH_TOKEN"
         )
         assert env_vars == {
@@ -90,15 +90,15 @@ def get_value(self):
         assert isinstance(conv, LocalConversation)
 
         # Verify callable secrets work
-        secrets_manager = conv.state.secrets_manager
+        secret_registry = conv.state.secret_registry
 
-        env_vars = secrets_manager.get_secrets_as_env_vars("echo $DYNAMIC_TOKEN")
+        env_vars = secret_registry.get_secrets_as_env_vars("echo $DYNAMIC_TOKEN")
         assert env_vars == {"DYNAMIC_TOKEN": "dynamic-token-789"}
 
-        env_vars = secrets_manager.get_secrets_as_env_vars("echo $API_KEY")
+        env_vars = secret_registry.get_secrets_as_env_vars("echo $API_KEY")
         assert env_vars == {"API_KEY": "callable-api-key"}
 
-        env_vars = secrets_manager.get_secrets_as_env_vars("echo $STATIC_KEY")
+        env_vars = secret_registry.get_secrets_as_env_vars("echo $STATIC_KEY")
         assert env_vars == {"STATIC_KEY": "static-value"}
 
 
@@ -118,11 +118,11 @@ def test_local_conversation_constructor_without_secrets():
         assert isinstance(conv, LocalConversation)
 
         # Verify secrets manager exists but is empty
-        secrets_manager = conv.state.secrets_manager
-        assert secrets_manager is not None
+        secret_registry = conv.state.secret_registry
+        assert secret_registry is not None
 
         # Should return empty dict for any command
-        env_vars = secrets_manager.get_secrets_as_env_vars("echo $API_KEY")
+        env_vars = secret_registry.get_secrets_as_env_vars("echo $API_KEY")
         assert env_vars == {}
 
 
@@ -142,11 +142,11 @@ def test_local_conversation_constructor_with_empty_secrets():
         assert isinstance(conv, LocalConversation)
 
         # Verify secrets manager exists but is empty
-        secrets_manager = conv.state.secrets_manager
-        assert secrets_manager is not None
+        secret_registry = conv.state.secret_registry
+        assert secret_registry is not None
 
         # Should return empty dict for any command
-        env_vars = secrets_manager.get_secrets_as_env_vars("echo $API_KEY")
+        env_vars = secret_registry.get_secrets_as_env_vars("echo $API_KEY")
         assert env_vars == {}
 
 
diff --git a/tests/sdk/conversation/test_remote_conversation_state_updates.py b/tests/sdk/conversation/test_remote_conversation_state_updates.py
index af51b1fd0c..472118c3d6 100644
--- a/tests/sdk/conversation/test_remote_conversation_state_updates.py
+++ b/tests/sdk/conversation/test_remote_conversation_state_updates.py
@@ -41,7 +41,7 @@ def test_update_state_from_event_with_full_state():
 
         # Create a full state event
         full_state = {
-            "agent_status": "running",
+            "execution_status": "running",
             "confirmation_policy": {"kind": "NeverConfirm"},
             "max_iterations": 100,
         }
@@ -53,7 +53,7 @@ def test_update_state_from_event_with_full_state():
         # Verify all fields were updated
         assert conv.state._cached_state is not None
         assert conv.state._cached_state == full_state
-        assert conv.state._cached_state["agent_status"] == "running"
+        assert conv.state._cached_state["execution_status"] == "running"
         assert conv.state._cached_state["max_iterations"] == 100
 
 
@@ -79,19 +79,19 @@ def test_update_state_from_event_with_individual_field():
 
         # Set initial cached state
         conv.state._cached_state = {
-            "agent_status": "idle",
+            "execution_status": "idle",
             "max_iterations": 50,
         }
 
         # Create an individual field update event
-        event = ConversationStateUpdateEvent(key="agent_status", value="running")
+        event = ConversationStateUpdateEvent(key="execution_status", value="running")
 
         # Update state using the real RemoteState
         conv.state.update_state_from_event(event)
 
         # Verify only that field was updated
         assert conv.state._cached_state is not None
-        assert conv.state._cached_state["agent_status"] == "running"
+        assert conv.state._cached_state["execution_status"] == "running"
         assert conv.state._cached_state["max_iterations"] == 50  # Unchanged
 
 
@@ -119,12 +119,12 @@ def test_update_state_initializes_cache_if_none():
         conv.state._cached_state = None
 
         # Update with individual field when cache is None
-        event = ConversationStateUpdateEvent(key="agent_status", value="running")
+        event = ConversationStateUpdateEvent(key="execution_status", value="running")
         conv.state.update_state_from_event(event)
 
         # Verify cache was initialized
         assert conv.state._cached_state is not None
-        assert conv.state._cached_state["agent_status"] == "running"
+        assert conv.state._cached_state["execution_status"] == "running"
 
 
 def test_update_state_from_multiple_events():
@@ -149,7 +149,7 @@ def test_update_state_from_multiple_events():
 
         # First, full state
         full_state = {
-            "agent_status": "idle",
+            "execution_status": "idle",
             "max_iterations": 50,
             "stuck_detection": True,
         }
@@ -157,7 +157,7 @@ def test_update_state_from_multiple_events():
         conv.state.update_state_from_event(event1)
 
         # Then, individual updates
-        event2 = ConversationStateUpdateEvent(key="agent_status", value="running")
+        event2 = ConversationStateUpdateEvent(key="execution_status", value="running")
         conv.state.update_state_from_event(event2)
 
         event3 = ConversationStateUpdateEvent(key="max_iterations", value=100)
@@ -165,7 +165,7 @@ def test_update_state_from_multiple_events():
 
         # Verify final state
         assert conv.state._cached_state is not None
-        assert conv.state._cached_state["agent_status"] == "running"
+        assert conv.state._cached_state["execution_status"] == "running"
         assert conv.state._cached_state["max_iterations"] == 100
         assert conv.state._cached_state["stuck_detection"] is True
 
@@ -192,14 +192,14 @@ def test_update_state_full_state_overwrites_fields():
 
         # Set initial cached state
         conv.state._cached_state = {
-            "agent_status": "running",
+            "execution_status": "running",
             "max_iterations": 100,
             "old_field": "old_value",
         }
 
         # Update with full state (without old_field)
         full_state = {
-            "agent_status": "idle",
+            "execution_status": "idle",
             "max_iterations": 50,
         }
         event = ConversationStateUpdateEvent(key="full_state", value=full_state)
@@ -207,7 +207,7 @@ def test_update_state_full_state_overwrites_fields():
 
         # Verify new fields are set and old field still exists (update, not replace)
         assert conv.state._cached_state is not None
-        assert conv.state._cached_state["agent_status"] == "idle"
+        assert conv.state._cached_state["execution_status"] == "idle"
         assert conv.state._cached_state["max_iterations"] == 50
         assert "old_field" in conv.state._cached_state  # Still there from .update()
 
@@ -321,11 +321,11 @@ def test_state_update_callback_integration():
         state_update_callback = conv.state.create_state_update_callback()
 
         # Test that the callback properly handles ConversationStateUpdateEvent
-        event = ConversationStateUpdateEvent(key="agent_status", value="running")
+        event = ConversationStateUpdateEvent(key="execution_status", value="running")
 
         # Call the callback directly (simulating websocket event)
         state_update_callback(event)
 
         # Verify the state was updated
         assert conv.state._cached_state is not None
-        assert conv.state._cached_state["agent_status"] == "running"
+        assert conv.state._cached_state["execution_status"] == "running"
diff --git a/tests/sdk/conversation/test_secrets_manager.py b/tests/sdk/conversation/test_secrets_manager.py
index ab93d1b350..6540704d2b 100644
--- a/tests/sdk/conversation/test_secrets_manager.py
+++ b/tests/sdk/conversation/test_secrets_manager.py
@@ -2,20 +2,20 @@
 
 from pydantic import SecretStr
 
+from openhands.sdk.conversation.secret_registry import SecretRegistry
 from openhands.sdk.conversation.secret_source import SecretSource, StaticSecret
-from openhands.sdk.conversation.secrets_manager import SecretsManager
 
 
 def test_update_secrets_with_static_values():
     """Test updating secrets with static string values."""
-    manager = SecretsManager()
+    secret_registry = SecretRegistry()
     secrets = {
         "API_KEY": "test-api-key",
         "DATABASE_URL": "postgresql://localhost/test",
     }
 
-    manager.update_secrets(secrets)
-    assert manager.secret_sources == {
+    secret_registry.update_secrets(secrets)
+    assert secret_registry.secret_sources == {
         "API_KEY": StaticSecret(value=SecretStr("test-api-key")),
         "DATABASE_URL": StaticSecret(value=SecretStr("postgresql://localhost/test")),
     }
@@ -23,30 +23,30 @@ def test_update_secrets_with_static_values():
 
 def test_update_secrets_overwrites_existing():
     """Test that update_secrets overwrites existing keys."""
-    manager = SecretsManager()
+    secret_registry = SecretRegistry()
 
     # Add initial secrets
-    manager.update_secrets({"API_KEY": "old-value"})
-    assert manager.secret_sources["API_KEY"] == StaticSecret(
+    secret_registry.update_secrets({"API_KEY": "old-value"})
+    assert secret_registry.secret_sources["API_KEY"] == StaticSecret(
         value=SecretStr("old-value")
     )
 
     # Update with new value
-    manager.update_secrets({"API_KEY": "new-value", "NEW_KEY": "key-value"})
-    assert manager.secret_sources["API_KEY"] == StaticSecret(
+    secret_registry.update_secrets({"API_KEY": "new-value", "NEW_KEY": "key-value"})
+    assert secret_registry.secret_sources["API_KEY"] == StaticSecret(
         value=SecretStr("new-value")
     )
 
-    manager.update_secrets({"API_KEY": "new-value-2"})
-    assert manager.secret_sources["API_KEY"] == StaticSecret(
+    secret_registry.update_secrets({"API_KEY": "new-value-2"})
+    assert secret_registry.secret_sources["API_KEY"] == StaticSecret(
         value=SecretStr("new-value-2")
     )
 
 
 def test_find_secrets_in_text_case_insensitive():
     """Test that find_secrets_in_text is case insensitive."""
-    manager = SecretsManager()
-    manager.update_secrets(
+    secret_registry = SecretRegistry()
+    secret_registry.update_secrets(
         {
             "API_KEY": "test-key",
             "DATABASE_PASSWORD": "test-password",
@@ -54,23 +54,23 @@ def test_find_secrets_in_text_case_insensitive():
     )
 
     # Test various case combinations
-    found = manager.find_secrets_in_text("echo api_key=$API_KEY")
+    found = secret_registry.find_secrets_in_text("echo api_key=$API_KEY")
     assert found == {"API_KEY"}
 
-    found = manager.find_secrets_in_text("echo $database_password")
+    found = secret_registry.find_secrets_in_text("echo $database_password")
     assert found == {"DATABASE_PASSWORD"}
 
-    found = manager.find_secrets_in_text("API_KEY and DATABASE_PASSWORD")
+    found = secret_registry.find_secrets_in_text("API_KEY and DATABASE_PASSWORD")
     assert found == {"API_KEY", "DATABASE_PASSWORD"}
 
-    found = manager.find_secrets_in_text("echo hello world")
+    found = secret_registry.find_secrets_in_text("echo hello world")
     assert found == set()
 
 
 def test_find_secrets_in_text_partial_matches():
     """Test that find_secrets_in_text handles partial matches correctly."""
-    manager = SecretsManager()
-    manager.update_secrets(
+    secret_registry = SecretRegistry()
+    secret_registry.update_secrets(
         {
             "API_KEY": "test-key",
             "API": "test-api",  # Shorter key that's contained in API_KEY
@@ -78,25 +78,25 @@ def test_find_secrets_in_text_partial_matches():
     )
 
     # Both should be found since "API" is contained in "API_KEY"
-    found = manager.find_secrets_in_text("export API_KEY=$API_KEY")
+    found = secret_registry.find_secrets_in_text("export API_KEY=$API_KEY")
     assert "API_KEY" in found
     assert "API" in found
 
 
 def test_get_secrets_as_env_vars_static_values():
     """Test get_secrets_as_env_vars with static values."""
-    manager = SecretsManager()
-    manager.update_secrets(
+    secret_registry = SecretRegistry()
+    secret_registry.update_secrets(
         {
             "API_KEY": "test-api-key",
             "DATABASE_URL": "postgresql://localhost/test",
         }
     )
 
-    env_vars = manager.get_secrets_as_env_vars("curl -H 'X-API-Key: $API_KEY'")
+    env_vars = secret_registry.get_secrets_as_env_vars("curl -H 'X-API-Key: $API_KEY'")
     assert env_vars == {"API_KEY": "test-api-key"}
 
-    env_vars = manager.get_secrets_as_env_vars(
+    env_vars = secret_registry.get_secrets_as_env_vars(
         "export API_KEY=$API_KEY && export DATABASE_URL=$DATABASE_URL"
     )
     assert env_vars == {
@@ -107,26 +107,28 @@ def test_get_secrets_as_env_vars_static_values():
 
 def test_get_secrets_as_env_vars_callable_values():
     """Test get_secrets_as_env_vars with callable values."""
-    manager = SecretsManager()
+    secret_registry = SecretRegistry()
 
     class MyTokenSource(SecretSource):
         def get_value(self):
             return "dynamic-token-456"
 
-    manager.update_secrets(
+    secret_registry.update_secrets(
         {
             "STATIC_KEY": "static-value",
             "DYNAMIC_TOKEN": MyTokenSource(),
         }
     )
 
-    env_vars = manager.get_secrets_as_env_vars("export DYNAMIC_TOKEN=$DYNAMIC_TOKEN")
+    env_vars = secret_registry.get_secrets_as_env_vars(
+        "export DYNAMIC_TOKEN=$DYNAMIC_TOKEN"
+    )
     assert env_vars == {"DYNAMIC_TOKEN": "dynamic-token-456"}
 
 
 def test_get_secrets_as_env_vars_handles_callable_exceptions():
     """Test that get_secrets_as_env_vars handles exceptions from callables."""
-    manager = SecretsManager()
+    secret_registry = SecretRegistry()
 
     class MyFailingTokenSource(SecretSource):
         def get_value(self):
@@ -136,7 +138,7 @@ class MyWorkingTokenSource(SecretSource):
         def get_value(self):
             return "working-value"
 
-    manager.update_secrets(
+    secret_registry.update_secrets(
         {
             "FAILING_SECRET": MyFailingTokenSource(),
             "WORKING_SECRET": MyWorkingTokenSource(),
@@ -144,7 +146,7 @@ def get_value(self):
     )
 
     # Should not raise exception, should skip failing secret
-    env_vars = manager.get_secrets_as_env_vars(
+    env_vars = secret_registry.get_secrets_as_env_vars(
         "export FAILING_SECRET=$FAILING_SECRET && export WORKING_SECRET=$WORKING_SECRET"
     )
 
diff --git a/tests/sdk/conversation/test_state_change_callback.py b/tests/sdk/conversation/test_state_change_callback.py
index 3ac08ca00d..9443a94291 100644
--- a/tests/sdk/conversation/test_state_change_callback.py
+++ b/tests/sdk/conversation/test_state_change_callback.py
@@ -6,7 +6,10 @@
 from pydantic import SecretStr
 
 from openhands.sdk import LLM, Agent
-from openhands.sdk.conversation.state import AgentExecutionStatus, ConversationState
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+    ConversationState,
+)
 from openhands.sdk.event.conversation_state import ConversationStateUpdateEvent
 from openhands.sdk.io import InMemoryFileStore
 from openhands.sdk.workspace import LocalWorkspace
@@ -45,14 +48,14 @@ def callback(event: ConversationStateUpdateEvent):
 
     # Change state - should trigger callback
     with state:
-        state.agent_status = AgentExecutionStatus.RUNNING
+        state.execution_status = ConversationExecutionStatus.RUNNING
 
     # Verify callback was called
     assert len(callback_calls) == 1
     event = callback_calls[0]
     assert isinstance(event, ConversationStateUpdateEvent)
-    assert event.key == "agent_status"
-    assert event.value == AgentExecutionStatus.RUNNING
+    assert event.key == "execution_status"
+    assert event.value == ConversationExecutionStatus.RUNNING
 
 
 def test_callback_called_multiple_times(state):
@@ -66,15 +69,15 @@ def callback(event: ConversationStateUpdateEvent):
 
     # Make multiple state changes
     with state:
-        state.agent_status = AgentExecutionStatus.RUNNING
-        state.agent_status = AgentExecutionStatus.PAUSED
-        state.agent_status = AgentExecutionStatus.FINISHED
+        state.execution_status = ConversationExecutionStatus.RUNNING
+        state.execution_status = ConversationExecutionStatus.PAUSED
+        state.execution_status = ConversationExecutionStatus.FINISHED
 
     # Verify callback was called for each change
     assert len(callback_calls) == 3
-    assert callback_calls[0].value == AgentExecutionStatus.RUNNING
-    assert callback_calls[1].value == AgentExecutionStatus.PAUSED
-    assert callback_calls[2].value == AgentExecutionStatus.FINISHED
+    assert callback_calls[0].value == ConversationExecutionStatus.RUNNING
+    assert callback_calls[1].value == ConversationExecutionStatus.PAUSED
+    assert callback_calls[2].value == ConversationExecutionStatus.FINISHED
 
 
 def test_callback_can_be_cleared(state):
@@ -90,7 +93,7 @@ def callback(event: ConversationStateUpdateEvent):
 
     # Change state - callback should not be called
     with state:
-        state.agent_status = AgentExecutionStatus.RUNNING
+        state.execution_status = ConversationExecutionStatus.RUNNING
 
     # Verify callback was not called
     assert len(callback_calls) == 0
@@ -106,10 +109,10 @@ def bad_callback(event: ConversationStateUpdateEvent):
 
     # Change state - should not raise despite callback error
     with state:
-        state.agent_status = AgentExecutionStatus.RUNNING
+        state.execution_status = ConversationExecutionStatus.RUNNING
 
     # Verify state was still changed
-    assert state.agent_status == AgentExecutionStatus.RUNNING
+    assert state.execution_status == ConversationExecutionStatus.RUNNING
 
 
 def test_callback_not_called_without_lock(state):
@@ -123,7 +126,7 @@ def callback(event: ConversationStateUpdateEvent):
 
     # This should still trigger callback since __setattr__ is called
     with state:
-        state.agent_status = AgentExecutionStatus.RUNNING
+        state.execution_status = ConversationExecutionStatus.RUNNING
 
     # Verify callback was called
     assert len(callback_calls) == 1
@@ -140,13 +143,13 @@ def callback(event: ConversationStateUpdateEvent):
 
     # Change different types of fields
     with state:
-        state.agent_status = AgentExecutionStatus.RUNNING
+        state.execution_status = ConversationExecutionStatus.RUNNING
         state.max_iterations = 100
         state.stuck_detection = False
 
     # Verify callback was called for each change
     assert len(callback_calls) == 3
-    assert callback_calls[0].key == "agent_status"
+    assert callback_calls[0].key == "execution_status"
     assert callback_calls[1].key == "max_iterations"
     assert callback_calls[2].key == "stuck_detection"
 
diff --git a/tests/sdk/conversation/test_visualizer.py b/tests/sdk/conversation/test_visualizer.py
index 7b479c2c0d..fcac0c8a33 100644
--- a/tests/sdk/conversation/test_visualizer.py
+++ b/tests/sdk/conversation/test_visualizer.py
@@ -206,7 +206,7 @@ def test_agent_error_event_visualize():
     event = AgentErrorEvent(
         error="Failed to execute command: permission denied",
         tool_call_id="call_err_1",
-        tool_name="execute_bash",
+        tool_name="bash",
     )
 
     result = event.visualize
@@ -333,13 +333,53 @@ def test_metrics_formatting():
     # Test the metrics subtitle formatting
     subtitle = visualizer._format_metrics_subtitle()
     assert subtitle is not None
-    assert "1.50K" in subtitle  # Input tokens abbreviated
+    assert "1.5K" in subtitle  # Input tokens abbreviated (trailing zeros removed)
     assert "500" in subtitle  # Output tokens
     assert "20.00%" in subtitle  # Cache hit rate
     assert "200" in subtitle  # Reasoning tokens
     assert "0.0234" in subtitle  # Cost
 
 
+def test_metrics_abbreviation_formatting():
+    """Test number abbreviation with various edge cases."""
+    from openhands.sdk.conversation.conversation_stats import ConversationStats
+    from openhands.sdk.llm.utils.metrics import Metrics
+
+    test_cases = [
+        # (input_tokens, expected_abbr)
+        (999, "999"),  # Below threshold
+        (1000, "1K"),  # Exact K boundary, trailing zeros removed
+        (1500, "1.5K"),  # K with one decimal, trailing zero removed
+        (89080, "89.08K"),  # K with two decimals (regression test for bug)
+        (89000, "89K"),  # K with trailing zeros removed
+        (1000000, "1M"),  # Exact M boundary
+        (1234567, "1.23M"),  # M with decimals
+        (1000000000, "1B"),  # Exact B boundary
+    ]
+
+    for tokens, expected in test_cases:
+        stats = ConversationStats()
+        metrics = Metrics(model_name="test-model")
+        metrics.add_token_usage(
+            prompt_tokens=tokens,
+            completion_tokens=100,
+            cache_read_tokens=0,
+            cache_write_tokens=0,
+            reasoning_tokens=0,
+            context_window=8000,
+            response_id="test",
+        )
+        stats.usage_to_metrics["test"] = metrics
+
+        visualizer = ConversationVisualizer(conversation_stats=stats)
+        subtitle = visualizer._format_metrics_subtitle()
+
+        assert subtitle is not None, f"Failed for {tokens}"
+        assert expected in subtitle, (
+            f"Expected '{expected}' in subtitle for {tokens}, got: {subtitle}"
+        )
+
+
 def test_event_base_fallback_visualize():
     """Test that Event provides fallback visualization."""
     from openhands.sdk.event.base import Event
diff --git a/tests/sdk/event/test_event_immutability.py b/tests/sdk/event/test_event_immutability.py
index cbc8bff629..6d9173e099 100644
--- a/tests/sdk/event/test_event_immutability.py
+++ b/tests/sdk/event/test_event_immutability.py
@@ -220,7 +220,9 @@ def test_pause_event_is_frozen():
 def test_condensation_is_frozen():
     """Test that Condensation instances are frozen."""
     event = Condensation(
-        forgotten_event_ids=["event1", "event2"], summary="Test summary"
+        forgotten_event_ids=["event1", "event2"],
+        summary="Test summary",
+        llm_response_id="condensation_response_1",
     )
 
     # Test that we cannot modify any field
diff --git a/tests/sdk/event/test_event_serialization.py b/tests/sdk/event/test_event_serialization.py
index 9abf4495e0..fd3a3d17ee 100644
--- a/tests/sdk/event/test_event_serialization.py
+++ b/tests/sdk/event/test_event_serialization.py
@@ -151,6 +151,7 @@ def test_condensation_serialization() -> None:
     event = Condensation(
         summary="This is a summary",
         forgotten_event_ids=["event1", "event2", "event3", "event4", "event5"],
+        llm_response_id="condensation_response_1",
     )
 
     # Serialize
diff --git a/tests/sdk/event/test_events_to_messages.py b/tests/sdk/event/test_events_to_messages.py
index 953d3910e9..eba2bacae6 100644
--- a/tests/sdk/event/test_events_to_messages.py
+++ b/tests/sdk/event/test_events_to_messages.py
@@ -104,7 +104,7 @@ def test_single_action_event(self):
         """Test conversion of single ActionEvent."""
         action_event = create_action_event(
             thought_text="I need to run a command",
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id="call_123",
             llm_response_id="response_1",
             action_args={"command": "ls -la"},
@@ -121,7 +121,7 @@ def test_single_action_event(self):
         assert messages[0].tool_calls is not None
         assert len(messages[0].tool_calls) == 1
         assert messages[0].tool_calls[0].id == "call_123"
-        assert messages[0].tool_calls[0].name == "execute_bash"
+        assert messages[0].tool_calls[0].name == "bash"
 
     def test_parallel_function_calling_same_response_id(self):
         """Test parallel function calling with multiple ActionEvents having same ID.
@@ -200,7 +200,7 @@ def test_multiple_separate_action_events(self):
         """Test multiple ActionEvents with different response_ids (separate calls)."""
         action1 = create_action_event(
             thought_text="First command",
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id="call_1",
             llm_response_id="response_1",
             action_args={"command": "ls"},
@@ -208,7 +208,7 @@ def test_multiple_separate_action_events(self):
 
         action2 = create_action_event(
             thought_text="Second command",
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id="call_2",
             llm_response_id="response_2",
             action_args={"command": "pwd"},
@@ -291,7 +291,7 @@ def test_agent_error_event(self):
         error_event = AgentErrorEvent(
             error="Command failed with exit code 1",
             tool_call_id="call_err",
-            tool_name="execute_bash",
+            tool_name="bash",
         )
 
         events = [error_event]
@@ -355,7 +355,7 @@ def test_complex_parallel_and_sequential_mix(self):
         # Fourth: Separate file listing call (different response_id)
         list_files = create_action_event(
             thought_text="Now I'll list the files",
-            tool_name="execute_bash",
+            tool_name="bash",
             tool_call_id="call_ls",
             llm_response_id="list_files_response",
             action_args={"command": "ls -la"},
diff --git a/tests/sdk/llm/test_api_connection_error_retry.py b/tests/sdk/llm/test_api_connection_error_retry.py
index 7209262088..315490d18f 100644
--- a/tests/sdk/llm/test_api_connection_error_retry.py
+++ b/tests/sdk/llm/test_api_connection_error_retry.py
@@ -6,6 +6,7 @@
 from pydantic import SecretStr
 
 from openhands.sdk.llm import LLM, LLMResponse, Message, TextContent
+from openhands.sdk.llm.exceptions import LLMServiceUnavailableError
 
 
 def create_mock_response(content: str = "Test response", response_id: str = "test-id"):
@@ -80,7 +81,7 @@ def test_completion_retries_api_connection_error(
 def test_completion_max_retries_api_connection_error(
     mock_litellm_completion, default_config
 ):
-    """Test that APIConnectionError respects max retries."""
+    """Test that APIConnectionError respects max retries and is mapped to SDK error."""
     # Mock the litellm_completion to raise APIConnectionError multiple times
     mock_litellm_completion.side_effect = [
         APIConnectionError(
@@ -110,8 +111,9 @@ def test_completion_max_retries_api_connection_error(
         usage_id="test-service",
     )
 
-    # The completion should raise an APIConnectionError after exhausting all retries
-    with pytest.raises(APIConnectionError) as excinfo:
+    # The completion should raise an SDK typed error after exhausting all retries
+
+    with pytest.raises(LLMServiceUnavailableError) as excinfo:
         llm.completion(
             messages=[Message(role="user", content=[TextContent(text="Hello!")])],
         )
@@ -123,6 +125,9 @@ def test_completion_max_retries_api_connection_error(
     # The exception should contain connection error information
     assert "API connection error" in str(excinfo.value)
 
+    # Ensure the original provider exception is preserved as the cause
+    assert isinstance(excinfo.value.__cause__, APIConnectionError)
+
 
 @patch("openhands.sdk.llm.llm.litellm_completion")
 def test_completion_no_retry_on_success(mock_litellm_completion, default_config):
diff --git a/tests/sdk/llm/test_exception.py b/tests/sdk/llm/test_exception.py
index 8ac1c63914..c15620c49d 100644
--- a/tests/sdk/llm/test_exception.py
+++ b/tests/sdk/llm/test_exception.py
@@ -61,9 +61,7 @@ def test_llm_context_window_exceed_error_default():
 
     error = LLMContextWindowExceedError()
     expected_message = "Conversation history longer than LLM context window limit. "
-    expected_message += (
-        "Consider turning on enable_history_truncation config to avoid this error"
-    )
+    expected_message += "Consider enabling a condenser or shortening inputs."
     assert str(error) == expected_message
     assert error.message == expected_message
 
diff --git a/tests/sdk/llm/test_exception_classifier.py b/tests/sdk/llm/test_exception_classifier.py
new file mode 100644
index 0000000000..08c76dbbac
--- /dev/null
+++ b/tests/sdk/llm/test_exception_classifier.py
@@ -0,0 +1,46 @@
+from litellm.exceptions import BadRequestError, ContextWindowExceededError
+
+from openhands.sdk.llm.exceptions import (
+    is_context_window_exceeded,
+    looks_like_auth_error,
+)
+
+
+MODEL = "test-model"
+PROVIDER = "test-provider"
+
+
+def test_is_context_window_exceeded_direct_type():
+    assert (
+        is_context_window_exceeded(ContextWindowExceededError("boom", MODEL, PROVIDER))
+        is True
+    )
+
+
+def test_is_context_window_exceeded_via_text():
+    # BadRequest containing context-window-ish text should be detected
+    e = BadRequestError(
+        "The request exceeds the available context size", MODEL, PROVIDER
+    )
+    assert is_context_window_exceeded(e) is True
+
+
+def test_is_context_window_exceeded_negative():
+    assert (
+        is_context_window_exceeded(BadRequestError("irrelevant", MODEL, PROVIDER))
+        is False
+    )
+
+
+def test_looks_like_auth_error_positive():
+    assert (
+        looks_like_auth_error(BadRequestError("Invalid API key", MODEL, PROVIDER))
+        is True
+    )
+
+
+def test_looks_like_auth_error_negative():
+    assert (
+        looks_like_auth_error(BadRequestError("Something else", MODEL, PROVIDER))
+        is False
+    )
diff --git a/tests/sdk/llm/test_exception_mapping.py b/tests/sdk/llm/test_exception_mapping.py
new file mode 100644
index 0000000000..af23e439fa
--- /dev/null
+++ b/tests/sdk/llm/test_exception_mapping.py
@@ -0,0 +1,41 @@
+from litellm.exceptions import BadRequestError
+
+from openhands.sdk.llm.exceptions import (
+    LLMAuthenticationError,
+    LLMBadRequestError,
+    map_provider_exception,
+)
+
+
+MODEL = "test-model"
+PROVIDER = "test-provider"
+
+
+def test_map_auth_error_from_bad_request():
+    e = BadRequestError("Invalid API key provided", MODEL, PROVIDER)
+    mapped = map_provider_exception(e)
+    assert isinstance(mapped, LLMAuthenticationError)
+
+
+def test_map_auth_error_from_openai_error():
+    # OpenAIError has odd behavior; create a BadRequestError that wraps an
+    # auth-like message instead, as providers commonly route auth issues
+    # through BadRequestError in LiteLLM
+    e = BadRequestError("status 401 Unauthorized: missing API key", MODEL, PROVIDER)
+    mapped = map_provider_exception(e)
+    assert isinstance(mapped, LLMAuthenticationError)
+
+
+def test_map_generic_bad_request():
+    e = BadRequestError("Some client-side error not related to auth", MODEL, PROVIDER)
+    mapped = map_provider_exception(e)
+    assert isinstance(mapped, LLMBadRequestError)
+
+
+def test_passthrough_unknown_exception():
+    class MyCustom(Exception):
+        pass
+
+    e = MyCustom("random")
+    mapped = map_provider_exception(e)
+    assert mapped is e
diff --git a/tests/sdk/llm/test_llm.py b/tests/sdk/llm/test_llm.py
index 0d34e4640a..e5a2eef089 100644
--- a/tests/sdk/llm/test_llm.py
+++ b/tests/sdk/llm/test_llm.py
@@ -5,6 +5,9 @@
 from litellm.exceptions import (
     RateLimitError,
 )
+from litellm.types.llms.openai import ResponseAPIUsage, ResponsesAPIResponse
+from openai.types.responses.response_output_message import ResponseOutputMessage
+from openai.types.responses.response_output_text import ResponseOutputText
 from pydantic import SecretStr
 
 from openhands.sdk.llm import LLM, LLMResponse, Message, TextContent
@@ -261,6 +264,189 @@ def test_llm_token_counting(default_llm):
     assert token_count >= 0
 
 
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_llm_forwards_extra_headers_to_litellm(mock_completion):
+    mock_response = create_mock_litellm_response("ok")
+    mock_completion.return_value = mock_response
+
+    headers = {"anthropic-beta": "context-1m-2025-08-07"}  # Enable 1M context
+    llm = LLM(
+        usage_id="test-llm",
+        model="gpt-4o",
+        api_key=SecretStr("test_key"),
+        extra_headers=headers,
+        num_retries=0,
+    )
+
+    messages = [Message(role="user", content=[TextContent(text="Hi")])]
+    _ = llm.completion(messages=messages)
+
+    assert mock_completion.call_count == 1
+    _, kwargs = mock_completion.call_args
+    # extra_headers forwarded either directly or inside **kwargs
+    assert kwargs.get("extra_headers") == headers
+
+
+@patch("openhands.sdk.llm.llm.litellm_responses")
+def test_llm_responses_forwards_extra_headers_to_litellm(mock_responses):
+    # Build a minimal, but valid, ResponsesAPIResponse instance per litellm types
+    # Build typed message output using OpenAI types to satisfy litellm schema
+    msg = ResponseOutputMessage.model_construct(
+        id="m1",
+        type="message",
+        role="assistant",
+        status="completed",
+        content=[ResponseOutputText(type="output_text", text="ok", annotations=[])],
+    )
+    usage = ResponseAPIUsage(input_tokens=0, output_tokens=0, total_tokens=0)
+    resp = ResponsesAPIResponse(
+        id="resp123",
+        created_at=0,
+        output=[msg],
+        usage=usage,
+        parallel_tool_calls=False,
+        tool_choice="auto",
+        top_p=None,
+        tools=[],
+        instructions="",
+        status="completed",
+    )
+
+    mock_responses.return_value = resp
+
+    headers = {"anthropic-beta": "context-1m-2025-08-07"}
+    llm = LLM(
+        usage_id="test-llm",
+        model="gpt-4o",
+        api_key=SecretStr("test_key"),
+        extra_headers=headers,
+        num_retries=0,
+    )
+
+    messages = [
+        Message(role="system", content=[TextContent(text="sys")]),
+        Message(role="user", content=[TextContent(text="Hi")]),
+    ]
+    _ = llm.responses(messages=messages)
+
+    assert mock_responses.call_count == 1
+    _, kwargs = mock_responses.call_args
+    assert kwargs.get("extra_headers") == headers
+
+
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_completion_merges_llm_extra_headers_with_extended_thinking_default(
+    mock_completion,
+):
+    mock_response = create_mock_litellm_response("ok")
+    mock_completion.return_value = mock_response
+
+    llm = LLM(
+        usage_id="test-llm",
+        model="claude-sonnet-4-5-20250514",
+        api_key=SecretStr("test_key"),
+        extra_headers={"X-Trace": "1"},
+        extended_thinking_budget=1000,
+        num_retries=0,
+    )
+
+    messages = [Message(role="user", content=[TextContent(text="Hi")])]
+    _ = llm.completion(messages=messages)
+
+    assert mock_completion.call_count == 1
+    _, kwargs = mock_completion.call_args
+    headers = kwargs.get("extra_headers") or {}
+    # Intended behavior:
+    # - No per-call headers provided.
+    # - LLM.extra_headers should be used.
+    # - Extended thinking default (anthropic-beta) should be merged in.
+    # - Result keeps both the default and configured headers.
+    assert headers.get("anthropic-beta") == "interleaved-thinking-2025-05-14"
+    assert headers.get("X-Trace") == "1"
+
+
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_completion_call_time_extra_headers_override_config_and_defaults(
+    mock_completion,
+):
+    mock_response = create_mock_litellm_response("ok")
+    mock_completion.return_value = mock_response
+
+    llm = LLM(
+        usage_id="test-llm",
+        model="claude-sonnet-4-5-20250514",
+        api_key=SecretStr("test_key"),
+        # Config sets a conflicting header
+        extra_headers={"anthropic-beta": "context-1m-2025-08-07", "X-Trace": "1"},
+        extended_thinking_budget=1000,
+        num_retries=0,
+    )
+
+    messages = [Message(role="user", content=[TextContent(text="Hi")])]
+    # Intended behavior:
+    # - Per-call headers should replace any LLM.extra_headers.
+    # - Extended thinking default should still be merged in.
+    # - On conflicts, per-call headers win (anthropic-beta => custom-beta).
+    call_headers = {"anthropic-beta": "custom-beta", "Header-Only": "H"}
+    _ = llm.completion(messages=messages, extra_headers=call_headers)
+
+    assert mock_completion.call_count == 1
+    _, kwargs = mock_completion.call_args
+    headers = kwargs.get("extra_headers") or {}
+    assert headers.get("anthropic-beta") == "custom-beta"
+    assert headers.get("Header-Only") == "H"
+    # LLM.config headers should not be merged when user specifies their own
+    # (except defaults we explicitly add)
+    assert "X-Trace" not in headers
+
+
+@patch("openhands.sdk.llm.llm.litellm_responses")
+def test_responses_call_time_extra_headers_override_config(mock_responses):
+    # Build a minimal valid Responses response
+    msg = ResponseOutputMessage.model_construct(
+        id="m1",
+        type="message",
+        role="assistant",
+        status="completed",
+        content=[ResponseOutputText(type="output_text", text="ok", annotations=[])],
+    )
+    usage = ResponseAPIUsage(input_tokens=0, output_tokens=0, total_tokens=0)
+    resp = ResponsesAPIResponse(
+        id="resp123",
+        created_at=0,
+        output=[msg],
+        usage=usage,
+        parallel_tool_calls=False,
+        tool_choice="auto",
+        top_p=None,
+        tools=[],
+        instructions="",
+        status="completed",
+    )
+    mock_responses.return_value = resp
+
+    llm = LLM(
+        usage_id="test-llm",
+        model="gpt-4o",
+        api_key=SecretStr("test_key"),
+        extra_headers={"X-Trace": "1"},
+        num_retries=0,
+    )
+
+    messages = [Message(role="user", content=[TextContent(text="Hi")])]
+    # Intended behavior:
+    # - Per-call headers should replace any LLM.extra_headers for Responses path.
+    # - No Anthropic default is currently added on the Responses path.
+    call_headers = {"Header-Only": "H"}
+    _ = llm.responses(messages=messages, extra_headers=call_headers)
+
+    assert mock_responses.call_count == 1
+    _, kwargs = mock_responses.call_args
+    headers = kwargs.get("extra_headers") or {}
+    assert headers.get("Header-Only") == "H"
+    assert "X-Trace" not in headers
+
+
 def test_llm_vision_support(default_llm):
     """Test LLM vision support detection."""
     llm = default_llm
@@ -275,8 +461,47 @@ def test_llm_function_calling_support(default_llm):
     llm = default_llm
 
     # Function calling support detection should work without errors
-    function_calling_active = llm.is_function_calling_active()
-    assert isinstance(function_calling_active, bool)
+    native_tool_calling = llm.native_tool_calling
+    assert isinstance(native_tool_calling, bool)
+
+
+def test_llm_function_calling_enabled_by_default():
+    """Test that function calling is enabled by default for all models."""
+    # Test with a known model
+    llm_known = LLM(
+        model="gpt-4o", api_key=SecretStr("test_key"), usage_id="test-known"
+    )
+    assert llm_known.native_tool_calling is True
+
+    # Test with an unknown model - should still be enabled by default
+    llm_unknown = LLM(
+        model="some-unknown-model-xyz",
+        api_key=SecretStr("test_key"),
+        usage_id="test-unknown",
+    )
+    assert llm_unknown.native_tool_calling is True
+
+
+def test_llm_function_calling_can_be_disabled():
+    """Test that users can opt-out of function calling via
+    native_tool_calling=False."""
+    # Test with a known model that normally has function calling
+    llm_disabled = LLM(
+        model="gpt-4o",
+        api_key=SecretStr("test_key"),
+        native_tool_calling=False,
+        usage_id="test-disabled",
+    )
+    assert llm_disabled.native_tool_calling is False
+
+    # Test with an unknown model with function calling disabled
+    llm_unknown_disabled = LLM(
+        model="some-unknown-model-xyz",
+        api_key=SecretStr("test_key"),
+        native_tool_calling=False,
+        usage_id="test-unknown-disabled",
+    )
+    assert llm_unknown_disabled.native_tool_calling is False
 
 
 def test_llm_caching_support(default_llm):
diff --git a/tests/sdk/llm/test_llm_completion.py b/tests/sdk/llm/test_llm_completion.py
index 395ed6ff61..2a90b2ac37 100644
--- a/tests/sdk/llm/test_llm_completion.py
+++ b/tests/sdk/llm/test_llm_completion.py
@@ -1,5 +1,7 @@
 """Tests for LLM completion functionality, configuration, and metrics tracking."""
 
+from collections.abc import Sequence
+from typing import ClassVar
 from unittest.mock import patch
 
 import pytest
@@ -19,7 +21,7 @@
     TextContent,
 )
 from openhands.sdk.tool.schema import Action
-from openhands.sdk.tool.tool import ToolBase, ToolDefinition
+from openhands.sdk.tool.tool import ToolDefinition
 
 
 def create_mock_response(content: str = "Test response", response_id: str = "test-id"):
@@ -41,6 +43,23 @@ def create_mock_response(content: str = "Test response", response_id: str = "tes
     )
 
 
+# Helper tool classes for testing
+class _ArgsBasic(Action):
+    """Basic action for testing."""
+
+    param: str
+
+
+class _MockTool(ToolDefinition[_ArgsBasic, None]):
+    """Mock tool for LLM completion testing."""
+
+    name: ClassVar[str] = "test_tool"
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["_MockTool"]:
+        return [cls(description="A test tool", action_type=_ArgsBasic)]
+
+
 @pytest.fixture
 def default_config():
     return LLM(
@@ -127,13 +146,7 @@ def test_llm_completion_with_tools(mock_completion):
     # Test completion with tools
     messages = [Message(role="user", content=[TextContent(text="Use the test tool")])]
 
-    class _ArgsBasic(Action):
-        param: str
-
-    tool: ToolBase = ToolDefinition(
-        name="test_tool", description="A test tool", action_type=_ArgsBasic
-    )
-    tools_list: list[ToolBase] = [tool]
+    tools_list = list(_MockTool.create())
 
     response = llm.completion(messages=messages, tools=tools_list)
 
@@ -205,7 +218,7 @@ def test_llm_feature_detection(default_config):
 
     # All feature detection methods should return booleans
     assert isinstance(llm.vision_is_active(), bool)
-    assert isinstance(llm.is_function_calling_active(), bool)
+    assert isinstance(llm.native_tool_calling, bool)
     assert isinstance(llm.is_caching_prompt_active(), bool)
 
 
@@ -315,14 +328,15 @@ def test_llm_completion_non_function_call_mode(mock_completion):
         usage_id="test-llm",
         model="gpt-4o",
         api_key=SecretStr("test_key"),
-        native_tool_calling=False,  # This is the key setting for non-function call mode
+        # This is the key setting for non-function call mode
+        native_tool_calling=False,
         num_retries=2,
         retry_min_wait=1,
         retry_max_wait=2,
     )
 
     # Verify that function calling is not active
-    assert not llm.is_function_calling_active()
+    assert not llm.native_tool_calling
 
     # Test completion with tools - this should trigger the non-function call path
     messages = [
@@ -332,16 +346,7 @@ def test_llm_completion_non_function_call_mode(mock_completion):
         )
     ]
 
-    class TestNonFCArgs(Action):
-        param: str
-
-    tools: list[ToolBase] = [
-        ToolDefinition(
-            name="test_tool",
-            description="A test tool for non-function call mode",
-            action_type=TestNonFCArgs,
-        )
-    ]
+    tools = list(_MockTool.create())
 
     # Verify that tools should be mocked (non-function call path)
     cc_tools = [t.to_openai_tool(add_security_risk_prediction=False) for t in tools]
@@ -389,14 +394,7 @@ def test_llm_completion_function_call_vs_non_function_call_mode(mock_completion)
     mock_response = create_mock_response("Test response")
     mock_completion.return_value = mock_response
 
-    class TestFCArgs(Action):
-        param: str | None = None
-
-    tools: list[ToolBase] = [
-        ToolDefinition(
-            name="test_tool", description="A test tool", action_type=TestFCArgs
-        )
-    ]
+    tools = list(_MockTool.create())
     messages = [Message(role="user", content=[TextContent(text="Use the test tool")])]
 
     # Test with native function calling enabled (default behavior for gpt-4o)
@@ -411,7 +409,7 @@ class TestFCArgs(Action):
     )
 
     # Verify function calling is active
-    assert llm_native.is_function_calling_active()
+    assert llm_native.native_tool_calling
     # Should not mock tools when native function calling is active
 
     # Test with native function calling disabled
@@ -426,7 +424,7 @@ class TestFCArgs(Action):
     )
 
     # Verify function calling is not active
-    assert not llm_non_native.is_function_calling_active()
+    assert not llm_non_native.native_tool_calling
 
     # Call both and verify different behavior
     mock_completion.reset_mock()
diff --git a/tests/sdk/llm/test_llm_fncall_converter.py b/tests/sdk/llm/test_llm_fncall_converter.py
index 43a5bedb1f..39f894e34d 100644
--- a/tests/sdk/llm/test_llm_fncall_converter.py
+++ b/tests/sdk/llm/test_llm_fncall_converter.py
@@ -21,7 +21,7 @@
     {
         "type": "function",
         "function": {
-            "name": "execute_bash",
+            "name": "bash",
             "description": "Execute a bash command in the terminal.",
             "parameters": {
                 "type": "object",
@@ -65,7 +65,7 @@ def test_convert_fncall_to_non_fncall_basic():
                     "id": "call_123",
                     "type": "function",
                     "function": {
-                        "name": "execute_bash",
+                        "name": "bash",
                         "arguments": '{"command": "ls"}',
                     },
                 }
@@ -84,14 +84,12 @@ def test_convert_fncall_to_non_fncall_basic():
     # Check that tool calls are converted to text format
     assistant_msg = None
     for msg in non_fncall_messages:
-        if msg.get("role") == "assistant" and "execute_bash" in str(
-            msg.get("content", "")
-        ):
+        if msg.get("role") == "assistant" and "bash" in str(msg.get("content", "")):
             assistant_msg = msg
             break
 
     assert assistant_msg is not None
-    assert "execute_bash" in assistant_msg["content"]
+    assert "bash" in assistant_msg["content"]
 
 
 def test_convert_non_fncall_to_fncall_basic():
@@ -102,7 +100,7 @@ def test_convert_non_fncall_to_fncall_basic():
         {
             "role": "assistant",
             "content": (
-                "I'll run the ls command for you.\n\n<function=execute_bash>\n"
+                "I'll run the ls command for you.\n\n<function=bash>\n"
                 "<parameter=command>ls</parameter>\n</function>"
             ),
         },
@@ -125,7 +123,7 @@ def test_convert_non_fncall_to_fncall_basic():
     assert assistant_msg is not None
     assert "tool_calls" in assistant_msg
     assert len(assistant_msg["tool_calls"]) == 1
-    assert assistant_msg["tool_calls"][0]["function"]["name"] == "execute_bash"
+    assert assistant_msg["tool_calls"][0]["function"]["name"] == "bash"
 
 
 def test_convert_fncall_to_non_fncall_with_in_context_learning():
@@ -180,7 +178,7 @@ def test_convert_with_multiple_tool_calls():
                     "id": "call_123",
                     "type": "function",
                     "function": {
-                        "name": "execute_bash",
+                        "name": "bash",
                         "arguments": '{"command": "ls"}',
                     },
                 },
@@ -188,7 +186,7 @@ def test_convert_with_multiple_tool_calls():
                     "id": "call_456",
                     "type": "function",
                     "function": {
-                        "name": "execute_bash",
+                        "name": "bash",
                         "arguments": '{"command": "pwd"}',
                     },
                 },
@@ -215,7 +213,7 @@ def test_convert_with_tool_response():
                     "id": "call_123",
                     "type": "function",
                     "function": {
-                        "name": "execute_bash",
+                        "name": "bash",
                         "arguments": '{"command": "ls"}',
                     },
                 }
@@ -262,7 +260,7 @@ def test_convert_roundtrip():
                     "id": "call_123",
                     "type": "function",
                     "function": {
-                        "name": "execute_bash",
+                        "name": "bash",
                         "arguments": '{"command": "ls"}',
                     },
                 }
@@ -412,7 +410,7 @@ def test_convert_with_system_message():
                     "id": "call_123",
                     "type": "function",
                     "function": {
-                        "name": "execute_bash",
+                        "name": "bash",
                         "arguments": '{"command": "ls"}',
                     },
                 }
@@ -476,14 +474,11 @@ def test_convert_with_finish_tool():
                 "id": "test_id",
                 "type": "function",
                 "function": {
-                    "name": "execute_bash",
+                    "name": "bash",
                     "arguments": '{"command": "ls -la"}',
                 },
             },
-            (
-                "<function=execute_bash>\n<parameter=command>ls -la</parameter>\n"
-                "</function>"
-            ),
+            ("<function=bash>\n<parameter=command>ls -la</parameter>\n</function>"),
         ),
         # Multiple parameters with different types
         (
@@ -491,7 +486,7 @@ def test_convert_with_finish_tool():
                 "id": "test_id",
                 "type": "function",
                 "function": {
-                    "name": "str_replace_editor",
+                    "name": "file_editor",
                     "arguments": (
                         '{"command": "view", "path": "/test/file.py", '
                         '"view_range": [1, 10]}'
@@ -499,7 +494,7 @@ def test_convert_with_finish_tool():
                 },
             },
             (
-                "<function=str_replace_editor>\n<parameter=command>view</parameter>\n"
+                "<function=file_editor>\n<parameter=command>view</parameter>\n"
                 "<parameter=path>/test/file.py</parameter>\n"
                 "<parameter=view_range>[1, 10]</parameter>\n</function>"
             ),
@@ -510,7 +505,7 @@ def test_convert_with_finish_tool():
                 "id": "test_id",
                 "type": "function",
                 "function": {
-                    "name": "str_replace_editor",
+                    "name": "file_editor",
                     "arguments": json.dumps(
                         {
                             "command": "str_replace",
@@ -525,7 +520,7 @@ def test_convert_with_finish_tool():
                 },
             },
             (
-                "<function=str_replace_editor>\n<parameter=command>str_replace</parameter>\n"
+                "<function=file_editor>\n<parameter=command>str_replace</parameter>\n"
                 "<parameter=path>/test/file.py</parameter>\n<parameter=old_str>\n"
                 "def example():\n    pass\n</parameter>\n<parameter=new_str>\n"
                 'def example():\n    # This is indented\n    print("hello")\n'
diff --git a/tests/sdk/llm/test_llm_litellm_extra_body.py b/tests/sdk/llm/test_llm_litellm_extra_body.py
new file mode 100644
index 0000000000..ba334ea98c
--- /dev/null
+++ b/tests/sdk/llm/test_llm_litellm_extra_body.py
@@ -0,0 +1,56 @@
+from unittest.mock import patch
+
+from litellm.types.utils import ModelResponse
+
+from openhands.sdk.llm import LLM, Message, TextContent
+
+
+def test_litellm_extra_body_passed_to_completion():
+    """Test that litellm_extra_body is correctly passed to litellm.completion()."""
+    custom_extra_body = {
+        "cluster_id": "prod-cluster-1",
+        "routing_key": "high-priority",
+        "user_tier": "premium",
+        "custom_headers": {
+            "X-Request-Source": "openhands-agent",
+        },
+    }
+
+    llm = LLM(model="gpt-4o", usage_id="test", litellm_extra_body=custom_extra_body)
+    messages = [Message(role="user", content=[TextContent(text="Hello")])]
+
+    with patch("openhands.sdk.llm.llm.litellm_completion") as mock_completion:
+        # Create a proper ModelResponse mock
+        mock_response = ModelResponse(
+            id="test-id",
+            choices=[
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": "Hello!"},
+                    "finish_reason": "stop",
+                }
+            ],
+            created=1234567890,
+            model="gpt-4o",
+            object="chat.completion",
+        )
+        mock_completion.return_value = mock_response
+
+        # Call completion
+        llm.completion(messages=messages)
+
+        # Verify that litellm.completion was called with our extra_body
+        mock_completion.assert_called_once()
+        call_kwargs = mock_completion.call_args[1]
+
+        # Check that extra_body was passed correctly
+        assert "extra_body" in call_kwargs
+        assert call_kwargs["extra_body"] == custom_extra_body
+
+        # Verify specific custom fields were passed through
+        assert call_kwargs["extra_body"]["cluster_id"] == "prod-cluster-1"
+        assert call_kwargs["extra_body"]["routing_key"] == "high-priority"
+        assert (
+            call_kwargs["extra_body"]["custom_headers"]["X-Request-Source"]
+            == "openhands-agent"
+        )
diff --git a/tests/sdk/llm/test_llm_metadata.py b/tests/sdk/llm/test_llm_metadata.py
deleted file mode 100644
index d02c5b853c..0000000000
--- a/tests/sdk/llm/test_llm_metadata.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from openhands.sdk.llm import LLM
-
-
-def test_llm_metadata_default():
-    """Test that metadata field defaults to empty dict."""
-    llm = LLM(model="gpt-4o", usage_id="test")
-    assert llm.metadata == {}
-
-
-def test_llm_metadata_initialization():
-    """Test metadata field initialization with custom values."""
-    custom_metadata = {
-        "trace_version": "1.0.0",
-        "tags": ["model:gpt-4", "agent:my-agent"],
-        "session_id": "session-123",
-        "trace_user_id": "user-456",
-    }
-    llm = LLM(model="gpt-4o", usage_id="test", metadata=custom_metadata)
-    assert llm.metadata == custom_metadata
-
-
-def test_llm_metadata_modification():
-    """Test that metadata field can be modified after initialization."""
-    llm = LLM(model="gpt-4o", usage_id="test")
-
-    # Start with empty metadata
-    assert llm.metadata == {}
-
-    # Add some metadata
-    llm.metadata["custom_key"] = "custom_value"
-    llm.metadata["session_id"] = "session-123"
-
-    assert llm.metadata["custom_key"] == "custom_value"
-    assert llm.metadata["session_id"] == "session-123"
-
-
-def test_llm_metadata_complex_structure():
-    """Test metadata field with complex nested structure."""
-    complex_metadata = {
-        "trace_version": "2.1.0",
-        "tags": ["model:claude-3", "agent:coding-agent", "env:production"],
-        "session_info": {
-            "id": "session-789",
-            "user_id": "user-101",
-            "created_at": "2024-01-01T00:00:00Z",
-        },
-        "metrics": {
-            "tokens_used": 1500,
-            "response_time_ms": 250,
-        },
-    }
-    llm = LLM(model="claude-3-5-sonnet", usage_id="test", metadata=complex_metadata)
-    assert llm.metadata == complex_metadata
-
-    # Test nested access
-    assert llm.metadata["session_info"]["id"] == "session-789"
-    assert llm.metadata["metrics"]["tokens_used"] == 1500
diff --git a/tests/sdk/llm/test_llm_retry_telemetry.py b/tests/sdk/llm/test_llm_retry_telemetry.py
new file mode 100644
index 0000000000..bf1ab0a774
--- /dev/null
+++ b/tests/sdk/llm/test_llm_retry_telemetry.py
@@ -0,0 +1,306 @@
+"""
+Test that telemetry records are accurate when LLM calls are retried.
+
+This test ensures that when an LLM call is retried, the telemetry only
+records the latency and metrics for the successful attempt, not the
+combined time of all failed attempts plus the successful one.
+"""
+
+import time
+from unittest.mock import patch
+
+from litellm.exceptions import APIConnectionError
+from litellm.types.utils import Choices, Message as LiteLLMMessage, ModelResponse, Usage
+from pydantic import SecretStr
+
+from openhands.sdk.llm import LLM, Message, TextContent
+
+
+def create_mock_response(
+    content: str = "Test response",
+    response_id: str = "test-id",
+    prompt_tokens: int = 10,
+    completion_tokens: int = 5,
+):
+    """Helper function to create properly structured mock responses."""
+    return ModelResponse(
+        id=response_id,
+        choices=[
+            Choices(
+                finish_reason="stop",
+                index=0,
+                message=LiteLLMMessage(content=content, role="assistant"),
+            )
+        ],
+        created=1234567890,
+        model="gpt-4o",
+        object="chat.completion",
+        system_fingerprint="test",
+        usage=Usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+        ),
+    )
+
+
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_telemetry_records_only_successful_attempt_latency(mock_litellm_completion):
+    """
+    Test that when LLM calls are retried, telemetry only records the latency
+    of the successful attempt, not the cumulative time of all attempts.
+
+    Before the fix, on_request was called once before retry logic, causing
+    the latency to include all failed attempts + wait times. After the fix,
+    on_request is called for each retry attempt, so only the successful
+    attempt's latency is recorded.
+    """
+    # Create mock responses for failed and successful attempts
+    mock_response = create_mock_response("Success after retry")
+
+    # Simulate 2 failures followed by success
+    mock_litellm_completion.side_effect = [
+        APIConnectionError(
+            message="Connection failed 1",
+            llm_provider="test_provider",
+            model="test_model",
+        ),
+        APIConnectionError(
+            message="Connection failed 2",
+            llm_provider="test_provider",
+            model="test_model",
+        ),
+        mock_response,  # Third attempt succeeds
+    ]
+
+    # Create LLM with retry configuration and minimal wait times for faster test
+    llm = LLM(
+        model="gpt-4o",
+        api_key=SecretStr("test_key"),
+        num_retries=3,
+        retry_min_wait=1,  # 1 second minimum wait
+        retry_max_wait=1,  # 1 second maximum wait (same as min for consistent timing)
+        usage_id="test-service",
+    )
+
+    # Record the start time of the entire operation
+    operation_start = time.time()
+
+    # Make the completion call (will retry twice, then succeed)
+    response = llm.completion(
+        messages=[Message(role="user", content=[TextContent(text="Hello!")])],
+    )
+
+    # Record the total operation time
+    total_operation_time = time.time() - operation_start
+
+    # Verify the call succeeded
+    assert response.raw_response == mock_response
+    assert mock_litellm_completion.call_count == 3
+
+    # Get the metrics to check recorded latency
+    metrics = llm.metrics
+
+    # The recorded latency should be much less than the total operation time
+    # because it should only include the successful attempt, not the failed ones
+    recorded_latencies = [latency.latency for latency in metrics.response_latencies]
+
+    # There should be exactly one latency record (for the successful attempt)
+    assert len(recorded_latencies) == 1
+
+    recorded_latency = recorded_latencies[0]
+
+    # The recorded latency should be significantly less than total operation time
+    # Total operation time includes:
+    # - First attempt (failed) + wait time
+    # - Second attempt (failed) + wait time
+    # - Third attempt (successful)
+    #
+    # The recorded latency should only include the third attempt
+    assert recorded_latency < total_operation_time * 0.5, (
+        f"Recorded latency ({recorded_latency:.3f}s) should be much less "
+        f"than total operation time ({total_operation_time:.3f}s)"
+    )
+
+    # The recorded latency should be relatively small (just the mock call time)
+    # Since we're mocking, it should be very quick (< 100ms typically)
+    assert recorded_latency < 0.5, (
+        f"Recorded latency ({recorded_latency:.3f}s) should be < 0.5s for a mocked call"
+    )
+
+
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_telemetry_on_request_called_per_retry(mock_litellm_completion):
+    """
+    Test that telemetry.on_request() is called for each retry attempt.
+
+    This ensures that each retry resets the request timer, so only the
+    successful attempt's latency is recorded.
+
+    We verify this by checking the _req_start timestamps which are set
+    by on_request(). With the fix, _req_start should be reset for each retry.
+    """
+    # Track _req_start values to see when on_request is called
+    req_start_values = []
+
+    mock_response = create_mock_response("Success after one retry")
+
+    # Create a side effect function that captures _req_start after each attempt
+    def mock_transport_call_side_effect(*args, **kwargs):
+        # Capture the current _req_start value (set by on_request)
+        # This runs inside _one_attempt, after on_request is called
+        nonlocal req_start_values
+        req_start_values.append(time.time())
+
+        # First call fails, second succeeds
+        if len(req_start_values) == 1:
+            raise APIConnectionError(
+                message="Connection failed",
+                llm_provider="test_provider",
+                model="test_model",
+            )
+        return mock_response
+
+    mock_litellm_completion.side_effect = mock_transport_call_side_effect
+
+    # Create LLM instance
+    llm = LLM(
+        model="gpt-4o",
+        api_key=SecretStr("test_key"),
+        num_retries=2,
+        retry_min_wait=1,
+        retry_max_wait=1,
+        usage_id="test-service",
+    )
+
+    # Make the completion call
+    response = llm.completion(
+        messages=[Message(role="user", content=[TextContent(text="Test")])],
+    )
+
+    # Verify the call succeeded
+    assert response.raw_response == mock_response
+
+    # Should have attempted twice (one failure, one success)
+    assert len(req_start_values) == 2, (
+        f"Expected 2 attempts, got {len(req_start_values)}"
+    )
+
+    # Verify there was a time gap between the attempts (retry wait time)
+    # This proves on_request was called for each attempt
+    time_gap = req_start_values[1] - req_start_values[0]
+    assert time_gap >= 0.5, (
+        "There should be a wait time between retry attempts "
+        f"(gap: {time_gap:.3f}s, expected >= 0.5s due to 1 second retry wait)"
+    )
+
+
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_telemetry_metrics_accurate_with_retries(mock_litellm_completion):
+    """
+    Test that all telemetry metrics (tokens, cost, latency) are accurate
+    when retries occur.
+    """
+    # Create a response with specific token counts
+    mock_response = create_mock_response(
+        "Success", prompt_tokens=100, completion_tokens=50
+    )
+
+    # Simulate one failure then success
+    mock_litellm_completion.side_effect = [
+        APIConnectionError(
+            message="Connection failed",
+            llm_provider="test_provider",
+            model="test_model",
+        ),
+        mock_response,
+    ]
+
+    # Create LLM with cost tracking
+    llm = LLM(
+        model="gpt-4o",
+        api_key=SecretStr("test_key"),
+        num_retries=2,
+        retry_min_wait=1,
+        retry_max_wait=1,
+        usage_id="test-service",
+        input_cost_per_token=0.001,
+        output_cost_per_token=0.002,
+    )
+
+    # Make the completion call
+    response = llm.completion(
+        messages=[Message(role="user", content=[TextContent(text="Test")])],
+    )
+
+    # Verify the call succeeded
+    assert response.raw_response == mock_response
+
+    # Get metrics
+    metrics = llm.metrics
+
+    # Token usage should only reflect the successful attempt
+    assert len(metrics.token_usages) == 1
+    token_usage = metrics.token_usages[0]
+    assert token_usage.prompt_tokens == 100
+    assert token_usage.completion_tokens == 50
+
+    # Cost should only reflect the successful attempt
+    # Note: Cost calculation depends on litellm, so we just verify it's positive
+    assert metrics.accumulated_cost > 0
+
+    # Latency should only reflect the successful attempt (should be small)
+    assert len(metrics.response_latencies) == 1
+    assert metrics.response_latencies[0].latency < 0.5
+
+
+@patch("openhands.sdk.llm.llm.litellm_completion")
+def test_telemetry_no_multiple_records_on_retry(mock_litellm_completion):
+    """
+    Test that telemetry doesn't create multiple records for failed attempts.
+
+    Only the successful attempt should result in telemetry records.
+    """
+    mock_response = create_mock_response("Success")
+
+    # Simulate multiple failures then success
+    mock_litellm_completion.side_effect = [
+        APIConnectionError(
+            message="Fail 1", llm_provider="test_provider", model="test_model"
+        ),
+        APIConnectionError(
+            message="Fail 2", llm_provider="test_provider", model="test_model"
+        ),
+        APIConnectionError(
+            message="Fail 3", llm_provider="test_provider", model="test_model"
+        ),
+        mock_response,
+    ]
+
+    llm = LLM(
+        model="gpt-4o",
+        api_key=SecretStr("test_key"),
+        num_retries=5,
+        retry_min_wait=1,
+        retry_max_wait=1,
+        usage_id="test-service",
+    )
+
+    # Make the completion call
+    response = llm.completion(
+        messages=[Message(role="user", content=[TextContent(text="Test")])],
+    )
+
+    assert response.raw_response == mock_response
+
+    metrics = llm.metrics
+
+    # Should only have ONE latency record (for the successful attempt)
+    assert len(metrics.response_latencies) == 1
+
+    # Should only have ONE token usage record (for the successful attempt)
+    assert len(metrics.token_usages) == 1
+
+    # Should only have ONE cost record (for the successful attempt)
+    # Cost is accumulated, so we just check it's positive
+    assert metrics.accumulated_cost > 0
diff --git a/tests/sdk/llm/test_llm_serialization.py b/tests/sdk/llm/test_llm_serialization.py
index 1b568e8e27..b49771bf33 100644
--- a/tests/sdk/llm/test_llm_serialization.py
+++ b/tests/sdk/llm/test_llm_serialization.py
@@ -95,7 +95,6 @@ def test_llm_private_attributes_not_serialized() -> None:
     # Set private attributes (these would normally be set internally)
     llm._model_info = {"some": "info"}
     llm._tokenizer = "mock-tokenizer"
-    llm._function_calling_active = True
 
     # Serialize to dict
     llm_dict = llm.model_dump()
@@ -103,7 +102,6 @@ def test_llm_private_attributes_not_serialized() -> None:
     # Private attributes should not be present
     assert "_model_info" not in llm_dict
     assert "_tokenizer" not in llm_dict
-    assert "_function_calling_active" not in llm_dict
     assert "_telemetry" not in llm_dict
 
     # Serialize to JSON and deserialize
@@ -114,7 +112,7 @@ def test_llm_private_attributes_not_serialized() -> None:
     # (LLM creates telemetry automatically)
     assert deserialized_llm._model_info is None
     assert deserialized_llm._tokenizer is None
-    assert deserialized_llm._function_calling_active is False
+    assert deserialized_llm.native_tool_calling is True
     assert (
         deserialized_llm._telemetry is not None
     )  # LLM creates telemetry automatically
diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py
index 8f2b200e0d..7a6c424c96 100644
--- a/tests/sdk/llm/test_model_features.py
+++ b/tests/sdk/llm/test_model_features.py
@@ -3,40 +3,17 @@
 from openhands.sdk.llm.utils.model_features import (
     get_features,
     model_matches,
-    normalize_model_name,
 )
 
 
-@pytest.mark.parametrize(
-    "raw,expected",
-    [
-        ("  OPENAI/gpt-4o  ", "gpt-4o"),
-        ("anthropic/claude-3-7-sonnet", "claude-3-7-sonnet"),
-        ("litellm_proxy/gemini-2.5-pro", "gemini-2.5-pro"),
-        ("qwen3-coder-480b-a35b-instruct", "qwen3-coder-480b-a35b-instruct"),
-        ("gpt-5", "gpt-5"),
-        ("openai/GLM-4.5-GGUF", "glm-4.5"),
-        ("openrouter/gpt-4o-mini", "gpt-4o-mini"),
-        (
-            "bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0",
-            "claude-3-5-sonnet-20241022-v2",
-        ),
-        ("", ""),
-        (None, ""),  # type: ignore[arg-type]
-    ],
-)
-def test_normalize_model_name(raw, expected):
-    assert normalize_model_name(raw) == expected
-
-
 @pytest.mark.parametrize(
     "name,pattern,expected",
     [
-        ("gpt-4o", "gpt-4o*", True),
-        ("openai/gpt-4o", "gpt-4o*", True),
-        ("litellm_proxy/gpt-4o-mini", "gpt-4o*", True),
-        ("claude-3-7-sonnet-20250219", "claude-3-7-sonnet*", True),
-        ("o1-2024-12-17", "o1*", True),
+        ("gpt-4o", "gpt-4o", True),
+        ("openai/gpt-4o", "gpt-4o", True),
+        ("litellm_proxy/gpt-4o-mini", "gpt-4o", True),
+        ("claude-3-7-sonnet-20250219", "claude-3-7-sonnet", True),
+        ("o1-2024-12-17", "o1", True),
         ("grok-4-0709", "grok-4-0709", True),
         ("grok-4-0801", "grok-4-0709", False),
     ],
@@ -45,30 +22,6 @@ def test_model_matches(name, pattern, expected):
     assert model_matches(name, [pattern]) is expected
 
 
-@pytest.mark.parametrize(
-    "model,expected_function_calling",
-    [
-        ("gpt-4o", True),
-        ("gpt-4o-mini", True),
-        ("claude-3-5-sonnet", True),
-        ("claude-3-7-sonnet", True),
-        ("gemini-2.5-pro", True),
-        # AWS Bedrock models
-        ("bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0", True),
-        ("bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0", True),
-        ("bedrock/anthropic.claude-sonnet-4-20250514-v1:0", True),
-        (
-            "llama-3.1-70b",
-            False,
-        ),  # Most open source models don't support native function calling
-        ("unknown-model", False),  # Default to False for unknown models
-    ],
-)
-def test_function_calling_support(model, expected_function_calling):
-    features = get_features(model)
-    assert features.supports_function_calling == expected_function_calling
-
-
 @pytest.mark.parametrize(
     "model,expected_reasoning",
     [
@@ -94,10 +47,22 @@ def test_reasoning_effort_support(model, expected_reasoning):
         ("claude-3-7-sonnet", True),
         ("claude-3-haiku-20240307", True),
         ("claude-3-opus-20240229", True),
-        # AWS Bedrock models
+        # AWS Bedrock model ids (provider-prefixed)
         ("bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0", True),
         ("bedrock/anthropic.claude-3-haiku-20240307-v1:0", True),
+        # Anthropic Haiku 4.5 variants (dot and dash)
+        ("claude-haiku-4.5", True),
+        ("claude-haiku-4-5", True),
+        ("us.anthropic.claude-haiku-4.5-20251001", True),
+        ("us.anthropic.claude-haiku-4-5-20251001", True),
         ("bedrock/anthropic.claude-3-opus-20240229-v1:0", True),
+        # Anthropic 4.5 variants (dash and dot)
+        ("claude-sonnet-4-5", True),
+        ("claude-sonnet-4.5", True),
+        # User-facing model names (no provider prefix)
+        ("anthropic.claude-3-5-sonnet-20241022", True),
+        ("anthropic.claude-3-haiku-20240307", True),
+        ("anthropic.claude-3-opus-20240229", True),
         ("gpt-4o", False),  # OpenAI doesn't support explicit prompt caching
         ("gemini-1.5-pro", False),
         ("unknown-model", False),
@@ -134,9 +99,11 @@ def test_stop_words_support(model, expected_stop_words):
 def test_get_features_with_provider_prefix():
     """Test that get_features works with provider prefixes."""
     # Test with various provider prefixes
-    assert get_features("openai/gpt-4o").supports_function_calling is True
-    assert get_features("anthropic/claude-3-5-sonnet").supports_function_calling is True
-    assert get_features("litellm_proxy/gpt-4o").supports_function_calling is True
+    assert get_features("openai/gpt-4o").supports_reasoning_effort is False
+    assert (
+        get_features("anthropic/claude-3-5-sonnet").supports_reasoning_effort is False
+    )
+    assert get_features("litellm_proxy/gpt-4o").supports_reasoning_effort is False
 
 
 def test_get_features_case_insensitive():
@@ -145,17 +112,14 @@ def test_get_features_case_insensitive():
     features_upper = get_features("GPT-4O")
     features_mixed = get_features("Gpt-4O")
 
-    assert (
-        features_lower.supports_function_calling
-        == features_upper.supports_function_calling
-    )
     assert (
         features_lower.supports_reasoning_effort
         == features_upper.supports_reasoning_effort
     )
+    assert features_lower.supports_stop_words == features_upper.supports_stop_words
     assert (
-        features_lower.supports_function_calling
-        == features_mixed.supports_function_calling
+        features_lower.supports_reasoning_effort
+        == features_mixed.supports_reasoning_effort
     )
 
 
@@ -165,14 +129,11 @@ def test_get_features_with_version_suffixes():
     base_features = get_features("claude-3-5-sonnet")
     versioned_features = get_features("claude-3-5-sonnet-20241022")
 
-    assert (
-        base_features.supports_function_calling
-        == versioned_features.supports_function_calling
-    )
     assert (
         base_features.supports_reasoning_effort
         == versioned_features.supports_reasoning_effort
     )
+    assert base_features.supports_stop_words == versioned_features.supports_stop_words
     assert (
         base_features.supports_prompt_cache == versioned_features.supports_prompt_cache
     )
@@ -180,7 +141,7 @@ def test_get_features_with_version_suffixes():
 
 def test_model_matches_multiple_patterns():
     """Test model_matches with multiple patterns."""
-    patterns = ["gpt-4*", "claude-3*", "gemini-*"]
+    patterns = ["gpt-4", "claude-3", "gemini-"]
 
     assert model_matches("gpt-4o", patterns) is True
     assert model_matches("claude-3-5-sonnet", patterns) is True
@@ -188,37 +149,22 @@ def test_model_matches_multiple_patterns():
     assert model_matches("llama-3.1-70b", patterns) is False
 
 
-def test_model_matches_exact_match():
-    """Test model_matches with exact patterns (no wildcards)."""
+def test_model_matches_substring_semantics():
+    """Test model_matches uses substring semantics (no globbing)."""
     patterns = ["gpt-4o", "claude-3-5-sonnet"]
 
     assert model_matches("gpt-4o", patterns) is True
     assert model_matches("claude-3-5-sonnet", patterns) is True
-    assert model_matches("gpt-4o-mini", patterns) is False
+    # Substring match: 'gpt-4o' matches 'gpt-4o-mini'
+    assert model_matches("gpt-4o-mini", patterns) is True
     assert model_matches("claude-3-haiku", patterns) is False
 
 
-def test_normalize_model_name_edge_cases():
-    """Test normalize_model_name with edge cases."""
-    # Test with multiple slashes
-    assert normalize_model_name("provider/sub/model-name") == "model-name"
-
-    # Test with colons and special characters
-    assert normalize_model_name("provider/model:version:tag") == "model"
-
-    # Test with whitespace and case
-    assert normalize_model_name("  PROVIDER/Model-Name  ") == "model-name"
-
-    # Test with underscores and hyphens
-    assert normalize_model_name("provider/model_name-v1") == "model_name-v1"
-
-
 def test_get_features_unknown_model():
     """Test get_features with completely unknown model."""
     features = get_features("completely-unknown-model-12345")
 
-    # Unknown models should have conservative defaults
-    assert features.supports_function_calling is False
+    # Unknown models should have default feature values
     assert features.supports_reasoning_effort is False
     assert features.supports_prompt_cache is False
     assert features.supports_stop_words is True  # Most models support stop words
@@ -229,19 +175,18 @@ def test_get_features_empty_model():
     features_empty = get_features("")
     features_none = get_features(None)  # type: ignore[arg-type]
 
-    # Both should return conservative defaults
-    assert features_empty.supports_function_calling is False
-    assert features_none.supports_function_calling is False
+    # Empty models should have default feature values
     assert features_empty.supports_reasoning_effort is False
     assert features_none.supports_reasoning_effort is False
+    assert features_empty.supports_stop_words is True
+    assert features_none.supports_stop_words is True
 
 
 def test_model_matches_with_provider_pattern():
-    """Test model_matches with pattern containing '/' matches raw model string."""
-    # Test pattern with '/' matches against raw model string (lines 43-44)
-    assert model_matches("openai/gpt-4", ["openai/*"])
-    assert model_matches("anthropic/claude-3", ["anthropic/claude*"])
-    assert not model_matches("openai/gpt-4", ["anthropic/*"])
+    """model_matches uses substring on raw model name incl. provider prefixes."""
+    assert model_matches("openai/gpt-4", ["openai/"])
+    assert model_matches("anthropic/claude-3", ["anthropic/claude"])
+    assert not model_matches("openai/gpt-4", ["anthropic/"])
 
 
 def test_stop_words_grok_provider_prefixed():
@@ -265,3 +210,33 @@ def test_supports_stop_words_false_models(model):
     """Test models that don't support stop words."""
     features = get_features(model)
     assert features.supports_stop_words is False
+
+
+@pytest.mark.parametrize(
+    "model,expected_responses",
+    [
+        ("gpt-5", True),
+        ("openai/gpt-5-mini", True),
+        ("codex-mini-latest", True),
+        ("openai/codex-mini-latest", True),
+        ("gpt-4o", False),
+        ("unknown-model", False),
+    ],
+)
+def test_responses_api_support(model, expected_responses):
+    features = get_features(model)
+    assert features.supports_responses_api is expected_responses
+
+
+def test_force_string_serializer_full_model_names():
+    """Ensure full model names match substring patterns for string serializer.
+
+    Regression coverage for patterns like deepseek/glm without wildcards; Kimi
+    should only match when provider-prefixed with groq/.
+    """
+    assert get_features("DeepSeek-V3.2-Exp").force_string_serializer is True
+    assert get_features("GLM-4.5").force_string_serializer is True
+    # Provider-agnostic Kimi should not force string serializer
+    assert get_features("Kimi K2-Instruct-0905").force_string_serializer is False
+    # Groq-prefixed Kimi should force string serializer
+    assert get_features("groq/kimi-k2-instruct-0905").force_string_serializer is True
diff --git a/tests/sdk/mcp/test_mcp_tool_immutability.py b/tests/sdk/mcp/test_mcp_tool_immutability.py
index 5472f08ad8..b8ef34cb73 100644
--- a/tests/sdk/mcp/test_mcp_tool_immutability.py
+++ b/tests/sdk/mcp/test_mcp_tool_immutability.py
@@ -47,7 +47,11 @@ def test_mcp_tool_is_frozen(self):
         with pytest.raises(
             Exception
         ):  # Pydantic raises ValidationError for frozen models
-            self.tool.name = "modified_name"
+            self.tool.mcp_tool = mcp.types.Tool(
+                name="modified_name",
+                description="modified description",
+                inputSchema={"type": "object", "properties": {}},
+            )
 
         with pytest.raises(Exception):
             self.tool.description = "modified_description"
@@ -66,9 +70,21 @@ def test_mcp_tool_set_executor_returns_new_instance(self):
 
     def test_mcp_tool_model_copy_creates_modified_instance(self):
         """Test that model_copy can create modified versions of MCPTool instances."""
+        # Create a modified MCP tool with a different name
+        from mcp.types import Tool as MCPTool
+
+        modified_mcp_tool = MCPTool(
+            name="modified_tool",
+            description="Modified MCP tool description",
+            inputSchema=self.tool.mcp_tool.inputSchema,
+        )
+
         # Create a copy with modified fields
         modified_tool = self.tool.model_copy(
-            update={"name": "modified_tool", "description": "Modified description"}
+            update={
+                "mcp_tool": modified_mcp_tool,
+                "description": "Modified description",
+            }
         )
 
         # Verify that a new instance was created with modifications
@@ -118,7 +134,11 @@ def test_mcp_tool_create_immutable_instance(self):
 
         # Verify it's immutable
         with pytest.raises(Exception):
-            tool2.name = "modified_name"
+            tool2.mcp_tool = mcp.types.Tool(
+                name="modified_name",
+                description="modified description",
+                inputSchema={"type": "object", "properties": {}},
+            )
 
         # Verify it has the correct properties
         assert tool2.name == "another_tool"
diff --git a/tests/sdk/mcp/test_mcp_tool_serialization.py b/tests/sdk/mcp/test_mcp_tool_serialization.py
index a073927610..a8f7e0d037 100644
--- a/tests/sdk/mcp/test_mcp_tool_serialization.py
+++ b/tests/sdk/mcp/test_mcp_tool_serialization.py
@@ -13,7 +13,7 @@
 from openhands.sdk.mcp.definition import MCPToolAction, MCPToolObservation
 from openhands.sdk.mcp.tool import MCPToolDefinition
 from openhands.sdk.tool.schema import Action
-from openhands.sdk.tool.tool import ToolBase
+from openhands.sdk.tool.tool import ToolDefinition
 
 
 def create_mock_mcp_tool(name: str) -> mcp.types.Tool:
@@ -57,8 +57,8 @@ def test_mcp_tool_polymorphic_behavior() -> None:
     tools = MCPToolDefinition.create(mock_mcp_tool, mock_client)
     mcp_tool = tools[0]  # Extract single tool from sequence
 
-    # Should be instance of ToolBase
-    assert isinstance(mcp_tool, ToolBase)
+    # Should be instance of ToolDefinition
+    assert isinstance(mcp_tool, ToolDefinition)
     assert isinstance(mcp_tool, MCPToolDefinition)
 
     # Check basic properties
@@ -99,8 +99,8 @@ def test_mcp_tool_fallback_behavior() -> None:
         },
     }
 
-    deserialized_tool = ToolBase.model_validate(tool_data)
-    assert isinstance(deserialized_tool, ToolBase)
+    deserialized_tool = ToolDefinition.model_validate(tool_data)
+    assert isinstance(deserialized_tool, ToolDefinition)
     assert deserialized_tool.name == "fallback-tool"
     assert issubclass(deserialized_tool.action_type, Action)
     assert deserialized_tool.observation_type and issubclass(
diff --git a/tests/sdk/tool/test_builtins.py b/tests/sdk/tool/test_builtins.py
index ca3da4caa8..d69f0d589d 100644
--- a/tests/sdk/tool/test_builtins.py
+++ b/tests/sdk/tool/test_builtins.py
@@ -2,13 +2,22 @@
 
 
 def test_all_tools_property():
-    for tool in BUILT_IN_TOOLS:
-        assert tool.description is not None
-        assert tool.executor is not None
-        assert tool.annotations is not None
-        # Annotations should have specific hints
-        # Builtin tools should have all these properties
-        assert tool.annotations.readOnlyHint
-        assert not tool.annotations.destructiveHint
-        assert tool.annotations.idempotentHint
-        assert not tool.annotations.openWorldHint
+    # BUILT_IN_TOOLS contains tool classes, so we need to instantiate them
+    for tool_class in BUILT_IN_TOOLS:
+        # Create tool instances using .create() method
+        tool_instances = tool_class.create()
+        assert len(tool_instances) > 0, (
+            f"{tool_class.__name__}.create() should return at least one tool"
+        )
+
+        # Check properties for all instances (usually just one)
+        for tool in tool_instances:
+            assert tool.description is not None
+            assert tool.executor is not None
+            assert tool.annotations is not None
+            # Annotations should have specific hints
+            # Builtin tools should have all these properties
+            assert tool.annotations.readOnlyHint
+            assert not tool.annotations.destructiveHint
+            assert tool.annotations.idempotentHint
+            assert not tool.annotations.openWorldHint
diff --git a/tests/sdk/tool/test_registry.py b/tests/sdk/tool/test_registry.py
index 5cdb176f73..49213bd87a 100644
--- a/tests/sdk/tool/test_registry.py
+++ b/tests/sdk/tool/test_registry.py
@@ -60,7 +60,6 @@ def __call__(
 
         return [
             cls(
-                name="say_configurable_hello",
                 description=f"{greeting}{punctuation}",
                 action_type=_HelloAction,
                 observation_type=_HelloObservation,
@@ -69,16 +68,23 @@ def __call__(
         ]
 
 
+class _SimpleHelloTool(ToolDefinition[_HelloAction, _HelloObservation]):
+    """Simple concrete tool for registry testing."""
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["_SimpleHelloTool"]:
+        return [
+            cls(
+                description="Says hello",
+                action_type=_HelloAction,
+                observation_type=_HelloObservation,
+                executor=_HelloExec(),
+            )
+        ]
+
+
 def _hello_tool_factory(conv_state=None, **params) -> list[ToolDefinition]:
-    return [
-        ToolDefinition(
-            name="say_hello",
-            description="Says hello",
-            action_type=_HelloAction,
-            observation_type=_HelloObservation,
-            executor=_HelloExec(),
-        )
-    ]
+    return list(_SimpleHelloTool.create(conv_state, **params))
 
 
 def test_register_and_resolve_callable_factory():
@@ -86,7 +92,7 @@ def test_register_and_resolve_callable_factory():
     tools = resolve_tool(Tool(name="say_hello"), _create_mock_conv_state())
     assert len(tools) == 1
     assert isinstance(tools[0], ToolDefinition)
-    assert tools[0].name == "say_hello"
+    assert tools[0].name == "__simple_hello"
 
 
 def test_register_tool_instance_rejects_params():
diff --git a/tests/sdk/tool/test_to_responses_tool.py b/tests/sdk/tool/test_to_responses_tool.py
index c33d90659f..e6244961cd 100644
--- a/tests/sdk/tool/test_to_responses_tool.py
+++ b/tests/sdk/tool/test_to_responses_tool.py
@@ -1,5 +1,7 @@
+from typing import ClassVar
+
 from openhands.sdk.tool.schema import Action, Observation
-from openhands.sdk.tool.tool import ToolBase
+from openhands.sdk.tool.tool import ToolDefinition
 
 
 class A(Action):
@@ -13,16 +15,16 @@ def to_llm_content(self):  # type: ignore[override]
         return [TextContent(text="ok")]
 
 
-class T(ToolBase[A, Obs]):
+class T(ToolDefinition[A, Obs]):
+    name: ClassVar[str] = "t"
+
     @classmethod
     def create(cls, *args, **kwargs):  # pragma: no cover
         raise NotImplementedError
 
 
 def test_to_responses_tool_includes_strict_and_params():
-    out = T(
-        name="t", description="d", action_type=A, observation_type=Obs
-    ).to_responses_tool()
+    out = T(description="d", action_type=A, observation_type=Obs).to_responses_tool()
     assert out["type"] == "function"
     assert out["name"] == "t"
     # description is optional in the TypedDict; access via get for type safety
diff --git a/tests/sdk/tool/test_to_responses_tool_security.py b/tests/sdk/tool/test_to_responses_tool_security.py
index 87b20c4c97..bc78bd1c38 100644
--- a/tests/sdk/tool/test_to_responses_tool_security.py
+++ b/tests/sdk/tool/test_to_responses_tool_security.py
@@ -1,16 +1,48 @@
+from collections.abc import Sequence
+from typing import ClassVar
+
 from pydantic import Field
 
-from openhands.sdk.tool import Action, ToolAnnotations, ToolDefinition
+from openhands.sdk.tool import Action, Observation, ToolAnnotations, ToolDefinition
 
 
 class TRTSAction(Action):
     x: int = Field(description="x")
 
 
+class MockSecurityTool1(ToolDefinition[TRTSAction, Observation]):
+    """Concrete mock tool for security testing - readonly."""
+
+    name: ClassVar[str] = "t1"
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["MockSecurityTool1"]:
+        return [cls(**params)]
+
+
+class MockSecurityTool2(ToolDefinition[TRTSAction, Observation]):
+    """Concrete mock tool for security testing - writable."""
+
+    name: ClassVar[str] = "t2"
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["MockSecurityTool2"]:
+        return [cls(**params)]
+
+
+class MockSecurityTool3(ToolDefinition[TRTSAction, Observation]):
+    """Concrete mock tool for security testing - no flag."""
+
+    name: ClassVar[str] = "t3"
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["MockSecurityTool3"]:
+        return [cls(**params)]
+
+
 def test_to_responses_tool_security_gating():
     # readOnlyHint=True -> do not add security_risk even if requested
-    readonly = ToolDefinition(
-        name="t1",
+    readonly = MockSecurityTool1(
         description="d",
         action_type=TRTSAction,
         observation_type=None,
@@ -24,8 +56,7 @@ def test_to_responses_tool_security_gating():
     assert "security_risk" not in props
 
     # readOnlyHint=False -> add when requested
-    writable = ToolDefinition(
-        name="t2",
+    writable = MockSecurityTool2(
         description="d",
         action_type=TRTSAction,
         observation_type=None,
@@ -39,8 +70,7 @@ def test_to_responses_tool_security_gating():
     assert "security_risk" in props2
 
     # add_security_risk_prediction=False -> never add
-    noflag = ToolDefinition(
-        name="t3",
+    noflag = MockSecurityTool3(
         description="d",
         action_type=TRTSAction,
         observation_type=None,
diff --git a/tests/sdk/tool/test_tool_call_output_coercion.py b/tests/sdk/tool/test_tool_call_output_coercion.py
index 2d32ba5461..0eff69a9ef 100644
--- a/tests/sdk/tool/test_tool_call_output_coercion.py
+++ b/tests/sdk/tool/test_tool_call_output_coercion.py
@@ -1,3 +1,5 @@
+from collections.abc import Sequence
+
 import pytest
 from pydantic import Field
 
@@ -19,6 +21,14 @@ def to_llm_content(self):  # type: ignore[override]
         return [TextContent(text=str(self.value))]
 
 
+class MockCoercionTool(ToolDefinition[OCAAction, OCAObs]):
+    """Concrete mock tool for output coercion testing."""
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["MockCoercionTool"]:
+        return [cls(**params)]
+
+
 def test_tool_call_with_observation_none_result_shapes():
     # When observation_type is None, results are wrapped/coerced to Observation
     # 1) dict -> Observation
@@ -26,8 +36,7 @@ class E1(ToolExecutor[OCAAction, dict[str, object]]):
         def __call__(self, action: OCAAction, conversation=None) -> dict[str, object]:
             return {"kind": "OCAObs", "value": 1}
 
-    t = ToolDefinition(
-        name="t",
+    t = MockCoercionTool(
         description="d",
         action_type=OCAAction,
         observation_type=None,
@@ -50,8 +59,7 @@ class E2(ToolExecutor[OCAAction, MObs]):
         def __call__(self, action: OCAAction, conversation=None) -> MObs:
             return MObs(value=2)
 
-    t2 = ToolDefinition(
-        name="t2",
+    t2 = MockCoercionTool(
         description="d",
         action_type=OCAAction,
         observation_type=None,
@@ -66,8 +74,7 @@ class E3(ToolExecutor[OCAAction, list[int]]):
         def __call__(self, action: OCAAction, conversation=None) -> list[int]:
             return [1, 2, 3]
 
-    t3 = ToolDefinition(
-        name="t3",
+    t3 = MockCoercionTool(
         description="d",
         action_type=OCAAction,
         observation_type=None,
diff --git a/tests/sdk/tool/test_tool_definition.py b/tests/sdk/tool/test_tool_definition.py
index 9c50a46b68..3de62155f0 100644
--- a/tests/sdk/tool/test_tool_definition.py
+++ b/tests/sdk/tool/test_tool_definition.py
@@ -36,19 +36,26 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         return [TextContent(text=self.result)]
 
 
+class MockTestTool(ToolDefinition[ToolMockAction, ToolMockObservation]):
+    """Concrete mock tool for testing."""
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["MockTestTool"]:
+        return [cls(**params)]
+
+
 class TestTool:
     """Test cases for the Tool class."""
 
     def test_tool_creation_basic(self):
         """Test basic tool creation."""
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
         )
 
-        assert tool.name == "test_tool"
+        assert tool.name == "mock_test"
         assert tool.description == "A test tool"
         assert tool.action_type == ToolMockAction
         assert tool.observation_type == ToolMockObservation
@@ -61,8 +68,7 @@ class MockExecutor(ToolExecutor):
             def __call__(self, action, conversation=None) -> ToolMockObservation:
                 return ToolMockObservation(result=f"Executed: {action.command}")
 
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -84,8 +90,7 @@ def test_tool_creation_with_annotations(self):
             destructiveHint=False,
         )
 
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -100,8 +105,7 @@ def test_tool_creation_with_annotations(self):
 
     def test_to_mcp_tool_basic(self):
         """Test conversion to MCP tool format."""
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -109,7 +113,7 @@ def test_to_mcp_tool_basic(self):
 
         mcp_tool = tool.to_mcp_tool()
 
-        assert mcp_tool["name"] == "test_tool"
+        assert mcp_tool["name"] == "mock_test"
         assert mcp_tool["description"] == "A test tool"
         assert "inputSchema" in mcp_tool
         assert mcp_tool["inputSchema"]["type"] == "object"
@@ -129,8 +133,7 @@ def test_to_mcp_tool_with_annotations(self):
             readOnlyHint=True,
         )
 
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -140,15 +143,14 @@ def test_to_mcp_tool_with_annotations(self):
         mcp_tool = tool.to_mcp_tool()
 
         # Tool should include annotations
-        assert mcp_tool["name"] == "test_tool"
+        assert mcp_tool["name"] == "mock_test"
         assert mcp_tool["description"] == "A test tool"
         assert "annotations" in mcp_tool
         assert mcp_tool["annotations"] == annotations
 
     def test_call_without_executor(self):
         """Test calling tool without executor raises error."""
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -156,7 +158,7 @@ def test_call_without_executor(self):
 
         action = ToolMockAction(command="test")
         with pytest.raises(
-            NotImplementedError, match="Tool 'test_tool' has no executor"
+            NotImplementedError, match="Tool 'mock_test' has no executor"
         ):
             tool(action)
 
@@ -167,8 +169,7 @@ class MockExecutor(ToolExecutor):
             def __call__(self, action, conversation=None) -> ToolMockObservation:
                 return ToolMockObservation(result=f"Processed: {action.command}")
 
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -193,8 +194,7 @@ class ComplexAction(Action):
                 default_factory=list, description="List of strings"
             )
 
-        tool = ToolDefinition(
-            name="complex_tool",
+        tool = MockTestTool(
             description="Tool with complex types",
             action_type=ComplexAction,
             observation_type=ToolMockObservation,
@@ -217,8 +217,7 @@ class MockExecutor(ToolExecutor):
             def __call__(self, action, conversation=None) -> ToolMockObservation:
                 return ToolMockObservation(result="success")
 
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -239,8 +238,7 @@ class MockExecutor(ToolExecutor):
             def __call__(self, action, conversation=None) -> ToolMockObservation:
                 return ToolMockObservation(result="test", extra_field="extra_data")
 
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -256,8 +254,7 @@ def __call__(self, action, conversation=None) -> ToolMockObservation:
 
     def test_action_validation_with_nested_data(self):
         """Test action validation with nested data structures."""
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -282,8 +279,7 @@ def test_schema_roundtrip_conversion(self):
         original_schema = ToolMockAction.to_mcp_schema()
 
         # Create tool and get its schema
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -298,8 +294,7 @@ def test_schema_roundtrip_conversion(self):
 
     def test_tool_with_no_observation_type(self):
         """Test tool creation with None observation type."""
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=None,
@@ -309,7 +304,7 @@ def test_tool_with_no_observation_type(self):
 
         # Should still be able to create MCP tool
         mcp_tool = tool.to_mcp_tool()
-        assert mcp_tool["name"] == "test_tool"
+        assert mcp_tool["name"] == "mock_test"
 
     def test_executor_function_attachment(self):
         """Test creating tool with executor."""
@@ -321,8 +316,7 @@ def __call__(self, action, conversation=None) -> ToolMockObservation:
 
         executor = MockExecutor()
 
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -338,23 +332,13 @@ def __call__(self, action, conversation=None) -> ToolMockObservation:
 
     def test_tool_name_validation(self):
         """Test tool name validation."""
-        # Valid names should work
-        tool = ToolDefinition(
-            name="valid_tool_name",
-            description="A test tool",
-            action_type=ToolMockAction,
-            observation_type=ToolMockObservation,
-        )
-        assert tool.name == "valid_tool_name"
-
-        # Empty name should still work (validation might be elsewhere)
-        tool2 = ToolDefinition(
-            name="",
+        # Name is now automatically generated from class name
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
         )
-        assert tool2.name == ""
+        assert tool.name == "mock_test"
 
     def test_complex_executor_return_types(self):
         """Test executor with complex return types."""
@@ -376,8 +360,7 @@ def __call__(self, action, conversation=None) -> ComplexObservation:
                     count=len(action.command) if hasattr(action, "command") else 0,
                 )
 
-        tool = ToolDefinition(
-            name="complex_tool",
+        tool = MockTestTool(
             description="Tool with complex observation",
             action_type=ToolMockAction,
             observation_type=ComplexObservation,
@@ -398,8 +381,7 @@ class FailingExecutor(ToolExecutor):
             def __call__(self, action, conversation=None) -> ToolMockObservation:
                 raise RuntimeError("Executor failed")
 
-        tool = ToolDefinition(
-            name="failing_tool",
+        tool = MockTestTool(
             description="Tool that fails",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -425,8 +407,7 @@ class ValidExecutor(ToolExecutor):
             def __call__(self, action, conversation=None) -> StrictObservation:
                 return StrictObservation(message="success", value=42)
 
-        tool = ToolDefinition(
-            name="strict_tool",
+        tool = MockTestTool(
             description="Tool with strict observation",
             action_type=ToolMockAction,
             observation_type=StrictObservation,
@@ -441,15 +422,13 @@ def __call__(self, action, conversation=None) -> StrictObservation:
 
     def test_tool_equality_and_hashing(self):
         """Test tool equality and hashing behavior."""
-        tool1 = ToolDefinition(
-            name="test_tool",
+        tool1 = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
         )
 
-        tool2 = ToolDefinition(
-            name="test_tool",
+        tool2 = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -469,8 +448,7 @@ class RequiredFieldAction(Action):
                 default=None, description="This field is optional"
             )
 
-        tool = ToolDefinition(
-            name="required_tool",
+        tool = MockTestTool(
             description="Tool with required fields",
             action_type=RequiredFieldAction,
             observation_type=ToolMockObservation,
@@ -488,8 +466,7 @@ def test_tool_with_meta_data(self):
         """Test tool creation with metadata."""
         meta_data = {"version": "1.0", "author": "test"}
 
-        tool = ToolDefinition(
-            name="meta_tool",
+        tool = MockTestTool(
             description="Tool with metadata",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -525,8 +502,7 @@ class ComplexNestedAction(Action):
                 default=None, description="Optional array"
             )
 
-        tool = ToolDefinition(
-            name="complex_nested_tool",
+        tool = MockTestTool(
             description="Tool with complex nested types",
             action_type=ComplexNestedAction,
             observation_type=ToolMockObservation,
@@ -574,8 +550,7 @@ def test_security_risk_only_added_for_non_readonly_tools(self):
             readOnlyHint=True,
         )
 
-        readonly_tool = ToolDefinition(
-            name="readonly_tool",
+        readonly_tool = MockTestTool(
             description="A read-only tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -588,8 +563,7 @@ def test_security_risk_only_added_for_non_readonly_tools(self):
             readOnlyHint=False,
         )
 
-        writable_tool = ToolDefinition(
-            name="writable_tool",
+        writable_tool = MockTestTool(
             description="A writable tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -597,8 +571,7 @@ def test_security_risk_only_added_for_non_readonly_tools(self):
         )
 
         # Test with tool that has no annotations (should be treated as writable)
-        no_annotations_tool = ToolDefinition(
-            name="no_annotations_tool",
+        no_annotations_tool = MockTestTool(
             description="A tool with no annotations",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -662,8 +635,7 @@ def test_security_risk_is_required_field_in_schema(self):
         assert "security_risk" in schema["required"]
 
         # Test via to_openai_tool method
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -686,8 +658,7 @@ def test_security_risk_is_required_field_in_schema(self):
             readOnlyHint=False,
         )
 
-        writable_tool = ToolDefinition(
-            name="writable_tool",
+        writable_tool = MockTestTool(
             description="A writable tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -713,8 +684,7 @@ def __call__(self, action, conversation=None) -> ToolMockObservation:
                 return ToolMockObservation(result=f"Executed: {action.command}")
 
         executor = MockExecutor()
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -723,7 +693,7 @@ def __call__(self, action, conversation=None) -> ToolMockObservation:
 
         # Should return ExecutableTool without error
         executable_tool = tool.as_executable()
-        assert executable_tool.name == "test_tool"
+        assert executable_tool.name == "mock_test"
         assert executable_tool.executor is executor
 
         # Should be able to call it
@@ -734,8 +704,7 @@ def __call__(self, action, conversation=None) -> ToolMockObservation:
 
     def test_as_executable_without_executor(self):
         """Test as_executable() method with a tool that has no executor."""
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockTestTool(
             description="A test tool",
             action_type=ToolMockAction,
             observation_type=ToolMockObservation,
@@ -743,6 +712,6 @@ def test_as_executable_without_executor(self):
 
         # Should raise NotImplementedError
         with pytest.raises(
-            NotImplementedError, match="Tool 'test_tool' has no executor"
+            NotImplementedError, match="Tool 'mock_test' has no executor"
         ):
             tool.as_executable()
diff --git a/tests/sdk/tool/test_tool_immutability.py b/tests/sdk/tool/test_tool_immutability.py
index f94cbaf448..17a88a51a7 100644
--- a/tests/sdk/tool/test_tool_immutability.py
+++ b/tests/sdk/tool/test_tool_immutability.py
@@ -36,24 +36,29 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         return [TextContent(text=self.result)]
 
 
+class MockImmutableTool(
+    ToolDefinition[ToolImmutabilityMockAction, ToolImmutabilityMockObservation]
+):
+    """Concrete mock tool for immutability testing."""
+
+    @classmethod
+    def create(cls, conv_state=None, **params) -> Sequence["MockImmutableTool"]:
+        return [cls(**params)]
+
+
 class TestToolImmutability:
     """Test suite for Tool immutability features."""
 
     def test_tool_is_frozen(self):
         """Test that Tool instances are frozen and cannot be modified."""
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockImmutableTool(
             description="Test tool",
             action_type=ToolImmutabilityMockAction,
             observation_type=ToolImmutabilityMockObservation,
         )
 
         # Test that we cannot modify any field
-        with pytest.raises(
-            Exception
-        ):  # Pydantic raises ValidationError for frozen models
-            tool.name = "modified_name"
-
+        # Note: name is now a ClassVar and cannot be assigned through instance
         with pytest.raises(Exception):
             tool.description = "modified_description"
 
@@ -62,8 +67,7 @@ def test_tool_is_frozen(self):
 
     def test_tool_set_executor_returns_new_instance(self):
         """Test that set_executor returns a new Tool instance."""
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockImmutableTool(
             description="Test tool",
             action_type=ToolImmutabilityMockAction,
             observation_type=ToolImmutabilityMockObservation,
@@ -89,8 +93,7 @@ def __call__(
 
     def test_tool_model_copy_creates_modified_instance(self):
         """Test that model_copy can create modified versions of Tool instances."""
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockImmutableTool(
             description="Test tool",
             action_type=ToolImmutabilityMockAction,
             observation_type=ToolImmutabilityMockObservation,
@@ -103,7 +106,7 @@ def test_tool_model_copy_creates_modified_instance(self):
 
         # Verify that a new instance was created with modifications
         assert modified_tool is not tool
-        assert tool.name == "test_tool"
+        assert tool.name == "mock_immutable"
         assert tool.description == "Test tool"
         assert modified_tool.name == "modified_tool"
         assert modified_tool.description == "Modified description"
@@ -111,8 +114,7 @@ def test_tool_model_copy_creates_modified_instance(self):
     def test_tool_meta_field_immutability(self):
         """Test that the meta field works correctly and is immutable."""
         meta_data = {"version": "1.0", "author": "test"}
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockImmutableTool(
             description="Test tool",
             action_type=ToolImmutabilityMockAction,
             observation_type=ToolImmutabilityMockObservation,
@@ -135,8 +137,7 @@ def test_tool_meta_field_immutability(self):
     def test_tool_constructor_parameter_validation(self):
         """Test that Tool constructor validates parameters correctly."""
         # Test that new parameter names work
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockImmutableTool(
             description="Test tool",
             action_type=ToolImmutabilityMockAction,
             observation_type=ToolImmutabilityMockObservation,
@@ -146,8 +147,7 @@ def test_tool_constructor_parameter_validation(self):
 
         # Test that invalid field types are rejected
         with pytest.raises(ValidationError):
-            ToolDefinition(
-                name="test_tool",
+            MockImmutableTool(
                 description="Test tool",
                 action_type="invalid_type",  # type: ignore[arg-type] # Should be a class, not string
                 observation_type=ToolImmutabilityMockObservation,
@@ -161,8 +161,7 @@ def test_tool_annotations_immutability(self):
             destructiveHint=False,
         )
 
-        tool = ToolDefinition(
-            name="test_tool",
+        tool = MockImmutableTool(
             description="Test tool",
             action_type=ToolImmutabilityMockAction,
             observation_type=ToolImmutabilityMockObservation,
diff --git a/tests/sdk/tool/test_tool_serialization.py b/tests/sdk/tool/test_tool_serialization.py
index 20c4636611..aa8c861cde 100644
--- a/tests/sdk/tool/test_tool_serialization.py
+++ b/tests/sdk/tool/test_tool_serialization.py
@@ -3,17 +3,17 @@
 import json
 
 import pytest
-from pydantic import BaseModel, ValidationError
+from pydantic import BaseModel
 
 from openhands.sdk.tool import ToolDefinition
 from openhands.sdk.tool.builtins import FinishTool, ThinkTool
-from openhands.sdk.tool.tool import ToolBase
 
 
 def test_tool_serialization_deserialization() -> None:
     """Test that Tool supports polymorphic JSON serialization/deserialization."""
     # Use FinishTool which is a simple built-in tool
-    tool = FinishTool
+    tool_instances = FinishTool.create()
+    tool = tool_instances[0]
 
     # Serialize to JSON
     tool_json = tool.model_dump_json()
@@ -33,7 +33,8 @@ class Container(BaseModel):
         tool: ToolDefinition
 
     # Create container with tool
-    tool = FinishTool
+    tool_instances = FinishTool.create()
+    tool = tool_instances[0]
     container = Container(tool=tool)
 
     # Serialize to JSON
@@ -54,8 +55,10 @@ class NestedContainer(BaseModel):
         tools: list[ToolDefinition]
 
     # Create container with multiple tools
-    tool1 = FinishTool
-    tool2 = ThinkTool
+    tool1_instances = FinishTool.create()
+    tool1 = tool1_instances[0]
+    tool2_instances = ThinkTool.create()
+    tool2 = tool2_instances[0]
     container = NestedContainer(tools=[tool1, tool2])
 
     # Serialize to JSON
@@ -75,7 +78,8 @@ class NestedContainer(BaseModel):
 def test_tool_model_validate_json_dict() -> None:
     """Test that Tool.model_validate works with dict from JSON."""
     # Create tool
-    tool = FinishTool
+    tool_instances = FinishTool.create()
+    tool = tool_instances[0]
 
     # Serialize to JSON, then parse to dict
     tool_json = tool.model_dump_json()
@@ -101,14 +105,15 @@ def test_tool_no_fallback_behavior_json() -> None:
     }
     tool_json = json.dumps(tool_dict)
 
-    with pytest.raises(ValidationError):
-        ToolBase.model_validate_json(tool_json)
+    with pytest.raises(ValueError, match="Unknown kind 'UnknownToolType'"):
+        ToolDefinition.model_validate_json(tool_json)
 
 
 def test_tool_type_annotation_works_json() -> None:
     """Test that ToolType annotation works correctly with JSON."""
     # Create tool
-    tool = FinishTool
+    tool_instances = FinishTool.create()
+    tool = tool_instances[0]
 
     # Use ToolType annotation
     class TestModel(BaseModel):
@@ -130,7 +135,8 @@ class TestModel(BaseModel):
 def test_tool_kind_field_json() -> None:
     """Test Tool kind field is correctly set and preserved through JSON."""
     # Create tool
-    tool = FinishTool
+    tool_instances = FinishTool.create()
+    tool = tool_instances[0]
 
     # Check kind field
     assert hasattr(tool, "kind")
diff --git a/tests/sdk/workspace/remote/test_remote_workspace_mixin.py b/tests/sdk/workspace/remote/test_remote_workspace_mixin.py
index 205a29f63a..a144a708aa 100644
--- a/tests/sdk/workspace/remote/test_remote_workspace_mixin.py
+++ b/tests/sdk/workspace/remote/test_remote_workspace_mixin.py
@@ -294,12 +294,15 @@ def test_file_upload_generator_basic_flow(temp_file):
     upload_response.raise_for_status = Mock()
     upload_response.json.return_value = {"success": True, "file_size": 12}
 
+    destination = "/remote/file.txt"
     generator = mixin._file_upload_generator(temp_file, "/remote/file.txt")
 
     # Get upload request
     upload_kwargs = next(generator)
     assert upload_kwargs["method"] == "POST"
-    assert upload_kwargs["url"] == "http://localhost:8000/api/file/upload"
+    assert (
+        upload_kwargs["url"] == f"http://localhost:8000/api/file/upload/{destination}"
+    )
     assert upload_kwargs["data"]["destination_path"] == "/remote/file.txt"
     assert "file" in upload_kwargs["files"]
     assert upload_kwargs["headers"] == {"X-Session-API-Key": "test-key"}
diff --git a/tests/tools/browser_use/test_browser_toolset.py b/tests/tools/browser_use/test_browser_toolset.py
index bb51b64e88..ec2c3221a7 100644
--- a/tests/tools/browser_use/test_browser_toolset.py
+++ b/tests/tools/browser_use/test_browser_toolset.py
@@ -1,20 +1,7 @@
 """Test BrowserToolSet functionality."""
 
 from openhands.sdk.tool import ToolDefinition
-from openhands.sdk.tool.tool import ToolBase
-from openhands.tools.browser_use import (
-    BrowserToolSet,
-    browser_click_tool,
-    browser_close_tab_tool,
-    browser_get_content_tool,
-    browser_get_state_tool,
-    browser_go_back_tool,
-    browser_list_tabs_tool,
-    browser_navigate_tool,
-    browser_scroll_tool,
-    browser_switch_tab_tool,
-    browser_type_tool,
-)
+from openhands.tools.browser_use import BrowserToolSet
 from openhands.tools.browser_use.impl import BrowserToolExecutor
 
 
@@ -39,16 +26,16 @@ def test_browser_toolset_create_includes_all_browser_tools():
 
     # Expected tool names based on the browser tools
     expected_names = [
-        browser_navigate_tool.name,
-        browser_click_tool.name,
-        browser_get_state_tool.name,
-        browser_get_content_tool.name,
-        browser_type_tool.name,
-        browser_scroll_tool.name,
-        browser_go_back_tool.name,
-        browser_list_tabs_tool.name,
-        browser_switch_tab_tool.name,
-        browser_close_tab_tool.name,
+        "browser_navigate",
+        "browser_click",
+        "browser_get_state",
+        "browser_get_content",
+        "browser_type",
+        "browser_scroll",
+        "browser_go_back",
+        "browser_list_tabs",
+        "browser_switch_tab",
+        "browser_close_tab",
     ]
 
     # Verify all expected tools are present
@@ -80,14 +67,14 @@ def test_browser_toolset_create_tools_are_properly_configured():
     # Find a specific tool to test (e.g., navigate tool)
     navigate_tool = None
     for tool in tools:
-        if tool.name == browser_navigate_tool.name:
+        if tool.name == "browser_navigate":
             navigate_tool = tool
             break
 
     assert navigate_tool is not None
-    assert navigate_tool.description == browser_navigate_tool.description
-    assert navigate_tool.action_type == browser_navigate_tool.action_type
-    assert navigate_tool.observation_type == browser_navigate_tool.observation_type
+    assert navigate_tool.description is not None
+    assert navigate_tool.action_type is not None
+    assert navigate_tool.observation_type is not None
     assert navigate_tool.executor is not None
 
 
@@ -134,11 +121,11 @@ def test_browser_toolset_create_no_parameters():
 
 def test_browser_toolset_inheritance():
     """Test that BrowserToolSet properly inherits from Tool."""
-    assert issubclass(BrowserToolSet, ToolBase)
+    assert issubclass(BrowserToolSet, ToolDefinition)
 
     # BrowserToolSet should not be instantiable directly (it's a factory)
     # The create method returns a list, not an instance of BrowserToolSet
     tools = BrowserToolSet.create()
     for tool in tools:
         assert not isinstance(tool, BrowserToolSet)
-        assert isinstance(tool, ToolBase)
+        assert isinstance(tool, ToolDefinition)
diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index d3f9d855b5..ee1a26a465 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -5,7 +5,7 @@
 
 from pydantic import SecretStr
 
-from openhands.sdk.conversation.state import AgentExecutionStatus
+from openhands.sdk.conversation.state import ConversationExecutionStatus
 from openhands.sdk.llm import LLM
 from openhands.tools.delegate import (
     DelegateAction,
@@ -38,7 +38,7 @@ def create_mock_conversation():
     """Helper to create a mock conversation."""
     mock_conv = MagicMock()
     mock_conv.id = str(uuid.uuid4())
-    mock_conv.state.agent_status = AgentExecutionStatus.FINISHED
+    mock_conv.state.execution_status = ConversationExecutionStatus.FINISHED
     return mock_conv
 
 
diff --git a/tests/tools/execute_bash/test_bash_tool.py b/tests/tools/execute_bash/test_bash_tool.py
index 985b4d94e2..61aed7fee2 100644
--- a/tests/tools/execute_bash/test_bash_tool.py
+++ b/tests/tools/execute_bash/test_bash_tool.py
@@ -35,7 +35,7 @@ def test_bash_tool_initialization():
         tool = tools[0]
 
         # Check that the tool has the correct name and properties
-        assert tool.name == "execute_bash"
+        assert tool.name == "bash"
         assert tool.executor is not None
         assert tool.action_type == ExecuteBashAction
 
@@ -48,7 +48,7 @@ def test_bash_tool_with_username():
         tool = tools[0]
 
         # Check that the tool has the correct name and properties
-        assert tool.name == "execute_bash"
+        assert tool.name == "bash"
         assert tool.executor is not None
         assert tool.action_type == ExecuteBashAction
 
@@ -102,6 +102,6 @@ def test_bash_tool_to_openai_tool():
 
         # Check the format
         assert openai_tool["type"] == "function"
-        assert openai_tool["function"]["name"] == "execute_bash"
+        assert openai_tool["function"]["name"] == "bash"
         assert "description" in openai_tool["function"]
         assert "parameters" in openai_tool["function"]
diff --git a/tests/tools/execute_bash/test_bash_tool_auto_detection.py b/tests/tools/execute_bash/test_bash_tool_auto_detection.py
index 500595f38f..5dcb10427b 100644
--- a/tests/tools/execute_bash/test_bash_tool_auto_detection.py
+++ b/tests/tools/execute_bash/test_bash_tool_auto_detection.py
@@ -148,7 +148,7 @@ def test_tool_metadata():
         tools = BashTool.create(_create_conv_state(temp_dir))
         tool = tools[0]
 
-        assert tool.name == "execute_bash"
+        assert tool.name == "bash"
         assert tool.description is not None
         assert tool.action_type == ExecuteBashAction
         assert hasattr(tool, "annotations")
diff --git a/tests/tools/execute_bash/test_schema.py b/tests/tools/execute_bash/test_schema.py
index 86a821168a..829672d500 100644
--- a/tests/tools/execute_bash/test_schema.py
+++ b/tests/tools/execute_bash/test_schema.py
@@ -1,9 +1,14 @@
-from openhands.tools.execute_bash import execute_bash_tool
+from openhands.tools.execute_bash import BashTool
 
 
-def test_to_mcp_tool_detailed_type_validation_bash():
+def test_to_mcp_tool_detailed_type_validation_bash(mock_conversation_state):
     """Test detailed type validation for MCP tool schema generation (execute_bash)."""  # noqa: E501
 
+    execute_bash_tool = BashTool.create(conv_state=mock_conversation_state)
+    assert len(execute_bash_tool) == 1
+    execute_bash_tool = execute_bash_tool[0]
+    assert isinstance(execute_bash_tool, BashTool)
+
     # Test execute_bash tool schema
     bash_mcp = execute_bash_tool.to_mcp_tool()
     bash_schema = bash_mcp["inputSchema"]
diff --git a/tests/tools/execute_bash/test_secrets_masking.py b/tests/tools/execute_bash/test_secrets_masking.py
index fcdebeae66..2fa56d2777 100644
--- a/tests/tools/execute_bash/test_secrets_masking.py
+++ b/tests/tools/execute_bash/test_secrets_masking.py
@@ -32,7 +32,7 @@ def test_bash_executor_without_conversation():
 
 
 def test_bash_executor_with_conversation_secrets():
-    """Test that BashExecutor uses secrets from conversation.state.secrets_manager."""
+    """Test that BashExecutor uses secrets from conversation.state.secret_registry."""
     with tempfile.TemporaryDirectory() as temp_dir:
         # Create a conversation with secrets
         llm = LLM(
@@ -65,6 +65,7 @@ def test_bash_executor_with_conversation_secrets():
                 output="Token: secret-value-123, Key: another-secret-456",
             )
             mock_session.execute.return_value = mock_observation
+            mock_session._closed = False
             executor.session = mock_session
 
             # Execute command with conversation - secrets should be exported and masked
diff --git a/tests/tools/file_editor/test_file_editor_tool.py b/tests/tools/file_editor/test_file_editor_tool.py
index 31c2b9ee1c..dafce1c4d3 100644
--- a/tests/tools/file_editor/test_file_editor_tool.py
+++ b/tests/tools/file_editor/test_file_editor_tool.py
@@ -36,7 +36,7 @@ def test_file_editor_tool_initialization():
         tool = tools[0]
 
         # Check that the tool has the correct name and properties
-        assert tool.name == "str_replace_editor"
+        assert tool.name == "file_editor"
         assert tool.executor is not None
         assert issubclass(tool.action_type, FileEditorAction)
 
@@ -147,7 +147,7 @@ def test_file_editor_tool_to_openai_tool():
 
         # Check the format
         assert openai_tool["type"] == "function"
-        assert openai_tool["function"]["name"] == "str_replace_editor"
+        assert openai_tool["function"]["name"] == "file_editor"
         assert "description" in openai_tool["function"]
         assert "parameters" in openai_tool["function"]
 
diff --git a/tests/tools/file_editor/test_schema.py b/tests/tools/file_editor/test_schema.py
index 3fc526226e..deee278d30 100644
--- a/tests/tools/file_editor/test_schema.py
+++ b/tests/tools/file_editor/test_schema.py
@@ -1,9 +1,14 @@
-from openhands.tools.file_editor import file_editor_tool
+from openhands.tools.file_editor import FileEditorTool
 
 
-def test_to_mcp_tool_detailed_type_validation_editor():
+def test_to_mcp_tool_detailed_type_validation_editor(mock_conversation_state):
     """Test detailed type validation for MCP tool schema generation."""
 
+    file_editor_tool = FileEditorTool.create(conv_state=mock_conversation_state)
+    assert len(file_editor_tool) == 1
+    file_editor_tool = file_editor_tool[0]
+    assert isinstance(file_editor_tool, FileEditorTool)
+
     # Test file_editor tool schema
     str_editor_mcp = file_editor_tool.to_mcp_tool()
     str_editor_schema = str_editor_mcp["inputSchema"]
diff --git a/tests/tools/test_tool_name_consistency.py b/tests/tools/test_tool_name_consistency.py
new file mode 100644
index 0000000000..cb3b9b1f4d
--- /dev/null
+++ b/tests/tools/test_tool_name_consistency.py
@@ -0,0 +1,69 @@
+"""Test that tool_name class variables are consistent with automatic naming."""
+
+from openhands.tools.browser_use import BrowserToolSet
+from openhands.tools.execute_bash import BashTool
+from openhands.tools.file_editor import FileEditorTool
+from openhands.tools.glob import GlobTool
+from openhands.tools.grep import GrepTool
+from openhands.tools.planning_file_editor import PlanningFileEditorTool
+from openhands.tools.task_tracker import TaskTrackerTool
+
+
+def test_tool_name_attributes_exist():
+    """Test that all tool classes have name class variables."""
+    tools = [
+        BashTool,
+        FileEditorTool,
+        TaskTrackerTool,
+        BrowserToolSet,
+        GrepTool,
+        GlobTool,
+        PlanningFileEditorTool,
+    ]
+
+    for tool_class in tools:
+        assert hasattr(tool_class, "name"), (
+            f"{tool_class.__name__} missing name attribute"
+        )
+        assert isinstance(tool_class.name, str), (
+            f"{tool_class.__name__}.name is not a string"
+        )
+        # name should be snake_case version of class name
+        assert tool_class.name.islower(), (
+            f"{tool_class.__name__}.name should be snake_case"
+        )
+        # Allow single words without underscores (e.g., "bash", "grep")
+        assert "_" in tool_class.name or len(tool_class.name) <= 10, (
+            f"{tool_class.__name__}.name should contain underscores for "
+            "multi-word names or be a short single word"
+        )
+
+
+def test_tool_name_consistency():
+    """Test that name matches the expected snake_case conversion."""
+    expected_names = {
+        BashTool: "bash",
+        FileEditorTool: "file_editor",
+        TaskTrackerTool: "task_tracker",
+        BrowserToolSet: "browser_tool_set",
+        GrepTool: "grep",
+        GlobTool: "glob",
+        PlanningFileEditorTool: "planning_file_editor",
+    }
+
+    for tool_class, expected_name in expected_names.items():
+        assert tool_class.name == expected_name, (
+            f"{tool_class.__name__}.name should be '{expected_name}'"
+        )
+
+
+def test_tool_name_accessible_at_class_level():
+    """Test that name can be accessed at the class level without instantiation."""
+    # This should not raise any errors and should return snake_case names
+    assert BashTool.name == "bash"
+    assert FileEditorTool.name == "file_editor"
+    assert TaskTrackerTool.name == "task_tracker"
+    assert BrowserToolSet.name == "browser_tool_set"
+    assert GrepTool.name == "grep"
+    assert GlobTool.name == "glob"
+    assert PlanningFileEditorTool.name == "planning_file_editor"
diff --git a/uv.lock b/uv.lock
index 57671df86b..33f043b1ee 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1089,6 +1089,8 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/19/0d/6660d55f7373b2ff8152401a83e02084956da23ae58cddbfb0b330978fe9/greenlet-3.2.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b3812d8d0c9579967815af437d96623f45c0f2ae5f04e366de62a12d83a8fb0", size = 607586, upload-time = "2025-08-07T13:18:28.544Z" },
     { url = "https://files.pythonhosted.org/packages/8e/1a/c953fdedd22d81ee4629afbb38d2f9d71e37d23caace44775a3a969147d4/greenlet-3.2.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:abbf57b5a870d30c4675928c37278493044d7c14378350b3aa5d484fa65575f0", size = 1123281, upload-time = "2025-08-07T13:42:39.858Z" },
     { url = "https://files.pythonhosted.org/packages/3f/c7/12381b18e21aef2c6bd3a636da1088b888b97b7a0362fac2e4de92405f97/greenlet-3.2.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:20fb936b4652b6e307b8f347665e2c615540d4b42b3b4c8a321d8286da7e520f", size = 1151142, upload-time = "2025-08-07T13:18:22.981Z" },
+    { url = "https://files.pythonhosted.org/packages/27/45/80935968b53cfd3f33cf99ea5f08227f2646e044568c9b1555b58ffd61c2/greenlet-3.2.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ee7a6ec486883397d70eec05059353b8e83eca9168b9f3f9a361971e77e0bcd0", size = 1564846, upload-time = "2025-11-04T12:42:15.191Z" },
+    { url = "https://files.pythonhosted.org/packages/69/02/b7c30e5e04752cb4db6202a3858b149c0710e5453b71a3b2aec5d78a1aab/greenlet-3.2.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:326d234cbf337c9c3def0676412eb7040a35a768efc92504b947b3e9cfc7543d", size = 1633814, upload-time = "2025-11-04T12:42:17.175Z" },
     { url = "https://files.pythonhosted.org/packages/e9/08/b0814846b79399e585f974bbeebf5580fbe59e258ea7be64d9dfb253c84f/greenlet-3.2.4-cp312-cp312-win_amd64.whl", hash = "sha256:a7d4e128405eea3814a12cc2605e0e6aedb4035bf32697f72deca74de4105e02", size = 299899, upload-time = "2025-08-07T13:38:53.448Z" },
     { url = "https://files.pythonhosted.org/packages/49/e8/58c7f85958bda41dafea50497cbd59738c5c43dbbea5ee83d651234398f4/greenlet-3.2.4-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:1a921e542453fe531144e91e1feedf12e07351b1cf6c9e8a3325ea600a715a31", size = 272814, upload-time = "2025-08-07T13:15:50.011Z" },
     { url = "https://files.pythonhosted.org/packages/62/dd/b9f59862e9e257a16e4e610480cfffd29e3fae018a68c2332090b53aac3d/greenlet-3.2.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd3c8e693bff0fff6ba55f140bf390fa92c994083f838fece0f63be121334945", size = 641073, upload-time = "2025-08-07T13:42:57.23Z" },
@@ -1098,6 +1100,8 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" },
     { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" },
     { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" },
+    { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" },
     { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" },
     { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" },
     { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" },
@@ -1105,6 +1109,8 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" },
     { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" },
     { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" },
+    { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" },
     { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" },
 ]
 
@@ -1125,6 +1131,47 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9d/08/24d62fccb01c4e86c59ba79073af7e5c8ab643846823c2fa3e957bde4b58/groq-0.32.0-py3-none-any.whl", hash = "sha256:0ed0be290042f8826f851f3a1defaac4f979dcfce86ec4a0681a23af00ec800b", size = 135387, upload-time = "2025-09-27T23:01:33.223Z" },
 ]
 
+[[package]]
+name = "grpcio"
+version = "1.75.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9d/f7/8963848164c7604efb3a3e6ee457fdb3a469653e19002bd24742473254f8/grpcio-1.75.1.tar.gz", hash = "sha256:3e81d89ece99b9ace23a6916880baca613c03a799925afb2857887efa8b1b3d2", size = 12731327, upload-time = "2025-09-26T09:03:36.887Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3a/81/42be79e73a50aaa20af66731c2defeb0e8c9008d9935a64dd8ea8e8c44eb/grpcio-1.75.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:7b888b33cd14085d86176b1628ad2fcbff94cfbbe7809465097aa0132e58b018", size = 5668314, upload-time = "2025-09-26T09:01:55.424Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/a7/3686ed15822fedc58c22f82b3a7403d9faf38d7c33de46d4de6f06e49426/grpcio-1.75.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:8775036efe4ad2085975531d221535329f5dac99b6c2a854a995456098f99546", size = 11476125, upload-time = "2025-09-26T09:01:57.927Z" },
+    { url = "https://files.pythonhosted.org/packages/14/85/21c71d674f03345ab183c634ecd889d3330177e27baea8d5d247a89b6442/grpcio-1.75.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb658f703468d7fbb5dcc4037c65391b7dc34f808ac46ed9136c24fc5eeb041d", size = 6246335, upload-time = "2025-09-26T09:02:00.76Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/db/3beb661bc56a385ae4fa6b0e70f6b91ac99d47afb726fe76aaff87ebb116/grpcio-1.75.1-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4b7177a1cdb3c51b02b0c0a256b0a72fdab719600a693e0e9037949efffb200b", size = 6916309, upload-time = "2025-09-26T09:02:02.894Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/9c/eda9fe57f2b84343d44c1b66cf3831c973ba29b078b16a27d4587a1fdd47/grpcio-1.75.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7d4fa6ccc3ec2e68a04f7b883d354d7fea22a34c44ce535a2f0c0049cf626ddf", size = 6435419, upload-time = "2025-09-26T09:02:05.055Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/b8/090c98983e0a9d602e3f919a6e2d4e470a8b489452905f9a0fa472cac059/grpcio-1.75.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3d86880ecaeb5b2f0a8afa63824de93adb8ebe4e49d0e51442532f4e08add7d6", size = 7064893, upload-time = "2025-09-26T09:02:07.275Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/c0/6d53d4dbbd00f8bd81571f5478d8a95528b716e0eddb4217cc7cb45aae5f/grpcio-1.75.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a8041d2f9e8a742aeae96f4b047ee44e73619f4f9d24565e84d5446c623673b6", size = 8011922, upload-time = "2025-09-26T09:02:09.527Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/7c/48455b2d0c5949678d6982c3e31ea4d89df4e16131b03f7d5c590811cbe9/grpcio-1.75.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3652516048bf4c314ce12be37423c79829f46efffb390ad64149a10c6071e8de", size = 7466181, upload-time = "2025-09-26T09:02:12.279Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/12/04a0e79081e3170b6124f8cba9b6275871276be06c156ef981033f691880/grpcio-1.75.1-cp312-cp312-win32.whl", hash = "sha256:44b62345d8403975513af88da2f3d5cc76f73ca538ba46596f92a127c2aea945", size = 3938543, upload-time = "2025-09-26T09:02:14.77Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/d7/11350d9d7fb5adc73d2b0ebf6ac1cc70135577701e607407fe6739a90021/grpcio-1.75.1-cp312-cp312-win_amd64.whl", hash = "sha256:b1e191c5c465fa777d4cafbaacf0c01e0d5278022082c0abbd2ee1d6454ed94d", size = 4641938, upload-time = "2025-09-26T09:02:16.927Z" },
+    { url = "https://files.pythonhosted.org/packages/46/74/bac4ab9f7722164afdf263ae31ba97b8174c667153510322a5eba4194c32/grpcio-1.75.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:3bed22e750d91d53d9e31e0af35a7b0b51367e974e14a4ff229db5b207647884", size = 5672779, upload-time = "2025-09-26T09:02:19.11Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/52/d0483cfa667cddaa294e3ab88fd2c2a6e9dc1a1928c0e5911e2e54bd5b50/grpcio-1.75.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:5b8f381eadcd6ecaa143a21e9e80a26424c76a0a9b3d546febe6648f3a36a5ac", size = 11470623, upload-time = "2025-09-26T09:02:22.117Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/e4/d1954dce2972e32384db6a30273275e8c8ea5a44b80347f9055589333b3f/grpcio-1.75.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5bf4001d3293e3414d0cf99ff9b1139106e57c3a66dfff0c5f60b2a6286ec133", size = 6248838, upload-time = "2025-09-26T09:02:26.426Z" },
+    { url = "https://files.pythonhosted.org/packages/06/43/073363bf63826ba8077c335d797a8d026f129dc0912b69c42feaf8f0cd26/grpcio-1.75.1-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:9f82ff474103e26351dacfe8d50214e7c9322960d8d07ba7fa1d05ff981c8b2d", size = 6922663, upload-time = "2025-09-26T09:02:28.724Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/6f/076ac0df6c359117676cacfa8a377e2abcecec6a6599a15a672d331f6680/grpcio-1.75.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0ee119f4f88d9f75414217823d21d75bfe0e6ed40135b0cbbfc6376bc9f7757d", size = 6436149, upload-time = "2025-09-26T09:02:30.971Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/27/1d08824f1d573fcb1fa35ede40d6020e68a04391709939e1c6f4193b445f/grpcio-1.75.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:664eecc3abe6d916fa6cf8dd6b778e62fb264a70f3430a3180995bf2da935446", size = 7067989, upload-time = "2025-09-26T09:02:33.233Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/98/98594cf97b8713feb06a8cb04eeef60b4757e3e2fb91aa0d9161da769843/grpcio-1.75.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:c32193fa08b2fbebf08fe08e84f8a0aad32d87c3ad42999c65e9449871b1c66e", size = 8010717, upload-time = "2025-09-26T09:02:36.011Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/7e/bb80b1bba03c12158f9254762cdf5cced4a9bc2e8ed51ed335915a5a06ef/grpcio-1.75.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5cebe13088b9254f6e615bcf1da9131d46cfa4e88039454aca9cb65f639bd3bc", size = 7463822, upload-time = "2025-09-26T09:02:38.26Z" },
+    { url = "https://files.pythonhosted.org/packages/23/1c/1ea57fdc06927eb5640f6750c697f596f26183573069189eeaf6ef86ba2d/grpcio-1.75.1-cp313-cp313-win32.whl", hash = "sha256:4b4c678e7ed50f8ae8b8dbad15a865ee73ce12668b6aaf411bf3258b5bc3f970", size = 3938490, upload-time = "2025-09-26T09:02:40.268Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/24/fbb8ff1ccadfbf78ad2401c41aceaf02b0d782c084530d8871ddd69a2d49/grpcio-1.75.1-cp313-cp313-win_amd64.whl", hash = "sha256:5573f51e3f296a1bcf71e7a690c092845fb223072120f4bdb7a5b48e111def66", size = 4642538, upload-time = "2025-09-26T09:02:42.519Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/1b/9a0a5cecd24302b9fdbcd55d15ed6267e5f3d5b898ff9ac8cbe17ee76129/grpcio-1.75.1-cp314-cp314-linux_armv7l.whl", hash = "sha256:c05da79068dd96723793bffc8d0e64c45f316248417515f28d22204d9dae51c7", size = 5673319, upload-time = "2025-09-26T09:02:44.742Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/ec/9d6959429a83fbf5df8549c591a8a52bb313976f6646b79852c4884e3225/grpcio-1.75.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:06373a94fd16ec287116a825161dca179a0402d0c60674ceeec8c9fba344fe66", size = 11480347, upload-time = "2025-09-26T09:02:47.539Z" },
+    { url = "https://files.pythonhosted.org/packages/09/7a/26da709e42c4565c3d7bf999a9569da96243ce34a8271a968dee810a7cf1/grpcio-1.75.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4484f4b7287bdaa7a5b3980f3c7224c3c622669405d20f69549f5fb956ad0421", size = 6254706, upload-time = "2025-09-26T09:02:50.4Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/08/dcb26a319d3725f199c97e671d904d84ee5680de57d74c566a991cfab632/grpcio-1.75.1-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:2720c239c1180eee69f7883c1d4c83fc1a495a2535b5fa322887c70bf02b16e8", size = 6922501, upload-time = "2025-09-26T09:02:52.711Z" },
+    { url = "https://files.pythonhosted.org/packages/78/66/044d412c98408a5e23cb348845979a2d17a2e2b6c3c34c1ec91b920f49d0/grpcio-1.75.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:07a554fa31c668cf0e7a188678ceeca3cb8fead29bbe455352e712ec33ca701c", size = 6437492, upload-time = "2025-09-26T09:02:55.542Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/9d/5e3e362815152aa1afd8b26ea613effa005962f9da0eec6e0e4527e7a7d1/grpcio-1.75.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:3e71a2105210366bfc398eef7f57a664df99194f3520edb88b9c3a7e46ee0d64", size = 7081061, upload-time = "2025-09-26T09:02:58.261Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/1a/46615682a19e100f46e31ddba9ebc297c5a5ab9ddb47b35443ffadb8776c/grpcio-1.75.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:8679aa8a5b67976776d3c6b0521e99d1c34db8a312a12bcfd78a7085cb9b604e", size = 8010849, upload-time = "2025-09-26T09:03:00.548Z" },
+    { url = "https://files.pythonhosted.org/packages/67/8e/3204b94ac30b0f675ab1c06540ab5578660dc8b690db71854d3116f20d00/grpcio-1.75.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:aad1c774f4ebf0696a7f148a56d39a3432550612597331792528895258966dc0", size = 7464478, upload-time = "2025-09-26T09:03:03.096Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/97/2d90652b213863b2cf466d9c1260ca7e7b67a16780431b3eb1d0420e3d5b/grpcio-1.75.1-cp314-cp314-win32.whl", hash = "sha256:62ce42d9994446b307649cb2a23335fa8e927f7ab2cbf5fcb844d6acb4d85f9c", size = 4012672, upload-time = "2025-09-26T09:03:05.477Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/df/e2e6e9fc1c985cd1a59e6996a05647c720fe8a03b92f5ec2d60d366c531e/grpcio-1.75.1-cp314-cp314-win_amd64.whl", hash = "sha256:f86e92275710bea3000cb79feca1762dc0ad3b27830dd1a74e82ab321d4ee464", size = 4772475, upload-time = "2025-09-26T09:03:07.661Z" },
+]
+
 [[package]]
 name = "h11"
 version = "0.16.0"
@@ -1440,6 +1487,33 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fc/fb/38a48efe3e05a8e9a9765b991740282e0358a83fb896ec00d70bf1448791/litellm-1.78.0-py3-none-any.whl", hash = "sha256:a9d6deee882de8df38ca24beb930689f49209340137ff8a3dcab0c5fc4a0513d", size = 9677983, upload-time = "2025-10-11T19:28:23.242Z" },
 ]
 
+[[package]]
+name = "lmnr"
+version = "0.7.20"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "grpcio" },
+    { name = "httpx" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-grpc" },
+    { name = "opentelemetry-exporter-otlp-proto-http" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-instrumentation-threading" },
+    { name = "opentelemetry-sdk" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "opentelemetry-semantic-conventions-ai" },
+    { name = "orjson" },
+    { name = "packaging" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+    { name = "tenacity" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d4/c0/996403cc2f6967881a42af4b27ff8931956d57ab3ed2d8bf11e5b37aed40/lmnr-0.7.20.tar.gz", hash = "sha256:1f484cd618db2d71af65f90a0b8b36d20d80dc91a5138b811575c8677bf7c4fd", size = 194075, upload-time = "2025-11-04T16:53:34.49Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/df/4665a3931b2fbc5f5b66e4906ffab106f3f65ab7e78732ecdaf3ba4a3076/lmnr-0.7.20-py3-none-any.whl", hash = "sha256:5f9fa7444e6f96c25e097f66484ff29e632bdd1de0e9346948bf5595f4a8af38", size = 247465, upload-time = "2025-11-04T16:53:32.713Z" },
+]
+
 [[package]]
 name = "macholib"
 version = "1.16.3"
@@ -1867,7 +1941,7 @@ wheels = [
 
 [[package]]
 name = "openhands-agent-server"
-version = "1.0.0a5"
+version = "1.0.0a6"
 source = { editable = "openhands-agent-server" }
 dependencies = [
     { name = "aiosqlite" },
@@ -1896,12 +1970,13 @@ requires-dist = [
 
 [[package]]
 name = "openhands-sdk"
-version = "1.0.0a5"
+version = "1.0.0a6"
 source = { editable = "openhands-sdk" }
 dependencies = [
     { name = "fastmcp" },
     { name = "httpx" },
     { name = "litellm" },
+    { name = "lmnr" },
     { name = "pydantic" },
     { name = "python-frontmatter" },
     { name = "python-json-logger" },
@@ -1920,6 +1995,7 @@ requires-dist = [
     { name = "fastmcp", specifier = ">=2.11.3" },
     { name = "httpx", specifier = ">=0.27.0" },
     { name = "litellm", specifier = ">=1.77.7.dev9" },
+    { name = "lmnr", specifier = ">=0.7.20" },
     { name = "pydantic", specifier = ">=2.11.7" },
     { name = "python-frontmatter", specifier = ">=1.1.0" },
     { name = "python-json-logger", specifier = ">=3.3.0" },
@@ -1930,7 +2006,7 @@ provides-extras = ["boto3"]
 
 [[package]]
 name = "openhands-tools"
-version = "1.0.0a5"
+version = "1.0.0a6"
 source = { editable = "openhands-tools" }
 dependencies = [
     { name = "bashlex" },
@@ -1957,7 +2033,7 @@ requires-dist = [
 
 [[package]]
 name = "openhands-workspace"
-version = "1.0.0a5"
+version = "1.0.0a6"
 source = { editable = "openhands-workspace" }
 dependencies = [
     { name = "openhands-sdk" },
@@ -1970,6 +2046,193 @@ requires-dist = [
     { name = "pydantic", specifier = ">=2.11.7" },
 ]
 
+[[package]]
+name = "opentelemetry-api"
+version = "1.37.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "importlib-metadata" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/63/04/05040d7ce33a907a2a02257e601992f0cdf11c73b33f13c4492bf6c3d6d5/opentelemetry_api-1.37.0.tar.gz", hash = "sha256:540735b120355bd5112738ea53621f8d5edb35ebcd6fe21ada3ab1c61d1cd9a7", size = 64923, upload-time = "2025-09-11T10:29:01.662Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/91/48/28ed9e55dcf2f453128df738210a980e09f4e468a456fa3c763dbc8be70a/opentelemetry_api-1.37.0-py3-none-any.whl", hash = "sha256:accf2024d3e89faec14302213bc39550ec0f4095d1cf5ca688e1bfb1c8612f47", size = 65732, upload-time = "2025-09-11T10:28:41.826Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-common"
+version = "1.37.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-proto" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dc/6c/10018cbcc1e6fff23aac67d7fd977c3d692dbe5f9ef9bb4db5c1268726cc/opentelemetry_exporter_otlp_proto_common-1.37.0.tar.gz", hash = "sha256:c87a1bdd9f41fdc408d9cc9367bb53f8d2602829659f2b90be9f9d79d0bfe62c", size = 20430, upload-time = "2025-09-11T10:29:03.605Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/08/13/b4ef09837409a777f3c0af2a5b4ba9b7af34872bc43609dda0c209e4060d/opentelemetry_exporter_otlp_proto_common-1.37.0-py3-none-any.whl", hash = "sha256:53038428449c559b0c564b8d718df3314da387109c4d36bd1b94c9a641b0292e", size = 18359, upload-time = "2025-09-11T10:28:44.939Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-grpc"
+version = "1.37.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos" },
+    { name = "grpcio" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-common" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/d1/11/4ad0979d0bb13ae5a845214e97c8d42da43980034c30d6f72d8e0ebe580e/opentelemetry_exporter_otlp_proto_grpc-1.37.0.tar.gz", hash = "sha256:f55bcb9fc848ce05ad3dd954058bc7b126624d22c4d9e958da24d8537763bec5", size = 24465, upload-time = "2025-09-11T10:29:04.172Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/39/17/46630b74751031a658706bef23ac99cdc2953cd3b2d28ec90590a0766b3e/opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl", hash = "sha256:aee5104835bf7993b7ddaaf380b6467472abaedb1f1dbfcc54a52a7d781a3890", size = 19305, upload-time = "2025-09-11T10:28:45.776Z" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-http"
+version = "1.37.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-common" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk" },
+    { name = "requests" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5d/e3/6e320aeb24f951449e73867e53c55542bebbaf24faeee7623ef677d66736/opentelemetry_exporter_otlp_proto_http-1.37.0.tar.gz", hash = "sha256:e52e8600f1720d6de298419a802108a8f5afa63c96809ff83becb03f874e44ac", size = 17281, upload-time = "2025-09-11T10:29:04.844Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/e9/70d74a664d83976556cec395d6bfedd9b85ec1498b778367d5f93e373397/opentelemetry_exporter_otlp_proto_http-1.37.0-py3-none-any.whl", hash = "sha256:54c42b39945a6cc9d9a2a33decb876eabb9547e0dcb49df090122773447f1aef", size = 19576, upload-time = "2025-09-11T10:28:46.726Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation"
+version = "0.58b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "packaging" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f6/36/7c307d9be8ce4ee7beb86d7f1d31027f2a6a89228240405a858d6e4d64f9/opentelemetry_instrumentation-0.58b0.tar.gz", hash = "sha256:df640f3ac715a3e05af145c18f527f4422c6ab6c467e40bd24d2ad75a00cb705", size = 31549, upload-time = "2025-09-11T11:42:14.084Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d4/db/5ff1cd6c5ca1d12ecf1b73be16fbb2a8af2114ee46d4b0e6d4b23f4f4db7/opentelemetry_instrumentation-0.58b0-py3-none-any.whl", hash = "sha256:50f97ac03100676c9f7fc28197f8240c7290ca1baa12da8bfbb9a1de4f34cc45", size = 33019, upload-time = "2025-09-11T11:41:00.624Z" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation-threading"
+version = "0.58b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/70/a9/3888cb0470e6eb48ea17b6802275ae71df411edd6382b9a8e8f391936fda/opentelemetry_instrumentation_threading-0.58b0.tar.gz", hash = "sha256:f68c61f77841f9ff6270176f4d496c10addbceacd782af434d705f83e4504862", size = 8770, upload-time = "2025-09-11T11:42:56.308Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a5/54/add1076cb37980e617723a96e29c84006983e8ad6fc589dde7f69ddc57d4/opentelemetry_instrumentation_threading-0.58b0-py3-none-any.whl", hash = "sha256:eacc072881006aceb5b9b6831bcdce718c67ef6f31ac0b32bd6a23a94d979b4a", size = 9312, upload-time = "2025-09-11T11:41:58.603Z" },
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "1.37.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dd/ea/a75f36b463a36f3c5a10c0b5292c58b31dbdde74f6f905d3d0ab2313987b/opentelemetry_proto-1.37.0.tar.gz", hash = "sha256:30f5c494faf66f77faeaefa35ed4443c5edb3b0aa46dad073ed7210e1a789538", size = 46151, upload-time = "2025-09-11T10:29:11.04Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/25/f89ea66c59bd7687e218361826c969443c4fa15dfe89733f3bf1e2a9e971/opentelemetry_proto-1.37.0-py3-none-any.whl", hash = "sha256:8ed8c066ae8828bbf0c39229979bdf583a126981142378a9cbe9d6fd5701c6e2", size = 72534, upload-time = "2025-09-11T10:28:56.831Z" },
+]
+
+[[package]]
+name = "opentelemetry-sdk"
+version = "1.37.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f4/62/2e0ca80d7fe94f0b193135375da92c640d15fe81f636658d2acf373086bc/opentelemetry_sdk-1.37.0.tar.gz", hash = "sha256:cc8e089c10953ded765b5ab5669b198bbe0af1b3f89f1007d19acd32dc46dda5", size = 170404, upload-time = "2025-09-11T10:29:11.779Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9f/62/9f4ad6a54126fb00f7ed4bb5034964c6e4f00fcd5a905e115bd22707e20d/opentelemetry_sdk-1.37.0-py3-none-any.whl", hash = "sha256:8f3c3c22063e52475c5dbced7209495c2c16723d016d39287dfc215d1771257c", size = 131941, upload-time = "2025-09-11T10:28:57.83Z" },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.58b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/aa/1b/90701d91e6300d9f2fb352153fb1721ed99ed1f6ea14fa992c756016e63a/opentelemetry_semantic_conventions-0.58b0.tar.gz", hash = "sha256:6bd46f51264279c433755767bb44ad00f1c9e2367e1b42af563372c5a6fa0c25", size = 129867, upload-time = "2025-09-11T10:29:12.597Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/90/68152b7465f50285d3ce2481b3aec2f82822e3f52e5152eeeaf516bab841/opentelemetry_semantic_conventions-0.58b0-py3-none-any.whl", hash = "sha256:5564905ab1458b96684db1340232729fce3b5375a06e140e8904c78e4f815b28", size = 207954, upload-time = "2025-09-11T10:28:59.218Z" },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions-ai"
+version = "0.4.13"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/e6/40b59eda51ac47009fb47afcdf37c6938594a0bd7f3b9fadcbc6058248e3/opentelemetry_semantic_conventions_ai-0.4.13.tar.gz", hash = "sha256:94efa9fb4ffac18c45f54a3a338ffeb7eedb7e1bb4d147786e77202e159f0036", size = 5368, upload-time = "2025-08-22T10:14:17.387Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/35/b5/cf25da2218910f0d6cdf7f876a06bed118c4969eacaf60a887cbaef44f44/opentelemetry_semantic_conventions_ai-0.4.13-py3-none-any.whl", hash = "sha256:883a30a6bb5deaec0d646912b5f9f6dcbb9f6f72557b73d0f2560bf25d13e2d5", size = 6080, upload-time = "2025-08-22T10:14:16.477Z" },
+]
+
+[[package]]
+name = "orjson"
+version = "3.11.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/be/4d/8df5f83256a809c22c4d6792ce8d43bb503be0fb7a8e4da9025754b09658/orjson-3.11.3.tar.gz", hash = "sha256:1c0603b1d2ffcd43a411d64797a19556ef76958aef1c182f22dc30860152a98a", size = 5482394, upload-time = "2025-08-26T17:46:43.171Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3d/b0/a7edab2a00cdcb2688e1c943401cb3236323e7bfd2839815c6131a3742f4/orjson-3.11.3-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:8c752089db84333e36d754c4baf19c0e1437012242048439c7e80eb0e6426e3b", size = 238259, upload-time = "2025-08-26T17:45:15.093Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/c6/ff4865a9cc398a07a83342713b5932e4dc3cb4bf4bc04e8f83dedfc0d736/orjson-3.11.3-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:9b8761b6cf04a856eb544acdd82fc594b978f12ac3602d6374a7edb9d86fd2c2", size = 127633, upload-time = "2025-08-26T17:45:16.417Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/e6/e00bea2d9472f44fe8794f523e548ce0ad51eb9693cf538a753a27b8bda4/orjson-3.11.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b13974dc8ac6ba22feaa867fc19135a3e01a134b4f7c9c28162fed4d615008a", size = 123061, upload-time = "2025-08-26T17:45:17.673Z" },
+    { url = "https://files.pythonhosted.org/packages/54/31/9fbb78b8e1eb3ac605467cb846e1c08d0588506028b37f4ee21f978a51d4/orjson-3.11.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f83abab5bacb76d9c821fd5c07728ff224ed0e52d7a71b7b3de822f3df04e15c", size = 127956, upload-time = "2025-08-26T17:45:19.172Z" },
+    { url = "https://files.pythonhosted.org/packages/36/88/b0604c22af1eed9f98d709a96302006915cfd724a7ebd27d6dd11c22d80b/orjson-3.11.3-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e6fbaf48a744b94091a56c62897b27c31ee2da93d826aa5b207131a1e13d4064", size = 130790, upload-time = "2025-08-26T17:45:20.586Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/9d/1c1238ae9fffbfed51ba1e507731b3faaf6b846126a47e9649222b0fd06f/orjson-3.11.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc779b4f4bba2847d0d2940081a7b6f7b5877e05408ffbb74fa1faf4a136c424", size = 132385, upload-time = "2025-08-26T17:45:22.036Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/b5/c06f1b090a1c875f337e21dd71943bc9d84087f7cdf8c6e9086902c34e42/orjson-3.11.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd4b909ce4c50faa2192da6bb684d9848d4510b736b0611b6ab4020ea6fd2d23", size = 135305, upload-time = "2025-08-26T17:45:23.4Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/26/5f028c7d81ad2ebbf84414ba6d6c9cac03f22f5cd0d01eb40fb2d6a06b07/orjson-3.11.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:524b765ad888dc5518bbce12c77c2e83dee1ed6b0992c1790cc5fb49bb4b6667", size = 132875, upload-time = "2025-08-26T17:45:25.182Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/d4/b8df70d9cfb56e385bf39b4e915298f9ae6c61454c8154a0f5fd7efcd42e/orjson-3.11.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:84fd82870b97ae3cdcea9d8746e592b6d40e1e4d4527835fc520c588d2ded04f", size = 130940, upload-time = "2025-08-26T17:45:27.209Z" },
+    { url = "https://files.pythonhosted.org/packages/da/5e/afe6a052ebc1a4741c792dd96e9f65bf3939d2094e8b356503b68d48f9f5/orjson-3.11.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:fbecb9709111be913ae6879b07bafd4b0785b44c1eb5cac8ac76da048b3885a1", size = 403852, upload-time = "2025-08-26T17:45:28.478Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/90/7bbabafeb2ce65915e9247f14a56b29c9334003536009ef5b122783fe67e/orjson-3.11.3-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9dba358d55aee552bd868de348f4736ca5a4086d9a62e2bfbbeeb5629fe8b0cc", size = 146293, upload-time = "2025-08-26T17:45:29.86Z" },
+    { url = "https://files.pythonhosted.org/packages/27/b3/2d703946447da8b093350570644a663df69448c9d9330e5f1d9cce997f20/orjson-3.11.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eabcf2e84f1d7105f84580e03012270c7e97ecb1fb1618bda395061b2a84a049", size = 135470, upload-time = "2025-08-26T17:45:31.243Z" },
+    { url = "https://files.pythonhosted.org/packages/38/70/b14dcfae7aff0e379b0119c8a812f8396678919c431efccc8e8a0263e4d9/orjson-3.11.3-cp312-cp312-win32.whl", hash = "sha256:3782d2c60b8116772aea8d9b7905221437fdf53e7277282e8d8b07c220f96cca", size = 136248, upload-time = "2025-08-26T17:45:32.567Z" },
+    { url = "https://files.pythonhosted.org/packages/35/b8/9e3127d65de7fff243f7f3e53f59a531bf6bb295ebe5db024c2503cc0726/orjson-3.11.3-cp312-cp312-win_amd64.whl", hash = "sha256:79b44319268af2eaa3e315b92298de9a0067ade6e6003ddaef72f8e0bedb94f1", size = 131437, upload-time = "2025-08-26T17:45:34.949Z" },
+    { url = "https://files.pythonhosted.org/packages/51/92/a946e737d4d8a7fd84a606aba96220043dcc7d6988b9e7551f7f6d5ba5ad/orjson-3.11.3-cp312-cp312-win_arm64.whl", hash = "sha256:0e92a4e83341ef79d835ca21b8bd13e27c859e4e9e4d7b63defc6e58462a3710", size = 125978, upload-time = "2025-08-26T17:45:36.422Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/79/8932b27293ad35919571f77cb3693b5906cf14f206ef17546052a241fdf6/orjson-3.11.3-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:af40c6612fd2a4b00de648aa26d18186cd1322330bd3a3cc52f87c699e995810", size = 238127, upload-time = "2025-08-26T17:45:38.146Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/82/cb93cd8cf132cd7643b30b6c5a56a26c4e780c7a145db6f83de977b540ce/orjson-3.11.3-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:9f1587f26c235894c09e8b5b7636a38091a9e6e7fe4531937534749c04face43", size = 127494, upload-time = "2025-08-26T17:45:39.57Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/b8/2d9eb181a9b6bb71463a78882bcac1027fd29cf62c38a40cc02fc11d3495/orjson-3.11.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61dcdad16da5bb486d7227a37a2e789c429397793a6955227cedbd7252eb5a27", size = 123017, upload-time = "2025-08-26T17:45:40.876Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/14/a0e971e72d03b509190232356d54c0f34507a05050bd026b8db2bf2c192c/orjson-3.11.3-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:11c6d71478e2cbea0a709e8a06365fa63da81da6498a53e4c4f065881d21ae8f", size = 127898, upload-time = "2025-08-26T17:45:42.188Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/af/dc74536722b03d65e17042cc30ae586161093e5b1f29bccda24765a6ae47/orjson-3.11.3-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ff94112e0098470b665cb0ed06efb187154b63649403b8d5e9aedeb482b4548c", size = 130742, upload-time = "2025-08-26T17:45:43.511Z" },
+    { url = "https://files.pythonhosted.org/packages/62/e6/7a3b63b6677bce089fe939353cda24a7679825c43a24e49f757805fc0d8a/orjson-3.11.3-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae8b756575aaa2a855a75192f356bbda11a89169830e1439cfb1a3e1a6dde7be", size = 132377, upload-time = "2025-08-26T17:45:45.525Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/cd/ce2ab93e2e7eaf518f0fd15e3068b8c43216c8a44ed82ac2b79ce5cef72d/orjson-3.11.3-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c9416cc19a349c167ef76135b2fe40d03cea93680428efee8771f3e9fb66079d", size = 135313, upload-time = "2025-08-26T17:45:46.821Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/b4/f98355eff0bd1a38454209bbc73372ce351ba29933cb3e2eba16c04b9448/orjson-3.11.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b822caf5b9752bc6f246eb08124c3d12bf2175b66ab74bac2ef3bbf9221ce1b2", size = 132908, upload-time = "2025-08-26T17:45:48.126Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/92/8f5182d7bc2a1bed46ed960b61a39af8389f0ad476120cd99e67182bfb6d/orjson-3.11.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:414f71e3bdd5573893bf5ecdf35c32b213ed20aa15536fe2f588f946c318824f", size = 130905, upload-time = "2025-08-26T17:45:49.414Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/60/c41ca753ce9ffe3d0f67b9b4c093bdd6e5fdb1bc53064f992f66bb99954d/orjson-3.11.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:828e3149ad8815dc14468f36ab2a4b819237c155ee1370341b91ea4c8672d2ee", size = 403812, upload-time = "2025-08-26T17:45:51.085Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/13/e4a4f16d71ce1868860db59092e78782c67082a8f1dc06a3788aef2b41bc/orjson-3.11.3-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ac9e05f25627ffc714c21f8dfe3a579445a5c392a9c8ae7ba1d0e9fb5333f56e", size = 146277, upload-time = "2025-08-26T17:45:52.851Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/8b/bafb7f0afef9344754a3a0597a12442f1b85a048b82108ef2c956f53babd/orjson-3.11.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e44fbe4000bd321d9f3b648ae46e0196d21577cf66ae684a96ff90b1f7c93633", size = 135418, upload-time = "2025-08-26T17:45:54.806Z" },
+    { url = "https://files.pythonhosted.org/packages/60/d4/bae8e4f26afb2c23bea69d2f6d566132584d1c3a5fe89ee8c17b718cab67/orjson-3.11.3-cp313-cp313-win32.whl", hash = "sha256:2039b7847ba3eec1f5886e75e6763a16e18c68a63efc4b029ddf994821e2e66b", size = 136216, upload-time = "2025-08-26T17:45:57.182Z" },
+    { url = "https://files.pythonhosted.org/packages/88/76/224985d9f127e121c8cad882cea55f0ebe39f97925de040b75ccd4b33999/orjson-3.11.3-cp313-cp313-win_amd64.whl", hash = "sha256:29be5ac4164aa8bdcba5fa0700a3c9c316b411d8ed9d39ef8a882541bd452fae", size = 131362, upload-time = "2025-08-26T17:45:58.56Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/cf/0dce7a0be94bd36d1346be5067ed65ded6adb795fdbe3abd234c8d576d01/orjson-3.11.3-cp313-cp313-win_arm64.whl", hash = "sha256:18bd1435cb1f2857ceb59cfb7de6f92593ef7b831ccd1b9bfb28ca530e539dce", size = 125989, upload-time = "2025-08-26T17:45:59.95Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/77/d3b1fef1fc6aaeed4cbf3be2b480114035f4df8fa1a99d2dac1d40d6e924/orjson-3.11.3-cp314-cp314-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:cf4b81227ec86935568c7edd78352a92e97af8da7bd70bdfdaa0d2e0011a1ab4", size = 238115, upload-time = "2025-08-26T17:46:01.669Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/6d/468d21d49bb12f900052edcfbf52c292022d0a323d7828dc6376e6319703/orjson-3.11.3-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:bc8bc85b81b6ac9fc4dae393a8c159b817f4c2c9dee5d12b773bddb3b95fc07e", size = 127493, upload-time = "2025-08-26T17:46:03.466Z" },
+    { url = "https://files.pythonhosted.org/packages/67/46/1e2588700d354aacdf9e12cc2d98131fb8ac6f31ca65997bef3863edb8ff/orjson-3.11.3-cp314-cp314-manylinux_2_34_aarch64.whl", hash = "sha256:88dcfc514cfd1b0de038443c7b3e6a9797ffb1b3674ef1fd14f701a13397f82d", size = 122998, upload-time = "2025-08-26T17:46:04.803Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/94/11137c9b6adb3779f1b34fd98be51608a14b430dbc02c6d41134fbba484c/orjson-3.11.3-cp314-cp314-manylinux_2_34_x86_64.whl", hash = "sha256:d61cd543d69715d5fc0a690c7c6f8dcc307bc23abef9738957981885f5f38229", size = 132915, upload-time = "2025-08-26T17:46:06.237Z" },
+    { url = "https://files.pythonhosted.org/packages/10/61/dccedcf9e9bcaac09fdabe9eaee0311ca92115699500efbd31950d878833/orjson-3.11.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2b7b153ed90ababadbef5c3eb39549f9476890d339cf47af563aea7e07db2451", size = 130907, upload-time = "2025-08-26T17:46:07.581Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/fd/0e935539aa7b08b3ca0f817d73034f7eb506792aae5ecc3b7c6e679cdf5f/orjson-3.11.3-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:7909ae2460f5f494fecbcd10613beafe40381fd0316e35d6acb5f3a05bfda167", size = 403852, upload-time = "2025-08-26T17:46:08.982Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/2b/50ae1a5505cd1043379132fdb2adb8a05f37b3e1ebffe94a5073321966fd/orjson-3.11.3-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:2030c01cbf77bc67bee7eef1e7e31ecf28649353987775e3583062c752da0077", size = 146309, upload-time = "2025-08-26T17:46:10.576Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/1d/a473c158e380ef6f32753b5f39a69028b25ec5be331c2049a2201bde2e19/orjson-3.11.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:a0169ebd1cbd94b26c7a7ad282cf5c2744fce054133f959e02eb5265deae1872", size = 135424, upload-time = "2025-08-26T17:46:12.386Z" },
+    { url = "https://files.pythonhosted.org/packages/da/09/17d9d2b60592890ff7382e591aa1d9afb202a266b180c3d4049b1ec70e4a/orjson-3.11.3-cp314-cp314-win32.whl", hash = "sha256:0c6d7328c200c349e3a4c6d8c83e0a5ad029bdc2d417f234152bf34842d0fc8d", size = 136266, upload-time = "2025-08-26T17:46:13.853Z" },
+    { url = "https://files.pythonhosted.org/packages/15/58/358f6846410a6b4958b74734727e582ed971e13d335d6c7ce3e47730493e/orjson-3.11.3-cp314-cp314-win_amd64.whl", hash = "sha256:317bbe2c069bbc757b1a2e4105b64aacd3bc78279b66a6b9e51e846e4809f804", size = 131351, upload-time = "2025-08-26T17:46:15.27Z" },
+    { url = "https://files.pythonhosted.org/packages/28/01/d6b274a0635be0468d4dbd9cafe80c47105937a0d42434e805e67cd2ed8b/orjson-3.11.3-cp314-cp314-win_arm64.whl", hash = "sha256:e8f6a7a27d7b7bec81bd5924163e9af03d49bbb63013f107b48eb5d16db711bc", size = 125985, upload-time = "2025-08-26T17:46:16.67Z" },
+]
+
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -6136,6 +6399,55 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/ea/c67e1dee1ba208ed22c06d1d547ae5e293374bfc43e0eb0ef5e262b68561/werkzeug-3.1.1-py3-none-any.whl", hash = "sha256:a71124d1ef06008baafa3d266c02f56e1836a5984afd6dd6c9230669d60d9fb5", size = 224371, upload-time = "2024-11-01T16:40:43.994Z" },
 ]
 
+[[package]]
+name = "wrapt"
+version = "1.17.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9f/41/cad1aba93e752f1f9268c77270da3c469883d56e2798e7df6240dcb2287b/wrapt-1.17.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ab232e7fdb44cdfbf55fc3afa31bcdb0d8980b9b95c38b6405df2acb672af0e0", size = 53998, upload-time = "2025-08-12T05:51:47.138Z" },
+    { url = "https://files.pythonhosted.org/packages/60/f8/096a7cc13097a1869fe44efe68dace40d2a16ecb853141394047f0780b96/wrapt-1.17.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9baa544e6acc91130e926e8c802a17f3b16fbea0fd441b5a60f5cf2cc5c3deba", size = 39020, upload-time = "2025-08-12T05:51:35.906Z" },
+    { url = "https://files.pythonhosted.org/packages/33/df/bdf864b8997aab4febb96a9ae5c124f700a5abd9b5e13d2a3214ec4be705/wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b538e31eca1a7ea4605e44f81a48aa24c4632a277431a6ed3f328835901f4fd", size = 39098, upload-time = "2025-08-12T05:51:57.474Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/81/5d931d78d0eb732b95dc3ddaeeb71c8bb572fb01356e9133916cd729ecdd/wrapt-1.17.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:042ec3bb8f319c147b1301f2393bc19dba6e176b7da446853406d041c36c7828", size = 88036, upload-time = "2025-08-12T05:52:34.784Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/38/2e1785df03b3d72d34fc6252d91d9d12dc27a5c89caef3335a1bbb8908ca/wrapt-1.17.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3af60380ba0b7b5aeb329bc4e402acd25bd877e98b3727b0135cb5c2efdaefe9", size = 88156, upload-time = "2025-08-12T05:52:13.599Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/8b/48cdb60fe0603e34e05cffda0b2a4adab81fd43718e11111a4b0100fd7c1/wrapt-1.17.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b02e424deef65c9f7326d8c19220a2c9040c51dc165cddb732f16198c168396", size = 87102, upload-time = "2025-08-12T05:52:14.56Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/51/d81abca783b58f40a154f1b2c56db1d2d9e0d04fa2d4224e357529f57a57/wrapt-1.17.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:74afa28374a3c3a11b3b5e5fca0ae03bef8450d6aa3ab3a1e2c30e3a75d023dc", size = 87732, upload-time = "2025-08-12T05:52:36.165Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/b1/43b286ca1392a006d5336412d41663eeef1ad57485f3e52c767376ba7e5a/wrapt-1.17.3-cp312-cp312-win32.whl", hash = "sha256:4da9f45279fff3543c371d5ababc57a0384f70be244de7759c85a7f989cb4ebe", size = 36705, upload-time = "2025-08-12T05:53:07.123Z" },
+    { url = "https://files.pythonhosted.org/packages/28/de/49493f962bd3c586ab4b88066e967aa2e0703d6ef2c43aa28cb83bf7b507/wrapt-1.17.3-cp312-cp312-win_amd64.whl", hash = "sha256:e71d5c6ebac14875668a1e90baf2ea0ef5b7ac7918355850c0908ae82bcb297c", size = 38877, upload-time = "2025-08-12T05:53:05.436Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/48/0f7102fe9cb1e8a5a77f80d4f0956d62d97034bbe88d33e94699f99d181d/wrapt-1.17.3-cp312-cp312-win_arm64.whl", hash = "sha256:604d076c55e2fdd4c1c03d06dc1a31b95130010517b5019db15365ec4a405fc6", size = 36885, upload-time = "2025-08-12T05:52:54.367Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/f6/759ece88472157acb55fc195e5b116e06730f1b651b5b314c66291729193/wrapt-1.17.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a47681378a0439215912ef542c45a783484d4dd82bac412b71e59cf9c0e1cea0", size = 54003, upload-time = "2025-08-12T05:51:48.627Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/a9/49940b9dc6d47027dc850c116d79b4155f15c08547d04db0f07121499347/wrapt-1.17.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a30837587c6ee3cd1a4d1c2ec5d24e77984d44e2f34547e2323ddb4e22eb77", size = 39025, upload-time = "2025-08-12T05:51:37.156Z" },
+    { url = "https://files.pythonhosted.org/packages/45/35/6a08de0f2c96dcdd7fe464d7420ddb9a7655a6561150e5fc4da9356aeaab/wrapt-1.17.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:16ecf15d6af39246fe33e507105d67e4b81d8f8d2c6598ff7e3ca1b8a37213f7", size = 39108, upload-time = "2025-08-12T05:51:58.425Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277", size = 88072, upload-time = "2025-08-12T05:52:37.53Z" },
+    { url = "https://files.pythonhosted.org/packages/78/f2/efe19ada4a38e4e15b6dff39c3e3f3f73f5decf901f66e6f72fe79623a06/wrapt-1.17.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ed61b7c2d49cee3c027372df5809a59d60cf1b6c2f81ee980a091f3afed6a2d", size = 88214, upload-time = "2025-08-12T05:52:15.886Z" },
+    { url = "https://files.pythonhosted.org/packages/40/90/ca86701e9de1622b16e09689fc24b76f69b06bb0150990f6f4e8b0eeb576/wrapt-1.17.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:423ed5420ad5f5529db9ce89eac09c8a2f97da18eb1c870237e84c5a5c2d60aa", size = 87105, upload-time = "2025-08-12T05:52:17.914Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/e0/d10bd257c9a3e15cbf5523025252cc14d77468e8ed644aafb2d6f54cb95d/wrapt-1.17.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e01375f275f010fcbf7f643b4279896d04e571889b8a5b3f848423d91bf07050", size = 87766, upload-time = "2025-08-12T05:52:39.243Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/cf/7d848740203c7b4b27eb55dbfede11aca974a51c3d894f6cc4b865f42f58/wrapt-1.17.3-cp313-cp313-win32.whl", hash = "sha256:53e5e39ff71b3fc484df8a522c933ea2b7cdd0d5d15ae82e5b23fde87d44cbd8", size = 36711, upload-time = "2025-08-12T05:53:10.074Z" },
+    { url = "https://files.pythonhosted.org/packages/57/54/35a84d0a4d23ea675994104e667ceff49227ce473ba6a59ba2c84f250b74/wrapt-1.17.3-cp313-cp313-win_amd64.whl", hash = "sha256:1f0b2f40cf341ee8cc1a97d51ff50dddb9fcc73241b9143ec74b30fc4f44f6cb", size = 38885, upload-time = "2025-08-12T05:53:08.695Z" },
+    { url = "https://files.pythonhosted.org/packages/01/77/66e54407c59d7b02a3c4e0af3783168fff8e5d61def52cda8728439d86bc/wrapt-1.17.3-cp313-cp313-win_arm64.whl", hash = "sha256:7425ac3c54430f5fc5e7b6f41d41e704db073309acfc09305816bc6a0b26bb16", size = 36896, upload-time = "2025-08-12T05:52:55.34Z" },
+    { url = "https://files.pythonhosted.org/packages/02/a2/cd864b2a14f20d14f4c496fab97802001560f9f41554eef6df201cd7f76c/wrapt-1.17.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cf30f6e3c077c8e6a9a7809c94551203c8843e74ba0c960f4a98cd80d4665d39", size = 54132, upload-time = "2025-08-12T05:51:49.864Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/46/d011725b0c89e853dc44cceb738a307cde5d240d023d6d40a82d1b4e1182/wrapt-1.17.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e228514a06843cae89621384cfe3a80418f3c04aadf8a3b14e46a7be704e4235", size = 39091, upload-time = "2025-08-12T05:51:38.935Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/9e/3ad852d77c35aae7ddebdbc3b6d35ec8013af7d7dddad0ad911f3d891dae/wrapt-1.17.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:5ea5eb3c0c071862997d6f3e02af1d055f381b1d25b286b9d6644b79db77657c", size = 39172, upload-time = "2025-08-12T05:51:59.365Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/f7/c983d2762bcce2326c317c26a6a1e7016f7eb039c27cdf5c4e30f4160f31/wrapt-1.17.3-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:281262213373b6d5e4bb4353bc36d1ba4084e6d6b5d242863721ef2bf2c2930b", size = 87163, upload-time = "2025-08-12T05:52:40.965Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/0f/f673f75d489c7f22d17fe0193e84b41540d962f75fce579cf6873167c29b/wrapt-1.17.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc4a8d2b25efb6681ecacad42fca8859f88092d8732b170de6a5dddd80a1c8fa", size = 87963, upload-time = "2025-08-12T05:52:20.326Z" },
+    { url = "https://files.pythonhosted.org/packages/df/61/515ad6caca68995da2fac7a6af97faab8f78ebe3bf4f761e1b77efbc47b5/wrapt-1.17.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:373342dd05b1d07d752cecbec0c41817231f29f3a89aa8b8843f7b95992ed0c7", size = 86945, upload-time = "2025-08-12T05:52:21.581Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/bd/4e70162ce398462a467bc09e768bee112f1412e563620adc353de9055d33/wrapt-1.17.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d40770d7c0fd5cbed9d84b2c3f2e156431a12c9a37dc6284060fb4bec0b7ffd4", size = 86857, upload-time = "2025-08-12T05:52:43.043Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/b8/da8560695e9284810b8d3df8a19396a6e40e7518059584a1a394a2b35e0a/wrapt-1.17.3-cp314-cp314-win32.whl", hash = "sha256:fbd3c8319de8e1dc79d346929cd71d523622da527cca14e0c1d257e31c2b8b10", size = 37178, upload-time = "2025-08-12T05:53:12.605Z" },
+    { url = "https://files.pythonhosted.org/packages/db/c8/b71eeb192c440d67a5a0449aaee2310a1a1e8eca41676046f99ed2487e9f/wrapt-1.17.3-cp314-cp314-win_amd64.whl", hash = "sha256:e1a4120ae5705f673727d3253de3ed0e016f7cd78dc463db1b31e2463e1f3cf6", size = 39310, upload-time = "2025-08-12T05:53:11.106Z" },
+    { url = "https://files.pythonhosted.org/packages/45/20/2cda20fd4865fa40f86f6c46ed37a2a8356a7a2fde0773269311f2af56c7/wrapt-1.17.3-cp314-cp314-win_arm64.whl", hash = "sha256:507553480670cab08a800b9463bdb881b2edeed77dc677b0a5915e6106e91a58", size = 37266, upload-time = "2025-08-12T05:52:56.531Z" },
+    { url = "https://files.pythonhosted.org/packages/77/ed/dd5cf21aec36c80443c6f900449260b80e2a65cf963668eaef3b9accce36/wrapt-1.17.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ed7c635ae45cfbc1a7371f708727bf74690daedc49b4dba310590ca0bd28aa8a", size = 56544, upload-time = "2025-08-12T05:51:51.109Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/96/450c651cc753877ad100c7949ab4d2e2ecc4d97157e00fa8f45df682456a/wrapt-1.17.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:249f88ed15503f6492a71f01442abddd73856a0032ae860de6d75ca62eed8067", size = 40283, upload-time = "2025-08-12T05:51:39.912Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/86/2fcad95994d9b572db57632acb6f900695a648c3e063f2cd344b3f5c5a37/wrapt-1.17.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a03a38adec8066d5a37bea22f2ba6bbf39fcdefbe2d91419ab864c3fb515454", size = 40366, upload-time = "2025-08-12T05:52:00.693Z" },
+    { url = "https://files.pythonhosted.org/packages/64/0e/f4472f2fdde2d4617975144311f8800ef73677a159be7fe61fa50997d6c0/wrapt-1.17.3-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5d4478d72eb61c36e5b446e375bbc49ed002430d17cdec3cecb36993398e1a9e", size = 108571, upload-time = "2025-08-12T05:52:44.521Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload-time = "2025-08-12T05:52:22.618Z" },
+    { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload-time = "2025-08-12T05:52:24.057Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload-time = "2025-08-12T05:52:45.976Z" },
+    { url = "https://files.pythonhosted.org/packages/be/44/a1bd64b723d13bb151d6cc91b986146a1952385e0392a78567e12149c7b4/wrapt-1.17.3-cp314-cp314t-win32.whl", hash = "sha256:41b1d2bc74c2cac6f9074df52b2efbef2b30bdfe5f40cb78f8ca22963bc62977", size = 38717, upload-time = "2025-08-12T05:53:15.214Z" },
+    { url = "https://files.pythonhosted.org/packages/79/d9/7cfd5a312760ac4dd8bf0184a6ee9e43c33e47f3dadc303032ce012b8fa3/wrapt-1.17.3-cp314-cp314t-win_amd64.whl", hash = "sha256:73d496de46cd2cdbdbcce4ae4bcdb4afb6a11234a1df9c085249d55166b95116", size = 41334, upload-time = "2025-08-12T05:53:14.178Z" },
+    { url = "https://files.pythonhosted.org/packages/46/78/10ad9781128ed2f99dbc474f43283b13fea8ba58723e98844367531c18e9/wrapt-1.17.3-cp314-cp314t-win_arm64.whl", hash = "sha256:f38e60678850c42461d4202739f9bf1e3a737c7ad283638251e79cc49effb6b6", size = 38471, upload-time = "2025-08-12T05:52:57.784Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
+]
+
 [[package]]
 name = "wsproto"
 version = "1.2.0"