Check H100/B200 test results.

ybaturina · Google-ML-Automation · commit b0fb210a6217 · 2025-11-19T10:14:18.000-08:00
PiperOrigin-RevId: 834325957
diff --git a/.github/workflows/bazel_cuda_h100_b200.yml b/.github/workflows/bazel_cuda_h100_b200.yml
@@ -76,12 +76,13 @@ jobs:
             bazel test \
             --config=ci_linux_x86_64_cuda \
             --config=ci_rbe_cache \
+            --@cuda_driver//:include_cuda_umd_libs=false \
             --repo_env=HERMETIC_PYTHON_VERSION="3.14" \
             --repo_env=HERMETIC_CUDNN_VERSION="9.11.0" \
             --repo_env=HERMETIC_CUDA_UMD_VERSION="13.0.0" \
             --test_env=XLA_PYTHON_CLIENT_ALLOCATOR=platform \
             --run_under "$(pwd)/build/parallel_accelerator_execute.sh" \
-            --test_output=errors \
+            --test_output=all \
             --test_tag_filters=-multiaccelerator \
             --test_env=JAX_ACCELERATOR_COUNT=1 \
             --test_env=JAX_TESTS_PER_ACCELERATOR=8 \
@@ -95,14 +96,11 @@ jobs:
             --flaky_test_attempts=1 \
             --test_timeout=420 \
             --color=yes \
-            //tests:cudnn_fusion_test_gpu \
-            //tests:scaled_matmul_stablehlo_test_gpu \
-            //tests:fused_attention_stablehlo_test_gpu \
-            //tests:nn_test_gpu \
-            //tests/pallas:gpu_tests \
-            //tests/mosaic:gpu_tests
+            --action_env=LD_DEBUG=files,libs \
+            //tests:nn_test_gpu
   run_multiaccelerator_tests:
-    if: ${{ github.event.repository.fork == false && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'CI Optional GPU Presubmit')) }}
+    needs: changed_files
+    if: ${{ github.event.repository.fork == false && (github.event_name == 'schedule' || needs.changed_files.outputs.any_changed == 'true' || github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'CI Optional GPU Presubmit')) }}
     runs-on: linux-x86-a3-8g-h100-8gpu
     container: 'us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build:latest'
     name: "Bazel multiple H100 CUDA tests"
@@ -120,11 +118,12 @@ jobs:
             bazel test \
             --config=ci_linux_x86_64_cuda \
             --config=ci_rbe_cache \
+            --@cuda_driver//:include_cuda_umd_libs=false \
             --repo_env=HERMETIC_PYTHON_VERSION="3.14" \
             --repo_env=HERMETIC_CUDNN_VERSION="9.11.0" \
             --repo_env=HERMETIC_CUDA_UMD_VERSION="13.0.0" \
             --test_env=XLA_PYTHON_CLIENT_ALLOCATOR=platform \
-            --test_output=errors \
+            --test_output=all \
             --strategy=TestRunner=local \
             --local_test_jobs=8 \
             --test_env=JAX_EXCLUDE_TEST_TARGETS='PmapTest.testSizeOverflow|.*InterpretTest.*' \
@@ -135,13 +134,4 @@ jobs:
             --action_env=NCCL_DEBUG=WARN \
             --flaky_test_attempts=1 \
             --color=yes \
-            //tests/mosaic:gpu_tests \
-            //tests/pallas:gpu_tests \
-            //tests:array_interoperability_test_gpu \
-            //tests:cudnn_fusion_test_gpu \
-            //tests:fused_attention_stablehlo_test_gpu \
-            //tests:gpu_tests \
-            //tests:python_callback_test_gpu \
-            //tests:ragged_collective_test_gpu \
-            //tests/multiprocess:gpu_tests \
-            //jax/experimental/jax2tf/tests/multiprocess:gpu_tests
+            //tests:nn_test_gpu