Skip to content

Commit b0fb210

Browse files
ybaturinaGoogle-ML-Automation
authored andcommitted
Check H100/B200 test results.
PiperOrigin-RevId: 834325957
1 parent 00d707e commit b0fb210

File tree

1 file changed

+9
-19
lines changed

1 file changed

+9
-19
lines changed

.github/workflows/bazel_cuda_h100_b200.yml

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,13 @@ jobs:
7676
bazel test \
7777
--config=ci_linux_x86_64_cuda \
7878
--config=ci_rbe_cache \
79+
--@cuda_driver//:include_cuda_umd_libs=false \
7980
--repo_env=HERMETIC_PYTHON_VERSION="3.14" \
8081
--repo_env=HERMETIC_CUDNN_VERSION="9.11.0" \
8182
--repo_env=HERMETIC_CUDA_UMD_VERSION="13.0.0" \
8283
--test_env=XLA_PYTHON_CLIENT_ALLOCATOR=platform \
8384
--run_under "$(pwd)/build/parallel_accelerator_execute.sh" \
84-
--test_output=errors \
85+
--test_output=all \
8586
--test_tag_filters=-multiaccelerator \
8687
--test_env=JAX_ACCELERATOR_COUNT=1 \
8788
--test_env=JAX_TESTS_PER_ACCELERATOR=8 \
@@ -95,14 +96,11 @@ jobs:
9596
--flaky_test_attempts=1 \
9697
--test_timeout=420 \
9798
--color=yes \
98-
//tests:cudnn_fusion_test_gpu \
99-
//tests:scaled_matmul_stablehlo_test_gpu \
100-
//tests:fused_attention_stablehlo_test_gpu \
101-
//tests:nn_test_gpu \
102-
//tests/pallas:gpu_tests \
103-
//tests/mosaic:gpu_tests
99+
--action_env=LD_DEBUG=files,libs \
100+
//tests:nn_test_gpu
104101
run_multiaccelerator_tests:
105-
if: ${{ github.event.repository.fork == false && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'CI Optional GPU Presubmit')) }}
102+
needs: changed_files
103+
if: ${{ github.event.repository.fork == false && (github.event_name == 'schedule' || needs.changed_files.outputs.any_changed == 'true' || github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'CI Optional GPU Presubmit')) }}
106104
runs-on: linux-x86-a3-8g-h100-8gpu
107105
container: 'us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build:latest'
108106
name: "Bazel multiple H100 CUDA tests"
@@ -120,11 +118,12 @@ jobs:
120118
bazel test \
121119
--config=ci_linux_x86_64_cuda \
122120
--config=ci_rbe_cache \
121+
--@cuda_driver//:include_cuda_umd_libs=false \
123122
--repo_env=HERMETIC_PYTHON_VERSION="3.14" \
124123
--repo_env=HERMETIC_CUDNN_VERSION="9.11.0" \
125124
--repo_env=HERMETIC_CUDA_UMD_VERSION="13.0.0" \
126125
--test_env=XLA_PYTHON_CLIENT_ALLOCATOR=platform \
127-
--test_output=errors \
126+
--test_output=all \
128127
--strategy=TestRunner=local \
129128
--local_test_jobs=8 \
130129
--test_env=JAX_EXCLUDE_TEST_TARGETS='PmapTest.testSizeOverflow|.*InterpretTest.*' \
@@ -135,13 +134,4 @@ jobs:
135134
--action_env=NCCL_DEBUG=WARN \
136135
--flaky_test_attempts=1 \
137136
--color=yes \
138-
//tests/mosaic:gpu_tests \
139-
//tests/pallas:gpu_tests \
140-
//tests:array_interoperability_test_gpu \
141-
//tests:cudnn_fusion_test_gpu \
142-
//tests:fused_attention_stablehlo_test_gpu \
143-
//tests:gpu_tests \
144-
//tests:python_callback_test_gpu \
145-
//tests:ragged_collective_test_gpu \
146-
//tests/multiprocess:gpu_tests \
147-
//jax/experimental/jax2tf/tests/multiprocess:gpu_tests
137+
//tests:nn_test_gpu

0 commit comments

Comments
 (0)