From a779e8c069244e9f67815277f742904064610d0e Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 11 Nov 2025 21:29:34 -0600 Subject: [PATCH 1/4] Replaced TEST_WITH_ROCM environment variable with gpu_arch_type argument to run_tests.py. --- .github/workflows/integration_test_8gpu_features.yaml | 3 +-- tests/integration_tests/run_tests.py | 11 +++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index c6e8ed30d5..0de1003192 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -73,8 +73,7 @@ jobs: sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded" sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded" - export TEST_WITH_ROCM=$([[ "${{ matrix.gpu-arch-type }}" == "rocm" ]] && echo 1 || echo 0) - python -m tests.integration_tests.run_tests --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8 + python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8 rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint rm -rf artifacts-to-be-uploaded/*/checkpoint diff --git a/tests/integration_tests/run_tests.py b/tests/integration_tests/run_tests.py index 011fa25554..b2cb8ea503 100644 --- a/tests/integration_tests/run_tests.py +++ b/tests/integration_tests/run_tests.py @@ -25,9 +25,6 @@ } -TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1" - - def _run_cmd(cmd): return subprocess.run([cmd], text=True, shell=True) @@ -92,7 +89,7 @@ def run_tests(args, test_list: list[OverrideDefinitions]): continue # Skip the test for ROCm - if TEST_WITH_ROCM and test_flavor.skip_rocm_test: + if args.gpu_arch_type == "rocm" and test_flavor.skip_rocm_test: continue # Check if we have enough GPUs @@ -110,6 +107,12 @@ def main(): parser.add_argument( "output_dir", help="Directory to dump results generated by tests" ) + parser.add_argument( + "--gpu_arch_type", + default="cuda", + choices=["cuda", "rocm"], + help="GPU architecture type. Must be specified as either 'cuda' or 'rocm'.", + ) parser.add_argument( "--test_suite", default="features", From b8467c2bf2d849f642feb40894c4f552dfcaf03c Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Tue, 18 Nov 2025 23:43:01 -0600 Subject: [PATCH 2/4] Generate dynamic matrix and use in the build-test job. --- .../integration_test_8gpu_features.yaml | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index 0de1003192..f1f1edbec2 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -25,26 +25,34 @@ permissions: contents: read jobs: + # Step 1: Dynamically compute the matrix based on conditions + set-matrix: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set.outputs.matrix }} + steps: - id: set run: | # Decide which matrix entries to include based on event type if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then + # Include both CUDA and ROCm + echo '{"include":[ + {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}, + {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} + ]}' > matrix.json + else + # Include only CUDA + echo '{"include":[ + {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"} + ]}' > matrix.json + fi + + # Export matrix to job outputs + echo "matrix=$(cat matrix.json)" >> $GITHUB_OUTPUT + + # Step 2: Use the dynamic matrix in the build-test job build-test: + needs: set-matrix uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main strategy: fail-fast: false - matrix: - include: - - name: cuda - runner: linux.g5.48xlarge.nvidia.gpu - gpu-arch-type: cuda - gpu-arch-version: "12.6" - # This image is faster to clone than the default, but it lacks CC needed by triton - # (1m25s vs 2m37s). - docker-image: torchtitan-ubuntu-20.04-clang12 - index-url: https://download.pytorch.org/whl/nightly/cu126 - - name: rocm - runner: linux.rocm.gpu.gfx942.8 - gpu-arch-type: rocm - gpu-arch-version: "7.0" - docker-image: torchtitan-rocm-ubuntu-22.04-clang12 - index-url: https://download.pytorch.org/whl/nightly/rocm7.0 + matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }} with: runner: ${{ matrix.runner }} gpu-arch-type: ${{ matrix.gpu-arch-type }} From bb1568d47dedc1000b75a0aa35dd4d472b8cb6b4 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 19 Nov 2025 00:02:23 -0600 Subject: [PATCH 3/4] Fix indentation in integration_test_8gpu_features.yaml. --- .../integration_test_8gpu_features.yaml | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index f1f1edbec2..c407830b1d 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -30,16 +30,20 @@ jobs: runs-on: ubuntu-latest outputs: matrix: ${{ steps.set.outputs.matrix }} - steps: - id: set run: | # Decide which matrix entries to include based on event type if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then - # Include both CUDA and ROCm - echo '{"include":[ - {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}, - {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} + steps: + - id: set + run: | + # Decide which matrix entries to include based on event type + if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then + # Include both CUDA and ROCm + echo '{"include":[ + {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}, + {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"} ]}' > matrix.json else - # Include only CUDA - echo '{"include":[ - {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"} + # Include only CUDA + echo '{"include":[ + {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"} ]}' > matrix.json fi From fd594c17769fa0617eec26c4825606d576865d71 Mon Sep 17 00:00:00 2001 From: Akash Verma Date: Wed, 19 Nov 2025 00:09:58 -0600 Subject: [PATCH 4/4] Use the multiline output syntax. Since the output contains spaces, quotes, newlines, hence wrapping it in a heredoc block. --- .github/workflows/integration_test_8gpu_features.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index c407830b1d..14e185b5e1 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -48,7 +48,12 @@ jobs: fi # Export matrix to job outputs - echo "matrix=$(cat matrix.json)" >> $GITHUB_OUTPUT + { + echo 'matrix<> $GITHUB_OUTPUT + # Step 2: Use the dynamic matrix in the build-test job build-test: