Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 34 additions & 18 deletions .github/workflows/integration_test_8gpu_features.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,43 @@ permissions:
contents: read

jobs:
# Step 1: Dynamically compute the matrix based on conditions
set-matrix:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set.outputs.matrix }}
steps:
- id: set
run: |
# Decide which matrix entries to include based on event type
if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then
# Include both CUDA and ROCm
echo '{"include":[
{"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"},
{"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"}
]}' > matrix.json
else
# Include only CUDA
echo '{"include":[
{"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}
]}' > matrix.json
fi

# Export matrix to job outputs
{
echo 'matrix<<EOF'
cat matrix.json
echo 'EOF'
} >> $GITHUB_OUTPUT


# Step 2: Use the dynamic matrix in the build-test job
build-test:
needs: set-matrix
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
strategy:
fail-fast: false
matrix:
include:
- name: cuda
runner: linux.g5.48xlarge.nvidia.gpu
gpu-arch-type: cuda
gpu-arch-version: "12.6"
# This image is faster to clone than the default, but it lacks CC needed by triton
# (1m25s vs 2m37s).
docker-image: torchtitan-ubuntu-20.04-clang12
index-url: https://download.pytorch.org/whl/nightly/cu126
- name: rocm
runner: linux.rocm.gpu.gfx942.8
gpu-arch-type: rocm
gpu-arch-version: "7.0"
docker-image: torchtitan-rocm-ubuntu-22.04-clang12
index-url: https://download.pytorch.org/whl/nightly/rocm7.0
matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
with:
runner: ${{ matrix.runner }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
Expand Down Expand Up @@ -73,8 +90,7 @@ jobs:
sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"

export TEST_WITH_ROCM=$([[ "${{ matrix.gpu-arch-type }}" == "rocm" ]] && echo 1 || echo 0)
python -m tests.integration_tests.run_tests --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8

rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
rm -rf artifacts-to-be-uploaded/*/checkpoint
11 changes: 7 additions & 4 deletions tests/integration_tests/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,6 @@
}


TEST_WITH_ROCM = os.getenv("TEST_WITH_ROCM", "0") == "1"


def _run_cmd(cmd):
return subprocess.run([cmd], text=True, shell=True)

Expand Down Expand Up @@ -92,7 +89,7 @@ def run_tests(args, test_list: list[OverrideDefinitions]):
continue

# Skip the test for ROCm
if TEST_WITH_ROCM and test_flavor.skip_rocm_test:
if args.gpu_arch_type == "rocm" and test_flavor.skip_rocm_test:
continue

# Check if we have enough GPUs
Expand All @@ -110,6 +107,12 @@ def main():
parser.add_argument(
"output_dir", help="Directory to dump results generated by tests"
)
parser.add_argument(
"--gpu_arch_type",
default="cuda",
choices=["cuda", "rocm"],
help="GPU architecture type. Must be specified as either 'cuda' or 'rocm'.",
)
parser.add_argument(
"--test_suite",
default="features",
Expand Down
Loading