From 7690ce042f258915fe7dc2265363de01d12fa3f1 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 9 Jul 2025 15:24:57 +0000 Subject: [PATCH 01/12] distributed_weekly test --- .github/actions/linux-testenv/action.yml | 4 ++-- .github/actions/linux-uttest/action.yml | 6 ++++-- .github/scripts/build.sh | 1 - .github/workflows/_linux_build.yml | 4 ++-- .github/workflows/_linux_ut.yml | 4 ++-- .github/workflows/pull.yml | 9 ++++----- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index ea876649cc..13054c746d 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,11 +3,11 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.9' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 9f3888694b..e18c1e9f39 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -170,11 +170,13 @@ runs: tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log - name: xpu_distributed - shell: timeout 36000 bash -xeu -o pipefail {0} + shell: bash -xeu -o pipefail {0} if: ${{ inputs.ut_name == 'xpu_distributed' }} run: | xpu-smi topology -m mkdir -p ut_log/xpu_distributed + pip install pytest pytest-timeout xmlrunner unittest-xml-reporting zstandard transformers + cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/ut_result_check.sh ut_log/xpu_distributed/ cd pytorch/third_party/torch-xpu-ops/test/xpu XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then @@ -184,7 +186,7 @@ runs: export CCL_ROOT=$(dirname $(which python))/../ export PATH="${CCL_ROOT}/bin/libfabric:${PATH}" export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}" - python run_distributed.py \ + python run_distributed_local.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \; diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index b4f5262979..b0b7f17b2c 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -44,7 +44,6 @@ git remote -v && git branch && git show -s # Pre Build cd ${WORKSPACE}/pytorch python -m pip install requests -python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt python -m pip install mkl-static==2025.2.0 mkl-include==2025.2.0 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 1bed161f2b..24f0f777d1 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,11 +10,11 @@ on: description: Runner label pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.9' description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: required: false diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index dc10647124..0ede180c26 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,11 +9,11 @@ on: description: Runner label pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.9' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 6c26af3c71..67d1fd25ad 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,8 +100,8 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - runner: pvc_rolling - pytorch: ${{ needs.conditions-filter.outputs.pytorch }} + pytorch: distributed_2.9 + runner: PVC-7358 linux-ut: needs: [conditions-filter, linux-build] @@ -130,9 +130,8 @@ jobs: ut_name: [xpu_distributed] uses: ./.github/workflows/_linux_ut.yml with: - runner: pvc_rolling - pytorch: ${{ needs.conditions-filter.outputs.pytorch }} - torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }} + runner: PVC-7358 + pytorch: distributed_2.9 ut: ${{ matrix.ut_name }} linux-e2e: From 1797096e87bff86aff272f8829049934c52e6218 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 28 Jul 2025 17:23:17 +0800 Subject: [PATCH 02/12] update --- .github/workflows/pull.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 67d1fd25ad..e4234732fa 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,8 +100,13 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: +<<<<<<< HEAD pytorch: distributed_2.9 runner: PVC-7358 +======= + pytorch: distributed_2.8 + runner: pvc_e2e +>>>>>>> 680c2cce (update) linux-ut: needs: [conditions-filter, linux-build] From b666750ce9001b5f9b0344dbe7b683c98966ef76 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 28 Jul 2025 23:18:35 +0800 Subject: [PATCH 03/12] update --- .github/workflows/pull.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index e4234732fa..67d1fd25ad 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,13 +100,8 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: -<<<<<<< HEAD pytorch: distributed_2.9 runner: PVC-7358 -======= - pytorch: distributed_2.8 - runner: pvc_e2e ->>>>>>> 680c2cce (update) linux-ut: needs: [conditions-filter, linux-build] From e942f2feaf2c0a8576f98ee7fe438c9ae43d40b6 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 29 Aug 2025 17:15:27 +0800 Subject: [PATCH 04/12] update --- .github/actions/linux-testenv/action.yml | 4 ++-- .github/scripts/build.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 13054c746d..9416a4b9a2 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -71,9 +71,9 @@ runs: fi TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" fi git clone ${PYTORCH_REPO} pytorch cd pytorch diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index b0b7f17b2c..44ae14a35c 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -19,7 +19,7 @@ done # Set pytorch rm -rf ${WORKSPACE}/pytorch -git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch +git clone https://github.com/daisyden/pytorch.git ${WORKSPACE}/pytorch cd ${WORKSPACE}/pytorch git checkout ${PYTORCH_COMMIT} git remote -v && git branch && git show -s From fddf68064aa41d833d78f34de6f83bb137de905d Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 29 Aug 2025 22:34:14 +0800 Subject: [PATCH 05/12] update --- .github/actions/linux-testenv/action.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 9416a4b9a2..0d890a1f4e 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -99,14 +99,9 @@ runs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ github.event_name }}" == "pull_request" ] && [[ "${{ inputs.pytorch }}" != *"_wheel" ]];then - cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops - cd third_party/torch-xpu-ops - else - git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops - cd third_party/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} - fi + git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} git status && git diff && git show -s - name: Install E2E Requirements shell: bash -xe {0} From 60e9689237fc799a8e06295f1843442302455109 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 29 Aug 2025 23:29:04 +0800 Subject: [PATCH 06/12] update --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 0ede180c26..030f35e3d9 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -104,7 +104,7 @@ jobs: runs-on: ${{ needs.runner.outputs.runner_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1 --max-worker-restart 10000 + PYTEST_ADDOPTS: -v steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 From 6d7bf01c80c9e2b531359a0cb9cad7830b8614c2 Mon Sep 17 00:00:00 2001 From: xiangdong <40376367+zxd1997066@users.noreply.github.com> Date: Sat, 30 Aug 2025 12:49:51 +0800 Subject: [PATCH 07/12] Update _linux_ut.yml --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 030f35e3d9..1398e0310b 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -99,7 +99,7 @@ jobs: test-in-baremetal: needs: runner - timeout-minutes: 600 + timeout-minutes: 1200 if: ${{ contains(inputs.ut, 'distributed') }} runs-on: ${{ needs.runner.outputs.runner_id }} env: From 8e0065ded4f2e6122ba2e517f7741bf398f16d1e Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Thu, 11 Sep 2025 17:26:31 +0800 Subject: [PATCH 08/12] update for rc --- .github/actions/linux-testenv/action.yml | 2 +- .github/workflows/_linux_build.yml | 2 +- .github/workflows/_linux_ut.yml | 2 +- .github/workflows/pull.yml | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 0d890a1f4e..4ff569ef6e 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,7 +3,7 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@release/2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 24f0f777d1..d534ff22fc 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,7 +10,7 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@release/2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 1398e0310b..8fec47be91 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,7 +9,7 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@release/2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 67d1fd25ad..45b49cc60c 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,7 +100,7 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - pytorch: distributed_2.9 + pytorch: release/2.9 runner: PVC-7358 linux-ut: @@ -131,7 +131,7 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: runner: PVC-7358 - pytorch: distributed_2.9 + pytorch: release/2.9 ut: ${{ matrix.ut_name }} linux-e2e: From fadabea457a9cc0dc19ef869d92b9dca04783479 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 12 Sep 2025 17:12:54 +0800 Subject: [PATCH 09/12] update --- .github/actions/linux-testenv/action.yml | 2 +- .github/workflows/_linux_build.yml | 2 +- .github/workflows/_linux_ut.yml | 2 +- .github/workflows/pull.yml | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 4ff569ef6e..0d890a1f4e 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,7 +3,7 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@release/2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index d534ff22fc..24f0f777d1 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,7 +10,7 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@release/2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 8fec47be91..1398e0310b 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,7 +9,7 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@release/2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 45b49cc60c..67d1fd25ad 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,7 +100,7 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - pytorch: release/2.9 + pytorch: distributed_2.9 runner: PVC-7358 linux-ut: @@ -131,7 +131,7 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: runner: PVC-7358 - pytorch: release/2.9 + pytorch: distributed_2.9 ut: ${{ matrix.ut_name }} linux-e2e: From 0e67e844d1948c1896dd17c3db4c0a451a711b24 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 10 Oct 2025 15:24:50 +0800 Subject: [PATCH 10/12] update for 2.10 --- .github/actions/linux-testenv/action.yml | 4 ++-- .github/workflows/_linux_build.yml | 4 ++-- .github/workflows/_linux_ut.yml | 4 ++-- .github/workflows/pull.yml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 0d890a1f4e..09b58195cc 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,11 +3,11 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'daisyden/distributed_2.9' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 24f0f777d1..7e30e885ac 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,11 +10,11 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'daisyden/distributed_2.9' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: required: false diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 1398e0310b..da15486e0f 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,11 +9,11 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'daisyden/distributed_2.9' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 67d1fd25ad..85281b43fc 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,7 +100,7 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - pytorch: distributed_2.9 + pytorch: distributed_2.10 runner: PVC-7358 linux-ut: @@ -131,7 +131,7 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: runner: PVC-7358 - pytorch: distributed_2.9 + pytorch: distributed_2.10 ut: ${{ matrix.ut_name }} linux-e2e: From 0d41dff11d729d84b7f8ea7cb03fb2590c1a530b Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 10 Oct 2025 15:24:50 +0800 Subject: [PATCH 11/12] update for 2.10 --- .github/actions/linux-uttest/action.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index e18c1e9f39..63125b1620 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -170,13 +170,11 @@ runs: tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log - name: xpu_distributed - shell: bash -xeu -o pipefail {0} + shell: timeout 36000 bash -xeu -o pipefail {0} if: ${{ inputs.ut_name == 'xpu_distributed' }} run: | xpu-smi topology -m mkdir -p ut_log/xpu_distributed - pip install pytest pytest-timeout xmlrunner unittest-xml-reporting zstandard transformers - cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/ut_result_check.sh ut_log/xpu_distributed/ cd pytorch/third_party/torch-xpu-ops/test/xpu XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then From 97f02bb7f29f338d9436f60741dcb9b35ff3feef Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 4 Nov 2025 14:51:37 +0800 Subject: [PATCH 12/12] test oneapi 2025.3 --- .github/actions/linux-uttest/action.yml | 3 -- .github/scripts/build.sh | 42 ++++++++++++------------- .github/workflows/_linux_build.yml | 32 ++++++++++--------- .github/workflows/_linux_ut.yml | 1 + 4 files changed, 39 insertions(+), 39 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 63125b1620..fb321d8671 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -181,9 +181,6 @@ runs: echo -e "[ERROR] XCCL is not enabled" exit 1 fi - export CCL_ROOT=$(dirname $(which python))/../ - export PATH="${CCL_ROOT}/bin/libfabric:${PATH}" - export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}" python run_distributed_local.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index 44ae14a35c..27f324ed1c 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -46,30 +46,30 @@ cd ${WORKSPACE}/pytorch python -m pip install requests git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt -python -m pip install mkl-static==2025.2.0 mkl-include==2025.2.0 +python -m pip install mkl-static mkl-include export USE_STATIC_MKL=1 if [ "${XPU_ONEAPI_PATH}" == "" ];then export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ - intel-cmplr-lib-rt==2025.2.1 | \ - intel-cmplr-lib-ur==2025.2.1 | \ - intel-cmplr-lic-rt==2025.2.1 | \ - intel-sycl-rt==2025.2.1 | \ - oneccl-devel==2021.16.1 | \ - oneccl==2021.16.1 | \ - impi-rt==2021.16.1 | \ - onemkl-sycl-blas==2025.2.0 | \ - onemkl-sycl-dft==2025.2.0 | \ - onemkl-sycl-lapack==2025.2.0 | \ - onemkl-sycl-rng==2025.2.0 | \ - onemkl-sycl-sparse==2025.2.0 | \ - dpcpp-cpp-rt==2025.2.1 | \ - intel-opencl-rt==2025.2.1 | \ - mkl==2025.2.0 | \ - intel-openmp==2025.2.1 | \ - tbb==2022.2.0 | \ - tcmlib==1.4.0 | \ - umf==0.11.0 | \ - intel-pti==0.13.1 + intel-cmplr-lib-rt | \ + intel-cmplr-lib-ur | \ + intel-cmplr-lic-rt | \ + intel-sycl-rt | \ + oneccl-devel | \ + oneccl | \ + impi-rt | \ + onemkl-sycl-blas | \ + onemkl-sycl-dft | \ + onemkl-sycl-lapack | \ + onemkl-sycl-rng | \ + onemkl-sycl-sparse | \ + dpcpp-cpp-rt | \ + intel-opencl-rt | \ + mkl | \ + intel-openmp | \ + tbb | \ + tcmlib | \ + umf | \ + intel-pti " fi diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 7e30e885ac..31d6572819 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -59,7 +59,7 @@ jobs: if: ${{ ! endsWith(inputs.pytorch, '_wheel') }} runs-on: ${{ needs.runner.outputs.runner_id }} container: - image: 'pytorch/manylinux2_28-builder:xpu-main' + image: 'intelgpu/ubuntu-22.04-lts2:2523.31' volumes: - ${{ github.workspace }}:${{ github.workspace }} env: @@ -72,21 +72,30 @@ jobs: steps: - name: Install gh-cli run: | + rm -rf ./*.whl ./*.log cat /etc/os-release hostname && id # install gh - dnf install -y 'dnf-command(config-manager)' - dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo - dnf install -y gh --repo gh-cli - gh --version + sudo apt update + sudo apt install -y gpg-agent wget curl cmake git unzip zip libgl1 zlib1g-dev numactl \ + libglib2.0-dev rsync jq gcc-11 g++-11 python3-dev python3-venv gh - name: Setup python-${{ inputs.python }} run: | rm -rf /tmp/xpu-tool/myvenv - local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /tmp/xpu-tool/myvenv + curl -LsSf https://astral.sh/uv/install.sh | sh + source $HOME/.local/bin/env + uv venv /tmp/xpu-tool/myvenv --python 3.10 --clear + source /tmp/xpu-tool/myvenv/bin/activate which python && python -V which pip && pip list - pip install -U pip wheel setuptools + uv pip install -U pip wheel setuptools + - name: Install oneapi + run: | + rm -rf /opt/intel/oneapi + wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9065c156-58ab-41b0-bbee-9b0e229ffca5/intel-deep-learning-essentials-2025.3.1.15_offline.sh + sudo bash intel-deep-learning-essentials-2025.3.1.15_offline.sh -a -s --eula accept + source /opt/intel/oneapi/setvars.sh + icpx --version - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: @@ -113,8 +122,6 @@ jobs: TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi - # gcc 13 - source /opt/rh/gcc-toolset-13/enable source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ --WORKSPACE="${{ github.workspace }}" \ @@ -129,8 +136,6 @@ jobs: fi - name: Build Torchvision and Torchaudio run: | - # gcc 13 - source /opt/rh/gcc-toolset-13/enable cd ./pytorch TORCHVISION_COMMIT_ID="$(cat .github/ci_commit_pins/vision.txt)" TORCHAUDIO_COMMIT_ID="$(cat .github/ci_commit_pins/audio.txt)" @@ -177,9 +182,6 @@ jobs: curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ grep '__version__' |head -n 1 |awk -F "'" '{print $2}' )" - # gcc 13 - source /opt/rh/gcc-toolset-13/enable - dnf install -y zlib-devel pip install cmake ninja pybind11 python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \ 2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index da15486e0f..b306317e59 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -175,6 +175,7 @@ jobs: else ut_list="${{ inputs.ut }}" fi + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ for ut_name in ${ut_list} do cp Known_issue.log.tmp Known_issue.log