vllm-project · MingYang119 · Nov 29, 2025
diff --git a/.github/Dockerfile.buildwheel b/.github/Dockerfile.buildwheel
@@ -18,7 +18,7 @@ ARG PY_VERSION=3.11
 FROM quay.io/ascend/manylinux:8.3.rc2-910b-manylinux_2_28-py${PY_VERSION}
 
 ARG COMPILE_CUSTOM_KERNELS=1
-ARG SOC_VERSION
+ARG SOC_VERSION="ascend910b1"
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive

diff --git a/.github/workflows/_e2e_nightly_single_node_models.yaml b/.github/workflows/_e2e_nightly_single_node_models.yaml
@@ -59,7 +59,7 @@ jobs:
     name: ${{inputs.model_list}} accuracy test
     runs-on: ${{ inputs.runner }}
     container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
+      image: "${{ inputs.image }}"
       env:
         VLLM_USE_MODELSCOPE: True
         GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }}
@@ -109,7 +109,13 @@ jobs:
         shell: bash -l {0}
         run: |
           . /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
-          python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev20250914-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
+          python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
+
+      - name: Install tensorflow (for Molmo-7B-D-0924)
+        if: ${{ inputs.runner == 'linux-aarch64-a2-1' && contains(inputs.model_list, 'Molmo-7B-D-0924') }}
+        shell: bash -l {0}
+        run: |
+          pip install tensorflow --no-cache-dir
 
       - name: Resolve vllm-ascend version
         run: |
@@ -172,6 +178,7 @@ jobs:
         id: report
         env:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
+          HF_DATASETS_OFFLINE: True
           VLLM_USE_MODELSCOPE: True
           VLLM_CI_RUNNER: ${{ inputs.runner }}
           VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}

diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
@@ -94,11 +94,11 @@ jobs:
           pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
           pytest -sv tests/e2e/singlecard/test_bge_model.py
           pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_chunked.py
           pytest -sv tests/e2e/singlecard/test_embedding.py
           # pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          pytest -sv tests/e2e/singlecard/test_ilama_lora.py
+          # torch 2.8 doesn't work with lora, fix me
+          #pytest -sv tests/e2e/singlecard/test_ilama_lora.py
           pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py
           pytest -sv tests/e2e/singlecard/test_quantization.py
           pytest -sv tests/e2e/singlecard/test_sampler.py
@@ -188,7 +188,8 @@ jobs:
           pytest -sv tests/e2e/multicard/test_external_launcher.py
           pytest -sv tests/e2e/multicard/test_single_request_aclgraph.py
           pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
-          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
+          # torch 2.8 doesn't work with lora, fix me
+          #pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
 
           # To avoid oom, we need to run the test in a single process.
           pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
@@ -266,17 +267,17 @@ jobs:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
-          pytest -sv \
-            tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe \
-            tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC 
-            # tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP \
-            # tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_W8A8_WITH_EP
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
+          # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
+          # pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_W8A8_WITH_EP
+          pytest -sv tests/e2e/multicard/test_data_parallel_tp2.py
 
       - name: Install Ascend toolkit & triton_ascend (for Qwen3-Next-80B-A3B-Instruct)
         shell: bash -l {0}
         run: |
           . /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
-          python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27.whl"
+          python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
 
       - name: Run vllm-project/vllm-ascend Qwen3 Next test
         working-directory: ./vllm-ascend
@@ -286,4 +287,4 @@ jobs:
           VLLM_USE_MODELSCOPE: True
         run: |
           . /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
-          pytest -sv tests/e2e/multicard/test_qwen3_next.py
+          #pytest -sv tests/e2e/multicard/test_qwen3_next.py
diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml
@@ -43,7 +43,7 @@ jobs:
         uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.2.2
 
       - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
 
       - name: Get vLLM release version
         run: |

diff --git a/.github/workflows/image_310p_openeuler.yml b/.github/workflows/image_310p_openeuler.yml
@@ -132,5 +132,4 @@ jobs:
         file: Dockerfile.310p.openEuler
         build-args: |
           PIP_INDEX_URL=https://pypi.org/simple
-          SOC_VERSION=ascend310p1
         provenance: false
diff --git a/.github/workflows/image_310p_ubuntu.yml b/.github/workflows/image_310p_ubuntu.yml
@@ -128,5 +128,4 @@ jobs:
         tags: ${{ steps.meta.outputs.tags }}
         build-args: |
           PIP_INDEX_URL=https://pypi.org/simple
-          SOC_VERSION=ascend310p1
         provenance: false
diff --git a/.github/workflows/image_a3_openeuler.yml b/.github/workflows/image_a3_openeuler.yml
@@ -131,6 +131,5 @@ jobs:
         file: Dockerfile.a3.openEuler
         build-args: |
           PIP_INDEX_URL=https://pypi.org/simple
-          SOC_VERSION=ascend910_9391
         provenance: false
 
diff --git a/.github/workflows/image_a3_ubuntu.yml b/.github/workflows/image_a3_ubuntu.yml
@@ -127,6 +127,5 @@ jobs:
         tags: ${{ steps.meta.outputs.tags }}
         build-args: |
           PIP_INDEX_URL=https://pypi.org/simple
-          SOC_VERSION=ascend910_9391
         provenance: false
 
diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml
@@ -131,5 +131,4 @@ jobs:
         file: Dockerfile.openEuler
         build-args: |
           PIP_INDEX_URL=https://pypi.org/simple
-          SOC_VERSION=ascend910b1
         provenance: false
diff --git a/.github/workflows/image_ubuntu.yml b/.github/workflows/image_ubuntu.yml
@@ -128,5 +128,4 @@ jobs:
         tags: ${{ steps.meta.outputs.tags }}
         build-args: |
           PIP_INDEX_URL=https://pypi.org/simple
-          SOC_VERSION=ascend910b1
         provenance: false
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
     steps:
     - name: Checkout vllm-project/vllm-ascend repo
       uses: actions/checkout@v6
-    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+    - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
       with:
         python-version: "3.11"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"

diff --git a/.github/workflows/release_code.yml b/.github/workflows/release_code.yml
@@ -50,7 +50,7 @@ jobs:
           lscpu
 
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
         with:
           python-version: ${{ matrix.python-version }}
 

diff --git a/.github/workflows/release_whl.yml b/.github/workflows/release_whl.yml
@@ -69,7 +69,6 @@ jobs:
         ls
         docker build -f ./.github/Dockerfile.buildwheel \
         --build-arg PY_VERSION=${{ matrix.python-version }} \
-        --build-arg SOC_VERSION=ascend910b1 \
         -t wheel:v1 .
         docker run --rm \
         -u $(id -u):$(id -g) \
@@ -80,7 +79,7 @@ jobs:
 
     - name: Set up Python ${{ matrix.python-version }}
       if: startsWith(github.ref, 'refs/tags/')
-      uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+      uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
       with:
         python-version: ${{ matrix.python-version }}
 

diff --git a/.github/workflows/vllm_ascend_test_310p.yaml b/.github/workflows/vllm_ascend_test_310p.yaml
@@ -100,7 +100,6 @@ jobs:
         run: |
           export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
           export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
-          export SOC_VERSION=ASCEND310P3 
           pip install -r requirements-dev.txt
           pip install -v -e .
 

diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml
@@ -114,6 +114,15 @@ jobs:
               - Qwen3-VL-8B-Instruct
               - Qwen2.5-Omni-7B
               - Meta-Llama-3.1-8B-Instruct
+          - os: linux-aarch64-a2-1
+            model_list:
+              - ERNIE-4.5-21B-A3B-PT
+              - gemma-2-9b-it
+              - gemma-3-4b-it
+              - internlm-7b
+              - InternVL3_5-8B-hf
+              - llava-1.5-7b-hf
+              - Molmo-7B-D-0924
           - os: linux-aarch64-a2-2
             model_list:
               - Qwen3-30B-A3B
@@ -128,5 +137,5 @@ jobs:
       vllm: v0.11.2
       runner: ${{ matrix.test_config.os }}
       model_list: ${{ toJson(matrix.test_config.model_list) }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
       upload: false
diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml
@@ -134,6 +134,9 @@ jobs:
           - name: deepseek3_2-exp-w8a8
             os: linux-aarch64-a3-16
             tests: tests/e2e/nightly/models/test_deepseek_v3_2_exp_w8a8.py
+          - name: glm-4-5
+            os: linux-aarch64-a3-16
+            tests: tests/e2e/nightly/models/test_glm4_5.py
     uses: ./.github/workflows/_e2e_nightly_single_node.yaml
     with:
       vllm: v0.11.2

diff --git a/.gitignore b/.gitignore
@@ -203,5 +203,9 @@ kernel_meta/
 # benchmark results generated by run-performance-benchmarks.sh
 /benchmarks/results/
 
+# _cann_ops_custom generated by build_aclnn.sh
+/vllm_ascend/_cann_ops_custom/*
+!/vllm_ascend/_cann_ops_custom/.gitkeep
+
 # generated by CANN
 fusion_result.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
       args: [
         --toml, pyproject.toml,
         '--skip', 'tests/e2e/multicard/test_torchair_graph_mode.py,csrc/**,tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**,.github/**,typos.toml',
-        '-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,ArchType,AND'
+        '-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,ArchType,AND,ND'
       ]
       additional_dependencies:
         - tomli

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -22,9 +22,9 @@ find_package(Torch REQUIRED)
 
 run_python(TORCH_VERSION
   "import torch; print(torch.__version__)" "Failed to locate torch path")
-# check torch version is 2.7.1
-if(NOT ${TORCH_VERSION} VERSION_EQUAL "2.7.1")
-  message(FATAL_ERROR "Expected PyTorch version 2.7.1, but found ${TORCH_VERSION}")
+# check torch version is 2.8.0
+if(NOT ${TORCH_VERSION} VERSION_EQUAL "2.8.0")
+  message(FATAL_ERROR "Expected PyTorch version 2.8.0, but found ${TORCH_VERSION}")
 endif()
 
 set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
@@ -55,16 +55,36 @@ include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
 file(GLOB KERNEL_FILES
 ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/*.cpp)
 
-ascendc_library(vllm_ascend_kernels SHARED
+set(VLLM_ASCEND_CUSTOM_OP
     ${KERNEL_FILES}
     ${CMAKE_CURRENT_SOURCE_DIR}/csrc/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
+)
+
+set(VLLM_ASCEND_CUSTOM_OP_EXCLUDE
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_kernel/batch_matmul_transpose_kernel.cpp
+)
+
+if(SOC_VERSION STREQUAL "ASCEND310P3")
+    list(REMOVE_ITEM VLLM_ASCEND_CUSTOM_OP ${VLLM_ASCEND_CUSTOM_OP_EXCLUDE})
+endif()
+
+ascendc_library(vllm_ascend_kernels SHARED
+    ${VLLM_ASCEND_CUSTOM_OP}
 )
 
 message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
 
-file(GLOB VLLM_ASCEND_SRC
-${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
-${CMAKE_CURRENT_SOURCE_DIR}/csrc/aclnn_torch_adapter/*.cpp)
+if(SOC_VERSION STREQUAL "ASCEND310P3")
+    file(GLOB VLLM_ASCEND_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/aclnn_torch_adapter/*.cpp)
+else()
+    file(GLOB VLLM_ASCEND_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/aclnn_torch_adapter/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp)
+endif()
 
 include_directories(
   ${pybind11_INCLUDE_DIRS}
@@ -74,6 +94,7 @@ include_directories(
   ${ASCEND_HOME_PATH}/include
   ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
   ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
+  ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host
 )
 
 set(

diff --git a/Dockerfile b/Dockerfile
@@ -20,7 +20,7 @@ FROM quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 ARG MOONCAKE_TAG="v0.3.7.post2"
-ARG SOC_VERSION
+ARG SOC_VERSION="ascend910b1"
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive

diff --git a/Dockerfile.310p b/Dockerfile.310p
@@ -19,7 +19,7 @@ FROM quay.io/ascend/cann:8.3.rc2-310p-ubuntu22.04-py3.11
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
-ARG SOC_VERSION
+ARG SOC_VERSION="ascend310p1"
 
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive

diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler
@@ -19,7 +19,7 @@ FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
-ARG SOC_VERSION
+ARG SOC_VERSION="ascend310p1"
 
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 ENV SOC_VERSION=$SOC_VERSION

diff --git a/Dockerfile.a3 b/Dockerfile.a3
@@ -20,7 +20,7 @@ FROM quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 ARG MOONCAKE_TAG=v0.3.7.post2
-ARG SOC_VERSION
+ARG SOC_VERSION="ascend910_9391"
 
 COPY . /vllm-workspace/vllm-ascend/
 # Define environments

diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler
@@ -20,7 +20,7 @@ FROM quay.io/ascend/cann:8.3.rc2-a3-openeuler24.03-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 ARG MOONCAKE_TAG="v0.3.7.post2"
-ARG SOC_VERSION
+ARG SOC_VERSION="ascend910_9391"
 
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 ENV SOC_VERSION=$SOC_VERSION

diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
@@ -20,7 +20,7 @@ FROM quay.io/ascend/cann:8.3.rc2-910b-openeuler24.03-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 ARG MOONCAKE_TAG="v0.3.7.post2"
-ARG SOC_VERSION
+ARG SOC_VERSION="ascend910b1"
 
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 ENV SOC_VERSION=$SOC_VERSION

diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
 - Software:
   * Python >= 3.10, < 3.12
   * CANN >= 8.3.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html))
-  * PyTorch == 2.7.1, torch-npu == 2.7.1
+  * PyTorch == 2.8.0, torch-npu == 2.8.0
   * vLLM (the same version as vllm-ascend)
 
 ## Getting Started

diff --git a/README.zh.md b/README.zh.md
@@ -44,7 +44,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
 - 软件：
   * Python >= 3.10, < 3.12
   * CANN >= 8.3.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html))
-  * PyTorch == 2.7.1, torch-npu == 2.7.1
+  * PyTorch == 2.8.0, torch-npu == 2.8.0
   * vLLM (与vllm-ascend版本一致)
 
 ## 开始使用