vllm-project · wangxiyuan · Dec 2, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -32,7 +32,7 @@ on:
         description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
       vllm_version:
         required: false
-        default: "v0.11.2"
+        default: "86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24"
         type: string
         description: vllm version to use
       vllm_ascend_remote_url:

diff --git a/.github/workflows/format_pr_body.yaml b/.github/workflows/format_pr_body.yaml
@@ -36,7 +36,7 @@ jobs:
 
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=v0.11.2
+          VLLM_COMMIT=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
 
       - name: Checkout repository

diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml
@@ -51,7 +51,7 @@ jobs:
     strategy:
       matrix:
         include:
-          - vllm_branch: v0.11.2
+          - vllm_branch: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
             vllm_ascend_branch: main
       max-parallel: 1
     container:

diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml
@@ -86,7 +86,7 @@ jobs:
             tests: tests/e2e/nightly/ops
     uses: ./.github/workflows/_e2e_nightly_single_node.yaml
     with:
-      vllm: v0.11.2
+      vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
       runner: ${{ matrix.test_config.os }}
       tests: ${{ matrix.test_config.tests }}
       image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
@@ -134,7 +134,7 @@ jobs:
               - Qwen3-Next-80B-A3B-Instruct
     uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
     with:
-      vllm: v0.11.2
+      vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
       runner: ${{ matrix.test_config.os }}
       model_list: ${{ toJson(matrix.test_config.model_list) }}
       image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'

diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml
@@ -139,7 +139,7 @@ jobs:
             tests: tests/e2e/nightly/models/test_glm4_5.py
     uses: ./.github/workflows/_e2e_nightly_single_node.yaml
     with:
-      vllm: v0.11.2
+      vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
       runner: ${{ matrix.test_config.os }}
       image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
       tests: ${{ matrix.test_config.tests }}

diff --git a/.github/workflows/vllm_ascend_test_pr_full.yaml b/.github/workflows/vllm_ascend_test_pr_full.yaml
@@ -69,7 +69,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [v0.11.2]
+        vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml

diff --git a/.github/workflows/vllm_ascend_test_pr_light.yaml b/.github/workflows/vllm_ascend_test_pr_light.yaml
@@ -42,7 +42,7 @@ jobs:
   lint:
     uses: ./.github/workflows/pre-commit.yml
     with:
-      vllm: v0.11.2
+      vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
   changes:
     runs-on: ubuntu-latest
     outputs:
@@ -84,7 +84,7 @@ jobs:
         SOC_VERSION: ascend910b1
     strategy:
       matrix:
-        vllm_version: [v0.11.2]
+        vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
     steps:
       - name: Install packages
         run: |
@@ -142,7 +142,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [v0.11.2]
+        vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.

diff --git a/.github/workflows/vllm_ascend_test_report.yaml b/.github/workflows/vllm_ascend_test_report.yaml
@@ -72,7 +72,7 @@ jobs:
               - DeepSeek-V2-Lite
     uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
     with:
-      vllm: v0.11.2
+      vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
       runner: ${{ matrix.runner }}
       image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
       model_list: ${{ toJson(matrix.model_list) }}

diff --git a/Dockerfile b/Dockerfile
@@ -48,8 +48,10 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.11.2
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+# Revert this change once VLLM_TAG is specified to branch or tag
+# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

diff --git a/Dockerfile.310p b/Dockerfile.310p
@@ -39,8 +39,10 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.11.2
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+# Revert this change once VLLM_TAG is specified to branch or tag
+# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler
@@ -36,8 +36,10 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.11.2
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+# Revert this change once VLLM_TAG is specified to branch or tag
+# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

diff --git a/Dockerfile.a3 b/Dockerfile.a3
@@ -47,8 +47,10 @@ RUN apt-get update -y && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.11.2
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+# Revert this change once VLLM_TAG is specified to branch or tag
+# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler
@@ -50,8 +50,10 @@ RUN yum update -y && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.11.2
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+# Revert this change once VLLM_TAG is specified to branch or tag
+# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
@@ -50,8 +50,10 @@ RUN yum update -y && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.11.2
-RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
+# Revert this change once VLLM_TAG is specified to branch or tag
+# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -77,7 +77,7 @@
     # CANN image tag
     'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
     # vllm version in ci
-    'ci_vllm_version': 'v0.11.2',
+    'ci_vllm_version': '86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24',
 }
 
 # For cross-file header anchors

diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py
@@ -191,7 +191,7 @@ def test_ascend_mla_metadata_builder_default(self, mock_get_dcp_size,
         mock_vllm_config.cache_config.block_size = 16
         mock_vllm_config.scheduler_config.max_num_seqs = 4
         mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
-        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_vllm_config.scheduler_config.enable_chunked_prefill = False
         mock_device = 'cpu'
 
         mock_dcp.world_size = 1
@@ -213,7 +213,7 @@ def test_ascend_mla_metadata_builder_default(self, mock_get_dcp_size,
                              mock_vllm_config.cache_config.block_size)
             self.assertEqual(
                 builder.chunked_prefill_enabled,
-                mock_vllm_config.scheduler_config.chunked_prefill_enabled)
+                mock_vllm_config.scheduler_config.enable_chunked_prefill)
 
     @patch('vllm.distributed.parallel_state.get_dcp_group')
     @patch('vllm.distributed.parallel_state._DCP',
@@ -230,7 +230,7 @@ def test_ascend_mla_metadata_builder_spec_decode(self, mock_get_dcp_size,
         mock_vllm_config.cache_config.block_size = 16
         mock_vllm_config.scheduler_config.max_num_seqs = 4
         mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
-        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_vllm_config.scheduler_config.enable_chunked_prefill = False
         mock_device = 'cpu'
 
         mock_dcp.world_size = 1
@@ -254,7 +254,7 @@ def test_ascend_mla_metadata_builder_spec_decode(self, mock_get_dcp_size,
                              mock_vllm_config.cache_config.block_size)
             self.assertEqual(
                 builder.chunked_prefill_enabled,
-                mock_vllm_config.scheduler_config.chunked_prefill_enabled)
+                mock_vllm_config.scheduler_config.enable_chunked_prefill)
 
     @patch('vllm.distributed.parallel_state.get_dcp_group')
     @patch('vllm.distributed.parallel_state._DCP',
@@ -321,7 +321,7 @@ def test_reorder_batch(self, mock_get_dcp_size, mock_dcp,
         mock_vllm_config.cache_config.block_size = 16
         mock_vllm_config.scheduler_config.max_num_seqs = 4
         mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
-        mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
+        mock_vllm_config.scheduler_config.enable_chunked_prefill = False
         mock_device = 'cpu'
 
         mock_dcp.world_size = 1
@@ -440,8 +440,10 @@ def setUp(self):
         self.mock_vllm_config.model_config = ModelConfig(max_model_len=2048)
         self.mock_vllm_config.model_config.hf_text_config.qk_rope_head_dim = 32
         self.mock_vllm_config.cache_config = CacheConfig(block_size=32)
-        self.mock_vllm_config.scheduler_config = SchedulerConfig(
-            max_num_seqs=8, chunked_prefill_enabled=True)
+        mock_scheduler_config = MagicMock(spec=SchedulerConfig)
+        mock_scheduler_config.max_num_seqs = 8
+        mock_scheduler_config.chunked_prefill_enabled = True
+        self.mock_vllm_config.scheduler_config = mock_scheduler_config
         self.mock_vllm_config.speculative_config = None
         self.mock_device = torch.device("cpu")
 
@@ -454,12 +456,20 @@ def setUp(self):
         "vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size"
     )
     @patch("vllm_ascend.attention.mla_v1.get_ascend_config")
-    def test_build_prefix_no_cache_metadata(self, mock_get_ascend_config,
+    @patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros)
+    @patch("torch.Tensor.npu", new=lambda self: self)
+    @patch("torch.npu.is_available")
+    def test_build_prefix_no_cache_metadata(self, mock_npu_available,
+                                            mock_zeros, mock_get_ascend_config,
                                             mock_dcp_world_size):
-        if not torch.npu.is_available():
-            self.skipTest("NPU not available, skipping NPU-dependent tests")
+        mock_npu_available.return_value = False
         mock_dcp_world_size.return_value = 1
 
+        def zeros_override(*args, **kwargs):
+            kwargs.pop('pin_memory', None)
+            return mock_zeros._mock_wraps(*args, **kwargs)
+
+        mock_zeros.side_effect = zeros_override
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=torch.tensor([0, 3, 7]),
             query_start_loc_cpu=torch.tensor([0, 3, 7]),
@@ -506,12 +516,21 @@ def test_build_prefix_no_cache_metadata(self, mock_get_ascend_config,
         "vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size"
     )
     @patch("vllm_ascend.attention.mla_v1.get_ascend_config")
-    def test_build_chunked_prefix_metadata(self, mock_get_ascend_config,
+    @patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros)
+    @patch("torch.Tensor.npu", new=lambda self: self)
+    @patch("torch.npu.is_available")
+    def test_build_chunked_prefix_metadata(self, mock_npu_available,
+                                           mock_zeros, mock_get_ascend_config,
                                            mock_dcp_world_size):
-        if not torch.npu.is_available():
-            self.skipTest("NPU not available, skipping NPU-dependent tests")
+        mock_npu_available.return_value = False
         mock_dcp_world_size.return_value = 1
 
+        def zeros_override(*args, **kwargs):
+            kwargs.pop('pin_memory', None)
+            return mock_zeros._mock_wraps(*args, **kwargs)
+
+        mock_zeros.side_effect = zeros_override
+
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=torch.tensor([0, 2, 5, 9]),
             query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),

diff --git a/tests/ut/compilation/test_acl_graph.py b/tests/ut/compilation/test_acl_graph.py
@@ -32,7 +32,7 @@ def test_aclgraph_entry_initialization(self):
         """Test ACLGraphEntry initialization with default values"""
         batch_descriptor = BatchDescriptor(
             num_tokens=30,
-            uniform_decode=False,
+            uniform=False,
         )
 
         entry = ACLGraphEntry(batch_descriptor=batch_descriptor)
@@ -46,7 +46,7 @@ def test_aclgraph_entry_with_values(self):
         """Test ACLGraphEntry initialization with specified values"""
         batch_descriptor = BatchDescriptor(
             num_tokens=30,
-            uniform_decode=False,
+            uniform=False,
         )
 
         mock_graph = MagicMock()
@@ -89,7 +89,7 @@ def setUp(self):
         # Mock BatchDescriptor
         self.mock_batch_descriptor = BatchDescriptor(
             num_tokens=30,
-            uniform_decode=False,
+            uniform=False,
         )
 
         # Mock ForwardContext