fix mla_v1 acl_graph scheduler ut test

zhangxinyuehfad · zhangxinyuehfad · commit 9d7da9111206 · 2025-12-02T00:36:35.000+08:00
Signed-off-by: hfadzxy &lt;starmoon_zhang@163.com&gt;
diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py
@@ -440,8 +440,10 @@ def setUp(self):
         self.mock_vllm_config.model_config = ModelConfig(max_model_len=2048)
         self.mock_vllm_config.model_config.hf_text_config.qk_rope_head_dim = 32
         self.mock_vllm_config.cache_config = CacheConfig(block_size=32)
-        self.mock_vllm_config.scheduler_config = SchedulerConfig(
-            max_num_seqs=8, chunked_prefill_enabled=True)
+        mock_scheduler_config = MagicMock(spec=SchedulerConfig)
+        mock_scheduler_config.max_num_seqs = 8  # 设置为整数，不是 MagicMock
+        mock_scheduler_config.chunked_prefill_enabled = True
+        self.mock_vllm_config.scheduler_config = mock_scheduler_config
         self.mock_vllm_config.speculative_config = None
         self.mock_device = torch.device("cpu")
 
@@ -454,12 +456,20 @@ def setUp(self):
         "vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size"
     )
     @patch("vllm_ascend.attention.mla_v1.get_ascend_config")
-    def test_build_prefix_no_cache_metadata(self, mock_get_ascend_config,
+    @patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros)
+    @patch("torch.Tensor.npu", new=lambda self: self)
+    @patch("torch.npu.is_available")
+    def test_build_prefix_no_cache_metadata(self, mock_npu_available,
+                                            mock_zeros, mock_get_ascend_config,
                                             mock_dcp_world_size):
-        if not torch.npu.is_available():
-            self.skipTest("NPU not available, skipping NPU-dependent tests")
+        mock_npu_available.return_value = False
         mock_dcp_world_size.return_value = 1
 
+        def zeros_override(*args, **kwargs):
+            kwargs.pop('pin_memory', None)
+            return mock_zeros._mock_wraps(*args, **kwargs)
+
+        mock_zeros.side_effect = zeros_override
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=torch.tensor([0, 3, 7]),
             query_start_loc_cpu=torch.tensor([0, 3, 7]),
@@ -506,12 +516,21 @@ def test_build_prefix_no_cache_metadata(self, mock_get_ascend_config,
         "vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size"
     )
     @patch("vllm_ascend.attention.mla_v1.get_ascend_config")
-    def test_build_chunked_prefix_metadata(self, mock_get_ascend_config,
+    @patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros)
+    @patch("torch.Tensor.npu", new=lambda self: self)
+    @patch("torch.npu.is_available")
+    def test_build_chunked_prefix_metadata(self, mock_npu_available,
+                                           mock_zeros, mock_get_ascend_config,
                                            mock_dcp_world_size):
-        if not torch.npu.is_available():
-            self.skipTest("NPU not available, skipping NPU-dependent tests")
+        mock_npu_available.return_value = False
         mock_dcp_world_size.return_value = 1
 
+        def zeros_override(*args, **kwargs):
+            kwargs.pop('pin_memory', None)
+            return mock_zeros._mock_wraps(*args, **kwargs)
+
+        mock_zeros.side_effect = zeros_override
+
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=torch.tensor([0, 2, 5, 9]),
             query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),
diff --git a/tests/ut/compilation/test_acl_graph.py b/tests/ut/compilation/test_acl_graph.py
@@ -32,7 +32,7 @@ def test_aclgraph_entry_initialization(self):
         """Test ACLGraphEntry initialization with default values"""
         batch_descriptor = BatchDescriptor(
             num_tokens=30,
-            uniform_decode=False,
+            uniform=False,
         )
 
         entry = ACLGraphEntry(batch_descriptor=batch_descriptor)
@@ -46,7 +46,7 @@ def test_aclgraph_entry_with_values(self):
         """Test ACLGraphEntry initialization with specified values"""
         batch_descriptor = BatchDescriptor(
             num_tokens=30,
-            uniform_decode=False,
+            uniform=False,
         )
 
         mock_graph = MagicMock()
@@ -89,7 +89,7 @@ def setUp(self):
         # Mock BatchDescriptor
         self.mock_batch_descriptor = BatchDescriptor(
             num_tokens=30,
-            uniform_decode=False,
+            uniform=False,
         )
 
         # Mock ForwardContext
diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py
@@ -81,9 +81,7 @@ def make_output(scheduler):
         req.request_id: i
         for i, req in enumerate(scheduler.running)
     }
-    sampled_token_ids = [
-        np.array([1000], dtype=np.int64) for _ in scheduler.running
-    ]
+    sampled_token_ids = [[1000]] * len(scheduler.running)
 
     logprobs = None
 
@@ -372,8 +370,7 @@ def test_stop_via_update_from_output(self):
                 req.request_id: i
                 for i, req in enumerate(requests)
             },
-            sampled_token_ids=[np.array([EOS_TOKEN_ID]),
-                               np.array([10, 11])
+            sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
                                ],  # First request hits EOS, second continues
             logprobs=None,
             prompt_logprobs_dict={},
@@ -424,9 +421,8 @@ def test_stop_via_update_from_output(self):
                 req.request_id: i
                 for i, req in enumerate(requests)
             },
-            sampled_token_ids=[np.array([10, 42, 12]),
-                               np.array([13, 14])
-                               ],  # First request hits stop token
+            sampled_token_ids=[[10, 42, 12],
+                               [13, 14]],  # First request hits stop token
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[])
@@ -475,9 +471,8 @@ def test_stop_via_update_from_output(self):
                 req.request_id: i
                 for i, req in enumerate(requests)
             },
-            sampled_token_ids=[np.array([10, 11, 12]),
-                               np.array([13])
-                               ],  # First request exceeds max_tokens
+            sampled_token_ids=[[10, 11, 12],
+                               [13]],  # First request exceeds max_tokens
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[])
@@ -516,7 +511,7 @@ def test_stop_via_update_from_output(self):
         model_output = ModelRunnerOutput(
             req_ids=[requests[0].request_id],
             req_id_to_index={requests[0].request_id: 0},
-            sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
+            sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[])
@@ -573,7 +568,7 @@ def test_schedule_concurrent_batches(self):
             model_runner_output = ModelRunnerOutput(
                 req_ids=[requests[0].request_id],
                 req_id_to_index={requests[0].request_id: 0},
-                sampled_token_ids=[np.array([0], dtype=np.int64)],
+                sampled_token_ids=[[0]],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[])
@@ -589,7 +584,7 @@ def test_schedule_concurrent_batches(self):
             model_runner_output = ModelRunnerOutput(
                 req_ids=[requests[1].request_id],
                 req_id_to_index={requests[1].request_id: 0},
-                sampled_token_ids=[np.array([0], dtype=np.int64)],
+                sampled_token_ids=[[0]],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[])
@@ -607,12 +602,10 @@ def test_schedule_spec_decoding_stats(self):
         spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
                                                    [[1, 2], [3]], [[1]], [[]],
                                                    [[1, 2, 3], [4, 5, 6]]]
-        output_tokens_list: List[List[List[int]]] = [
-            [np.array([1, 2, 3, 4])], [np.array([1, 5])],
-            [np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
-            [np.array([5])], [np.array([1, 2, 7]),
-                              np.array([4, 8])]
-        ]
+        output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
+                                                     [[1, 2, 5], [3, 4]],
+                                                     [[1, 2]], [[5]],
+                                                     [[1, 2, 7], [4, 8]]]
         expected_list: List[Tuple[int, int,
                                   int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
                                                       (1, 3, 1, [1, 0, 0]),
@@ -650,9 +643,7 @@ def test_schedule_spec_decoding_stats(self):
             model_runner_output = ModelRunnerOutput(
                 req_ids=req_ids,
                 req_id_to_index=req_to_index,
-                sampled_token_ids=[
-                    np.array([0]) for _ in range(len(requests))
-                ],
+                sampled_token_ids=[[0] for _ in range(len(requests))],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[])
@@ -892,11 +883,13 @@ def create_scheduler(self, mock_compute_encoder_budget):
                                                    torch.float32, False))
             ],
         )
+        kv_cache_config.hash_block_size = block_size
         cache_config.num_gpu_blocks = 10000
 
         scheduler = SchedulerDynamicBatch(
             vllm_config=vllm_config,
             kv_cache_config=kv_cache_config,
+            block_size=block_size,
             log_stats=True,
             structured_output_manager=MagicMock(spec=StructuredOutputManager),
         )
@@ -1064,8 +1057,7 @@ def test_stop_via_update_from_output(self):
                 req.request_id: i
                 for i, req in enumerate(requests)
             },
-            sampled_token_ids=[np.array([EOS_TOKEN_ID]),
-                               np.array([10, 11])
+            sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
                                ],  # First request hits EOS, second continues
             logprobs=None,
             prompt_logprobs_dict={},
@@ -1116,9 +1108,8 @@ def test_stop_via_update_from_output(self):
                 req.request_id: i
                 for i, req in enumerate(requests)
             },
-            sampled_token_ids=[np.array([10, 42, 12]),
-                               np.array([13, 14])
-                               ],  # First request hits stop token
+            sampled_token_ids=[[10, 42, 12],
+                               [13, 14]],  # First request hits stop token
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[])
@@ -1167,9 +1158,8 @@ def test_stop_via_update_from_output(self):
                 req.request_id: i
                 for i, req in enumerate(requests)
             },
-            sampled_token_ids=[np.array([10, 11, 12]),
-                               np.array([13])
-                               ],  # First request exceeds max_tokens
+            sampled_token_ids=[[10, 11, 12],
+                               [13]],  # First request exceeds max_tokens
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[])
@@ -1208,7 +1198,7 @@ def test_stop_via_update_from_output(self):
         model_output = ModelRunnerOutput(
             req_ids=[requests[0].request_id],
             req_id_to_index={requests[0].request_id: 0},
-            sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
+            sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
             logprobs=None,
             prompt_logprobs_dict={},
             pooler_output=[])
@@ -1265,7 +1255,7 @@ def test_schedule_concurrent_batches(self):
             model_runner_output = ModelRunnerOutput(
                 req_ids=[requests[0].request_id],
                 req_id_to_index={requests[0].request_id: 0},
-                sampled_token_ids=[np.array([0])],
+                sampled_token_ids=[[0]],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[])
@@ -1281,7 +1271,7 @@ def test_schedule_concurrent_batches(self):
             model_runner_output = ModelRunnerOutput(
                 req_ids=[requests[1].request_id],
                 req_id_to_index={requests[1].request_id: 0},
-                sampled_token_ids=[np.array([0])],
+                sampled_token_ids=[[0]],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[])
@@ -1299,12 +1289,10 @@ def test_schedule_spec_decoding_stats(self):
         spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
                                                    [[1, 2], [3]], [[1]], [[]],
                                                    [[1, 2, 3], [4, 5, 6]]]
-        output_tokens_list: List[List[List[int]]] = [
-            [np.array([1, 2, 3, 4])], [np.array([1, 5])],
-            [np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
-            [np.array([5])], [np.array([1, 2, 7]),
-                              np.array([4, 8])]
-        ]
+        output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
+                                                     [[1, 2, 5], [3, 4]],
+                                                     [[1, 2]], [[5]],
+                                                     [[1, 2, 7], [4, 8]]]
         expected_list: List[Tuple[int, int,
                                   int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
                                                       (1, 3, 1, [1, 0, 0]),
@@ -1342,9 +1330,7 @@ def test_schedule_spec_decoding_stats(self):
             model_runner_output = ModelRunnerOutput(
                 req_ids=req_ids,
                 req_id_to_index=req_to_index,
-                sampled_token_ids=[
-                    np.array([0]) for _ in range(len(requests))
-                ],
+                sampled_token_ids=[[0] for _ in range(len(requests))],
                 logprobs=None,
                 prompt_logprobs_dict={},
                 pooler_output=[])