rm vanilla attn

zzzzwwjj · zzzzwwjj · commit cd8fc8df3bd3 · 2025-11-29T10:40:55.000+08:00
Signed-off-by: zzzzwwjj &lt;1183291235@qq.com&gt;
diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py
@@ -454,43 +454,6 @@ def test_forward_decode_only_swa_seq_len_mismatch(
 
         assert output.shape == (10, 8 * 64)
 
-    @patch('vllm_ascend.attention.attention_v1.get_forward_context')
-    @patch('vllm_ascend.utils.get_ascend_device_type',
-           return_value=AscendDeviceType._910_93)
-    @patch('torch_npu._npu_reshape_and_cache')
-    @patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill')
-    def test_forward_head_size_192(self, mock_vanilla_prefill,
-                                   mock_npu_reshape_and_cache,
-                                   mock_soc_version, mock_get_forward_context):
-        """Test forward pass when head_size is 192"""
-
-        self.impl.head_size = 192
-        query = torch.randn(10, 8 * 192)
-        key = torch.randn(10, 8 * 192)
-        value = torch.randn(10, 8 * 192)
-        kv_cache = torch.empty(2, 5, 128, 8, 192)
-        output = torch.empty_like(query)
-
-        mock_get_forward_context.return_value = MagicMock(capturing=False)
-
-        metadata = self.attn_metadata
-        metadata.attn_mask = torch.randn(1, 1, 10, 10)
-        metadata.query_lens = torch.tensor([10])
-        metadata.seq_lens = torch.tensor([10])
-        metadata.block_tables = torch.zeros(1, 5, dtype=torch.long)
-        metadata.num_actual_tokens = 10
-        metadata.slot_mapping = torch.zeros(10, dtype=torch.long)
-        metadata.num_decodes = 10
-        metadata.num_prefills = 0
-        layer = self.layer_no_quant
-        mock_vanilla_prefill.return_value = MagicMock()
-
-        output = self.impl_192.forward(layer, query, key, value, kv_cache,
-                                       metadata, output)
-
-        mock_vanilla_prefill.assert_called_once()
-        assert output.shape == (10, 8 * 192)
-
     @patch('vllm_ascend.attention.attention_v1.get_forward_context')
     @patch('torch_npu.npu_fused_infer_attention_score')
     @patch('torch_npu._npu_reshape_and_cache')
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -41,7 +41,6 @@
                                          split_decodes_and_prefills)
 from vllm_ascend.compilation.acl_graph import (get_graph_params,
                                                update_graph_params_workspaces)
-from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType,
                                aligned_16, get_ascend_device_type, nd_to_nz_2d,
                                nd_to_nz_spec, prefill_context_parallel_enable,
@@ -833,26 +832,6 @@ def _forward_v1_style(
         attn_metadata: AscendMetadata,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        # Use chunked prefill for head size 192 scenario, like deepseek
-        # paged_attention_splitfuse maybe crash at such scenario.
-        # TODO: vanilla path will be removed after the kernel support
-        # head_size 192 scenario.
-        if self.head_size == 192:
-            cu_seqlen_q = [0] + attn_metadata.query_lens.tolist()
-            cu_seqlen_k = [0] + attn_metadata.seq_lens.tolist()
-            cu_seqlen_q = torch.tensor(cu_seqlen_q, device=query.device)
-            cu_seqlen_k = torch.tensor(cu_seqlen_k, device=query.device)
-            cu_seqlen_q = torch.cumsum(cu_seqlen_q, dim=0)
-            cu_seqlen_k = torch.cumsum(cu_seqlen_k, dim=0)
-            max_seqlen_q = torch.max(attn_metadata.query_lens)
-            max_seqlen_k = torch.max(attn_metadata.seq_lens)
-            vanilla_chunked_prefill(output, query, self.key_cache,
-                                    self.value_cache,
-                                    attn_metadata.block_tables, cu_seqlen_q,
-                                    cu_seqlen_k, max_seqlen_q, max_seqlen_k,
-                                    self.scale, None, True)
-            return output
-
         # Use paged attention.
         assert attn_metadata is not None
         assert attn_metadata.attn_mask is not None
diff --git a/vllm_ascend/ops/attention.py b/vllm_ascend/ops/attention.py