vllm-project
diff --git a/‎examples/disaggregated_prefill_v1/gen_ranktable.py‎
Lines changed: 4 additions & 5 deletions b/‎examples/disaggregated_prefill_v1/gen_ranktable.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎setup.py‎
Lines changed: 83 additions & 5 deletions b/‎setup.py‎
Lines changed: 83 additions & 5 deletions
diff --git a/‎tests/ut/attention/test_attention_v1.py‎
Lines changed: 24 additions & 16 deletions b/‎tests/ut/attention/test_attention_v1.py‎
Lines changed: 24 additions & 16 deletions
diff --git a/‎tests/ut/models/conftest.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/ut/models/conftest.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/ut/ops/test_activation.py‎
Lines changed: 8 additions & 4 deletions b/‎tests/ut/ops/test_activation.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎tests/ut/ops/test_fused_moe.py‎
Lines changed: 11 additions & 16 deletions b/‎tests/ut/ops/test_fused_moe.py‎
Lines changed: 11 additions & 16 deletions
@@ -4,7 +4,7 @@
 
 import torch.distributed as dist
 
-from vllm_ascend.utils import AscendSocVersion, init_ascend_soc_version, get_ascend_soc_version
+from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
 
 parser = argparse.ArgumentParser(
     description="Arguments of rank table generator", )
@@ -42,8 +42,7 @@
 # and is different from WORLD_SIZE in gen_rank_table.sh.
 world_size = os.environ.get("WORLD_SIZE")
 
-init_ascend_soc_version()
-soc_info = get_ascend_soc_version()
+device_type = get_ascend_device_type()
 
 
 def get_cmd_stdout(cmd):
@@ -83,7 +82,7 @@ def get_cmd_stdout(cmd):
         device_id = local_device_ids[idx]
         chip_id = device_id % chips_per_card
         card_id = device_id // chips_per_card
-        if soc_info == AscendSocVersion.A3:
+        if device_type == AscendDeviceType._910_93:
             device_ip = get_cmd_stdout(
                 f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
             ).split(":")[1].strip()
@@ -103,7 +102,7 @@ def get_cmd_stdout(cmd):
             "device_id": str(device_id),
             "device_ip": str(device_ip),
         }
-        if soc_info == AscendSocVersion.A3:
+        if device_type == AscendDeviceType._910_93:
             device_info.update({
                 "super_pod_id": str(super_pod_id),
                 "super_device_id": str(super_device_id)
 
@@ -65,25 +65,103 @@ def check_or_set_default_env(cmake_args,
     return cmake_args
 
 
+def get_value_from_lines(lines: List[str], key: str) -> str:
+    for line in lines:
+        line = ' '.join(line.split())
+        if key in line:
+            return line.split(':')[-1].strip()
+    return ""
+
+
+def get_chip_info() -> str:
+    try:
+        npu_info_lines = subprocess.check_output(
+            ['npu-smi', 'info', '-l']).decode().strip().split('\n')
+        npu_id = int(get_value_from_lines(npu_info_lines, 'NPU ID'))
+        chip_info_lines = subprocess.check_output(
+            ['npu-smi', 'info', '-t', 'board', '-i',
+             str(npu_id), '-c', '0']).decode().strip().split('\n')
+        chip_name = get_value_from_lines(chip_info_lines, 'Chip Name')
+        chip_type = get_value_from_lines(chip_info_lines, 'Chip Type')
+        npu_name = get_value_from_lines(chip_info_lines, 'NPU Name')
+
+        if "310" in chip_name:
+            # 310P case
+            assert chip_type
+            return (chip_type + chip_name).lower()
+        elif "910" in chip_name:
+            if chip_type:
+                # A2 case
+                assert not npu_name
+                return (chip_type + chip_name).lower()
+            else:
+                # A3 case
+                assert npu_name
+                return (chip_name + '_' + npu_name).lower()
+        else:
+            # TODO(zzzzwwjj): Currently, A5's chip name has not determined yet.
+            raise ValueError(
+                f"Unable to recognize chip name: {chip_name}, please manually set env SOC_VERSION"
+            )
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Get chip info failed: {e}")
+    except FileNotFoundError:
+        # cpu envir, release code case, return `ascend910b1` by default
+        return "ascend910b1"
+
+
 envs = load_module_from_path("envs",
                              os.path.join(ROOT_DIR, "vllm_ascend", "envs.py"))
 
+soc_version = get_chip_info()
+
+if not envs.SOC_VERSION:
+    envs.SOC_VERSION = soc_version
+else:
+    if envs.SOC_VERSION != soc_version:
+        logging.warning(
+            f"env SOC_VERSION: {envs.SOC_VERSION} is not equal to soc_version from npu-smi: {soc_version}"
+        )
+
 
 def gen_build_info():
     soc_version = envs.SOC_VERSION
-    if not soc_version:
-        raise ValueError(
-            "SOC version is not set. Please set SOC_VERSION environment variable."
-        )
     if "310" in soc_version and not envs.COMPILE_CUSTOM_KERNELS:
         raise ValueError(
             "SOC version 310 only supports custom kernels. Please set COMPILE_CUSTOM_KERNELS=1 to enable custom kernels."
         )
 
+    # TODO(zzzzwwjj): Add A5 case
+    soc_to_device = {
+        "ascend910b1": "_910B",
+        "ascend910b2": "_910B",
+        "ascend910b2c": "_910B",
+        "ascend910b3": "_910B",
+        "ascend910b4": "_910B",
+        "ascend910b4-1": "_910B",
+        "ascend910_9391": "_910_93",
+        "ascend910_9381": "_910_93",
+        "ascend910_9372": "_910_93",
+        "ascend910_9392": "_910_93",
+        "ascend910_9382": "_910_93",
+        "ascend910_9362": "_910_93",
+        "ascend310p1": "_310P",
+        "ascend310p3": "_310P",
+        "ascend310p5": "_310P",
+        "ascend310p7": "_310P",
+        "ascend310p3vir01": "_310P",
+        "ascend310p3vir02": "_310P",
+        "ascend310p3vir04": "_310P",
+        "ascend310p3vir08": "_310P",
+    }
+
+    assert soc_version in soc_to_device, f"Undefined soc_version: {soc_version}. Please file an issue to vllm-ascend."
+    device_type = soc_to_device[soc_version]
+
     package_dir = os.path.join(ROOT_DIR, "vllm_ascend", "_build_info.py")
     with open(package_dir, "w+") as f:
         f.write('# Auto-generated file\n')
-        f.write(f"__soc_version__ = '{soc_version}'\n")
+        f.write(f"__device_type__ = '{device_type}'\n")
         f.write(f"__sleep_mode_enabled__ = {envs.COMPILE_CUSTOM_KERNELS}\n")
     logging.info(f"Generated _build_info.py with SOC version: {soc_version}")
 
 
@@ -9,6 +9,7 @@
                                                 AscendAttentionMetadataBuilder,
                                                 AscendAttentionState)
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
+from vllm_ascend.utils import AscendDeviceType
 
 
 class TestAscendAttentionBackend(TestBase):
@@ -24,14 +25,15 @@ def test_get_builder_cls(self):
         self.assertEqual(AscendAttentionBackend.get_builder_cls(),
                          AscendAttentionMetadataBuilder)
 
-    @patch('vllm_ascend.attention.attention_v1.is_310p')
-    def test_get_kv_cache_shape_310p(self, mock_is_310p):
-        mock_is_310p.return_value = True
+    @patch('vllm_ascend.attention.attention_v1.get_ascend_device_type',
+           return_value=AscendDeviceType._310P)
+    def test_get_kv_cache_shape_310p(self, mock_soc_version):
         result = AscendAttentionBackend.get_kv_cache_shape(10, 20, 30, 40)
         self.assertEqual(result, (2, 10, 30 * 40 // 16, 20, 16))
 
-    @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
-    def test_get_kv_cache_shape_not_310p(self, mock_is_310p):
+    @patch('vllm_ascend.utils.get_ascend_device_type',
+           return_value=AscendDeviceType._910_93)
+    def test_get_kv_cache_shape_not_310p(self, mock_soc_version):
         result = AscendAttentionBackend.get_kv_cache_shape(10, 20, 30, 40)
         self.assertEqual(result, (2, 10, 20, 30, 40))
 
@@ -96,8 +98,9 @@ def test_reorder_batch(self):
     @patch('vllm_ascend.attention.attention_v1.AscendMetadata')
     @patch('torch_npu.npu_format_cast')
     @patch('vllm_ascend.utils.nd_to_nz_2d')
-    @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True)
-    def test_build_prefill_no_cache(self, mock_is_310p, mock_nd_to_nz_2d,
+    @patch('vllm_ascend.utils.get_ascend_device_type',
+           return_value=AscendDeviceType._310P)
+    def test_build_prefill_no_cache(self, mock_soc_version, mock_nd_to_nz_2d,
                                     mock_npu_format_cast,
                                     mock_ascend_metadata):
         common_attn_metadata = AscendCommonAttentionMetadata(
@@ -128,10 +131,11 @@ def test_build_prefill_no_cache(self, mock_is_310p, mock_nd_to_nz_2d,
     @patch('vllm_ascend.attention.attention_v1.AscendMetadata')
     @patch('torch_npu.npu_format_cast')
     @patch('vllm_ascend.utils.nd_to_nz_spec')
-    @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True)
+    @patch('vllm_ascend.utils.get_ascend_device_type',
+           return_value=AscendDeviceType._310P)
     @patch('vllm_ascend.attention.attention_v1.AscendAttentionState')
     def test_build_chunked_prefill(self, mock_ascend_attention_state,
-                                   mock_is_310p, mock_nd_to_nz_spec,
+                                   mock_soc_version, mock_nd_to_nz_spec,
                                    mock_npu_format_cast, mock_ascend_metadata):
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=torch.tensor([0, 2, 5, 9]),
@@ -162,8 +166,9 @@ def test_build_chunked_prefill(self, mock_ascend_attention_state,
         self.builder.build(1, common_attn_metadata, mock_model)
 
     @patch('vllm_ascend.attention.attention_v1.AscendMetadata')
-    @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
-    def test_build_non_310p(self, mock_is_310p, mock_ascend_metadata):
+    @patch('vllm_ascend.utils.get_ascend_device_type',
+           return_value=AscendDeviceType._910_93)
+    def test_build_non_310p(self, mock_soc_version, mock_ascend_metadata):
         common_attn_metadata = AscendCommonAttentionMetadata(
             query_start_loc=torch.tensor([0, 2, 5, 9]),
             query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),
@@ -450,12 +455,13 @@ def test_forward_decode_only_swa_seq_len_mismatch(
         assert output.shape == (10, 8 * 64)
 
     @patch('vllm_ascend.attention.attention_v1.get_forward_context')
-    @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
+    @patch('vllm_ascend.utils.get_ascend_device_type',
+           return_value=AscendDeviceType._910_93)
     @patch('torch_npu._npu_reshape_and_cache')
     @patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill')
     def test_forward_head_size_192(self, mock_vanilla_prefill,
-                                   mock_npu_reshape_and_cache, mock_is_310p,
-                                   mock_get_forward_context):
+                                   mock_npu_reshape_and_cache,
+                                   mock_soc_version, mock_get_forward_context):
         """Test forward pass when head_size is 192"""
 
         self.impl.head_size = 192
@@ -522,9 +528,11 @@ def test_forward_normal_v1_situation(self, mock_npu_reshape_and_cache,
     @patch('torch_npu.npu_format_cast')
     @patch('torch_npu._npu_reshape_and_cache')
     @patch('torch_npu.npu_fused_infer_attention_score')
-    @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True)
+    @patch('vllm_ascend.utils.get_ascend_device_type',
+           return_value=AscendDeviceType._310P)
     @patch('vllm_ascend.attention.attention_v1.get_forward_context')
-    def test_forward_310p_device(self, mock_get_forward_context, mock_is_310p,
+    def test_forward_310p_device(self, mock_get_forward_context,
+                                 mock_soc_version,
                                  mock_npu_fused_infer_attention_score,
                                  mock_npu_reshape_and_cache,
                                  mock_npu_format_cast):
 
@@ -92,7 +92,7 @@ def mock_distributed():
 
     with patch("vllm_ascend.ops.fused_moe.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
             patch("vllm_ascend.ops.fused_moe.token_dispatcher.torch.distributed.get_rank", return_value=0), \
-            patch("vllm_ascend.ops.fused_moe.token_dispatcher.get_ascend_soc_version", return_value=None), \
+            patch("vllm_ascend.ops.fused_moe.token_dispatcher.get_ascend_device_type", return_value=None), \
             patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
                        _PP=pp_group), \
             patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \
 
@@ -19,6 +19,8 @@
 import torch
 from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
 
+from vllm_ascend.utils import AscendDeviceType
+
 
 @pytest.fixture
 def dummy_tensor():
@@ -36,20 +38,22 @@ def test_QuickGELU_forward(mock_gelu, dummy_tensor):
     mock_gelu.assert_called_once()
 
 
-@pytest.mark.parametrize("is_310p_return", [True, False])
+@pytest.mark.parametrize("is_310p", [True, False])
 @patch("torch_npu.npu_swiglu", side_effect=lambda x: x + 1)
 @patch("torch.ops.vllm.maybe_wait_prefetch_done", side_effect=lambda x: None)
 @patch("torch.ops.vllm.maybe_prefetch_mlp_down_proj",
        side_effect=lambda x: None)
 def test_SiluAndMul_forward(mock_maybe_prefetch_mlp_down_proj,
                             mock_maybe_wait_prefetch_done, mock_swiglu,
-                            is_310p_return, dummy_tensor):
+                            is_310p, dummy_tensor):
 
-    with patch("vllm_ascend.utils.is_310p", return_value=is_310p_return):
+    with patch("vllm_ascend.utils.get_ascend_device_type",
+               return_value=AscendDeviceType._310P
+               if is_310p else AscendDeviceType._910_93):
         layer = SiluAndMul()
         out = layer.forward(dummy_tensor)
 
-        if is_310p_return:
+        if is_310p:
             expected_arg = dummy_tensor.to(torch.float32)
         else:
             expected_arg = dummy_tensor
 
@@ -29,7 +29,7 @@
     AscendFusedMoE, AscendUnquantizedFusedMoEMethod)
 from vllm_ascend.ops.fused_moe.moe_mlp import (cumsum_group_list,
                                                unified_apply_mlp)
-from vllm_ascend.utils import AscendSocVersion, adapt_patch
+from vllm_ascend.utils import AscendDeviceType, adapt_patch
 
 adapt_patch(True)
 
@@ -129,7 +129,7 @@ def mock_finalize(hidden_states, **kwargs):
             return_value=mock_forward_context_obj), \
         patch('vllm_ascend.ops.fused_moe.prepare_finalize.get_forward_context',
             return_value=mock_forward_context_obj), \
-        patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3), \
+        patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._910_93), \
         patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context',
                 return_value=mock_forward_context_obj), \
         patch('vllm_ascend.ops.fused_moe.moe_comm_method.MC2CommImpl._get_token_dispatcher',
@@ -323,22 +323,21 @@ def test_cumsum_group_list_with_type_2(self):
 class TestUnifiedApplyMLP(TestBase):
 
     @patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context')
-    @patch('vllm_ascend.ops.fused_moe.moe_mlp.is_310p')
+    @patch('vllm_ascend.utils.get_ascend_device_type',
+           return_value=AscendDeviceType._910_93)
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_dynamic_quant')
     @patch('torch_npu.npu_dequant_swiglu_quant')
     def test_unified_apply_mlp_with_quantization_mc2(self, mock_npu_dequant,
                                                      mock_npu_dynamic_quant,
                                                      mock_npu_grouped_matmul,
-                                                     mock_is_310p,
+                                                     mock_soc_version,
                                                      mock_get_forward_context):
 
         mock_forward_context = MagicMock()
         mock_forward_context.moe_comm_type = MoECommType.MC2
         mock_get_forward_context.return_value = mock_forward_context
 
-        mock_is_310p.return_value = False
-
         mock_npu_dynamic_quant.return_value = (torch.randint(-128,
                                                              127, (10, 20),
                                                              dtype=torch.int8),
@@ -387,17 +386,16 @@ def test_unified_apply_mlp_with_quantization_mc2(self, mock_npu_dequant,
 
         self.assertEqual(result.dtype, torch.bfloat16)
 
-    @patch('vllm_ascend.ops.fused_moe.moe_mlp.is_310p')
+    @patch('vllm_ascend.utils.get_ascend_device_type',
+           return_value=AscendDeviceType._910_93)
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_swiglu')
     @patch('torch_npu.npu_dynamic_quant')
     def test_unified_apply_mlp_without_quantization(self,
                                                     mock_npu_dynamic_quant,
                                                     mock_npu_swiglu,
                                                     mock_npu_grouped_matmul,
-                                                    mock_is_310p):
-        mock_is_310p.return_value = False
-
+                                                    mock_soc_version):
         mock_npu_grouped_matmul.side_effect = [[
             torch.randn(10, 40, dtype=torch.float16)
         ], [torch.randn(10, 20, dtype=torch.float16)]]
@@ -490,15 +488,14 @@ def test_unified_apply_mlp_with_quantization_and_dynamic_scale(
         self.assertEqual(result.shape, hidden_states_shape)
         self.assertEqual(result.dtype, torch.bfloat16)
 
-    @patch('vllm_ascend.ops.fused_moe.moe_mlp.is_310p')
+    @patch('vllm_ascend.utils.get_ascend_device_type',
+           return_value=AscendDeviceType._310P)
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_swiglu')
     @patch('torch_npu.npu_dynamic_quant')
     def test_unified_apply_mlp_without_quantization_310p(
             self, mock_npu_dynamic_quant, mock_npu_swiglu,
-            mock_npu_grouped_matmul, mock_is_310p):
-        mock_is_310p.return_value = True
-
+            mock_npu_grouped_matmul, mock_soc_version):
         mock_gmm1_out = torch.randn(10, 40, dtype=torch.float16)
         mock_gmm2_out = torch.randn(10, 20, dtype=torch.float16)
         mock_npu_grouped_matmul.side_effect = [[mock_gmm1_out],
@@ -527,8 +524,6 @@ def test_unified_apply_mlp_without_quantization_310p(
                                    topk_scales=topk_scales,
                                    with_quant=False)
 
-        mock_is_310p.assert_called_once()
-
         self.assertEqual(mock_npu_grouped_matmul.call_count, 2)
         mock_npu_swiglu.assert_called_once()