66from torchair import patch_for_hcom
77from vllm .config import (CUDAGraphMode , VllmConfig ,
88 get_layers_from_vllm_config , set_current_vllm_config )
9- from vllm .forward_context import BatchDescriptor , get_forward_context
9+ from vllm .forward_context import get_forward_context
1010from vllm .model_executor .layers .attention_layer_base import AttentionLayerBase
1111from vllm .model_executor .model_loader import get_model_loader
1212from vllm .model_executor .model_loader .utils import \
@@ -343,12 +343,7 @@ def _propose_torchair(
343343 # torchair mode can reuse self.runner.num_tokens_across_dp
344344 num_tokens_across_dp = self .runner .num_tokens_across_dp
345345 with_prefill = self .runner .with_prefill
346-
347346 moe_comm_type = self .runner ._select_moe_comm_method (num_input_tokens )
348- batch_descriptor = BatchDescriptor (num_tokens = num_input_tokens ,
349- uniform_decode = False )
350- aclgraph_runtime_mode , batch_descriptor = \
351- self .runner .aclgraph_dispatcher .dispatch (batch_descriptor )
352347
353348 for step in range (self .num_speculative_tokens ):
354349 with set_ascend_forward_context (
@@ -359,7 +354,6 @@ def _propose_torchair(
359354 num_tokens_across_dp = num_tokens_across_dp ,
360355 reserved_mc2_mask = self .runner .reserved_mc2_mask ,
361356 moe_comm_type = moe_comm_type ,
362- aclgraph_runtime_mode = aclgraph_runtime_mode ,
363357 in_profile_run = self .runner .in_profile_run ,
364358 num_actual_tokens = num_tokens ):
365359 with ProfileExecuteDuration ().capture_async ('mtp_forward' ):
0 commit comments