@@ -273,7 +273,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
273273
274274 def __init__ (self , vllm_config : VllmConfig , device : torch .device ):
275275 self .vllm_config = vllm_config
276- self .model_config = vllm_config .model_config
276+ self .model_config = vllm_config .model_configi
277277 self .cache_config = vllm_config .cache_config
278278 self .compilation_config = vllm_config .compilation_config
279279 self .load_config = vllm_config .load_config
@@ -1943,26 +1943,27 @@ def _generate_process_reqs_hidden_states(self, attn_metadata, with_prefill,
19431943 def _build_attn_state (self , num_reqs , num_scheduled_tokens ,
19441944 num_valid_tokens ):
19451945 ascend_config = get_ascend_config ()
1946- if np .array_equal (self .seq_lens_np [:num_reqs ], num_scheduled_tokens ):
1947- attn_state = AscendAttentionState .PrefillNoCache
1948- # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
1949- elif np .all (num_scheduled_tokens == 1 ):
1950- attn_state = AscendAttentionState .DecodeOnly
1951- if self .speculative_config and self .speculative_config .method == 'deepseek_mtp' :
1952- # SpecDecoding now supports seq_len=1 and seq_len=2
1953- # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
1954- attn_state = AscendAttentionState .SpecDecoding
1955- # Speculative decoding.
1956- elif np .all (num_valid_tokens == 1 ):
1957- if self .speculative_config and self .speculative_config .method == 'deepseek_mtp' :
1958- attn_state = AscendAttentionState .SpecDecoding
1959- else :
1960- attn_state = AscendAttentionState .ChunkedPrefill
1961- # splitfuse
1962- elif not ascend_config .ascend_scheduler_config .enabled or self .chunked_prefill_enabled :
1963- attn_state = AscendAttentionState .ChunkedPrefill
1964- else :
1965- attn_state = AscendAttentionState .PrefillCacheHit
1946+ attn_state = AscendAttentionState .ChunkedPrefill
1947+ # if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
1948+ # attn_state = AscendAttentionState.PrefillNoCache
1949+ # # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
1950+ # elif np.all(num_scheduled_tokens == 1):
1951+ # attn_state = AscendAttentionState.DecodeOnly
1952+ # if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
1953+ # # SpecDecoding now supports seq_len=1 and seq_len=2
1954+ # # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
1955+ # attn_state = AscendAttentionState.SpecDecoding
1956+ # # Speculative decoding.
1957+ # elif np.all(num_valid_tokens == 1):
1958+ # if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
1959+ # attn_state = AscendAttentionState.SpecDecoding
1960+ # else:
1961+ # attn_state = AscendAttentionState.ChunkedPrefill
1962+ # # splitfuse
1963+ # elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
1964+ # attn_state = AscendAttentionState.ChunkedPrefill
1965+ # else:
1966+ # attn_state = AscendAttentionState.PrefillCacheHit
19661967 return attn_state
19671968
19681969 def _update_graph_pad_size (self , with_prefill , graph_pad_size ):
0 commit comments