Skip to content

Commit 7e0b1d1

Browse files
committed
qwen3-omni online reference server bugfix
Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
1 parent 84d7f5a commit 7e0b1d1

File tree

1 file changed

+22
-21
lines changed

1 file changed

+22
-21
lines changed

vllm_ascend/worker/model_runner_v1.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
273273

274274
def __init__(self, vllm_config: VllmConfig, device: torch.device):
275275
self.vllm_config = vllm_config
276-
self.model_config = vllm_config.model_config
276+
self.model_config = vllm_config.model_configi
277277
self.cache_config = vllm_config.cache_config
278278
self.compilation_config = vllm_config.compilation_config
279279
self.load_config = vllm_config.load_config
@@ -1943,26 +1943,27 @@ def _generate_process_reqs_hidden_states(self, attn_metadata, with_prefill,
19431943
def _build_attn_state(self, num_reqs, num_scheduled_tokens,
19441944
num_valid_tokens):
19451945
ascend_config = get_ascend_config()
1946-
if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
1947-
attn_state = AscendAttentionState.PrefillNoCache
1948-
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
1949-
elif np.all(num_scheduled_tokens == 1):
1950-
attn_state = AscendAttentionState.DecodeOnly
1951-
if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
1952-
# SpecDecoding now supports seq_len=1 and seq_len=2
1953-
# In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
1954-
attn_state = AscendAttentionState.SpecDecoding
1955-
# Speculative decoding.
1956-
elif np.all(num_valid_tokens == 1):
1957-
if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
1958-
attn_state = AscendAttentionState.SpecDecoding
1959-
else:
1960-
attn_state = AscendAttentionState.ChunkedPrefill
1961-
# splitfuse
1962-
elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
1963-
attn_state = AscendAttentionState.ChunkedPrefill
1964-
else:
1965-
attn_state = AscendAttentionState.PrefillCacheHit
1946+
attn_state = AscendAttentionState.ChunkedPrefill
1947+
# if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
1948+
# attn_state = AscendAttentionState.PrefillNoCache
1949+
# # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
1950+
# elif np.all(num_scheduled_tokens == 1):
1951+
# attn_state = AscendAttentionState.DecodeOnly
1952+
# if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
1953+
# # SpecDecoding now supports seq_len=1 and seq_len=2
1954+
# # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
1955+
# attn_state = AscendAttentionState.SpecDecoding
1956+
# # Speculative decoding.
1957+
# elif np.all(num_valid_tokens == 1):
1958+
# if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
1959+
# attn_state = AscendAttentionState.SpecDecoding
1960+
# else:
1961+
# attn_state = AscendAttentionState.ChunkedPrefill
1962+
# # splitfuse
1963+
# elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
1964+
# attn_state = AscendAttentionState.ChunkedPrefill
1965+
# else:
1966+
# attn_state = AscendAttentionState.PrefillCacheHit
19661967
return attn_state
19671968

19681969
def _update_graph_pad_size(self, with_prefill, graph_pad_size):

0 commit comments

Comments
 (0)