From 5bda65841375fa36d4342308e130cfb894f71b3e Mon Sep 17 00:00:00 2001 From: Meihan-chen Date: Mon, 1 Dec 2025 10:46:38 +0800 Subject: [PATCH 1/3] qwen3-omni online reference server bugfix Signed-off-by: Meihan-chen --- vllm_ascend/worker/model_runner_v1.py | 43 ++++++++++++++------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index ff55d1d1897..cf90584b281 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -273,7 +273,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): def __init__(self, vllm_config: VllmConfig, device: torch.device): self.vllm_config = vllm_config - self.model_config = vllm_config.model_config + self.model_config = vllm_config.model_configi self.cache_config = vllm_config.cache_config self.compilation_config = vllm_config.compilation_config self.load_config = vllm_config.load_config @@ -1943,26 +1943,27 @@ def _generate_process_reqs_hidden_states(self, attn_metadata, with_prefill, def _build_attn_state(self, num_reqs, num_scheduled_tokens, num_valid_tokens): ascend_config = get_ascend_config() - if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens): - attn_state = AscendAttentionState.PrefillNoCache - # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache. - elif np.all(num_scheduled_tokens == 1): - attn_state = AscendAttentionState.DecodeOnly - if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': - # SpecDecoding now supports seq_len=1 and seq_len=2 - # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1 - attn_state = AscendAttentionState.SpecDecoding - # Speculative decoding. - elif np.all(num_valid_tokens == 1): - if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': - attn_state = AscendAttentionState.SpecDecoding - else: - attn_state = AscendAttentionState.ChunkedPrefill - # splitfuse - elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled: - attn_state = AscendAttentionState.ChunkedPrefill - else: - attn_state = AscendAttentionState.PrefillCacheHit + attn_state = AscendAttentionState.ChunkedPrefill + # if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens): + # attn_state = AscendAttentionState.PrefillNoCache + # # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache. + # elif np.all(num_scheduled_tokens == 1): + # attn_state = AscendAttentionState.DecodeOnly + # if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': + # # SpecDecoding now supports seq_len=1 and seq_len=2 + # # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1 + # attn_state = AscendAttentionState.SpecDecoding + # # Speculative decoding. + # elif np.all(num_valid_tokens == 1): + # if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': + # attn_state = AscendAttentionState.SpecDecoding + # else: + # attn_state = AscendAttentionState.ChunkedPrefill + # # splitfuse + # elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled: + # attn_state = AscendAttentionState.ChunkedPrefill + # else: + # attn_state = AscendAttentionState.PrefillCacheHit return attn_state def _update_graph_pad_size(self, with_prefill, graph_pad_size): From 0a07c6105510f0df5c533540dba362492f2f0f1b Mon Sep 17 00:00:00 2001 From: Meihan-chen Date: Mon, 1 Dec 2025 10:47:07 +0800 Subject: [PATCH 2/3] qwen3-omni online reference server bugfix Signed-off-by: Meihan-chen --- vllm_ascend/worker/model_runner_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index cf90584b281..95ffb994fc4 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -273,7 +273,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): def __init__(self, vllm_config: VllmConfig, device: torch.device): self.vllm_config = vllm_config - self.model_config = vllm_config.model_configi + self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.compilation_config = vllm_config.compilation_config self.load_config = vllm_config.load_config From c11a23bccc095f3a15050756a85af496153d2a7f Mon Sep 17 00:00:00 2001 From: leo-pony Date: Mon, 1 Dec 2025 21:04:19 +0800 Subject: [PATCH 3/3] Add qwen3-30B-A3B-omni test cases Signed-off-by: leo-pony --- tests/e2e/multicard/test_qwen_omni.py | 68 +++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 tests/e2e/multicard/test_qwen_omni.py diff --git a/tests/e2e/multicard/test_qwen_omni.py b/tests/e2e/multicard/test_qwen_omni.py new file mode 100644 index 00000000000..e4daf10f14a --- /dev/null +++ b/tests/e2e/multicard/test_qwen_omni.py @@ -0,0 +1,68 @@ +import pytest +from vllm import LLM, SamplingParams + +MODEL_NAME = "Qwen/Qwen3-Omni-30B-A3B-Thinking" + + +@pytest.fixture(scope="function") +def llm_engine(): + llm = LLM( + model=MODEL_NAME, + tensor_parallel_size=4, + enable_expert_parallel=True, + trust_remote_code=True, + gpu_memory_utilization=0.90, + max_model_len=20480, + ) + return llm + + +def test_qwen_omni_multimodal_inputs(llm_engine): + # Sampling Parameters + sampling_params = SamplingParams(temperature=0.7, + top_p=0.8, + repetition_penalty=1.05, + max_tokens=512) + + # multi-model inputs (OpenAI Chat Format) + messages = [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg" + }, + }, { + "type": "audio_url", + "audio_url": { + "url": + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav" + } + }, { + "type": "video_url", + "video_url": { + "url": + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4" + } + }, { + "type": "text", + "text": "Analyze this audio, image, and video together." + }] + }] + + outputs = llm_engine.chat(messages=messages, + sampling_params=sampling_params) + + assert outputs is not None, "Output should not be None" + assert len(outputs) > 0, "Should return at least one output" + + output_text = outputs[0].outputs[0].text + print( + f"\n[Output] Model generated text:\n{'-'*20}\n{output_text}\n{'-'*20}") + + # Check whether the audio, image, and video content are correctly understood. + assert len(output_text.strip()) > 0, "Generated text should not be empty" + assert "cough" in output_text.lower() and "mercedes" in output_text.lower( + ) and "drawing" in output_text.lower()