From 5bda65841375fa36d4342308e130cfb894f71b3e Mon Sep 17 00:00:00 2001
From: Meihan-chen <jcccx.cmh@gmail.com>
Date: Mon, 1 Dec 2025 10:46:38 +0800
Subject: [PATCH 1/3] qwen3-omni online reference server bugfix

Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
---
 vllm_ascend/worker/model_runner_v1.py | 43 ++++++++++++++-------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index ff55d1d1897..cf90584b281 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -273,7 +273,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
 
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
+        self.model_config = vllm_config.model_configi
         self.cache_config = vllm_config.cache_config
         self.compilation_config = vllm_config.compilation_config
         self.load_config = vllm_config.load_config
@@ -1943,26 +1943,27 @@ def _generate_process_reqs_hidden_states(self, attn_metadata, with_prefill,
     def _build_attn_state(self, num_reqs, num_scheduled_tokens,
                           num_valid_tokens):
         ascend_config = get_ascend_config()
-        if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
-            attn_state = AscendAttentionState.PrefillNoCache
-        # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
-        elif np.all(num_scheduled_tokens == 1):
-            attn_state = AscendAttentionState.DecodeOnly
-            if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
-                # SpecDecoding now supports seq_len=1 and seq_len=2
-                # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
-                attn_state = AscendAttentionState.SpecDecoding
-        # Speculative decoding.
-        elif np.all(num_valid_tokens == 1):
-            if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
-                attn_state = AscendAttentionState.SpecDecoding
-            else:
-                attn_state = AscendAttentionState.ChunkedPrefill
-        # splitfuse
-        elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
-            attn_state = AscendAttentionState.ChunkedPrefill
-        else:
-            attn_state = AscendAttentionState.PrefillCacheHit
+        attn_state = AscendAttentionState.ChunkedPrefill
+        # if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
+        #     attn_state = AscendAttentionState.PrefillNoCache
+        # # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
+        # elif np.all(num_scheduled_tokens == 1):
+        #     attn_state = AscendAttentionState.DecodeOnly
+        #     if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
+        #         # SpecDecoding now supports seq_len=1 and seq_len=2
+        #         # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
+        #         attn_state = AscendAttentionState.SpecDecoding
+        # # Speculative decoding.
+        # elif np.all(num_valid_tokens == 1):
+        #     if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
+        #         attn_state = AscendAttentionState.SpecDecoding
+        #     else:
+        #         attn_state = AscendAttentionState.ChunkedPrefill
+        # # splitfuse
+        # elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
+        #     attn_state = AscendAttentionState.ChunkedPrefill
+        # else:
+        #     attn_state = AscendAttentionState.PrefillCacheHit
         return attn_state
 
     def _update_graph_pad_size(self, with_prefill, graph_pad_size):

From 0a07c6105510f0df5c533540dba362492f2f0f1b Mon Sep 17 00:00:00 2001
From: Meihan-chen <jcccx.cmh@gmail.com>
Date: Mon, 1 Dec 2025 10:47:07 +0800
Subject: [PATCH 2/3] qwen3-omni online reference server bugfix

Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
---
 vllm_ascend/worker/model_runner_v1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index cf90584b281..95ffb994fc4 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -273,7 +273,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
 
     def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_configi
+        self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
         self.compilation_config = vllm_config.compilation_config
         self.load_config = vllm_config.load_config

From c11a23bccc095f3a15050756a85af496153d2a7f Mon Sep 17 00:00:00 2001
From: leo-pony <nengjunma@outlook.com>
Date: Mon, 1 Dec 2025 21:04:19 +0800
Subject: [PATCH 3/3] Add qwen3-30B-A3B-omni test cases

Signed-off-by: leo-pony <nengjunma@outlook.com>
---
 tests/e2e/multicard/test_qwen_omni.py | 68 +++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 tests/e2e/multicard/test_qwen_omni.py

diff --git a/tests/e2e/multicard/test_qwen_omni.py b/tests/e2e/multicard/test_qwen_omni.py
new file mode 100644
index 00000000000..e4daf10f14a
--- /dev/null
+++ b/tests/e2e/multicard/test_qwen_omni.py
@@ -0,0 +1,68 @@
+import pytest
+from vllm import LLM, SamplingParams
+
+MODEL_NAME = "Qwen/Qwen3-Omni-30B-A3B-Thinking"
+
+
+@pytest.fixture(scope="function")
+def llm_engine():
+    llm = LLM(
+        model=MODEL_NAME,
+        tensor_parallel_size=4,
+        enable_expert_parallel=True,
+        trust_remote_code=True,
+        gpu_memory_utilization=0.90,
+        max_model_len=20480,
+    )
+    return llm
+
+
+def test_qwen_omni_multimodal_inputs(llm_engine):
+    # Sampling Parameters
+    sampling_params = SamplingParams(temperature=0.7,
+                                     top_p=0.8,
+                                     repetition_penalty=1.05,
+                                     max_tokens=512)
+
+    # multi-model inputs (OpenAI Chat Format)
+    messages = [{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url":
+                "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg"
+            },
+        }, {
+            "type": "audio_url",
+            "audio_url": {
+                "url":
+                "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"
+            }
+        }, {
+            "type": "video_url",
+            "video_url": {
+                "url":
+                "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
+            }
+        }, {
+            "type": "text",
+            "text": "Analyze this audio, image, and video together."
+        }]
+    }]
+
+    outputs = llm_engine.chat(messages=messages,
+                              sampling_params=sampling_params)
+
+    assert outputs is not None, "Output should not be None"
+    assert len(outputs) > 0, "Should return at least one output"
+
+    output_text = outputs[0].outputs[0].text
+    print(
+        f"\n[Output] Model generated text:\n{'-'*20}\n{output_text}\n{'-'*20}")
+
+    # Check whether the audio, image, and video content are correctly understood.
+    assert len(output_text.strip()) > 0, "Generated text should not be empty"
+    assert "cough" in output_text.lower() and "mercedes" in output_text.lower(
+    ) and "drawing" in output_text.lower()