diff --git a/tests/e2e/multicard/test_qwen_omni.py b/tests/e2e/multicard/test_qwen_omni.py new file mode 100644 index 00000000000..e4daf10f14a --- /dev/null +++ b/tests/e2e/multicard/test_qwen_omni.py @@ -0,0 +1,68 @@ +import pytest +from vllm import LLM, SamplingParams + +MODEL_NAME = "Qwen/Qwen3-Omni-30B-A3B-Thinking" + + +@pytest.fixture(scope="function") +def llm_engine(): + llm = LLM( + model=MODEL_NAME, + tensor_parallel_size=4, + enable_expert_parallel=True, + trust_remote_code=True, + gpu_memory_utilization=0.90, + max_model_len=20480, + ) + return llm + + +def test_qwen_omni_multimodal_inputs(llm_engine): + # Sampling Parameters + sampling_params = SamplingParams(temperature=0.7, + top_p=0.8, + repetition_penalty=1.05, + max_tokens=512) + + # multi-model inputs (OpenAI Chat Format) + messages = [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg" + }, + }, { + "type": "audio_url", + "audio_url": { + "url": + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav" + } + }, { + "type": "video_url", + "video_url": { + "url": + "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4" + } + }, { + "type": "text", + "text": "Analyze this audio, image, and video together." + }] + }] + + outputs = llm_engine.chat(messages=messages, + sampling_params=sampling_params) + + assert outputs is not None, "Output should not be None" + assert len(outputs) > 0, "Should return at least one output" + + output_text = outputs[0].outputs[0].text + print( + f"\n[Output] Model generated text:\n{'-'*20}\n{output_text}\n{'-'*20}") + + # Check whether the audio, image, and video content are correctly understood. + assert len(output_text.strip()) > 0, "Generated text should not be empty" + assert "cough" in output_text.lower() and "mercedes" in output_text.lower( + ) and "drawing" in output_text.lower() diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 1f46b9d40ab..80ee8cee7a7 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1939,26 +1939,27 @@ def _generate_process_reqs_hidden_states(self, attn_metadata, with_prefill, def _build_attn_state(self, num_reqs, num_scheduled_tokens, num_valid_tokens): ascend_config = get_ascend_config() - if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens): - attn_state = AscendAttentionState.PrefillNoCache - # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache. - elif np.all(num_scheduled_tokens == 1): - attn_state = AscendAttentionState.DecodeOnly - if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': - # SpecDecoding now supports seq_len=1 and seq_len=2 - # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1 - attn_state = AscendAttentionState.SpecDecoding - # Speculative decoding. - elif np.all(num_valid_tokens == 1): - if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': - attn_state = AscendAttentionState.SpecDecoding - else: - attn_state = AscendAttentionState.ChunkedPrefill - # splitfuse - elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled: - attn_state = AscendAttentionState.ChunkedPrefill - else: - attn_state = AscendAttentionState.PrefillCacheHit + attn_state = AscendAttentionState.ChunkedPrefill + # if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens): + # attn_state = AscendAttentionState.PrefillNoCache + # # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache. + # elif np.all(num_scheduled_tokens == 1): + # attn_state = AscendAttentionState.DecodeOnly + # if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': + # # SpecDecoding now supports seq_len=1 and seq_len=2 + # # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1 + # attn_state = AscendAttentionState.SpecDecoding + # # Speculative decoding. + # elif np.all(num_valid_tokens == 1): + # if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': + # attn_state = AscendAttentionState.SpecDecoding + # else: + # attn_state = AscendAttentionState.ChunkedPrefill + # # splitfuse + # elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled: + # attn_state = AscendAttentionState.ChunkedPrefill + # else: + # attn_state = AscendAttentionState.PrefillCacheHit return attn_state def _update_graph_pad_size(self, with_prefill, graph_pad_size):