Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions tests/e2e/multicard/test_qwen_omni.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import pytest
from vllm import LLM, SamplingParams

MODEL_NAME = "Qwen/Qwen3-Omni-30B-A3B-Thinking"


@pytest.fixture(scope="function")
def llm_engine():
llm = LLM(
model=MODEL_NAME,
tensor_parallel_size=4,
enable_expert_parallel=True,
trust_remote_code=True,
gpu_memory_utilization=0.90,
max_model_len=20480,
)
return llm


def test_qwen_omni_multimodal_inputs(llm_engine):
# Sampling Parameters
sampling_params = SamplingParams(temperature=0.7,
top_p=0.8,
repetition_penalty=1.05,
max_tokens=512)

# multi-model inputs (OpenAI Chat Format)
messages = [{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url":
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg"
},
}, {
"type": "audio_url",
"audio_url": {
"url":
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"
}
}, {
"type": "video_url",
"video_url": {
"url":
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
}
}, {
"type": "text",
"text": "Analyze this audio, image, and video together."
}]
}]

outputs = llm_engine.chat(messages=messages,
sampling_params=sampling_params)

assert outputs is not None, "Output should not be None"
assert len(outputs) > 0, "Should return at least one output"

output_text = outputs[0].outputs[0].text
print(
f"\n[Output] Model generated text:\n{'-'*20}\n{output_text}\n{'-'*20}")

# Check whether the audio, image, and video content are correctly understood.
assert len(output_text.strip()) > 0, "Generated text should not be empty"
assert "cough" in output_text.lower() and "mercedes" in output_text.lower(
) and "drawing" in output_text.lower()
41 changes: 21 additions & 20 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1938,27 +1938,28 @@

def _build_attn_state(self, num_reqs, num_scheduled_tokens,
num_valid_tokens):
ascend_config = get_ascend_config()

Check failure on line 1941 in vllm_ascend/worker/model_runner_v1.py

View workflow job for this annotation

GitHub Actions / lint / pre-commit

Ruff (F841)

vllm_ascend/worker/model_runner_v1.py:1941:9: F841 Local variable `ascend_config` is assigned to but never used
if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
attn_state = AscendAttentionState.PrefillNoCache
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
elif np.all(num_scheduled_tokens == 1):
attn_state = AscendAttentionState.DecodeOnly
if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
# SpecDecoding now supports seq_len=1 and seq_len=2
# In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
attn_state = AscendAttentionState.SpecDecoding
# Speculative decoding.
elif np.all(num_valid_tokens == 1):
if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
attn_state = AscendAttentionState.SpecDecoding
else:
attn_state = AscendAttentionState.ChunkedPrefill
# splitfuse
elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
attn_state = AscendAttentionState.ChunkedPrefill
else:
attn_state = AscendAttentionState.PrefillCacheHit
attn_state = AscendAttentionState.ChunkedPrefill
# if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
# attn_state = AscendAttentionState.PrefillNoCache
# # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
# elif np.all(num_scheduled_tokens == 1):
# attn_state = AscendAttentionState.DecodeOnly
# if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
# # SpecDecoding now supports seq_len=1 and seq_len=2
# # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
# attn_state = AscendAttentionState.SpecDecoding
# # Speculative decoding.
# elif np.all(num_valid_tokens == 1):
# if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
# attn_state = AscendAttentionState.SpecDecoding
# else:
# attn_state = AscendAttentionState.ChunkedPrefill
# # splitfuse
# elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
# attn_state = AscendAttentionState.ChunkedPrefill
# else:
# attn_state = AscendAttentionState.PrefillCacheHit
Comment on lines +1942 to +1962
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The logic for determining the attention state (attn_state) has been commented out and hardcoded to AscendAttentionState.ChunkedPrefill. This will force all attention computations to use the chunked prefill path, which is incorrect for decode, speculative decoding, and other states. This change will likely lead to incorrect behavior and performance issues. This seems like a temporary debugging change that should be reverted before merging.

Suggested change
attn_state = AscendAttentionState.ChunkedPrefill
# if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
# attn_state = AscendAttentionState.PrefillNoCache
# # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
# elif np.all(num_scheduled_tokens == 1):
# attn_state = AscendAttentionState.DecodeOnly
# if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
# # SpecDecoding now supports seq_len=1 and seq_len=2
# # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
# attn_state = AscendAttentionState.SpecDecoding
# # Speculative decoding.
# elif np.all(num_valid_tokens == 1):
# if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
# attn_state = AscendAttentionState.SpecDecoding
# else:
# attn_state = AscendAttentionState.ChunkedPrefill
# # splitfuse
# elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
# attn_state = AscendAttentionState.ChunkedPrefill
# else:
# attn_state = AscendAttentionState.PrefillCacheHit
if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens):
attn_state = AscendAttentionState.PrefillNoCache
# We assume it is the decode stage, where prefill occurs but only one token is not hit in cache.
elif np.all(num_scheduled_tokens == 1):
attn_state = AscendAttentionState.DecodeOnly
if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
# SpecDecoding now supports seq_len=1 and seq_len=2
# In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1
attn_state = AscendAttentionState.SpecDecoding
# Speculative decoding.
elif np.all(num_valid_tokens == 1):
if self.speculative_config and self.speculative_config.method == 'deepseek_mtp':
attn_state = AscendAttentionState.SpecDecoding
else:
attn_state = AscendAttentionState.ChunkedPrefill
# splitfuse
elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled:
attn_state = AscendAttentionState.ChunkedPrefill
else:
attn_state = AscendAttentionState.PrefillCacheHit

return attn_state

def _update_graph_pad_size(self, with_prefill, graph_pad_size):
Expand Down
Loading