Quick fix for IMA with the Prefix Prefill kernel during graph capture (vllm-project#25983)

SageMoore · web-flow · commit 5f2cacdb1e62 · 2025-10-03T11:28:22.000Z
Signed-off-by: Sage Moore &lt;sage@neuralmagic.com&gt;
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
@@ -83,6 +83,14 @@ def build_for_cudagraph_capture(
         # max_model_len will cause graph capture to be extremely
         # slow, so here we set it to 1.
         attn_metadata.seq_lens.fill_(1)
+
+        if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
+            # Here we set the query start locs to 0. This is to
+            # cover up an invalid memory access in the prefix_prefil kernel
+            # that we run into during graph capture (#25985)
+            common_attn_metadata.query_start_loc.zero_()
+            common_attn_metadata.query_start_loc_cpu.zero_()
+
         return attn_metadata
 
     def build(self,