We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent aa5053e commit 5f2cacdCopy full SHA for 5f2cacd
vllm/v1/attention/backends/rocm_attn.py
@@ -83,6 +83,14 @@ def build_for_cudagraph_capture(
83
# max_model_len will cause graph capture to be extremely
84
# slow, so here we set it to 1.
85
attn_metadata.seq_lens.fill_(1)
86
+
87
+ if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
88
+ # Here we set the query start locs to 0. This is to
89
+ # cover up an invalid memory access in the prefix_prefil kernel
90
+ # that we run into during graph capture (#25985)
91
+ common_attn_metadata.query_start_loc.zero_()
92
+ common_attn_metadata.query_start_loc_cpu.zero_()
93
94
return attn_metadata
95
96
def build(self,
0 commit comments