Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 0 additions & 15 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,21 +483,6 @@ def __post_init__(self):
"Overriding cudagraph_mode to PIECEWISE."
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
elif (
current_platform.is_cuda()
and current_platform.is_device_capability(100)
and self.model_config.max_model_len > 131072
and not self.model_config.use_mla
):
# Refer to vllm/utils/flashinfer.py::use_trtllm_attention()
logger.warning_once(
"NVIDIA Blackwell TRTLLM attention cannot support "
"max_model_len >= 131072 (found "
f"{self.model_config.max_model_len}), causing dynamic "
"dispatching that breaks full cudagraphs. "
"Overriding cudagraph_mode to PIECEWISE."
)
self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE

# disable cudagraph when enforce eager execution
if self.model_config is not None and self.model_config.enforce_eager:
Expand Down
6 changes: 2 additions & 4 deletions vllm/utils/flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,14 +319,12 @@ def use_trtllm_attention(
# Environment variable not set - use auto-detection
if is_prefill:
# Prefill auto-detection
use_trtllm = max_seq_len <= 131072 and kv_cache_dtype == "auto"
use_trtllm = kv_cache_dtype == "auto"
if use_trtllm:
logger.warning_once("Using TRTLLM prefill attention (auto-detected).")
else:
# Decode auto-detection
use_trtllm = (
num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto"
)
use_trtllm = num_tokens <= 256 and kv_cache_dtype == "auto"
if use_trtllm:
logger.warning_once("Using TRTLLM decode attention (auto-detected).")
return use_trtllm
Expand Down