From 0b01516224859a124db3e65309cdad7d9f50ec08 Mon Sep 17 00:00:00 2001 From: mingyuanm Date: Tue, 25 Nov 2025 15:49:52 -0800 Subject: [PATCH 01/11] remove upstream fa checks Signed-off-by: mingyuanm --- vllm/attention/layer.py | 51 +--------------------- vllm/attention/ops/vit_attn_wrappers.py | 10 +---- vllm/model_executor/models/dots_ocr.py | 2 - vllm/model_executor/models/ernie45_vl.py | 2 - vllm/model_executor/models/glm4_1v.py | 2 - vllm/model_executor/models/keye.py | 1 - vllm/model_executor/models/paddleocr_vl.py | 9 ---- vllm/model_executor/models/qwen2_5_vl.py | 16 ------- vllm/model_executor/models/qwen2_vl.py | 2 - vllm/model_executor/models/qwen3_vl.py | 11 ----- vllm/model_executor/models/siglip2navit.py | 2 - 11 files changed, 3 insertions(+), 105 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index f1d57ac50fb9..f631efa74f45 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -53,53 +53,20 @@ logger = init_logger(__name__) -def check_upstream_fa_availability(dtype: torch.dtype): - if ( - dtype in (torch.float16, torch.bfloat16) - and current_platform.is_cuda() - and current_platform.has_device_capability(80) - ): - from transformers.utils import is_flash_attn_2_available - - return is_flash_attn_2_available() - if current_platform.is_rocm(): - from importlib.util import find_spec - - return find_spec("flash_attn") is not None - return False - def maybe_get_vit_flash_attn_backend( attn_backend: AttentionBackendEnum, - use_upstream_fa: bool, attn_backend_override: AttentionBackendEnum | None = None, ) -> tuple[AttentionBackendEnum, Callable | None]: if current_platform.is_rocm(): if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9(): attn_backend = AttentionBackendEnum.ROCM_AITER_FA - - elif ( - check_upstream_fa_availability(torch.get_default_dtype()) - and on_gfx9() - and attn_backend_override is None - ): - attn_backend = AttentionBackendEnum.FLASH_ATTN - use_upstream_fa = True else: return AttentionBackendEnum.TORCH_SDPA, None - - elif current_platform.is_cuda(): - if ( - attn_backend != AttentionBackendEnum.FLASH_ATTN - and check_upstream_fa_availability(torch.get_default_dtype()) - ): - attn_backend = AttentionBackendEnum.FLASH_ATTN - use_upstream_fa = True elif current_platform.is_xpu(): assert attn_backend == AttentionBackendEnum.FLASH_ATTN, ( "XPU platform only supports FLASH_ATTN as vision attention backend." ) - use_upstream_fa = False else: return AttentionBackendEnum.TORCH_SDPA, None @@ -110,10 +77,7 @@ def maybe_get_vit_flash_attn_backend( if attn_backend == AttentionBackendEnum.ROCM_AITER_FA: from aiter import flash_attn_varlen_func else: - if use_upstream_fa: - from flash_attn import flash_attn_varlen_func - else: - from vllm.attention.utils.fa_utils import flash_attn_varlen_func + from vllm.attention.utils.fa_utils import flash_attn_varlen_func else: flash_attn_varlen_func = None @@ -498,10 +462,6 @@ def __init__( attn_backend_override=attn_backend_override, ) - # Some auto-selected backends can be upgraded - # to upstream flash attention if available. - # If vllm native fa is selected, we use it directly. - use_upstream_fa = False self.attn_backend = ( backend @@ -518,7 +478,6 @@ def __init__( self.attn_backend, self._flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - use_upstream_fa, attn_backend_override=attn_backend_override, ) ) @@ -528,17 +487,9 @@ def __init__( AttentionBackendEnum.ROCM_AITER_FA, } - # this condition is just to make sure that the - # use_upstream_fa in the log is correct - if ( - current_platform.is_rocm() - and self.attn_backend == AttentionBackendEnum.FLASH_ATTN - ): - use_upstream_fa = True logger.info_once( f"MultiHeadAttention attn_backend: {self.attn_backend}, " - f"use_upstream_fa: {use_upstream_fa}" ) def forward( diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py index 46f8f5117f7a..d9f15f1e4285 100644 --- a/vllm/attention/ops/vit_attn_wrappers.py +++ b/vllm/attention/ops/vit_attn_wrappers.py @@ -27,15 +27,11 @@ def flash_attn_maxseqlen_wrapper( max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, - use_upstream_fa: bool, ) -> torch.Tensor: if is_rocm_aiter: from aiter import flash_attn_varlen_func else: - if use_upstream_fa: - from flash_attn import flash_attn_varlen_func - else: - from vllm.attention.utils.fa_utils import flash_attn_varlen_func + from vllm.attention.utils.fa_utils import flash_attn_varlen_func q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) output = flash_attn_varlen_func( q, @@ -62,7 +58,6 @@ def flash_attn_maxseqlen_wrapper_fake( max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, - use_upstream_fa: bool, ) -> torch.Tensor: b, s, h, d = q.shape return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device) @@ -83,10 +78,9 @@ def vit_flash_attn_wrapper( max_seqlen: torch.Tensor, batch_size: int, is_rocm_aiter: bool, - use_upstream_fa: bool, ) -> torch.Tensor: return torch.ops.vllm.flash_attn_maxseqlen_wrapper( - q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa + q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter ) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 5460018d0d67..2364fa11bf6d 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -294,12 +294,10 @@ def __init__( torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - self.use_upstream_fa = False self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - self.use_upstream_fa, attn_backend_override=attn_backend_override, ) ) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 07b34fbc8add..d0e42e9320c3 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -201,12 +201,10 @@ def __init__( attn_backend_override=attn_backend_override, ) - self.use_upstream_fa = False self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - self.use_upstream_fa, attn_backend_override=attn_backend_override, ) ) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 7e0370886884..e5c141fc00fb 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -296,12 +296,10 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - self.use_upstream_fa = False self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - self.use_upstream_fa, attn_backend_override=attn_backend_override, ) ) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 302260b95299..881760155814 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -418,7 +418,6 @@ def __init__( self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - use_upstream_fa=False, attn_backend_override=attn_backend_override, ) ) diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 74bb868492da..a3ea8fd065fa 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -582,7 +582,6 @@ def __init__( prefix: str = "", attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA, attn_backend_override: AttentionBackendEnum | None = None, - use_upstream_fa: bool = False, ) -> None: super().__init__() @@ -612,11 +611,9 @@ def __init__( ) self.attn_backend = attn_backend - self.use_upstream_fa = use_upstream_fa self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - self.use_upstream_fa, attn_backend_override=attn_backend_override, ) ) @@ -680,7 +677,6 @@ def forward( max_seqlen, batch_size, self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA, - self.use_upstream_fa, ) elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: outputs = [] @@ -783,7 +779,6 @@ def __init__( *, attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA, attn_backend_override: AttentionBackendEnum | None = None, - use_upstream_fa: bool = False, ): super().__init__() self.embed_dim = config.hidden_size @@ -796,7 +791,6 @@ def __init__( prefix=f"{prefix}.self_attn", attn_backend=attn_backend, attn_backend_override=attn_backend_override, - use_upstream_fa=use_upstream_fa, ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = SiglipMLP( @@ -852,13 +846,11 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - self.use_upstream_fa = False if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.ROCM_AITER_FA, } and check_upstream_fa_availability(torch.get_default_dtype()): self.attn_backend = AttentionBackendEnum.FLASH_ATTN - self.use_upstream_fa = True if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.TORCH_SDPA, @@ -875,7 +867,6 @@ def __init__( prefix=f"{prefix}.layers.{layer_idx}", attn_backend=self.attn_backend, attn_backend_override=attn_backend_override, - use_upstream_fa=self.use_upstream_fa, ) for layer_idx in range(config.num_hidden_layers) ] diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8c707c2561af..d8a4972b3186 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -307,7 +307,6 @@ def __init__( prefix: str = "", use_data_parallel: bool = False, attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA, - use_upstream_fa: bool = False, attn_backend_override: AttentionBackendEnum | None = None, ) -> None: super().__init__() @@ -344,24 +343,15 @@ def __init__( disable_tp=use_data_parallel, ) self.attn_backend = attn_backend - self.use_upstream_fa = use_upstream_fa self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - self.use_upstream_fa, attn_backend_override=attn_backend_override, ) ) # On ROCm with FLASH_ATTN backend, upstream flash_attn is used from vllm.platforms import current_platform - if ( - current_platform.is_rocm() - and self.attn_backend == AttentionBackendEnum.FLASH_ATTN - ): - self.use_upstream_fa = True - if current_platform.is_xpu(): - self.use_upstream_fa = False self.is_flash_attn_backend = self.attn_backend in { AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.ROCM_AITER_FA, @@ -415,7 +405,6 @@ def forward( max_seqlen, batch_size, self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA, - self.use_upstream_fa, ) elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: # Execute attention entry by entry for speed & less VRAM. @@ -459,7 +448,6 @@ def __init__( prefix: str = "", use_data_parallel: bool = False, attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA, - use_upstream_fa: bool = False, attn_backend_override: AttentionBackendEnum | None = None, ) -> None: super().__init__() @@ -475,7 +463,6 @@ def __init__( prefix=f"{prefix}.attn", use_data_parallel=use_data_parallel, attn_backend=attn_backend, - use_upstream_fa=use_upstream_fa, attn_backend_override=attn_backend_override, ) self.mlp = Qwen2_5_VisionMLP( @@ -644,7 +631,6 @@ def __init__( is_neox_style=True, ) - use_upstream_fa = False self.attn_backend = get_vit_attn_backend( head_size=head_dim, dtype=torch.get_default_dtype(), @@ -654,7 +640,6 @@ def __init__( self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - use_upstream_fa, attn_backend_override=attn_backend_override, ) ) @@ -681,7 +666,6 @@ def __init__( prefix=f"{prefix}.blocks.{layer_idx}", use_data_parallel=use_data_parallel, attn_backend=self.attn_backend, - use_upstream_fa=use_upstream_fa, attn_backend_override=attn_backend_override, ) for layer_idx in range(depth) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 9d1d023aed17..740963aa5c0b 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -335,12 +335,10 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - self.use_upstream_fa = False self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - self.use_upstream_fa, attn_backend_override=attn_backend_override, ) ) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 4cd6fa14c32d..8c894e2126cf 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -201,7 +201,6 @@ def __init__( prefix: str = "", use_data_parallel: bool = False, attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA, - use_upstream_fa: bool = False, ) -> None: super().__init__() if norm_layer is None: @@ -216,7 +215,6 @@ def __init__( prefix=f"{prefix}.attn", use_data_parallel=use_data_parallel, attn_backend=attn_backend, - use_upstream_fa=use_upstream_fa, ) self.mlp = Qwen3_VisionMLP( dim, @@ -377,14 +375,6 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - use_upstream_fa = False - if ( - self.attn_backend != AttentionBackendEnum.FLASH_ATTN - and self.attn_backend != AttentionBackendEnum.ROCM_AITER_FA - and check_upstream_fa_availability(torch.get_default_dtype()) - ): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN - use_upstream_fa = True if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, @@ -406,7 +396,6 @@ def __init__( prefix=f"{prefix}.blocks.{layer_idx}", use_data_parallel=use_data_parallel, attn_backend=self.attn_backend, - use_upstream_fa=use_upstream_fa, ) for layer_idx in range(vision_config.depth) ] diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index c185b45345bd..bbce01995412 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -255,12 +255,10 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - self.use_upstream_fa = False self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - self.use_upstream_fa, attn_backend_override=attn_backend_override, ) ) From ae44624e5ff4fbff4542bc07e4ae7a54939722b5 Mon Sep 17 00:00:00 2001 From: mingyuanm Date: Tue, 25 Nov 2025 18:42:36 -0800 Subject: [PATCH 02/11] update the logic in maybe_get_vit_flash_attn_backend Signed-off-by: mingyuanm --- vllm/attention/layer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index f631efa74f45..eb347883f24e 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -61,8 +61,12 @@ def maybe_get_vit_flash_attn_backend( if current_platform.is_rocm(): if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9(): attn_backend = AttentionBackendEnum.ROCM_AITER_FA + elif attn_backend_override is None and attn_backend == AttentionBackendEnum.FLASH_ATTN: + return AttentionBackendEnum.FLASH_ATTN else: return AttentionBackendEnum.TORCH_SDPA, None + elif current_platform.is_cuda(): + return attn_backend # keep the original selection elif current_platform.is_xpu(): assert attn_backend == AttentionBackendEnum.FLASH_ATTN, ( "XPU platform only supports FLASH_ATTN as vision attention backend." From 2eaf2a44b0657ab1aed20c88085229b383274d53 Mon Sep 17 00:00:00 2001 From: mingyuanm Date: Tue, 25 Nov 2025 18:47:01 -0800 Subject: [PATCH 03/11] Remove deleted functions Signed-off-by: mingyuanm --- vllm/model_executor/models/dots_ocr.py | 6 ------ vllm/model_executor/models/ernie45_vl.py | 7 +------ vllm/model_executor/models/glm4_1v.py | 10 +--------- vllm/model_executor/models/paddleocr_vl.py | 7 +------ vllm/model_executor/models/qwen2_vl.py | 6 ------ vllm/model_executor/models/qwen3_omni_moe_thinker.py | 6 ------ vllm/model_executor/models/qwen3_vl.py | 1 - 7 files changed, 3 insertions(+), 40 deletions(-) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 2364fa11bf6d..5cc2a48f26d6 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -11,7 +11,6 @@ from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import ( - check_upstream_fa_availability, maybe_get_vit_flash_attn_backend, ) from vllm.config import VllmConfig @@ -567,11 +566,6 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - if ( - self.attn_backend != AttentionBackendEnum.FLASH_ATTN - and check_upstream_fa_availability(torch.get_default_dtype()) - ): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN self.out_hidden_size = config.hidden_size # Keep blocks for compatibility with other vision towers num_layers = ( diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index d0e42e9320c3..257c2110cfb0 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -38,7 +38,6 @@ from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import ( - check_upstream_fa_availability, maybe_get_vit_flash_attn_backend, ) from vllm.config import VllmConfig @@ -496,11 +495,7 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - if ( - self.attn_backend != AttentionBackendEnum.FLASH_ATTN - and check_upstream_fa_availability(torch.get_default_dtype()) - ): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN + @property def dtype(self) -> torch.dtype: diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index e5c141fc00fb..fe238861ecce 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -47,10 +47,7 @@ from transformers.video_utils import VideoMetadata from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layer import ( - check_upstream_fa_availability, - maybe_get_vit_flash_attn_backend, -) +from vllm.attention.layer import maybe_get_vit_flash_attn_backend from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state @@ -728,11 +725,6 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - if ( - self.attn_backend != AttentionBackendEnum.FLASH_ATTN - and check_upstream_fa_availability(torch.get_default_dtype()) - ): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN @property def dtype(self) -> torch.dtype: diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index a3ea8fd065fa..57f36c28bd19 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -33,7 +33,7 @@ from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import ( - check_upstream_fa_availability, + maybe_get_vit_flash_attn_backend, ) from vllm.attention.ops.vit_attn_wrappers import ( @@ -846,11 +846,6 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - if self.attn_backend not in { - AttentionBackendEnum.FLASH_ATTN, - AttentionBackendEnum.ROCM_AITER_FA, - } and check_upstream_fa_availability(torch.get_default_dtype()): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.TORCH_SDPA, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 740963aa5c0b..672659aa6042 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -45,7 +45,6 @@ from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import ( - check_upstream_fa_availability, maybe_get_vit_flash_attn_backend, ) from vllm.config import VllmConfig @@ -655,11 +654,6 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - if ( - self.attn_backend != AttentionBackendEnum.FLASH_ATTN - and check_upstream_fa_availability(torch.get_default_dtype()) - ): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN @property def dtype(self) -> torch.dtype: diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index f5f88f66eff9..39dd42552ae8 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -47,7 +47,6 @@ from transformers.models.whisper import WhisperFeatureExtractor from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layer import check_upstream_fa_availability from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed import get_pp_group @@ -381,11 +380,6 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - if ( - self.attn_backend != AttentionBackendEnum.FLASH_ATTN - and check_upstream_fa_availability(torch.get_default_dtype()) - ): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN @property def dtype(self) -> torch.dtype: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 8c894e2126cf..fa8698795245 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -49,7 +49,6 @@ from transformers.video_utils import VideoMetadata from vllm.attention.backends.registry import AttentionBackendEnum -from vllm.attention.layer import check_upstream_fa_availability from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions From 2bf56d35f850e8d5cf6b817f1626a2c6baed6ca8 Mon Sep 17 00:00:00 2001 From: mingyuanm Date: Tue, 25 Nov 2025 18:53:19 -0800 Subject: [PATCH 04/11] Format Signed-off-by: mingyuanm --- vllm/attention/layer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index eb347883f24e..f1dc3402b241 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -61,12 +61,13 @@ def maybe_get_vit_flash_attn_backend( if current_platform.is_rocm(): if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9(): attn_backend = AttentionBackendEnum.ROCM_AITER_FA - elif attn_backend_override is None and attn_backend == AttentionBackendEnum.FLASH_ATTN: - return AttentionBackendEnum.FLASH_ATTN + elif attn_backend_override is None \ + and attn_backend == AttentionBackendEnum.FLASH_ATTN: + return AttentionBackendEnum.FLASH_ATTN, None else: return AttentionBackendEnum.TORCH_SDPA, None elif current_platform.is_cuda(): - return attn_backend # keep the original selection + return attn_backend, None # keep the original selection elif current_platform.is_xpu(): assert attn_backend == AttentionBackendEnum.FLASH_ATTN, ( "XPU platform only supports FLASH_ATTN as vision attention backend." From 0e8901535ff607b2a1214f01d3999eed9229993a Mon Sep 17 00:00:00 2001 From: mingyuanm Date: Tue, 25 Nov 2025 19:27:41 -0800 Subject: [PATCH 05/11] add back on_fgx9() check Signed-off-by: mingyuanm --- vllm/attention/layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index f1dc3402b241..b79c476dd6b3 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -61,7 +61,7 @@ def maybe_get_vit_flash_attn_backend( if current_platform.is_rocm(): if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9(): attn_backend = AttentionBackendEnum.ROCM_AITER_FA - elif attn_backend_override is None \ + elif attn_backend_override is None and on_gfx9() \ and attn_backend == AttentionBackendEnum.FLASH_ATTN: return AttentionBackendEnum.FLASH_ATTN, None else: From 10c34ad463baa73b8dea8e3cfcc204c8fdd9359e Mon Sep 17 00:00:00 2001 From: mingyuanm Date: Tue, 25 Nov 2025 21:13:12 -0800 Subject: [PATCH 06/11] Add flash_attn_varlen_func import to fa_utils for rocm platform Signed-off-by: mingyuanm --- vllm/attention/layer.py | 5 +++-- vllm/attention/utils/fa_utils.py | 9 +++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index b79c476dd6b3..c754c7c3086f 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -63,15 +63,16 @@ def maybe_get_vit_flash_attn_backend( attn_backend = AttentionBackendEnum.ROCM_AITER_FA elif attn_backend_override is None and on_gfx9() \ and attn_backend == AttentionBackendEnum.FLASH_ATTN: - return AttentionBackendEnum.FLASH_ATTN, None + pass else: return AttentionBackendEnum.TORCH_SDPA, None elif current_platform.is_cuda(): - return attn_backend, None # keep the original selection + pass elif current_platform.is_xpu(): assert attn_backend == AttentionBackendEnum.FLASH_ATTN, ( "XPU platform only supports FLASH_ATTN as vision attention backend." ) + pass else: return AttentionBackendEnum.TORCH_SDPA, None diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py index adb9b08a6573..30831371f3b4 100644 --- a/vllm/attention/utils/fa_utils.py +++ b/vllm/attention/utils/fa_utils.py @@ -18,6 +18,15 @@ reshape_and_cache_flash = ops.reshape_and_cache_flash flash_attn_varlen_func = ops.flash_attn_varlen_func get_scheduler_metadata = ops.get_scheduler_metadata +elif current_platform.is_rocm(): + try: + from flash_attn import flash_attn_varlen_func + except ImportError as e: + raise ImportError( + "Rocm platform requires upstream flash-attn" + "to be installed. Please install flash_attn first." + ) from e + def get_flash_attn_version(requires_alibi: bool = False) -> int | None: From 8dbf627bf3d1b679302ee50686094c7c38701d54 Mon Sep 17 00:00:00 2001 From: mingyuanm Date: Tue, 25 Nov 2025 21:19:25 -0800 Subject: [PATCH 07/11] typo fix Signed-off-by: mingyuanm --- vllm/attention/utils/fa_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py index 30831371f3b4..4be87735d56c 100644 --- a/vllm/attention/utils/fa_utils.py +++ b/vllm/attention/utils/fa_utils.py @@ -24,7 +24,7 @@ except ImportError as e: raise ImportError( "Rocm platform requires upstream flash-attn" - "to be installed. Please install flash_attn first." + "to be installed. Please install flash-attn first." ) from e From e2a5e5c7d5356e4c5a5adda6187916a17f78cc5e Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 27 Nov 2025 17:33:07 -0800 Subject: [PATCH 08/11] update Signed-off-by: Roger Wang --- vllm/attention/utils/fa_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py index 4be87735d56c..71b343654393 100644 --- a/vllm/attention/utils/fa_utils.py +++ b/vllm/attention/utils/fa_utils.py @@ -20,10 +20,10 @@ get_scheduler_metadata = ops.get_scheduler_metadata elif current_platform.is_rocm(): try: - from flash_attn import flash_attn_varlen_func + from flash_attn import flash_attn_varlen_func # noqa: F401 except ImportError as e: raise ImportError( - "Rocm platform requires upstream flash-attn" + "Rocm platform requires upstream flash-attn " "to be installed. Please install flash-attn first." ) from e From 7e973ffa7516e6e062cd8ccf85c6ae4da34753fa Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 27 Nov 2025 17:41:17 -0800 Subject: [PATCH 09/11] precommit Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen2_5_vl.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index d8a4972b3186..6ca490f46763 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -349,8 +349,6 @@ def __init__( attn_backend_override=attn_backend_override, ) ) - # On ROCm with FLASH_ATTN backend, upstream flash_attn is used - from vllm.platforms import current_platform self.is_flash_attn_backend = self.attn_backend in { AttentionBackendEnum.FLASH_ATTN, From 966854bbc5fee53283fb7ae0eaaa9130efbc71e5 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 27 Nov 2025 17:46:59 -0800 Subject: [PATCH 10/11] precommit Signed-off-by: Roger Wang --- vllm/attention/layer.py | 12 ++++++------ vllm/attention/utils/fa_utils.py | 1 - vllm/model_executor/models/paddleocr_vl.py | 1 - 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 382a01905fa9..da5a62617129 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -56,7 +56,6 @@ logger = init_logger(__name__) - def maybe_get_vit_flash_attn_backend( attn_backend: AttentionBackendEnum, attn_backend_override: AttentionBackendEnum | None = None, @@ -64,8 +63,11 @@ def maybe_get_vit_flash_attn_backend( if current_platform.is_rocm(): if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9(): attn_backend = AttentionBackendEnum.ROCM_AITER_FA - elif attn_backend_override is None and on_gfx9() \ - and attn_backend == AttentionBackendEnum.FLASH_ATTN: + elif ( + attn_backend_override is None + and on_gfx9() + and attn_backend == AttentionBackendEnum.FLASH_ATTN + ): pass else: return AttentionBackendEnum.TORCH_SDPA, None @@ -471,7 +473,6 @@ def __init__( attn_backend_override=attn_backend_override, ) - self.attn_backend = ( backend if backend @@ -496,9 +497,8 @@ def __init__( AttentionBackendEnum.ROCM_AITER_FA, } - logger.info_once( - f"MultiHeadAttention attn_backend: {self.attn_backend}, " + f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder." ) def forward( diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py index 71b343654393..8a46587473e4 100644 --- a/vllm/attention/utils/fa_utils.py +++ b/vllm/attention/utils/fa_utils.py @@ -28,7 +28,6 @@ ) from e - def get_flash_attn_version(requires_alibi: bool = False) -> int | None: # import here to avoid circular dependencies from vllm.platforms import current_platform diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 57f36c28bd19..5256d8ba7fd8 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -33,7 +33,6 @@ from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import ( - maybe_get_vit_flash_attn_backend, ) from vllm.attention.ops.vit_attn_wrappers import ( From db4de68201452526c64c97020af08e7f84820d09 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 27 Nov 2025 18:19:03 -0800 Subject: [PATCH 11/11] precommit Signed-off-by: Roger Wang --- vllm/model_executor/models/ernie45_vl.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 257c2110cfb0..81663dd7bbb4 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -200,7 +200,6 @@ def __init__( attn_backend_override=attn_backend_override, ) - self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, @@ -496,7 +495,6 @@ def __init__( attn_backend_override=attn_backend_override, ) - @property def dtype(self) -> torch.dtype: return self.patch_embed.proj.weight.dtype