vllm-project
diff --git a/‎docker/Dockerfile.nightly_torch‎
Lines changed: 1 addition & 34 deletions b/‎docker/Dockerfile.nightly_torch‎
Lines changed: 1 addition & 34 deletions
diff --git a/‎docs/contributing/ci/update_pytorch_version.md‎
Lines changed: 0 additions & 15 deletions b/‎docs/contributing/ci/update_pytorch_version.md‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎docs/getting_started/quickstart.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/getting_started/quickstart.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/online_serving/openai_embedding_long_text/service.sh‎
Lines changed: 0 additions & 1 deletion b/‎examples/online_serving/openai_embedding_long_text/service.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎requirements/cuda.txt‎
Lines changed: 0 additions & 1 deletion b/‎requirements/cuda.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/basic_correctness/test_basic_correctness.py‎
Lines changed: 0 additions & 3 deletions b/‎tests/basic_correctness/test_basic_correctness.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎tests/kernels/attention/test_attention.py‎
Lines changed: 0 additions & 129 deletions b/‎tests/kernels/attention/test_attention.py‎
Lines changed: 0 additions & 129 deletions
diff --git a/‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 1 addition & 7 deletions b/‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎tests/kernels/attention/test_mha_attn.py‎
Lines changed: 0 additions & 4 deletions b/‎tests/kernels/attention/test_mha_attn.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎tests/kernels/utils.py‎
Lines changed: 11 additions & 67 deletions b/‎tests/kernels/utils.py‎
Lines changed: 11 additions & 67 deletions
@@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
 uv pip install --system -r requirements/common.txt
 
-# must put before installing xformers, so it can install the correct version of xfomrers.
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-
-# Build xformers with cuda and torch nightly
-# following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# todo(elainewy): cache xformers build result for faster build
-ARG max_jobs=16
-ENV MAX_JOBS=${max_jobs}
-ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-     --mount=type=cache,target=/root/.cache/uv \
-    echo 'git clone xformers...' \
-    && git clone https://github.com/facebookresearch/xformers.git --recursive \
-    && cd xformers \
-    && git checkout ${XFORMERS_COMMIT} \
-    && git submodule update --init --recursive \
-    && echo 'finish git clone xformers...' \
-    && rm -rf build \
-    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
-    && cd .. \
-    && rm -rf xformers
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system xformers-dist/*.whl --verbose
-
 # build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
@@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm
     --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system vllm-dist/*.whl --verbose
 
-# install xformers again for the new environment
-RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
-
 ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 
 # install package for build flashinfer
@@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/nightly_torch_test.txt
 
 # Logging to confirm the torch versions
-RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+RUN pip freeze | grep -E 'torch|vllm|flashinfer'
 
 # Logging to confirm all the packages are installed
 RUN pip freeze
 
@@ -98,21 +98,6 @@ to warm it up so that future builds are faster.
     <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
 </p>
 
-## Update dependencies
-
-Several vLLM dependencies like xFormers depend on PyTorch and need
-to be updated accordingly. Rather than waiting for all of them to publish new
-releases (which would take too much time), they can be built from
-source to unblock the update process.
-
-### xFormers
-
-```bash
-export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
-MAX_JOBS=16 uv pip install --system \
-    --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
-```
-
 ## Update all the different vLLM platforms
 
 Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
 
@@ -283,7 +283,7 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
 
-- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
+- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
 - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
 
 For AMD ROCm, you can further control the specific Attention implementation using the following variables:
 
@@ -22,7 +22,6 @@ API_KEY=${API_KEY:-"your-api-key"}
 POOLING_TYPE=${POOLING_TYPE:-"auto"}  # auto, MEAN, CLS, LAST
 export VLLM_ENABLE_CHUNKED_PROCESSING=true
 export CUDA_VISIBLE_DEVICES=2,3,4,5
-# export VLLM_ATTENTION_BACKEND=XFORMERS
 
 echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
 echo "=================================================================="
 
@@ -9,6 +9,5 @@ torch==2.9.0
 torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.2
@@ -74,9 +74,6 @@ def test_models(
     model_executor: str,
     enable_prompt_embeds: bool,
 ) -> None:
-    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
-        pytest.skip(f"{backend} does not support gemma2 with full context length.")
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
 
@@ -13,12 +13,6 @@
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 
-if not current_platform.is_rocm():
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
-
-    from tests.kernels.utils import make_alibi_bias
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
@@ -448,129 +442,6 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
-    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention(
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    use_alibi: bool = False,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
-    # As the xformers library is already tested with its own tests, we can use
-    # a smaller MAX_SEQ_LEN here.
-    max_len = min(MAX_SEQ_LEN, 4096)
-    seq_lens = random.sample(range(1, max_len), num_seqs)
-    num_tokens = sum(seq_lens)
-
-    scale = float(1.0 / (head_size**0.5))
-    num_query_heads, num_kv_heads = num_heads
-    qkv = torch.empty(
-        num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
-    )
-    qkv.uniform_(-scale, scale)
-    query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
-
-    num_queries_per_kv = num_query_heads // num_kv_heads
-    if num_queries_per_kv > 1:
-        # Handle MQA and GQA
-        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
-        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-    alibi_bias = None
-    if use_alibi:
-        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
-        attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
-        output = torch.empty_like(query)
-        start = 0
-        # Dynamic sequence length not supported with custom attn_bias.
-        for i, seq_len in enumerate(seq_lens):
-            end = start + seq_len
-            out = xops.memory_efficient_attention_forward(
-                query[None, start:end],
-                key[None, start:end],
-                value[None, start:end],
-                attn_bias=attn_bias[i],
-                p=0.0,
-                scale=scale,
-            )
-            output[start:end].copy_(out.view_as(query[start:end]))
-            start += seq_len
-        # xformers.AttentionBias to Tensor for use in reference impl.
-        alibi_bias = [
-            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
-            for b, i in zip(attn_bias, seq_lens)
-        ]
-    else:
-        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
-        output = xops.memory_efficient_attention_forward(
-            query.unsqueeze(0),
-            key.unsqueeze(0),
-            value.unsqueeze(0),
-            attn_bias=attn_bias,
-            p=0.0,
-            scale=scale,
-        )
-        output = output.squeeze(0)
-
-    cu_seq_lens = [0]
-    for seq_len in seq_lens:
-        cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
-    ref_output = ref_multi_query_kv_attention(
-        cu_seq_lens,
-        query,
-        key,
-        value,
-        scale,
-        alibi_bias,
-        dtype,
-    )
-    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
-    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
-
-
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", [64])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
-    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention_with_alibi(
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> None:
-    return test_multi_query_kv_attention(
-        num_seqs,
-        num_heads,
-        head_size,
-        dtype,
-        seed,
-        device,
-        use_alibi=True,
-    )
-
-
 @pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
 def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
     head_size = 64
 
@@ -34,7 +34,7 @@ def clear_cache():
 }
 
 DEVICE_REGULAR_ATTN_BACKENDS = {
-    "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
+    "cuda": ["FLASHINFER", "FLASH_ATTN"],
     "hip": ["ROCM_ATTN"],
     "cpu": ["CPU_ATTN"],
 }
@@ -207,12 +207,6 @@ def test_env(
                     )
                     expected = "FLASHINFER"
                     assert backend.get_name() == expected
-                elif name == "XFORMERS":
-                    backend = get_attn_backend(
-                        32, torch.float16, None, block_size, use_mla=use_mla
-                    )
-                    expected = "XFORMERS"
-                    assert backend.get_name() == expected
                 elif name == "FLASH_ATTN":
                     backend = get_attn_backend(
                         32, torch.float16, None, block_size, use_mla=use_mla
 
@@ -24,10 +24,6 @@
 def clear_cache():
     """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
-    # Clear xformers availability cache
-    import vllm.attention.layer as layer_module
-
-    layer_module.USE_XFORMERS_OPS = None
 
 
 @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
 
@@ -509,43 +509,6 @@ def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs:
     )
 
 
-def make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    num_kv_heads: int,
-    dtype: torch.dtype,
-    seq_lens: list[int],
-) -> list[Any]:
-    """Create ALiBi biases compatible with xFormers attention tests."""
-    from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias
-
-    if alibi_slopes is None:
-        return [None for _ in seq_lens]
-
-    attn_biases: list[Any] = []
-    num_heads = alibi_slopes.shape[0]
-    assert num_heads >= num_kv_heads, (
-        "ALiBi slopes expect at least as many heads as KV heads"
-    )
-
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
-        bias = bias[None, :] - bias[:, None]
-
-        padded_len = (seq_len + 7) // 8 * 8
-        bias_tensor = torch.empty(
-            1,
-            num_heads,
-            seq_len,
-            padded_len,
-            device=alibi_slopes.device,
-            dtype=dtype,
-        )[:, :, :, :seq_len].copy_(bias)
-        bias_tensor.mul_(alibi_slopes[:, None, None])
-        attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor))
-
-    return attn_biases
-
-
 def _make_metadata_tensors(
     seq_lens: list[int] | None,
     context_lens: list[int] | None,
@@ -649,23 +612,12 @@ def make_kv_cache(
 
     Returns:
 
-    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
-    *     for backend 'XFORMERS'
     * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
     *     for backend 'FLASH_ATTN'
     """
-    if backend == "XFORMERS":
-        kv_cache = torch.rand((2, num_blocks, block_size * num_heads * head_size)).to(
-            device
-        )
-    elif backend == "FLASH_ATTN":
-        kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(
-            device
-        )
-    else:
-        raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
-        )
+    if backend != "FLASH_ATTN":
+        raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+    kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(device)
     if default_val is not None:
         kv_cache[:, :, :] = default_val
     return kv_cache
@@ -843,22 +795,14 @@ def assert_actual_matches_ideal(
     * output_under_test: actually observed output value
     """
     ideal_output = test_params.packed_qkvo.ideal_output
-    if backend == "XFORMERS":
-        torch.testing.assert_close(
-            ideal_output, output_under_test.view_as(ideal_output)
-        )
-
-    elif backend == "FLASH_ATTN":
-        # For FlashAttention override the accuracy thresholds to non default
-        # values since we notice a higher difference between the ideal and
-        # actual output.
-        torch.testing.assert_close(
-            ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
-        )
-    else:
-        raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
-        )
+    if backend != "FLASH_ATTN":
+        raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+    # For FlashAttention override the accuracy thresholds to non default
+    # values since we notice a higher difference between the ideal and
+    # actual output.
+    torch.testing.assert_close(
+        ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
+    )
 
 
 # Copied/modified from torch._refs.__init__.py