vllm-project
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh‎
Lines changed: 64 additions & 0 deletions b/‎.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎.buildkite/test-amd.yaml‎
Lines changed: 2 additions & 3 deletions b/‎.buildkite/test-amd.yaml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 26 additions & 3 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎benchmarks/kernels/benchmark_mrope.py‎
Lines changed: 7 additions & 12 deletions b/‎benchmarks/kernels/benchmark_mrope.py‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎docker/Dockerfile.cpu‎
Lines changed: 10 additions & 0 deletions b/‎docker/Dockerfile.cpu‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎examples/offline_inference/context_extension.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/offline_inference/context_extension.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/compile/distributed/test_fusions_e2e.py‎
Lines changed: 4 additions & 8 deletions b/‎tests/compile/distributed/test_fusions_e2e.py‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎tests/compile/test_functionalization.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/compile/test_functionalization.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/distributed/test_context_parallel.py‎
Lines changed: 6 additions & 6 deletions b/‎tests/distributed/test_context_parallel.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tests/kernels/attention/test_prefix_prefill.py‎
Lines changed: 6 additions & 6 deletions b/‎tests/kernels/attention/test_prefix_prefill.py‎
Lines changed: 6 additions & 6 deletions
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-0-16}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
+NUMA_NODE=${NUMA_NODE:-0}
+
+export CMAKE_BUILD_PARALLEL_LEVEL=32
+
+# Setup cleanup
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test-"$NUMA_NODE" || true;
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+
+# Run the image, setting --shm-size=4g for tensor parallel.
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+
+function cpu_tests() {
+  set -e
+  export NUMA_NODE=$2
+
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run kernel tests
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/test_onednn.py
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+
+  # basic online serving
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
@@ -1323,21 +1323,20 @@ steps:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
 
 - label: Weight Loading Multiple GPU Test - Large Models # optional
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_2
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
-  gpu: a100
   optional: true
   source_file_dependencies:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
 
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
   mirror_hardwares: [amdexperimental]
 
@@ -550,6 +550,26 @@ steps:
   commands:
     - pytest -v -s kernels/mamba
 
+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  optional: true
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s tests/kernels/moe/test_deepgemm.py
+    - pytest -v -s tests/kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s tests/kernels/attention/test_deepgemm_attention.py
+
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
   torch_nightly: true
@@ -872,12 +892,12 @@ steps:
   optional: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
     - pytest -v -s tests/models/test_transformers.py
     # - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
+    - pytest -v -s tests/models/multimodal/test_mapping.py
     - python3 examples/offline_inference/basic/chat.py
-    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
@@ -921,6 +941,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 
 - label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
@@ -930,6 +951,8 @@ steps:
   - csrc/quantization/fp4/
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
   - vllm/compilation/
   # can affect pattern matching
   - vllm/model_executor/layers/layernorm.py
 
@@ -6,7 +6,7 @@
 #
 # The CSV file (named with current date/time) contains these columns:
 # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
-# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
+# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
 # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
 # speedup
 #
@@ -86,9 +86,8 @@ def benchmark_mrope(
     num_heads: int,
     num_kv_heads: int,
     max_position: int = 8192,
-    rope_theta: float = 10000,
     is_neox_style: bool = True,
-    rope_scaling: dict[str, Any] = None,
+    rope_parameters: dict[str, Any] | None = None,
     dtype: torch.dtype = torch.bfloat16,
     seed: int = 0,
     warmup_iter: int = 10,
@@ -102,9 +101,8 @@ def benchmark_mrope(
         head_size=head_dim,
         rotary_dim=head_dim,
         max_position=max_position,
-        base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=rope_scaling,
+        rope_parameters=rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
@@ -203,9 +201,8 @@ def benchmark_mrope(
             num_kv_heads,
             head_dim,
             max_position,
-            rope_theta,
             is_neox_style,
-            str(rope_scaling),
+            str(rope_parameters),
             str(dtype).split(".")[-1],
             torch_stats["mean"],
             torch_stats["median"],
@@ -255,9 +252,8 @@ def benchmark_mrope(
             "num_kv_heads",
             "head_dim",
             "max_position",
-            "rope_theta",
             "is_neox_style",
-            "rope_scaling",
+            "rope_parameters",
             "dtype",
             "torch_mean",
             "torch_median",
@@ -303,7 +299,7 @@ def benchmark_mrope(
                 q_size = num_heads * head_dim
                 kv_size = num_kv_heads * head_dim
                 is_neox_style = True
-                rope_theta = config.rope_theta
+                rope_parameters = config.rope_parameters
                 max_position = config.max_position_embeddings
 
                 for num_tokens in num_tokens_list:
@@ -315,9 +311,8 @@ def benchmark_mrope(
                         num_heads=num_heads,
                         num_kv_heads=num_kv_heads,
                         max_position=max_position,
-                        rope_theta=rope_theta,
                         is_neox_style=is_neox_style,
-                        rope_scaling=config.rope_scaling,
+                        rope_parameters=rope_parameters,
                         dtype=getattr(torch, args.dtype),
                         seed=args.seed,
                         warmup_iter=args.warmup_iter,
 
@@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
     && curl -LsSf https://astral.sh/uv/install.sh | sh
 
+ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
@@ -122,6 +123,15 @@ WORKDIR /workspace/vllm
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
     cp requirements/test.in requirements/cpu-test.in && \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
+    remove_packages_not_supported_on_aarch64() { \
+      case "$(uname -m)" in \
+        aarch64|arm64) \
+          sed -i '/decord/d' requirements/cpu-test.in; \
+          sed -i '/terratorch/d' requirements/cpu-test.in; \
+          ;; \
+      esac; \
+    }; \
+    remove_packages_not_supported_on_aarch64 && \
     sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
     sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
     sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
 
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This script demonstrates how to extend the context length
-of a Qwen model using the YARN method (rope_scaling)
+of a Qwen model using the YARN method (rope_parameters)
 and run a simple chat example.
 
 Usage:
@@ -19,8 +19,8 @@ def create_llm():
 
     # Use yarn to extend context
     hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
 
@@ -47,12 +47,8 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             # Use smaller model for L40s in CI
             model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-            # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
-            #  so FI attention+fp8_quant is at least tested once
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER
-            if is_blackwell()
-            else AttentionBackendEnum.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             matches=Matches(
                 attention_fusion=32,
                 allreduce_fusion=65,
@@ -65,9 +61,9 @@ class ModelBackendTestCase(NamedTuple):
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
             # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
             # https://github.com/vllm-project/vllm/issues/28568
-            # TODO FlashInfer attn broken on Blackwell for llama4:
-            # https://github.com/vllm-project/vllm/issues/28604
-            backend=AttentionBackendEnum.TRITON_ATTN,
+            backend=AttentionBackendEnum.FLASHINFER
+            if is_blackwell()
+            else AttentionBackendEnum.TRITON_ATTN,
             matches=Matches(
                 attention_fusion=48,
                 allreduce_fusion=96,
 
@@ -137,7 +137,7 @@ def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000):
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=max_position,
-            base=base,
+            rope_parameters={"rope_type": "default", "rope_theta": base},
         )
 
     def forward(self, positions, q, k):
@@ -172,7 +172,7 @@ def __init__(self, head_dim=64, num_heads=4, max_position=2048, base=10000):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=base,
+            rope_parameters={"rope_type": "default", "rope_theta": base},
         )
 
     def forward(self, positions, hidden_states):
 
@@ -31,7 +31,7 @@ class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
     dcp_size: int
-    dcp_kv_cache_interleave_size: int
+    cp_kv_cache_interleave_size: int
     eager_mode: bool
     chunked_prefill: bool
 
@@ -55,7 +55,7 @@ def detailed(
         tp_base: int = 4,
         pp_base: int = 1,
         dcp_base: int = 1,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
         load_format: str | None = None,
@@ -71,7 +71,7 @@ def detailed(
                                 tp_size=tp_base,
                                 pp_size=pp_multiplier * pp_base,
                                 dcp_size=int(dcp_multiplier * tp_base),
-                                dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+                                cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
                                 eager_mode=eager_mode_val,
                                 chunked_prefill=chunked_prefill_val,
                             )
@@ -116,7 +116,7 @@ def _compare_cp_with_tp(
         tp_size,
         pp_size,
         dcp_size,
-        dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size,
         eager_mode,
         chunked_prefill,
     ) = parallel_setup
@@ -197,7 +197,7 @@ def _compare_cp_with_tp(
         "--decode-context-parallel-size",
         str(dcp_size),
         "--dcp-kv-cache-interleave-size",
-        str(dcp_kv_cache_interleave_size),
+        str(cp_kv_cache_interleave_size),
         "--distributed-executor-backend",
         distributed_backend,
     ]
@@ -227,7 +227,7 @@ def _compare_cp_with_tp(
     "deepseek-ai/DeepSeek-V2-Lite-Chat": [
         CPTestSettings.detailed(),
         CPTestSettings.detailed(tp_base=2),
-        CPTestSettings.detailed(tp_base=2, dcp_kv_cache_interleave_size=64),
+        CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
     ],
     "bigcode/gpt_bigcode-santacoder": [
         CPTestSettings.detailed(),
 
@@ -174,11 +174,11 @@ def test_contexted_kv_attention(
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
+        torch.int32
     )
     for i in range(BS):
         for j in range(query_lens[i]):
@@ -417,11 +417,11 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
+        torch.int32
     )
     for i in range(BS):
         for j in range(query_lens[i]):