Skip to content

Commit 5195741

Browse files
authored
Merge branch 'main' into docker-switch-to-runtime
2 parents 9c07bfb + 20e4497 commit 5195741

File tree

185 files changed

+3576
-1934
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

185 files changed

+3576
-1934
lines changed
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
3+
# This script build the CPU docker image and run the offline inference inside the container.
4+
# It serves a sanity check for compilation and basic model usage.
5+
set -ex
6+
7+
# allow to bind to different cores
8+
CORE_RANGE=${CORE_RANGE:-0-16}
9+
OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
10+
NUMA_NODE=${NUMA_NODE:-0}
11+
12+
export CMAKE_BUILD_PARALLEL_LEVEL=32
13+
14+
# Setup cleanup
15+
remove_docker_container() {
16+
set -e;
17+
docker rm -f cpu-test-"$NUMA_NODE" || true;
18+
}
19+
trap remove_docker_container EXIT
20+
remove_docker_container
21+
22+
# Try building the docker image
23+
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
24+
25+
# Run the image, setting --shm-size=4g for tensor parallel.
26+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
27+
28+
function cpu_tests() {
29+
set -e
30+
export NUMA_NODE=$2
31+
32+
docker exec cpu-test-"$NUMA_NODE" bash -c "
33+
set -e
34+
pip list"
35+
36+
# offline inference
37+
docker exec cpu-test-"$NUMA_NODE" bash -c "
38+
set -e
39+
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
40+
41+
# Run kernel tests
42+
docker exec cpu-test-"$NUMA_NODE" bash -c "
43+
set -e
44+
pytest -x -v -s tests/kernels/test_onednn.py
45+
pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
46+
47+
# basic online serving
48+
docker exec cpu-test-"$NUMA_NODE" bash -c '
49+
set -e
50+
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 &
51+
server_pid=$!
52+
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
53+
vllm bench serve \
54+
--backend vllm \
55+
--dataset-name random \
56+
--model meta-llama/Llama-3.2-3B-Instruct \
57+
--num-prompts 20 \
58+
--endpoint /v1/completions
59+
kill -s SIGTERM $server_pid &'
60+
}
61+
62+
# All of CPU tests are expected to be finished less than 40 mins.
63+
export -f cpu_tests
64+
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/test-amd.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,21 +1323,20 @@ steps:
13231323
- vllm/
13241324
- tests/weight_loading
13251325
commands:
1326-
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
1326+
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
13271327

13281328
- label: Weight Loading Multiple GPU Test - Large Models # optional
13291329
mirror_hardwares: [amdexperimental]
13301330
agent_pool: mi325_2
13311331
# grade: Blocking
13321332
working_dir: "/vllm-workspace/tests"
13331333
num_gpus: 2
1334-
gpu: a100
13351334
optional: true
13361335
source_file_dependencies:
13371336
- vllm/
13381337
- tests/weight_loading
13391338
commands:
1340-
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
1339+
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
13411340

13421341
- label: NixlConnector PD accuracy tests (Distributed) # 30min
13431342
mirror_hardwares: [amdexperimental]

.buildkite/test-pipeline.yaml

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -550,6 +550,26 @@ steps:
550550
commands:
551551
- pytest -v -s kernels/mamba
552552

553+
- label: Kernels DeepGEMM Test (H100)
554+
timeout_in_minutes: 45
555+
gpu: h100
556+
num_gpus: 1
557+
optional: true
558+
source_file_dependencies:
559+
- tools/install_deepgemm.sh
560+
- vllm/utils/deep_gemm.py
561+
- vllm/model_executor/layers/fused_moe
562+
- vllm/model_executor/layers/quantization
563+
- tests/kernels/quantization/test_block_fp8.py
564+
- tests/kernels/moe/test_deepgemm.py
565+
- tests/kernels/moe/test_batched_deepgemm.py
566+
- tests/kernels/attention/test_deepgemm_attention.py
567+
commands:
568+
- pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm
569+
- pytest -v -s tests/kernels/moe/test_deepgemm.py
570+
- pytest -v -s tests/kernels/moe/test_batched_deepgemm.py
571+
- pytest -v -s tests/kernels/attention/test_deepgemm_attention.py
572+
553573
- label: Model Executor Test # 23min
554574
timeout_in_minutes: 35
555575
torch_nightly: true
@@ -872,12 +892,12 @@ steps:
872892
optional: true
873893
commands:
874894
- pip install --upgrade git+https://github.com/huggingface/transformers
875-
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
895+
- pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)'
876896
- pytest -v -s tests/models/test_transformers.py
877897
# - pytest -v -s tests/models/multimodal/processing/
878-
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
898+
- pytest -v -s tests/models/multimodal/test_mapping.py
879899
- python3 examples/offline_inference/basic/chat.py
880-
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
900+
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
881901
# Whisper needs spawn method to avoid deadlock
882902
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
883903

@@ -921,6 +941,7 @@ steps:
921941
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
922942
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
923943
- pytest -v -s tests/kernels/moe/test_flashinfer.py
944+
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
924945

925946
- label: Blackwell Fusion and Compile Tests # 30 min
926947
timeout_in_minutes: 40
@@ -930,6 +951,8 @@ steps:
930951
- csrc/quantization/fp4/
931952
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
932953
- vllm/v1/attention/backends/flashinfer.py
954+
- vllm/v1/worker/
955+
- vllm/v1/cudagraph_dispatcher.py
933956
- vllm/compilation/
934957
# can affect pattern matching
935958
- vllm/model_executor/layers/layernorm.py

benchmarks/kernels/benchmark_mrope.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#
77
# The CSV file (named with current date/time) contains these columns:
88
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
9-
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
9+
# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
1010
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
1111
# speedup
1212
#
@@ -86,9 +86,8 @@ def benchmark_mrope(
8686
num_heads: int,
8787
num_kv_heads: int,
8888
max_position: int = 8192,
89-
rope_theta: float = 10000,
9089
is_neox_style: bool = True,
91-
rope_scaling: dict[str, Any] = None,
90+
rope_parameters: dict[str, Any] | None = None,
9291
dtype: torch.dtype = torch.bfloat16,
9392
seed: int = 0,
9493
warmup_iter: int = 10,
@@ -102,9 +101,8 @@ def benchmark_mrope(
102101
head_size=head_dim,
103102
rotary_dim=head_dim,
104103
max_position=max_position,
105-
base=rope_theta,
106104
is_neox_style=is_neox_style,
107-
rope_scaling=rope_scaling,
105+
rope_parameters=rope_parameters,
108106
dtype=dtype,
109107
).to(device=device)
110108

@@ -203,9 +201,8 @@ def benchmark_mrope(
203201
num_kv_heads,
204202
head_dim,
205203
max_position,
206-
rope_theta,
207204
is_neox_style,
208-
str(rope_scaling),
205+
str(rope_parameters),
209206
str(dtype).split(".")[-1],
210207
torch_stats["mean"],
211208
torch_stats["median"],
@@ -255,9 +252,8 @@ def benchmark_mrope(
255252
"num_kv_heads",
256253
"head_dim",
257254
"max_position",
258-
"rope_theta",
259255
"is_neox_style",
260-
"rope_scaling",
256+
"rope_parameters",
261257
"dtype",
262258
"torch_mean",
263259
"torch_median",
@@ -303,7 +299,7 @@ def benchmark_mrope(
303299
q_size = num_heads * head_dim
304300
kv_size = num_kv_heads * head_dim
305301
is_neox_style = True
306-
rope_theta = config.rope_theta
302+
rope_parameters = config.rope_parameters
307303
max_position = config.max_position_embeddings
308304

309305
for num_tokens in num_tokens_list:
@@ -315,9 +311,8 @@ def benchmark_mrope(
315311
num_heads=num_heads,
316312
num_kv_heads=num_kv_heads,
317313
max_position=max_position,
318-
rope_theta=rope_theta,
319314
is_neox_style=is_neox_style,
320-
rope_scaling=config.rope_scaling,
315+
rope_parameters=rope_parameters,
321316
dtype=getattr(torch, args.dtype),
322317
seed=args.seed,
323318
warmup_iter=args.warmup_iter,

docker/Dockerfile.cpu

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
3737
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
3838
&& curl -LsSf https://astral.sh/uv/install.sh | sh
3939

40+
ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
4041
ENV CCACHE_DIR=/root/.cache/ccache
4142
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
4243

@@ -122,6 +123,15 @@ WORKDIR /workspace/vllm
122123
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
123124
cp requirements/test.in requirements/cpu-test.in && \
124125
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
126+
remove_packages_not_supported_on_aarch64() { \
127+
case "$(uname -m)" in \
128+
aarch64|arm64) \
129+
sed -i '/decord/d' requirements/cpu-test.in; \
130+
sed -i '/terratorch/d' requirements/cpu-test.in; \
131+
;; \
132+
esac; \
133+
}; \
134+
remove_packages_not_supported_on_aarch64 && \
125135
sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
126136
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
127137
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \

examples/offline_inference/context_extension.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
"""
44
This script demonstrates how to extend the context length
5-
of a Qwen model using the YARN method (rope_scaling)
5+
of a Qwen model using the YARN method (rope_parameters)
66
and run a simple chat example.
77
88
Usage:
@@ -19,8 +19,8 @@ def create_llm():
1919

2020
# Use yarn to extend context
2121
hf_overrides = {
22-
"rope_theta": rope_theta,
23-
"rope_scaling": {
22+
"rope_parameters": {
23+
"rope_theta": rope_theta,
2424
"rope_type": "yarn",
2525
"factor": factor,
2626
"original_max_position_embeddings": original_max_position_embeddings,

tests/compile/distributed/test_fusions_e2e.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,8 @@ class ModelBackendTestCase(NamedTuple):
4747
ModelBackendTestCase(
4848
# Use smaller model for L40s in CI
4949
model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
50-
# TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
51-
# so FI attention+fp8_quant is at least tested once
5250
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
53-
backend=AttentionBackendEnum.FLASHINFER
54-
if is_blackwell()
55-
else AttentionBackendEnum.TRITON_ATTN,
51+
backend=AttentionBackendEnum.TRITON_ATTN,
5652
matches=Matches(
5753
attention_fusion=32,
5854
allreduce_fusion=65,
@@ -65,9 +61,9 @@ class ModelBackendTestCase(NamedTuple):
6561
model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
6662
# TODO FlashInfer attn broken on Hopper with kvcache=fp8:
6763
# https://github.com/vllm-project/vllm/issues/28568
68-
# TODO FlashInfer attn broken on Blackwell for llama4:
69-
# https://github.com/vllm-project/vllm/issues/28604
70-
backend=AttentionBackendEnum.TRITON_ATTN,
64+
backend=AttentionBackendEnum.FLASHINFER
65+
if is_blackwell()
66+
else AttentionBackendEnum.TRITON_ATTN,
7167
matches=Matches(
7268
attention_fusion=48,
7369
allreduce_fusion=96,

tests/compile/test_functionalization.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000):
137137
self.head_dim,
138138
rotary_dim=self.rotary_dim,
139139
max_position=max_position,
140-
base=base,
140+
rope_parameters={"rope_type": "default", "rope_theta": base},
141141
)
142142

143143
def forward(self, positions, q, k):
@@ -172,7 +172,7 @@ def __init__(self, head_dim=64, num_heads=4, max_position=2048, base=10000):
172172
self.head_dim,
173173
rotary_dim=self.head_dim,
174174
max_position=max_position,
175-
base=base,
175+
rope_parameters={"rope_type": "default", "rope_theta": base},
176176
)
177177

178178
def forward(self, positions, hidden_states):

tests/distributed/test_context_parallel.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class ParallelSetup(NamedTuple):
3131
tp_size: int
3232
pp_size: int
3333
dcp_size: int
34-
dcp_kv_cache_interleave_size: int
34+
cp_kv_cache_interleave_size: int
3535
eager_mode: bool
3636
chunked_prefill: bool
3737

@@ -55,7 +55,7 @@ def detailed(
5555
tp_base: int = 4,
5656
pp_base: int = 1,
5757
dcp_base: int = 1,
58-
dcp_kv_cache_interleave_size: int = 1,
58+
cp_kv_cache_interleave_size: int = 1,
5959
multi_node_only: bool = False,
6060
runner: RunnerOption = "auto",
6161
load_format: str | None = None,
@@ -71,7 +71,7 @@ def detailed(
7171
tp_size=tp_base,
7272
pp_size=pp_multiplier * pp_base,
7373
dcp_size=int(dcp_multiplier * tp_base),
74-
dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
74+
cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
7575
eager_mode=eager_mode_val,
7676
chunked_prefill=chunked_prefill_val,
7777
)
@@ -116,7 +116,7 @@ def _compare_cp_with_tp(
116116
tp_size,
117117
pp_size,
118118
dcp_size,
119-
dcp_kv_cache_interleave_size,
119+
cp_kv_cache_interleave_size,
120120
eager_mode,
121121
chunked_prefill,
122122
) = parallel_setup
@@ -197,7 +197,7 @@ def _compare_cp_with_tp(
197197
"--decode-context-parallel-size",
198198
str(dcp_size),
199199
"--dcp-kv-cache-interleave-size",
200-
str(dcp_kv_cache_interleave_size),
200+
str(cp_kv_cache_interleave_size),
201201
"--distributed-executor-backend",
202202
distributed_backend,
203203
]
@@ -227,7 +227,7 @@ def _compare_cp_with_tp(
227227
"deepseek-ai/DeepSeek-V2-Lite-Chat": [
228228
CPTestSettings.detailed(),
229229
CPTestSettings.detailed(tp_base=2),
230-
CPTestSettings.detailed(tp_base=2, dcp_kv_cache_interleave_size=64),
230+
CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
231231
],
232232
"bigcode/gpt_bigcode-santacoder": [
233233
CPTestSettings.detailed(),

tests/kernels/attention/test_prefix_prefill.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -174,11 +174,11 @@ def test_contexted_kv_attention(
174174
block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
175175
b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
176176
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
177-
b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
177+
b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
178178
max_input_len = MAX_SEQ_LEN
179179
# copy kv to cache
180-
b_seq_start_loc = torch.cumsum(
181-
torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
180+
b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
181+
torch.int32
182182
)
183183
for i in range(BS):
184184
for j in range(query_lens[i]):
@@ -417,11 +417,11 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
417417
block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
418418
b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
419419
b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
420-
b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
420+
b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
421421
max_input_len = MAX_SEQ_LEN
422422
# copy kv to cache
423-
b_seq_start_loc = torch.cumsum(
424-
torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
423+
b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
424+
torch.int32
425425
)
426426
for i in range(BS):
427427
for j in range(query_lens[i]):

0 commit comments

Comments
 (0)