Skip to content

Commit 9c07bfb

Browse files
authored
Merge branch 'main' into docker-switch-to-runtime
2 parents 5bc5dd6 + d44e9df commit 9c07bfb

File tree

140 files changed

+3945
-1130
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

140 files changed

+3945
-1130
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,11 @@ function cpu_tests() {
7373
pytest -x -s -v \
7474
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
7575

76-
# Note: disable it until supports V1
77-
# Run AWQ test
78-
# docker exec cpu-test-"$NUMA_NODE" bash -c "
79-
# set -e
80-
# pytest -x -s -v \
81-
# tests/quantization/test_ipex_quant.py"
76+
# Run AWQ/GPTQ test
77+
docker exec cpu-test-"$NUMA_NODE" bash -c "
78+
set -e
79+
pytest -x -s -v \
80+
tests/quantization/test_cpu_wna16.py"
8281

8382
# Run multi-lora tests
8483
docker exec cpu-test-"$NUMA_NODE" bash -c "

.buildkite/test-amd.yaml

Lines changed: 27 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ steps:
187187
- tests/distributed/test_utils
188188
- tests/distributed/test_pynccl
189189
- tests/distributed/test_events
190-
- tests/compile/test_basic_correctness
190+
- tests/compile/fullgraph/test_basic_correctness.py
191191
- examples/offline_inference/rlhf.py
192192
- examples/offline_inference/rlhf_colocate.py
193193
- tests/examples/offline_inference/data_parallel.py
@@ -215,7 +215,7 @@ steps:
215215
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
216216
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
217217
- pytest -v -s distributed/test_utils.py
218-
- pytest -v -s compile/test_basic_correctness.py
218+
- pytest -v -s compile/fullgraph/test_basic_correctness.py
219219
- pytest -v -s distributed/test_pynccl.py
220220
- pytest -v -s distributed/test_events.py
221221
- pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -493,17 +493,12 @@ steps:
493493
- vllm/
494494
- tests/compile
495495
commands:
496-
- pytest -v -s compile/test_pass_manager.py
497-
- pytest -v -s compile/test_fusion.py
498-
- pytest -v -s compile/test_fusion_attn.py
499-
- pytest -v -s compile/test_functionalization.py
500-
- pytest -v -s compile/test_silu_mul_quant_fusion.py
501-
# - pytest -v -s compile/test_sequence_parallelism.py
502-
# - pytest -v -s compile/test_async_tp.py
503-
- pytest -v -s compile/test_fusion_all_reduce.py
504-
- pytest -v -s compile/test_decorator.py
505-
- pytest -v -s compile/test_noop_elimination.py
506-
- pytest -v -s compile/test_aot_compile.py
496+
# Run unit tests defined directly under compile/,
497+
# not including subdirectories, which are usually heavier
498+
# tests covered elsewhere.
499+
# Use `find` to launch multiple instances of pytest so that
500+
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
501+
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
507502

508503
- label: PyTorch Fullgraph Smoke Test # 15min
509504
timeout_in_minutes: 30
@@ -515,9 +510,11 @@ steps:
515510
- vllm/
516511
- tests/compile
517512
commands:
518-
- pytest -v -s compile/test_basic_correctness.py
519-
- pytest -v -s compile/test_multimodal_compile.py
520-
- pytest -v -s compile/piecewise/
513+
# Run smoke tests under fullgraph directory, except test_full_graph.py
514+
# as it is a heavy test that is covered in other steps.
515+
# Use `find` to launch multiple instances of pytest so that
516+
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
517+
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
521518

522519
- label: PyTorch Fullgraph Test # 27min
523520
timeout_in_minutes: 40
@@ -529,10 +526,10 @@ steps:
529526
- vllm/
530527
- tests/compile
531528
commands:
532-
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
529+
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
533530
# Limit to no custom ops to reduce running time
534531
# Wrap with quotes to escape yaml and avoid starting -k string with a -
535-
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
532+
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
536533

537534
- label: Cudagraph test
538535
timeout_in_minutes: 20
@@ -1066,10 +1063,10 @@ steps:
10661063
- pytest -v -s tests/compile/test_fusion_attn.py
10671064
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
10681065
# this runner has 2 GPUs available even though num_gpus=2 is not set
1069-
- pytest -v -s tests/compile/test_fusion_all_reduce.py
1066+
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
10701067
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
10711068
# Wrap with quotes to escape yaml
1072-
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
1069+
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
10731070

10741071
- label: Blackwell Fusion E2E Tests # 30 min
10751072
timeout_in_minutes: 40
@@ -1086,14 +1083,14 @@ steps:
10861083
- vllm/model_executor/layers/layernorm.py
10871084
- vllm/model_executor/layers/activation.py
10881085
- vllm/model_executor/layers/quantization/input_quant_fp8.py
1089-
- tests/compile/test_fusions_e2e.py
1090-
- tests/compile/test_full_graph.py
1086+
- tests/compile/distributed/test_fusions_e2e.py
1087+
- tests/compile/fullgraph/test_full_graph.py
10911088
commands:
10921089
- nvidia-smi
10931090
# Run all e2e fusion tests
1094-
- pytest -v -s tests/compile/test_fusions_e2e.py
1091+
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
10951092
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1096-
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
1093+
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
10971094

10981095
- label: ROCm GPT-OSS Eval
10991096
timeout_in_minutes: 60
@@ -1198,7 +1195,7 @@ steps:
11981195
- vllm/worker/worker_base.py
11991196
- vllm/v1/engine/
12001197
- vllm/v1/worker/
1201-
- tests/compile/test_basic_correctness.py
1198+
- tests/compile/fullgraph/test_basic_correctness.py
12021199
- tests/compile/test_wrapper.py
12031200
- tests/distributed/
12041201
- tests/entrypoints/llm/test_collective_rpc.py
@@ -1211,7 +1208,7 @@ steps:
12111208
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
12121209
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
12131210
- pytest -v -s entrypoints/llm/test_collective_rpc.py
1214-
- pytest -v -s ./compile/test_basic_correctness.py
1211+
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
12151212
- pytest -v -s ./compile/test_wrapper.py
12161213
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
12171214
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1417,10 +1414,10 @@ steps:
14171414
working_dir: "/vllm-workspace/"
14181415
num_gpus: 2
14191416
commands:
1420-
- pytest -v -s tests/compile/test_async_tp.py
1421-
- pytest -v -s tests/compile/test_sequence_parallelism.py
1422-
- pytest -v -s tests/compile/test_fusion_all_reduce.py
1423-
- pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1417+
- pytest -v -s tests/compile/distributed/test_async_tp.py
1418+
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
1419+
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
1420+
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
14241421
- pytest -v -s tests/distributed/test_context_parallel.py
14251422
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
14261423
- pytest -v -s tests/v1/distributed/test_dbo.py

.buildkite/test-pipeline.yaml

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ steps:
167167
- tests/distributed/test_utils
168168
- tests/distributed/test_pynccl
169169
- tests/distributed/test_events
170-
- tests/compile/test_basic_correctness
170+
- tests/compile/fullgraph/test_basic_correctness.py
171171
- examples/offline_inference/rlhf.py
172172
- examples/offline_inference/rlhf_colocate.py
173173
- tests/examples/offline_inference/data_parallel.py
@@ -197,7 +197,7 @@ steps:
197197
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
198198
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
199199
- pytest -v -s distributed/test_utils.py
200-
- pytest -v -s compile/test_basic_correctness.py
200+
- pytest -v -s compile/fullgraph/test_basic_correctness.py
201201
- pytest -v -s distributed/test_pynccl.py
202202
- pytest -v -s distributed/test_events.py
203203
- pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -445,18 +445,12 @@ steps:
445445
- vllm/
446446
- tests/compile
447447
commands:
448-
- pytest -v -s compile/test_graph_partition.py
449-
- pytest -v -s compile/test_config.py
450-
- pytest -v -s compile/test_pass_manager.py
451-
- pytest -v -s compile/test_fusion.py
452-
- pytest -v -s compile/test_fusion_attn.py
453-
- pytest -v -s compile/test_functionalization.py
454-
- pytest -v -s compile/test_silu_mul_quant_fusion.py
455-
- pytest -v -s compile/test_fusion_all_reduce.py
456-
- pytest -v -s compile/test_decorator.py
457-
- pytest -v -s compile/test_noop_elimination.py
458-
- pytest -v -s compile/test_aot_compile.py
459-
- pytest -v -s compile/test_qk_norm_rope_fusion.py
448+
# Run unit tests defined directly under compile/,
449+
# not including subdirectories, which are usually heavier
450+
# tests covered elsewhere.
451+
# Use `find` to launch multiple instances of pytest so that
452+
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
453+
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
460454

461455
- label: PyTorch Fullgraph Smoke Test # 15min
462456
timeout_in_minutes: 30
@@ -466,9 +460,11 @@ steps:
466460
- vllm/
467461
- tests/compile
468462
commands:
469-
- pytest -v -s compile/test_basic_correctness.py
470-
- pytest -v -s compile/test_multimodal_compile.py
471-
- pytest -v -s compile/piecewise/
463+
# Run smoke tests under fullgraph directory, except test_full_graph.py
464+
# as it is a heavy test that is covered in other steps.
465+
# Use `find` to launch multiple instances of pytest so that
466+
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
467+
- "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
472468

473469
- label: PyTorch Fullgraph Test # 27min
474470
timeout_in_minutes: 40
@@ -479,10 +475,10 @@ steps:
479475
- tests/compile
480476
commands:
481477
# fp8 kv scales not supported on sm89, tested on Blackwell instead
482-
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
478+
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
483479
# Limit to no custom ops to reduce running time
484480
# Wrap with quotes to escape yaml and avoid starting -k string with a -
485-
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
481+
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
486482

487483
- label: Cudagraph test
488484
timeout_in_minutes: 20
@@ -939,17 +935,22 @@ steps:
939935
- vllm/model_executor/layers/layernorm.py
940936
- vllm/model_executor/layers/activation.py
941937
- vllm/model_executor/layers/quantization/input_quant_fp8.py
938+
- tests/compile/test_fusion_attn.py
939+
- tests/compile/test_silu_mul_quant_fusion.py
940+
- tests/compile/distributed/test_fusion_all_reduce.py
941+
- tests/compile/distributed/test_fusions_e2e.py
942+
- tests/compile/fullgraph/test_full_graph.py
942943
commands:
943944
- nvidia-smi
944945
- pytest -v -s tests/compile/test_fusion_attn.py
945946
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
946947
# this runner has 2 GPUs available even though num_gpus=2 is not set
947-
- pytest -v -s tests/compile/test_fusion_all_reduce.py
948+
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
948949
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
949950
# Wrap with quotes to escape yaml
950-
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
951+
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
951952
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
952-
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
953+
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
953954

954955
- label: Blackwell Fusion E2E Tests # 30 min
955956
timeout_in_minutes: 40
@@ -966,12 +967,11 @@ steps:
966967
- vllm/model_executor/layers/layernorm.py
967968
- vllm/model_executor/layers/activation.py
968969
- vllm/model_executor/layers/quantization/input_quant_fp8.py
969-
- tests/compile/test_fusions_e2e.py
970-
- tests/compile/test_full_graph.py
970+
- tests/compile/distributed/test_fusions_e2e.py
971971
commands:
972972
- nvidia-smi
973973
# Run all e2e fusion tests
974-
- pytest -v -s tests/compile/test_fusions_e2e.py
974+
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
975975

976976
- label: Blackwell GPT-OSS Eval
977977
timeout_in_minutes: 60
@@ -1069,7 +1069,7 @@ steps:
10691069
- vllm/worker/worker_base.py
10701070
- vllm/v1/engine/
10711071
- vllm/v1/worker/
1072-
- tests/compile/test_basic_correctness.py
1072+
- tests/compile/fullgraph/test_basic_correctness.py
10731073
- tests/compile/test_wrapper.py
10741074
- tests/distributed/
10751075
- tests/entrypoints/llm/test_collective_rpc.py
@@ -1084,7 +1084,7 @@ steps:
10841084
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
10851085
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
10861086
- pytest -v -s entrypoints/llm/test_collective_rpc.py
1087-
- pytest -v -s ./compile/test_basic_correctness.py
1087+
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
10881088
- pytest -v -s ./compile/test_wrapper.py
10891089
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
10901090
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1264,10 +1264,10 @@ steps:
12641264
working_dir: "/vllm-workspace/"
12651265
num_gpus: 2
12661266
commands:
1267-
- pytest -v -s tests/compile/test_async_tp.py
1268-
- pytest -v -s tests/compile/test_sequence_parallelism.py
1269-
- pytest -v -s tests/compile/test_fusion_all_reduce.py
1270-
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
1267+
- pytest -v -s tests/compile/distributed/test_async_tp.py
1268+
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
1269+
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
1270+
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
12711271
- pytest -v -s tests/distributed/test_sequence_parallel.py
12721272
- pytest -v -s tests/distributed/test_context_parallel.py
12731273
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048

.github/workflows/macos-smoke-test.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
jobs:
1010
macos-m1-smoke-test:
1111
runs-on: macos-latest
12-
timeout-minutes: 20
12+
timeout-minutes: 30
1313

1414
steps:
1515
- uses: actions/checkout@v4
@@ -37,15 +37,14 @@ jobs:
3737
- name: Verify installation
3838
run: |
3939
python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
40-
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
4140
4241
- name: Smoke test vllm serve
43-
timeout-minutes: 10
4442
run: |
4543
# Start server in background
4644
vllm serve Qwen/Qwen3-0.6B \
47-
--max-model-len=2048 \
45+
--max-model-len=2K \
4846
--load-format=dummy \
47+
--hf-overrides '{"num_hidden_layers": 2}' \
4948
--enforce-eager \
5049
--port 8000 &
5150

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
# vllm-flash-attn built from source
55
vllm/vllm_flash_attn/*
66

7+
# OpenAI triton kernels copied from source
8+
vllm/third_party/triton_kernels/*
9+
710
# triton jit
811
.triton
912

CMakeLists.txt

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -512,9 +512,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
512512
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
513513
# require CUDA 12.8 or later
514514
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
515-
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
515+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
516516
else()
517-
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
517+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
518518
endif()
519519
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
520520
set(SRCS
@@ -619,9 +619,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
619619

620620
# FP4 Archs and flags
621621
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
622-
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
622+
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
623623
else()
624-
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
624+
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
625625
endif()
626626
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
627627
set(SRCS
@@ -695,7 +695,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
695695
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
696696
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
697697
else()
698-
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
698+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
699699
endif()
700700
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
701701
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
@@ -741,9 +741,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
741741
endif()
742742

743743
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
744-
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
744+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
745745
else()
746-
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
746+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
747747
endif()
748748
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
749749
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
@@ -1030,6 +1030,11 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
10301030
WITH_SOABI)
10311031
endif()
10321032

1033+
# For CUDA and HIP builds also build the triton_kernels external package.
1034+
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
1035+
include(cmake/external_projects/triton_kernels.cmake)
1036+
endif()
1037+
10331038
# For CUDA we also build and ship some external projects.
10341039
if (VLLM_GPU_LANG STREQUAL "CUDA")
10351040
include(cmake/external_projects/flashmla.cmake)

cmake/cpu_extension.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ set(VLLM_EXT_SRC
375375
if (AVX512_FOUND AND NOT AVX512_DISABLED)
376376
set(VLLM_EXT_SRC
377377
"csrc/cpu/shm.cpp"
378+
"csrc/cpu/cpu_wna16.cpp"
378379
${VLLM_EXT_SRC})
379380
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
380381
set(VLLM_EXT_SRC

0 commit comments

Comments
 (0)