Skip to content

Commit 39776d5

Browse files
committed
Merge remote-tracking branch 'github/main' into continous_batching_mamba_from_scratch
2 parents a663fa8 + 04de905 commit 39776d5

File tree

110 files changed

+1840
-859
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+1840
-859
lines changed

.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# We can use this script to compute baseline accuracy on GSM for transformers.
33
#
44
# Make sure you have lm-eval-harness installed:
5-
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
5+
# pip install lm-eval==0.4.4
66

77
usage() {
88
echo``

.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# We use this for fp8, which HF does not support.
44
#
55
# Make sure you have lm-eval-harness installed:
6-
# pip install lm-eval==0.4.3
6+
# pip install lm-eval==0.4.4
77

88
usage() {
99
echo``

.buildkite/run-cpu-test.sh

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,19 @@ docker exec cpu-test bash -c "
2727
pytest -v -s tests/models/decoder_only/language \
2828
--ignore=tests/models/test_fp8.py \
2929
--ignore=tests/models/decoder_only/language/test_jamba.py \
30+
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
3031
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
3132

3233
# Run compressed-tensor test
34+
# docker exec cpu-test bash -c "
35+
# pytest -s -v \
36+
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
37+
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
38+
39+
# Run AWQ test
3340
docker exec cpu-test bash -c "
3441
pytest -s -v \
35-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
36-
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
42+
tests/quantization/test_ipex_quant.py"
3743

3844
# online inference
3945
docker exec cpu-test bash -c "

.buildkite/test-pipeline.yaml

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ steps:
7777
- vllm/
7878
- tests/basic_correctness/test_chunked_prefill
7979
commands:
80-
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
81-
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
80+
- VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
81+
- VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py
8282

8383
- label: Core Test # 10min
8484
mirror_hardwares: [amd]
@@ -88,7 +88,11 @@ steps:
8888
- vllm/distributed
8989
- tests/core
9090
commands:
91-
- pytest -v -s core
91+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py
92+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py
93+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py
94+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py
95+
- pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py
9296

9397
- label: Entrypoints Test # 40min
9498
working_dir: "/vllm-workspace/tests"
@@ -98,7 +102,6 @@ steps:
98102
- vllm/
99103
commands:
100104
- pip install -e ./plugins/vllm_add_dummy_model
101-
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
102105
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
103106
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
104107
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
@@ -186,7 +189,8 @@ steps:
186189
- vllm/
187190
- tests/prefix_caching
188191
commands:
189-
- pytest -v -s prefix_caching
192+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py
193+
- pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py
190194

191195
- label: Samplers Test # 36min
192196
source_file_dependencies:
@@ -210,7 +214,8 @@ steps:
210214
- tests/spec_decode
211215
commands:
212216
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
213-
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
217+
- VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py
218+
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py
214219

215220
- label: LoRA Test %N # 15min each
216221
mirror_hardwares: [amd]
@@ -270,15 +275,14 @@ steps:
270275
- csrc/
271276
- vllm/model_executor/layers/quantization
272277
- tests/quantization
273-
command: pytest -v -s quantization
278+
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
274279

275280
- label: LM Eval Small Models # 53min
276281
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
277282
source_file_dependencies:
278283
- csrc/
279284
- vllm/model_executor/layers/quantization
280285
commands:
281-
- pip install lm-eval
282286
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
283287
- bash ./run-tests.sh -c configs/models-small.txt -t 1
284288

@@ -393,7 +397,7 @@ steps:
393397
- pytest -v -s ./compile/test_full_graph_multi_gpu.py
394398
- pytest -v -s ./compile/test_wrapper.py
395399
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
396-
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
400+
- TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus
397401
# Avoid importing model tests that cause CUDA reinitialization error
398402
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
399403
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
@@ -492,6 +496,5 @@ steps:
492496
- csrc/
493497
- vllm/model_executor/layers/quantization
494498
commands:
495-
- pip install lm-eval
496499
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
497500
- bash ./run-tests.sh -c configs/models-large.txt -t 4

.github/dependabot.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
version: 2
2+
updates:
3+
# Maintain dependencies for GitHub Actions
4+
- package-ecosystem: "github-actions"
5+
directory: "/"
6+
schedule:
7+
interval: "weekly"

.github/workflows/mypy.yaml

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ on:
1111
- main
1212

1313
jobs:
14-
ruff:
14+
mypy:
1515
runs-on: ubuntu-latest
1616
strategy:
1717
matrix:
@@ -32,15 +32,4 @@ jobs:
3232
pip install types-setuptools
3333
- name: Mypy
3434
run: |
35-
mypy
36-
mypy tests --follow-imports skip
37-
mypy vllm/attention --follow-imports skip
38-
mypy vllm/distributed --follow-imports skip
39-
mypy vllm/engine --follow-imports skip
40-
mypy vllm/executor --follow-imports skip
41-
mypy vllm/lora --follow-imports skip
42-
mypy vllm/model_executor --follow-imports skip
43-
mypy vllm/prompt_adapter --follow-imports skip
44-
mypy vllm/spec_decode --follow-imports skip
45-
mypy vllm/worker --follow-imports skip
46-
35+
tools/mypy.sh

CONTRIBUTING.md

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,23 @@
11
# Contributing to vLLM
22

3-
Thank you for your interest in contributing to vLLM!
4-
Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
5-
There are several ways you can contribute to the project:
3+
Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
64

75
- Identify and report any issues or bugs.
8-
- Request or add a new model.
6+
- Request or add support for a new model.
97
- Suggest or implement new features.
8+
- Improve documentation or contribute a how-to guide.
109

11-
However, remember that contributions aren't just about code.
12-
We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
10+
We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
1311

14-
Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
15-
Talk about it in your blog posts, highlighting how it's driving your incredible projects.
16-
Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
12+
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
1713

1814

19-
## Setup for development
15+
## Developing
2016

21-
### Build from source
17+
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
2218

23-
```bash
24-
pip install -e . # This may take several minutes.
25-
```
2619

27-
### Testing
20+
## Testing
2821

2922
```bash
3023
pip install -r requirements-dev.txt
@@ -36,15 +29,16 @@ mypy
3629
# Unit tests
3730
pytest tests/
3831
```
39-
**Note:** Currently, the repository does not pass the mypy tests.
32+
**Note:** Currently, the repository does not pass the ``mypy`` tests.
4033

34+
## Contribution Guidelines
4135

42-
## Contributing Guidelines
36+
### Issues
4337

44-
### Issue Reporting
38+
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
4539

46-
If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
47-
If not, please file a new issue, providing as much relevant information as possible.
40+
> [!IMPORTANT]
41+
> If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability).
4842
4943
### Pull Requests & Code Reviews
5044

@@ -53,4 +47,4 @@ Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE
5347
### Thank You
5448

5549
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
56-
Your contributions make vLLM a great tool for everyone!
50+
All of your contributions help make vLLM a great tool and community for everyone!

Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
144144
#################### DEV IMAGE ####################
145145
#################### vLLM installation IMAGE ####################
146146
# image with vLLM installed
147-
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
147+
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
148148
ARG CUDA_VERSION=12.4.1
149149
ARG PYTHON_VERSION=3.12
150150
WORKDIR /vllm-workspace
@@ -182,6 +182,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
182182
RUN --mount=type=cache,target=/root/.cache/pip \
183183
. /etc/environment && \
184184
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
185+
COPY examples examples
185186
#################### vLLM installation IMAGE ####################
186187

187188

Dockerfile.cpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
2222

2323
RUN echo 'ulimit -c 0' >> ~/.bashrc
2424

25-
RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl
25+
RUN pip install intel_extension_for_pytorch==2.4.0
2626

2727
WORKDIR /workspace
2828

SECURITY.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@
22

33
## Reporting a Vulnerability
44

5-
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away.
6-
We will investigate all legitimate reports and do our best to quickly fix the problem.
5+
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
76

8-
Please report security issues using https://github.com/vllm-project/vllm/security/advisories/new
7+
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
98

109
---
11-
Please see PyTorch Security for more information how to securely interact with models: https://github.com/pytorch/pytorch/blob/main/SECURITY.md
12-
This document mostly references the recommendation from PyTorch, thank you!
10+
11+
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.

0 commit comments

Comments
 (0)