Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions tests/kernels/moe/test_batched_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@
TOP_KS = [1, 2, 6]

vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192


@dataclass
Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/moe/test_block_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)

vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192

# Test configurations
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/moe/test_block_int8.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)

vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192

DTYPES = [torch.bfloat16]

Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/moe/test_cutlass_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@
]

vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192


@dataclasses.dataclass
Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/moe/test_flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@
]

vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192


def quant_fp8_per_tensor_batches(a):
Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/moe/test_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,6 @@
]

vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192


def run_moe_test(
Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/moe/test_pplx_cutlass_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,6 @@ def pplx_cutlass_moe(


vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192


def _pplx_moe(
Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/moe/test_pplx_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,6 @@
DTYPES = [torch.float8_e4m3fn, torch.bfloat16]

vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192


def torch_prepare(
Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/moe/test_triton_moe_ptpc_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)

vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192


def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/quantization/test_block_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)

vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192

# Test configurations
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
Expand Down
2 changes: 0 additions & 2 deletions tests/kernels/quantization/test_block_int8.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)

vllm_config = VllmConfig()
vllm_config.scheduler_config.max_num_seqs = 128
vllm_config.scheduler_config.max_model_len = 8192

DTYPES = [torch.half, torch.bfloat16]
M = [1, 33, 64, 222]
Expand Down
31 changes: 23 additions & 8 deletions tests/lora/test_lora_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
from safetensors.torch import load_file
from torch import nn

from vllm.config import ModelConfig, VllmConfig
from vllm.config import VllmConfig
from vllm.config.lora import LoRAConfig
from vllm.config.model import ModelConfig
from vllm.config.scheduler import SchedulerConfig
from vllm.lora.layers import (
ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA,
Expand Down Expand Up @@ -441,10 +443,17 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
)

model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
scheduler_config = SchedulerConfig(
max_num_seqs=4,
max_num_batched_tokens=2,
enable_chunked_prefill=True,
)
vllm_config = VllmConfig(
model_config=model_config,
lora_config=lora_config,
scheduler_config=scheduler_config,
)

vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2
worker_adapter_manager = LRUCacheWorkerLoRAManager(
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
)
Expand Down Expand Up @@ -544,10 +553,16 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
)

model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)

vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2
scheduler_config = SchedulerConfig(
max_num_seqs=4,
max_num_batched_tokens=2,
enable_chunked_prefill=True,
)
vllm_config = VllmConfig(
model_config=model_config,
lora_config=lora_config,
scheduler_config=scheduler_config,
)

worker_adapter_manager = WorkerLoRAManager(
vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
Expand Down
34 changes: 17 additions & 17 deletions vllm/config/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from dataclasses import InitVar
from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast

from pydantic import Field, field_validator, model_validator
from pydantic import Field, field_validator
from pydantic.dataclasses import dataclass
from typing_extensions import Self

Expand Down Expand Up @@ -48,13 +48,6 @@ class SchedulerConfig:
In real usage, this should be set in `EngineArgs.create_engine_config`.
"""

max_model_len: int = Field(default=8192, ge=1)
"""Maximum length of a sequence (including prompt and generated text).

The default value here is mainly for convenience when testing.
In real usage, this should duplicate `ModelConfig.max_model_len` via
`EngineArgs`."""

max_num_partial_prefills: int = Field(default=1, ge=1)
"""For chunked prefill, the maximum number of sequences that can be
partially prefilled concurrently."""
Expand Down Expand Up @@ -89,6 +82,12 @@ class SchedulerConfig:
is_multimodal_model: bool = False
"""True if the model is multimodal."""

max_model_len: InitVar[int] = 8192
"""Maximum length of a sequence (including prompt and generated text).

Note: This is stored in the ModelConfig, and is used only here to
provide fallbacks and validate other attributes."""

is_encoder_decoder: InitVar[bool] = False
"""True if the model is an encoder-decoder model.

Expand Down Expand Up @@ -199,7 +198,7 @@ def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
return value
return handler(value)

def __post_init__(self, is_encoder_decoder: bool) -> None:
def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
if is_encoder_decoder:
# Chunked prefill should be disabled for encoder-decoder models.
self.disable_chunked_mm_input = True
Expand Down Expand Up @@ -232,6 +231,8 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
self.long_prefill_token_threshold,
)

self.verify_max_model_len(max_model_len)

@property
def chunked_prefill_enabled(self) -> bool:
return self.enable_chunked_prefill
Expand All @@ -240,15 +241,14 @@ def chunked_prefill_enabled(self) -> bool:
def chunked_prefill_enabled(self, value: bool):
self.enable_chunked_prefill = value

@model_validator(mode="after")
def _verify_args(self) -> Self:
def verify_max_model_len(self, max_model_len: int) -> Self:
if (
self.max_num_batched_tokens < self.max_model_len
self.max_num_batched_tokens < max_model_len
and not self.chunked_prefill_enabled
):
raise ValueError(
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
f"smaller than max_model_len ({self.max_model_len}). "
f"smaller than max_model_len ({max_model_len}). "
"This effectively limits the maximum sequence length to "
"max_num_batched_tokens and makes vLLM reject longer "
"sequences. Please increase max_num_batched_tokens or "
Expand All @@ -262,12 +262,12 @@ def _verify_args(self) -> Self:
f"({self.max_num_seqs})."
)

if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
logger.warning(
"max_num_batched_tokens (%d) exceeds max_num_seqs "
"* max_model_len (%d). This may lead to unexpected behavior.",
self.max_num_batched_tokens,
self.max_num_seqs * self.max_model_len,
self.max_num_seqs * max_model_len,
)

if self.max_num_partial_prefills > 1:
Expand All @@ -277,11 +277,11 @@ def _verify_args(self) -> Self:
"max_num_partial_prefills > 1."
)

if self.long_prefill_token_threshold > self.max_model_len:
if self.long_prefill_token_threshold > max_model_len:
raise ValueError(
"long_prefill_token_threshold "
f"({self.long_prefill_token_threshold}) cannot be greater "
f"than the max_model_len ({self.max_model_len})."
f"than the max_model_len ({max_model_len})."
)

if self.max_long_partial_prefills > self.max_num_partial_prefills:
Expand Down
1 change: 0 additions & 1 deletion vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,6 @@ def recalculate_max_model_len(self, max_model_len: int):
model_config = self.model_config
max_model_len = model_config.get_and_verify_max_len(max_model_len)
self.model_config.max_model_len = max_model_len
self.scheduler_config.max_model_len = max_model_len

def try_verify_and_update_config(self):
if self.model_config is None:
Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
)
vllm_config.scheduler_config.enable_chunked_prefill = False
vllm_config.scheduler_config.max_num_batched_tokens = max(
vllm_config.scheduler_config.max_model_len,
vllm_config.model_config.max_model_len,
vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
)

Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
)
vllm_config.scheduler_config.enable_chunked_prefill = False
vllm_config.scheduler_config.max_num_batched_tokens = max(
vllm_config.scheduler_config.max_model_len,
vllm_config.model_config.max_model_len,
vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
)

Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/xpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
)
vllm_config.scheduler_config.enable_chunked_prefill = False
vllm_config.scheduler_config.max_num_batched_tokens = max(
vllm_config.scheduler_config.max_model_len,
vllm_config.model_config.max_model_len,
vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
)

Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def __init__(
# Scheduling constraints.
self.max_num_running_reqs = self.scheduler_config.max_num_seqs
self.max_num_scheduled_tokens = self.scheduler_config.max_num_batched_tokens
self.max_model_len = self.scheduler_config.max_model_len
self.max_model_len = vllm_config.model_config.max_model_len
self.enable_kv_cache_events = (
self.kv_events_config is not None
and self.kv_events_config.enable_kv_cache_events
Expand Down
Loading