[Misc] Make SchedulerConfig.max_model_len init-only (#28733)

DarkLight1337 · web-flow · commit 638e4196d15f · 2025-11-15T01:59:31.000-08:00
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -40,8 +40,6 @@
 TOP_KS = [1, 2, 6]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 @dataclass
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
@@ -33,8 +33,6 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
@@ -18,8 +18,6 @@
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 DTYPES = [torch.bfloat16]
 
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
@@ -42,8 +42,6 @@
 ]
 
 vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 @dataclasses.dataclass
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
@@ -45,8 +45,6 @@
 ]
 
 vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def quant_fp8_per_tensor_batches(a):
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
@@ -81,8 +81,6 @@
 ]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def run_moe_test(
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -192,8 +192,6 @@ def pplx_cutlass_moe(
 
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def _pplx_moe(
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -81,8 +81,6 @@
 DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def torch_prepare(
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -18,8 +18,6 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
@@ -29,8 +29,6 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
@@ -18,8 +18,6 @@
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 DTYPES = [torch.half, torch.bfloat16]
 M = [1, 33, 64, 222]
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
@@ -6,7 +6,7 @@
 from dataclasses import InitVar
 from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
 
-from pydantic import Field, field_validator, model_validator
+from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
 from typing_extensions import Self, deprecated
 
@@ -48,13 +48,6 @@ class SchedulerConfig:
     In real usage, this should be set in `EngineArgs.create_engine_config`.
     """
 
-    max_model_len: int = Field(default=8192, ge=1)
-    """Maximum length of a sequence (including prompt and generated text).
-
-    The default value here is mainly for convenience when testing.
-    In real usage, this should duplicate `ModelConfig.max_model_len` via
-    `EngineArgs`."""
-
     max_num_partial_prefills: int = Field(default=1, ge=1)
     """For chunked prefill, the maximum number of sequences that can be
     partially prefilled concurrently."""
@@ -89,6 +82,12 @@ class SchedulerConfig:
     is_multimodal_model: bool = False
     """True if the model is multimodal."""
 
+    max_model_len: InitVar[int] = 8192
+    """Maximum length of a sequence (including prompt and generated text).
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    provide fallbacks and validate other attributes."""
+
     is_encoder_decoder: InitVar[bool] = False
     """True if the model is an encoder-decoder model.
 
@@ -199,7 +198,7 @@ def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
             return value
         return handler(value)
 
-    def __post_init__(self, is_encoder_decoder: bool) -> None:
+    def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
         if is_encoder_decoder:
             # Chunked prefill should be disabled for encoder-decoder models.
             self.disable_chunked_mm_input = True
@@ -221,7 +220,7 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
 
         if self.max_num_partial_prefills > 1:
             if self.long_prefill_token_threshold == 0:
-                self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
+                self.long_prefill_token_threshold = int(max_model_len * 0.04)
 
             logger.info(
                 "Concurrent partial prefills enabled with "
@@ -232,6 +231,8 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 self.long_prefill_token_threshold,
             )
 
+        self.verify_max_model_len(max_model_len)
+
     @property
     @deprecated(
         "`SchedulerConfig.chunked_prefill_enabled` has been renamed to "
@@ -245,15 +246,14 @@ def chunked_prefill_enabled(self) -> bool:
     def chunked_prefill_enabled(self, value: bool):
         self.enable_chunked_prefill = value
 
-    @model_validator(mode="after")
-    def _verify_args(self) -> Self:
+    def verify_max_model_len(self, max_model_len: int) -> Self:
         if (
-            self.max_num_batched_tokens < self.max_model_len
+            self.max_num_batched_tokens < max_model_len
             and not self.enable_chunked_prefill
         ):
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
-                f"smaller than max_model_len ({self.max_model_len}). "
+                f"smaller than max_model_len ({max_model_len}). "
                 "This effectively limits the maximum sequence length to "
                 "max_num_batched_tokens and makes vLLM reject longer "
                 "sequences. Please increase max_num_batched_tokens or "
@@ -267,12 +267,12 @@ def _verify_args(self) -> Self:
                 f"({self.max_num_seqs})."
             )
 
-        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
+        if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
             logger.warning(
                 "max_num_batched_tokens (%d) exceeds max_num_seqs "
                 "* max_model_len (%d). This may lead to unexpected behavior.",
                 self.max_num_batched_tokens,
-                self.max_num_seqs * self.max_model_len,
+                self.max_num_seqs * max_model_len,
             )
 
         if self.max_num_partial_prefills > 1:
@@ -282,11 +282,11 @@ def _verify_args(self) -> Self:
                     "max_num_partial_prefills > 1."
                 )
 
-            if self.long_prefill_token_threshold > self.max_model_len:
+            if self.long_prefill_token_threshold > max_model_len:
                 raise ValueError(
                     "long_prefill_token_threshold "
                     f"({self.long_prefill_token_threshold}) cannot be greater "
-                    f"than the max_model_len ({self.max_model_len})."
+                    f"than the max_model_len ({max_model_len})."
                 )
 
         if self.max_long_partial_prefills > self.max_num_partial_prefills:
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
@@ -929,7 +929,6 @@ def recalculate_max_model_len(self, max_model_len: int):
         model_config = self.model_config
         max_model_len = model_config.get_and_verify_max_len(max_model_len)
         self.model_config.max_model_len = max_model_len
-        self.scheduler_config.max_model_len = max_model_len
 
     def try_verify_and_update_config(self):
         if self.model_config is None:
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
@@ -339,7 +339,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
+                vllm_config.model_config.max_model_len,
                 vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
@@ -191,7 +191,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
+                vllm_config.model_config.max_model_len,
                 vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
@@ -185,7 +185,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
+                vllm_config.model_config.max_model_len,
                 vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -83,7 +83,7 @@ def __init__(
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
         self.max_num_scheduled_tokens = self.scheduler_config.max_num_batched_tokens
-        self.max_model_len = self.scheduler_config.max_model_len
+        self.max_model_len = vllm_config.model_config.max_model_len
         self.enable_kv_cache_events = (
             self.kv_events_config is not None
             and self.kv_events_config.enable_kv_cache_events

Original file line number	Diff line number	Diff line change
`@@ -42,8 +42,6 @@`
`42`	`42`	`]`
`43`	`43`
`44`	`44`	`vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))`
`45`		`-vllm_config.scheduler_config.max_num_seqs = 128`
`46`		`-vllm_config.scheduler_config.max_model_len = 8192`
`47`	`45`
`48`	`46`
`49`	`47`	`@dataclasses.dataclass`
Original file line number	Diff line number	Diff line change
`@@ -45,8 +45,6 @@`
`45`	`45`	`]`
`46`	`46`
`47`	`47`	`vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))`
`48`		`-vllm_config.scheduler_config.max_num_seqs = 128`
`49`		`-vllm_config.scheduler_config.max_model_len = 8192`
`50`	`48`
`51`	`49`
`52`	`50`	`def quant_fp8_per_tensor_batches(a):`
Original file line number	Diff line number	Diff line change
`@@ -81,8 +81,6 @@`
`81`	`81`	`]`
`82`	`82`
`83`	`83`	`vllm_config = VllmConfig()`
`84`		`-vllm_config.scheduler_config.max_num_seqs = 128`
`85`		`-vllm_config.scheduler_config.max_model_len = 8192`
`86`	`84`
`87`	`85`
`88`	`86`	`def run_moe_test(`