Skip to content

Commit 20e4497

Browse files
DarkLight1337mgoin
andauthored
[V0 Deprecation] Remove num_lookahead_slots (#29000)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Michael Goin <mgoin64@gmail.com>
1 parent 1c7bcc5 commit 20e4497

File tree

3 files changed

+0
-30
lines changed

3 files changed

+0
-30
lines changed

vllm/config/scheduler.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,6 @@ class SchedulerConfig:
6262
"""For chunked prefill, a request is considered long if the prompt is
6363
longer than this number of tokens."""
6464

65-
num_lookahead_slots: int = Field(default=0, ge=0)
66-
"""The number of slots to allocate per sequence per
67-
step, beyond the known token ids. This is used in speculative
68-
decoding to store KV activations of tokens which may or may not be
69-
accepted.
70-
71-
NOTE: This will be replaced by speculative config in the future; it is
72-
present to enable correctness tests until then."""
73-
7465
enable_chunked_prefill: bool = True
7566
"""If True, prefill requests can be chunked based
7667
on the remaining `max_num_batched_tokens`.

vllm/config/speculative.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -634,16 +634,6 @@ def _verify_args(self) -> Self:
634634

635635
return self
636636

637-
@property
638-
def num_lookahead_slots(self) -> int:
639-
"""The number of additional slots the scheduler should allocate per
640-
step, in addition to the slots allocated for each known token.
641-
642-
This is equal to the number of speculative tokens, as each speculative
643-
token must be scored.
644-
"""
645-
return self.num_speculative_tokens
646-
647637
def use_eagle(self) -> bool:
648638
return self.method in ("eagle", "eagle3", "mtp")
649639

vllm/engine/arg_utils.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,6 @@ class EngineArgs:
488488

489489
ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
490490
num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
491-
num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
492491
model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
493492
ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
494493

@@ -1081,9 +1080,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
10811080
"--long-prefill-token-threshold",
10821081
**scheduler_kwargs["long_prefill_token_threshold"],
10831082
)
1084-
scheduler_group.add_argument(
1085-
"--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]
1086-
)
10871083
# multi-step scheduling has been removed; corresponding arguments
10881084
# are no longer supported.
10891085
scheduler_group.add_argument(
@@ -1653,18 +1649,11 @@ def create_engine_config(
16531649
target_parallel_config=parallel_config,
16541650
)
16551651

1656-
# make sure num_lookahead_slots is set appropriately depending on
1657-
# whether speculative decoding is enabled
1658-
num_lookahead_slots = self.num_lookahead_slots
1659-
if speculative_config is not None:
1660-
num_lookahead_slots = speculative_config.num_lookahead_slots
1661-
16621652
scheduler_config = SchedulerConfig(
16631653
runner_type=model_config.runner_type,
16641654
max_num_batched_tokens=self.max_num_batched_tokens,
16651655
max_num_seqs=self.max_num_seqs,
16661656
max_model_len=model_config.max_model_len,
1667-
num_lookahead_slots=num_lookahead_slots,
16681657
enable_chunked_prefill=self.enable_chunked_prefill,
16691658
disable_chunked_mm_input=self.disable_chunked_mm_input,
16701659
is_multimodal_model=model_config.is_multimodal_model,

0 commit comments

Comments
 (0)