From fcfcf6fdeac25b34421f6cf50a9ab76b8403297d Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Mon, 17 Nov 2025 18:45:46 -0800 Subject: [PATCH] [BugFix] Fix PP/async scheduling with pooling models Signed-off-by: Nick Hill --- vllm/v1/engine/core.py | 3 ++- vllm/v1/executor/ray_executor.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 97286c6e2e5e..d49eb752d56a 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -184,6 +184,7 @@ def __init__( vllm_config.ec_transfer_config is not None and vllm_config.ec_transfer_config.is_ec_producer ) + self.is_pooling_model = vllm_config.model_config.runner_type == "pooling" self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None if vllm_config.cache_config.enable_prefix_caching or kv_connector is not None: @@ -392,7 +393,7 @@ def step_with_batch_queue( if not self.ec_producer: model_executed = scheduler_output.total_num_scheduled_tokens > 0 - if not model_executed: + if self.is_pooling_model or not model_executed: # No sampling required (no requests scheduled). future = cast(Future[ModelRunnerOutput], exec_future) else: diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py index 55db7445c9c7..406eafcd339b 100644 --- a/vllm/v1/executor/ray_executor.py +++ b/vllm/v1/executor/ray_executor.py @@ -99,9 +99,9 @@ def _init_executor(self) -> None: # KV connector setup self.has_connector = self.vllm_config.kv_transfer_config is not None - self.ec_producer = ( - self.vllm_config.ec_transfer_config is not None - and self.vllm_config.ec_transfer_config.is_ec_producer + self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and ( + self.vllm_config.ec_transfer_config is None + or not self.vllm_config.ec_transfer_config.is_ec_producer ) self.scheduler_output: SchedulerOutput | None = None @@ -401,7 +401,7 @@ def execute_model( # type: ignore[override] "after execute_model() returns None." ) - if self.ec_producer or not scheduler_output.total_num_scheduled_tokens: + if not self.uses_sampler or not scheduler_output.total_num_scheduled_tokens: # Model will not execute, call model runner immediately. return self._execute_dag(scheduler_output, None, non_block)