6363from vllm .v1 .request import Request , RequestStatus
6464from vllm .v1 .serial_utils import MsgpackDecoder , MsgpackEncoder
6565from vllm .v1 .structured_output import StructuredOutputManager
66- from vllm .v1 .utils import record_function_or_nullcontext
6766from vllm .version import __version__ as VLLM_VERSION
6867
6968logger = init_logger (__name__ )
@@ -181,11 +180,13 @@ def __init__(
181180 logger .info ("Batch queue is enabled with size %d" , self .batch_queue_size )
182181 self .batch_queue = deque (maxlen = self .batch_queue_size )
183182
183+ self .ec_producer = (
184+ vllm_config .ec_transfer_config is not None
185+ and vllm_config .ec_transfer_config .is_ec_producer
186+ )
187+
184188 self .request_block_hasher : Callable [[Request ], list [BlockHash ]] | None = None
185- if (
186- self .vllm_config .cache_config .enable_prefix_caching
187- or kv_connector is not None
188- ):
189+ if vllm_config .cache_config .enable_prefix_caching or kv_connector is not None :
189190 caching_hash_fn = get_hash_fn_by_name (
190191 vllm_config .cache_config .prefix_caching_hash_algo
191192 )
@@ -246,7 +247,7 @@ def _initialize_kv_caches(
246247
247248 elapsed = time .time () - start
248249 logger .info_once (
249- ( "init engine (profile, create kv cache, warmup model) took %.2f seconds" ) ,
250+ "init engine (profile, create kv cache, warmup model) took %.2f seconds" ,
250251 elapsed ,
251252 scope = "local" ,
252253 )
@@ -312,6 +313,16 @@ def log_error_detail(self, scheduler_output: SchedulerOutput):
312313 )
313314 raise err
314315
316+ def _log_err_callback (self , scheduler_output : SchedulerOutput ):
317+ """Log error details of a future that's not expected to return a result."""
318+
319+ def callback (f , sched_output = scheduler_output ):
320+ with self .log_error_detail (sched_output ):
321+ result = f .result ()
322+ assert result is None
323+
324+ return callback
325+
315326 def step (self ) -> tuple [dict [int , EngineCoreOutputs ], bool ]:
316327 """Schedule, execute, and make output.
317328
@@ -323,21 +334,17 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
323334 # or finished and not yet removed from the batch.
324335 if not self .scheduler .has_requests ():
325336 return {}, False
326- with record_function_or_nullcontext ("core step: schedule" ):
327- scheduler_output = self .scheduler .schedule ()
328-
329- with record_function_or_nullcontext ("core step: execute_model" ):
330- future = self .model_executor .execute_model (scheduler_output , non_block = True )
331- grammar_output = self .scheduler .get_grammar_bitmask (scheduler_output )
332- with self .log_error_detail (scheduler_output ):
333- model_output = future .result ()
334- if model_output is None :
335- model_output = self .model_executor .sample_tokens (grammar_output )
336-
337- with record_function_or_nullcontext ("core step: update_from_output" ):
338- engine_core_outputs = self .scheduler .update_from_output (
339- scheduler_output , model_output
340- )
337+ scheduler_output = self .scheduler .schedule ()
338+ future = self .model_executor .execute_model (scheduler_output , non_block = True )
339+ grammar_output = self .scheduler .get_grammar_bitmask (scheduler_output )
340+ with self .log_error_detail (scheduler_output ):
341+ model_output = future .result ()
342+ if model_output is None :
343+ model_output = self .model_executor .sample_tokens (grammar_output )
344+
345+ engine_core_outputs = self .scheduler .update_from_output (
346+ scheduler_output , model_output
347+ )
341348
342349 return engine_core_outputs , scheduler_output .total_num_scheduled_tokens > 0
343350
@@ -378,52 +385,34 @@ def step_with_batch_queue(
378385 model_executed = False
379386 deferred_scheduler_output = None
380387 if self .scheduler .has_requests ():
381- with record_function_or_nullcontext ("core step_with_batch_queue: schedule" ):
382- scheduler_output = self .scheduler .schedule ()
383- with record_function_or_nullcontext (
384- "core step_with_batch_queue: execute_model"
385- ):
386- exec_future = self .model_executor .execute_model (
387- scheduler_output , non_block = True
388- )
389- model_executed = scheduler_output .total_num_scheduled_tokens > 0
388+ scheduler_output = self .scheduler .schedule ()
389+ exec_future = self .model_executor .execute_model (
390+ scheduler_output , non_block = True
391+ )
392+ if not self .ec_producer :
393+ model_executed = scheduler_output .total_num_scheduled_tokens > 0
390394
391- if scheduler_output .pending_structured_output_tokens :
392- with record_function_or_nullcontext (
393- "core step_with_batch_queue: pending_structured_output_tokens"
394- ):
395- # We need to defer sampling until we have processed the model output
396- # from the prior step.
397- deferred_scheduler_output = scheduler_output
398- # Block-wait for execute to return
399- # (continues running async on the GPU).
400- with self .log_error_detail (scheduler_output ):
401- exec_result = exec_future .result ()
402- assert exec_result is None
395+ if not model_executed :
396+ # No sampling required (no requests scheduled).
397+ future = cast (Future [ModelRunnerOutput ], exec_future )
403398 else :
404- with record_function_or_nullcontext (
405- "core step_with_batch_queue: get_grammar_bitmask"
406- ) :
407- # We aren't waiting for any tokens, get any grammar
408- # output immediately.
399+ exec_future . add_done_callback ( self . _log_err_callback ( scheduler_output ))
400+
401+ if not scheduler_output . pending_structured_output_tokens :
402+ # We aren't waiting for any tokens, get any grammar output
403+ # and sample immediately.
409404 grammar_output = self .scheduler .get_grammar_bitmask (
410405 scheduler_output
411406 )
412- # Block-wait for execute to return (continues running async on the GPU).
413- with self .log_error_detail (scheduler_output ):
414- exec_result = exec_future .result ()
415-
416- if exec_result is None :
417- with record_function_or_nullcontext (
418- "core step_with_batch_queue: sample_tokens"
419- ):
420- # Call sample tokens.
421- future = self .model_executor .sample_tokens (
422- grammar_output , non_block = True
423- )
407+ future = self .model_executor .sample_tokens (
408+ grammar_output , non_block = True
409+ )
424410 else :
425- # No sampling required (e.g. all requests finished).
426- future = cast (Future [ModelRunnerOutput ], exec_future )
411+ # We need to defer sampling until we have processed the model output
412+ # from the prior step.
413+ deferred_scheduler_output = scheduler_output
414+
415+ if not deferred_scheduler_output :
427416 # Add this step's future to the queue.
428417 batch_queue .appendleft ((future , scheduler_output ))
429418 if (
@@ -440,34 +429,27 @@ def step_with_batch_queue(
440429 # only be called when the scheduler contains requests or the queue
441430 # is non-empty.
442431 return None , False
443- with record_function_or_nullcontext ("core step_with_batch_queue: model_output" ):
444- # Block until the next result is available.
445- future , scheduler_output = batch_queue .pop ()
446- with self .log_error_detail (scheduler_output ):
447- model_output = future .result ()
448- with record_function_or_nullcontext (
449- "core step_with_batch_queue: update_from_output"
450- ):
451- engine_core_outputs = self .scheduler .update_from_output (
452- scheduler_output , model_output
453- )
432+
433+ # Block until the next result is available.
434+ future , scheduler_output = batch_queue .pop ()
435+ with self .log_error_detail (scheduler_output ):
436+ model_output = future .result ()
437+
438+ engine_core_outputs = self .scheduler .update_from_output (
439+ scheduler_output , model_output
440+ )
454441
455442 # NOTE(nick): We can either handle the deferred tasks here or save
456443 # in a field and do it immediately once step_with_batch_queue is
457444 # re-called. The latter slightly favors TTFT over TPOT/throughput.
458445 if deferred_scheduler_output :
459- with record_function_or_nullcontext (
460- "core step_with_batch_queue: deferred_scheduler_output"
461- ):
462- # We now have the tokens needed to compute the bitmask for the
463- # deferred request. Get the bitmask and call sample tokens.
464- grammar_output = self .scheduler .get_grammar_bitmask (
465- deferred_scheduler_output
466- )
467- future = self .model_executor .sample_tokens (
468- grammar_output , non_block = True
469- )
470- batch_queue .appendleft ((future , deferred_scheduler_output ))
446+ # We now have the tokens needed to compute the bitmask for the
447+ # deferred request. Get the bitmask and call sample tokens.
448+ grammar_output = self .scheduler .get_grammar_bitmask (
449+ deferred_scheduler_output
450+ )
451+ future = self .model_executor .sample_tokens (grammar_output , non_block = True )
452+ batch_queue .appendleft ((future , deferred_scheduler_output ))
471453
472454 return engine_core_outputs , model_executed
473455
0 commit comments