@@ -138,12 +138,12 @@ def __init__(self,
138138 # schedule and execute batches, and is required by pipeline parallelism
139139 # to eliminate pipeline bubbles.
140140 self .batch_queue_size = self .model_executor .max_concurrent_batches
141- self .batch_queue : Optional [queue . Queue [tuple [Future [ModelRunnerOutput ],
142- SchedulerOutput ]]] = None
141+ self .batch_queue : Optional [deque [tuple [Future [ModelRunnerOutput ],
142+ SchedulerOutput ]]] = None
143143 if self .batch_queue_size > 1 :
144144 logger .info ("Batch queue is enabled with size %d" ,
145145 self .batch_queue_size )
146- self .batch_queue = queue . Queue ( self .batch_queue_size )
146+ self .batch_queue = deque ( maxlen = self .batch_queue_size )
147147
148148 self .request_block_hasher : Optional [Callable [[Request ],
149149 list [BlockHash ]]] = None
@@ -319,41 +319,43 @@ def step_with_batch_queue(
319319 batch in the job queue is finished.
320320 3. Update the scheduler from the output.
321321 """
322- assert self .batch_queue is not None
322+ batch_queue = self .batch_queue
323+ assert batch_queue is not None
323324
324- engine_core_outputs = None
325- scheduler_output = None
326325 # Try to schedule a new batch if the batch queue is not full, but
327326 # the scheduler may return an empty batch if all requests are scheduled.
328327 # Note that this is not blocking.
329- if not self .batch_queue .full ():
330- scheduler_output = self .scheduler .schedule ()
331- if scheduler_output .total_num_scheduled_tokens > 0 :
332- future = self .model_executor .execute_model (scheduler_output )
333- self .batch_queue .put_nowait (
334- (future , scheduler_output )) # type: ignore
335-
336- scheduled_batch = (scheduler_output is not None
337- and scheduler_output .total_num_scheduled_tokens > 0 )
338-
339- # If no more requests can be scheduled and the job queue is not empty,
340- # block until the first batch in the job queue is finished.
341- # TODO(comaniac): Ideally we should peek the first batch in the
342- # job queue to check if it's finished before scheduling a new batch,
343- # but peeking the first element in a queue is not thread-safe,
344- # so we need more work.
345- if not scheduled_batch and not self .batch_queue .empty ():
346- future , scheduler_output = self .batch_queue .get_nowait ()
328+ assert len (batch_queue ) < self .batch_queue_size
347329
348- # Blocking until the first result is available.
349- model_output = self .execute_model_with_error_logging (
350- lambda _ : future .result (), scheduler_output )
330+ model_executed = False
331+ if self .scheduler .has_requests ():
332+ scheduler_output = self .scheduler .schedule ()
333+ future = self .model_executor .execute_model (scheduler_output )
334+ batch_queue .appendleft (
335+ (future , scheduler_output )) # type: ignore[arg-type]
336+
337+ model_executed = scheduler_output .total_num_scheduled_tokens > 0
338+ if model_executed and len (batch_queue ) < self .batch_queue_size \
339+ and not batch_queue [- 1 ][0 ].done ():
340+ # Don't block on next worker response unless the queue is full
341+ # or there are no more requests to schedule.
342+ return None , True
343+
344+ elif not batch_queue :
345+ # Queue is empty. We should not reach here since this method should
346+ # only be called when the scheduler contains requests or the queue
347+ # is non-empty.
348+ return None , False
349+
350+ # Block until the next result is available.
351+ future , scheduler_output = batch_queue .pop ()
352+ model_output = self .execute_model_with_error_logging (
353+ lambda _ : future .result (), scheduler_output )
351354
352- self .batch_queue .task_done ()
353- engine_core_outputs = (self .scheduler .update_from_output (
354- scheduler_output , model_output ))
355+ engine_core_outputs = self .scheduler .update_from_output (
356+ scheduler_output , model_output )
355357
356- return engine_core_outputs , scheduled_batch
358+ return engine_core_outputs , model_executed
357359
358360 def shutdown (self ):
359361 self .structured_output_manager .clear_backend ()
@@ -388,7 +390,7 @@ def is_sleeping(self) -> bool:
388390 return self .model_executor .is_sleeping
389391
390392 def execute_dummy_batch (self ):
391- self .model_executor .collective_rpc ( " execute_dummy_batch" )
393+ self .model_executor .execute_dummy_batch ( )
392394
393395 def add_lora (self , lora_request : LoRARequest ) -> bool :
394396 return self .model_executor .add_lora (lora_request )
@@ -733,7 +735,8 @@ def _process_input_queue(self):
733735 """Exits when an engine step needs to be performed."""
734736
735737 waited = False
736- while not self .engines_running and not self .scheduler .has_requests ():
738+ while not self .engines_running and not self .scheduler .has_requests () \
739+ and not self .batch_queue :
737740 if logger .isEnabledFor (DEBUG ) and self .input_queue .empty ():
738741 logger .debug ("EngineCore waiting for work." )
739742 waited = True
0 commit comments