@@ -3410,7 +3410,9 @@ def sample_tokens(self, grammar_output: "GrammarOutput | None") -> ModelRunnerOu
34103410 for req_id in self .input_batch .req_ids [:num_reqs ]:
34113411 req_state = self .requests [req_id ]
34123412 i = self .input_batch .req_id_to_index [req_id ]
3413- seq_len = (req_state .num_computed_tokens + scheduler_output .num_scheduled_tokens [req_id ])
3413+ # Cannot use num_computed_tokens + num_scheduled_tokens here
3414+ # as it may include rejected spec decode tokens
3415+ seq_len = self .input_batch .num_tokens_no_spec [i ]
34143416 token_ids = postprocessed_sampled_token_ids [i ]
34153417 num_tokens = len (token_ids )
34163418 self .input_batch .token_ids_cpu [i , seq_len :seq_len + num_tokens ] = token_ids
@@ -3996,7 +3998,16 @@ def _add_dummy_request(self,
39963998 scheduled_tokens ,
39973999 is_prompt ,
39984000 block_id = 0 ):
3999- num_blocks = round_up (total_tokens , self .block_size ) // self .block_size
4001+ # Spec decode: blocks should include look ahead tokens (eagle)
4002+ total_tokens_for_blocks = total_tokens
4003+ if self .speculative_config and self .speculative_config .use_eagle ():
4004+ # Consider the block space for draft tokens to propose
4005+ total_tokens_for_blocks += self .speculative_config .num_speculative_tokens
4006+ # Check the limit of the max model length
4007+ if total_tokens_for_blocks > self .max_model_len :
4008+ total_tokens_for_blocks = self .max_model_len
4009+
4010+ num_blocks = round_up (total_tokens_for_blocks , self .block_size ) // self .block_size
40004011 prompt_token_ids = list (range (total_tokens ))
40014012
40024013 req_id = f'{ len (requests )} '
@@ -4075,14 +4086,20 @@ def _add_dummy_unified_request(self, requests, is_prompt, is_unique, block_num,
40754086 requests .append (req )
40764087 scheduled_tokens [req_id ] = num_scheduled_tokens
40774088
4078- @staticmethod
4079- def _generate_seq_lengths (num_samples , num_blocks , block_size ):
4089+ def _generate_seq_lengths (self , num_samples , num_blocks , block_size ):
40804090 assert num_samples <= num_blocks
40814091 blocks = [num_blocks // num_samples ] * num_samples
40824092 missing_blocks = num_blocks - sum (blocks )
40834093 for i in range (missing_blocks ):
40844094 blocks [i ] += 1
4085- seq_lengths = [b * block_size - 1 for b in blocks ]
4095+
4096+ # Leave space for the output token and draft tokens to propose
4097+ num_lookahead_tokens = 1
4098+ if self .speculative_config and self .speculative_config .use_eagle ():
4099+ # Consider the token space for draft tokens to propose
4100+ # The draft tokens for eagle consumes block table space
4101+ num_lookahead_tokens += self .speculative_config .num_speculative_tokens
4102+ seq_lengths = [b * block_size - num_lookahead_tokens for b in blocks ]
40864103 return seq_lengths
40874104
40884105 def distribute_sum_evenly (self , total_sum , max_length ):
@@ -4222,6 +4239,12 @@ def _prepare_dummy_scenario(self, prompt_cfg, decode_cfg):
42224239 prompt_num_blocks )
42234240 for _ in range (prompt_bs ):
42244241 for tokens , context_len in zip (prompt_total_tokens , prompt_num_context_blocks ):
4242+ if self .speculative_config and self .speculative_config .use_eagle ():
4243+ # Leave the block space for draft tokens to propose
4244+ # The draft tokens for eagle consumes block table space
4245+ num_speculative_tokens = self .speculative_config .num_speculative_tokens
4246+ tokens -= num_speculative_tokens
4247+ prompt_query_len -= num_speculative_tokens
42254248 self ._add_dummy_request (requests ,
42264249 scheduled_tokens ,
42274250 num_computed_tokens = (context_len * self .block_size ),
0 commit comments