Skip to content

Commit 2b997ab

Browse files
committed
Spec decode warmup support
Signed-off-by: Chen Haifeng <haifeng.chen@intel.com>
1 parent e18a075 commit 2b997ab

File tree

1 file changed

+28
-5
lines changed

1 file changed

+28
-5
lines changed

vllm_gaudi/v1/worker/hpu_model_runner.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3410,7 +3410,9 @@ def sample_tokens(self, grammar_output: "GrammarOutput | None") -> ModelRunnerOu
34103410
for req_id in self.input_batch.req_ids[:num_reqs]:
34113411
req_state = self.requests[req_id]
34123412
i = self.input_batch.req_id_to_index[req_id]
3413-
seq_len = (req_state.num_computed_tokens + scheduler_output.num_scheduled_tokens[req_id])
3413+
# Cannot use num_computed_tokens + num_scheduled_tokens here
3414+
# as it may include rejected spec decode tokens
3415+
seq_len = self.input_batch.num_tokens_no_spec[i]
34143416
token_ids = postprocessed_sampled_token_ids[i]
34153417
num_tokens = len(token_ids)
34163418
self.input_batch.token_ids_cpu[i, seq_len:seq_len + num_tokens] = token_ids
@@ -3996,7 +3998,16 @@ def _add_dummy_request(self,
39963998
scheduled_tokens,
39973999
is_prompt,
39984000
block_id=0):
3999-
num_blocks = round_up(total_tokens, self.block_size) // self.block_size
4001+
# Spec decode: blocks should include look ahead tokens (eagle)
4002+
total_tokens_for_blocks = total_tokens
4003+
if self.speculative_config and self.speculative_config.use_eagle():
4004+
# Consider the block space for draft tokens to propose
4005+
total_tokens_for_blocks += self.speculative_config.num_speculative_tokens
4006+
# Check the limit of the max model length
4007+
if total_tokens_for_blocks > self.max_model_len:
4008+
total_tokens_for_blocks = self.max_model_len
4009+
4010+
num_blocks = round_up(total_tokens_for_blocks, self.block_size) // self.block_size
40004011
prompt_token_ids = list(range(total_tokens))
40014012

40024013
req_id = f'{len(requests)}'
@@ -4075,14 +4086,20 @@ def _add_dummy_unified_request(self, requests, is_prompt, is_unique, block_num,
40754086
requests.append(req)
40764087
scheduled_tokens[req_id] = num_scheduled_tokens
40774088

4078-
@staticmethod
4079-
def _generate_seq_lengths(num_samples, num_blocks, block_size):
4089+
def _generate_seq_lengths(self, num_samples, num_blocks, block_size):
40804090
assert num_samples <= num_blocks
40814091
blocks = [num_blocks // num_samples] * num_samples
40824092
missing_blocks = num_blocks - sum(blocks)
40834093
for i in range(missing_blocks):
40844094
blocks[i] += 1
4085-
seq_lengths = [b * block_size - 1 for b in blocks]
4095+
4096+
# Leave space for the output token and draft tokens to propose
4097+
num_lookahead_tokens = 1
4098+
if self.speculative_config and self.speculative_config.use_eagle():
4099+
# Consider the token space for draft tokens to propose
4100+
# The draft tokens for eagle consumes block table space
4101+
num_lookahead_tokens += self.speculative_config.num_speculative_tokens
4102+
seq_lengths = [b * block_size - num_lookahead_tokens for b in blocks]
40864103
return seq_lengths
40874104

40884105
def distribute_sum_evenly(self, total_sum, max_length):
@@ -4222,6 +4239,12 @@ def _prepare_dummy_scenario(self, prompt_cfg, decode_cfg):
42224239
prompt_num_blocks)
42234240
for _ in range(prompt_bs):
42244241
for tokens, context_len in zip(prompt_total_tokens, prompt_num_context_blocks):
4242+
if self.speculative_config and self.speculative_config.use_eagle():
4243+
# Leave the block space for draft tokens to propose
4244+
# The draft tokens for eagle consumes block table space
4245+
num_speculative_tokens = self.speculative_config.num_speculative_tokens
4246+
tokens -= num_speculative_tokens
4247+
prompt_query_len -= num_speculative_tokens
42254248
self._add_dummy_request(requests,
42264249
scheduled_tokens,
42274250
num_computed_tokens=(context_len * self.block_size),

0 commit comments

Comments
 (0)