@@ -125,6 +125,8 @@ def add_req_new(
125125 True if self .num_prompt_tokens > SEG_PREFILL_THRESHOLD else False
126126 )
127127 self ._init_slot (offset )
128+ if len (self .repre_slot_mapping ) > len (self .blocks ):
129+ self .repre_slot_mapping = self .repre_slot_mapping [: len (self .blocks )]
128130
129131 def updata_req_state (
130132 self , num_scheduled_tokens , add_req_state , index_in_batch
@@ -147,6 +149,8 @@ def updata_req_state(
147149 else :
148150 self .calc_block_table = []
149151 self .calc_repre_slot_mapping = []
152+ if len (self .repre_slot_mapping ) > len (self .blocks ):
153+ self .repre_slot_mapping = self .repre_slot_mapping [: len (self .blocks )]
150154
151155 def _get_sparse_and_free_block (self ):
152156 if self .num_prompt_tokens == self .num_computed_tokens :
@@ -400,7 +404,7 @@ def init_topk_cal(
400404 max_model_len = vllm_config .model_config .max_model_len
401405 max_num_seqs = vllm_config .scheduler_config .max_num_seqs
402406 self .gsa_offload_ops = gsa_offload_ops .CalKpreAndTopk (
403- self .layer_num , block_size , MAX_BS , att_num_heads , head_size
407+ self .layer_num , block_size , max_num_seqs , att_num_heads , head_size
404408 )
405409 self .gsa_offload_ops .set_kpre_method_param (
406410 int (max_model_len / block_size ) * MAX_BS , kv_num_heads , 1
0 commit comments