Skip to content

Commit b7869e3

Browse files
HaoLi980405HelenJia98zbb200819
authored
[bugfix]gsa fix running reqs exceed 30 bug (#195)
* increase q cache num * deal 抢占bug * ci --------- Co-authored-by: xujia <42216276@qq.com> Co-authored-by: zbb200819 <1130072360@qq.com>
1 parent aef5f3a commit b7869e3

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

ucm/ucm_sparse/gsa.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@ def add_req_new(
125125
True if self.num_prompt_tokens > SEG_PREFILL_THRESHOLD else False
126126
)
127127
self._init_slot(offset)
128+
if len(self.repre_slot_mapping) > len(self.blocks):
129+
self.repre_slot_mapping = self.repre_slot_mapping[: len(self.blocks)]
128130

129131
def updata_req_state(
130132
self, num_scheduled_tokens, add_req_state, index_in_batch
@@ -147,6 +149,8 @@ def updata_req_state(
147149
else:
148150
self.calc_block_table = []
149151
self.calc_repre_slot_mapping = []
152+
if len(self.repre_slot_mapping) > len(self.blocks):
153+
self.repre_slot_mapping = self.repre_slot_mapping[: len(self.blocks)]
150154

151155
def _get_sparse_and_free_block(self):
152156
if self.num_prompt_tokens == self.num_computed_tokens:
@@ -400,7 +404,7 @@ def init_topk_cal(
400404
max_model_len = vllm_config.model_config.max_model_len
401405
max_num_seqs = vllm_config.scheduler_config.max_num_seqs
402406
self.gsa_offload_ops = gsa_offload_ops.CalKpreAndTopk(
403-
self.layer_num, block_size, MAX_BS, att_num_heads, head_size
407+
self.layer_num, block_size, max_num_seqs, att_num_heads, head_size
404408
)
405409
self.gsa_offload_ops.set_kpre_method_param(
406410
int(max_model_len / block_size) * MAX_BS, kv_num_heads, 1

0 commit comments

Comments
 (0)