[bugfix]gsa fix running reqs exceed 30 bug (#195)

HaoLi980405 · HelenJia98 · zbb200819 · web-flow · commit b7869e3dffb8 · 2025-09-20T21:34:08.000+08:00
* increase q cache num

* deal 抢占bug

* ci

---------

Co-authored-by: xujia &lt;42216276@qq.com&gt;
Co-authored-by: zbb200819 &lt;1130072360@qq.com&gt;
diff --git a/ucm/ucm_sparse/gsa.py b/ucm/ucm_sparse/gsa.py
@@ -125,6 +125,8 @@ def add_req_new(
             True if self.num_prompt_tokens > SEG_PREFILL_THRESHOLD else False
         )
         self._init_slot(offset)
+        if len(self.repre_slot_mapping) > len(self.blocks):
+            self.repre_slot_mapping = self.repre_slot_mapping[: len(self.blocks)]
 
     def updata_req_state(
         self, num_scheduled_tokens, add_req_state, index_in_batch
@@ -147,6 +149,8 @@ def updata_req_state(
             else:
                 self.calc_block_table = []
                 self.calc_repre_slot_mapping = []
+        if len(self.repre_slot_mapping) > len(self.blocks):
+            self.repre_slot_mapping = self.repre_slot_mapping[: len(self.blocks)]
 
     def _get_sparse_and_free_block(self):
         if self.num_prompt_tokens == self.num_computed_tokens:
@@ -400,7 +404,7 @@ def init_topk_cal(
         max_model_len = vllm_config.model_config.max_model_len
         max_num_seqs = vllm_config.scheduler_config.max_num_seqs
         self.gsa_offload_ops = gsa_offload_ops.CalKpreAndTopk(
-            self.layer_num, block_size, MAX_BS, att_num_heads, head_size
+            self.layer_num, block_size, max_num_seqs, att_num_heads, head_size
         )
         self.gsa_offload_ops.set_kpre_method_param(
             int(max_model_len / block_size) * MAX_BS, kv_num_heads, 1