[bugfix]gsa fix reslotmapping bug (#194)

HaoLi980405 · zbb200819 · HelenJia98 · web-flow · commit aef5f3a4b3c3 · 2025-09-20T10:56:09.000+08:00
* deal bug

* gpu kpre and bug fixed

* deal bug

* deal bug

* deal bug

* clean code

* CI

* ci

---------

Co-authored-by: zbb200819 &lt;1130072360@qq.com&gt;
Co-authored-by: xujia &lt;42216276@qq.com&gt;
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
@@ -27,7 +27,7 @@ def build_llm_with_uc(module_path: str, name: str, model: str):
         kv_connector_extra_config={
             "ucm_connector_name": "UcmDram",
             "ucm_connector_config": {
-                "max_cache_size": 5368709120,
+                "max_cache_size": 53687091200,
                 "kv_block_size": 262144,
             },
             "ucm_sparse_method": "GSA",
@@ -37,8 +37,8 @@ def build_llm_with_uc(module_path: str, name: str, model: str):
     llm_args = EngineArgs(
         model=model,
         kv_transfer_config=ktc,
-        max_model_len=8000,
-        gpu_memory_utilization=0.8,
+        max_model_len=40960,
+        gpu_memory_utilization=0.87,
         block_size=128,
     )
 
@@ -81,7 +81,7 @@ def main():
             "Write a detailed letter to the leaders of Earth, explaining the most urgent global issue of the 21st "
             "century, the root sauses behind it, and a set of scientifically grounded, morally sound, and globally "
             "cooperative solutions that transcend culturak and national boundaries. Include both immediate actions "
-            "and long-term strategies."
+            "and long-term strategies." * 200
         ]
 
         sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=100)
diff --git a/ucm/csrc/gsaoffloadops/src/cal_kpre_and_topk.cpp b/ucm/csrc/gsaoffloadops/src/cal_kpre_and_topk.cpp
@@ -32,10 +32,10 @@ void CalKpreAndTopk::SetKpreMethodParam(uint32_t maxBlockNum, uint32_t numHeads,
     m_kNumHeads = numHeads;
     m_numKpre = numKpre;
     auto optionsForKCache = torch::TensorOptions().device("cpu").dtype(torch::kFloat32);
-    for (uint32_t i = 0; i < m_layerNum; i++) {
-        torch::Tensor layerKCache = torch::zeros({maxBlockNum, m_kNumHeads, m_blockSize, m_headSize}, optionsForKCache);
-        m_kCache.push_back(layerKCache);
-    }
+    // for (uint32_t i = 0; i < m_layerNum; i++) {
+    //     torch::Tensor layerKCache = torch::zeros({maxBlockNum, m_kNumHeads, m_blockSize, m_headSize}, optionsForKCache);
+    //     m_kCache.push_back(layerKCache);
+    // }
 }
 
 void CalKpreAndTopk::SetKpreCache(std::vector<torch::Tensor>& kpreCache)
@@ -152,10 +152,10 @@ void CalKpreAndTopk::CopyData()
             }
             SetTopkDataReady(curReq.layerId);
         } else {
-            torch::Tensor kNeeded = curReq.srcTensor.index({curReq.ids}).cpu();
-            torch::Tensor kCache = kNeeded.to(torch::kFloat32).permute({0, 2, 1, 3});
-            auto targetTensor = m_kCache[curReq.layerId].slice(0, 0, curReq.ids.size(0));
-            targetTensor.copy_(kCache);
+            // torch::Tensor kNeeded = curReq.srcTensor.index({curReq.ids}).cpu();
+            // torch::Tensor kCache = kNeeded.to(torch::kFloat32).permute({0, 2, 1, 3});
+            // auto targetTensor = m_kCache[curReq.layerId].slice(0, 0, curReq.ids.size(0));
+            // targetTensor.copy_(kCache);
             SetKpreDataReady(curReq.layerId);
         }
         if (!m_running) {
@@ -195,7 +195,7 @@ void CalKpreAndTopk::CalForOneLayer(uint32_t curLayer)
 {
     if (m_needCalPre) {
         while(!m_kReady[curLayer].load(std::memory_order_acquire));
-        CalculateKpre(curLayer);
+        // CalculateKpre(curLayer);
     }
     if (m_needCalTopk) {
         while(!m_qReady[curLayer].load(std::memory_order_acquire));
diff --git a/ucm/csrc/gsaoffloadops/src/select_topk_block.cpp b/ucm/csrc/gsaoffloadops/src/select_topk_block.cpp
@@ -5,7 +5,6 @@
 #include <cmath>
 #include "select_topk_block.h"
 
-
 namespace SelectTopkBlock {
 #define OMP_THREAD_NUM 16u
 
@@ -48,6 +47,7 @@ void TopkBlockSelector::TopKImpl(const float* scores, uint32_t numScores, uint32
     for (uint32_t i = 0; i < endWindow_; ++i) {
         topkIndices[idx++] = numScores - endWindow_ + i;
     }
+    std::sort(topkIndices, topkIndices + k);
 }
 
 float TopkBlockSelector::ComputeBlockScore(float* qMean, const float* blockBase,
diff --git a/ucm/ucm_sparse/gsa.py b/ucm/ucm_sparse/gsa.py
@@ -117,14 +117,14 @@ def add_req_new(
     ) -> None:
         self.blocks = [x for x in add_req_state.block_ids[0]]
         self.index_in_batch = index_in_batch
-        self._init_slot(offset)
         self.num_computed_tokens = add_req_state.num_computed_tokens
         self.num_scheduled_tokens = num_scheduled_tokens
         self.num_prompt_tokens = len(add_req_state.prompt_token_ids)
         self.num_output_tokens = len(add_req_state.output_token_ids)
         self.is_use_gsa = (
             True if self.num_prompt_tokens > SEG_PREFILL_THRESHOLD else False
         )
+        self._init_slot(offset)
 
     def updata_req_state(
         self, num_scheduled_tokens, add_req_state, index_in_batch
@@ -134,31 +134,24 @@ def updata_req_state(
         self.num_output_tokens = len(add_req_state.output_token_ids)
         self.index_in_batch = index_in_batch
         if self.stage() == SequenceStage.PREFILL:
-            if self.is_last_chunk():
-                add_blocks = [
-                    x for x in add_req_state.block_ids[0][:-1] if x not in self.blocks
-                ]
-            else:
-                add_blocks = [
-                    x for x in add_req_state.block_ids[0] if x not in self.blocks
-                ]
+            add_blocks = [x for x in add_req_state.block_ids[0] if x not in self.blocks]
             self.blocks = [x for x in add_req_state.block_ids[0]]
             self._update_slot(add_blocks)
         else:
             self._get_sparse_and_free_block()
             if len(add_req_state.block_ids[0]) != self.sparse_len:
-                add_blocks = [add_req_state.block_ids[0][-2]]
-                self._update_slot(add_blocks)
+                add_blocks = [add_req_state.block_ids[0][-1]]
                 self.blocks += [add_req_state.block_ids[0][-1]]
                 self.sparse_len = len(add_req_state.block_ids[0])
+                self._update_slot(add_blocks)
             else:
                 self.calc_block_table = []
                 self.calc_repre_slot_mapping = []
 
     def _get_sparse_and_free_block(self):
         if self.num_prompt_tokens == self.num_computed_tokens:
             blocks_len = len(self.blocks)
-            if self.num_prompt_tokens > SEG_PREFILL_THRESHOLD:
+            if self.num_prompt_tokens > SEG_PREFILL_THRESHOLD and PTOPK_PREFETCH_ENABLE:
                 remain_len = compute_topk_len(blocks_len)
                 if remain_len > MAX_TOPK_LEN:
                     prefetch_len = 0
@@ -176,10 +169,7 @@ def _get_sparse_and_free_block(self):
                 self.prefetch_idx = remain_blocks_idx[
                     remain_len - LOCAL_WINDOW_SZ : -LOCAL_WINDOW_SZ
                 ]
-                if PTOPK_PREFETCH_ENABLE:
-                    self.sparse_len = remain_len + prefetch_len
-                else:
-                    self.sparse_len = blocks_len
+                self.sparse_len = remain_len + prefetch_len
             else:
                 self.remain_idx = list(range(blocks_len))
                 self.prefetch_idx = []
@@ -190,14 +180,14 @@ def _get_sparse_and_free_block(self):
             self.prefetch_idx = None
 
     def _init_slot(self, offset: int) -> None:
+        self.repre_slot_mapping = list(range(len(self.blocks)))
+        self.repre_slot_mapping = [x + offset for x in self.repre_slot_mapping]
         if self.is_last_chunk():
-            self.repre_slot_mapping = list(range(len(self.blocks) - 1))
             self.calc_block_table = [x for x in self.blocks[:-1]]
+            self.calc_repre_slot_mapping = [x for x in self.repre_slot_mapping[:-1]]
         else:
-            self.repre_slot_mapping = list(range(len(self.blocks)))
             self.calc_block_table = [x for x in self.blocks]
-        self.repre_slot_mapping = [x + offset for x in self.repre_slot_mapping]
-        self.calc_repre_slot_mapping = [x for x in self.repre_slot_mapping]
+            self.calc_repre_slot_mapping = [x for x in self.repre_slot_mapping]
 
         value = len(self.blocks)
         one_mask = [False] * value
@@ -224,8 +214,20 @@ def _update_slot(
                 self.include_mask.append(True)
             self.exclude_mask.append(False)
         if add_len > 0:
-            self.calc_block_table = [x for x in add_blocks]
-            self.calc_repre_slot_mapping = self.repre_slot_mapping[add_len * -1 :]
+            if self.stage() == SequenceStage.PREFILL:
+                if self.is_last_chunk():
+                    self.calc_block_table = [x for x in add_blocks[:-1]]
+                    self.calc_repre_slot_mapping = self.repre_slot_mapping[
+                        add_len * -1 : -1
+                    ]
+                else:
+                    self.calc_block_table = [x for x in add_blocks]
+                    self.calc_repre_slot_mapping = self.repre_slot_mapping[
+                        add_len * -1 :
+                    ]
+            else:
+                self.calc_block_table = [self.blocks[-1]]
+                self.calc_repre_slot_mapping = [self.repre_slot_mapping[-1]]
         else:
             self.calc_block_table = []
             self.calc_repre_slot_mapping = []
@@ -269,15 +271,20 @@ def get_model_input(
     def trans_input_tensor(self, scheduler_output: SchedulerOutput):
         calc_block_table = []
         model_input = {}
+        calc_repre_slot_mappings = []
         query_locals = [0]
         for req_id, _ in scheduler_output.num_scheduled_tokens.items():
             calc_block_table += self.gsa_stats[req_id].calc_block_table
+            calc_repre_slot_mappings += self.gsa_stats[req_id].calc_repre_slot_mapping
             query_locals.append(
                 query_locals[-1] + scheduler_output.num_scheduled_tokens[req_id]
             )
         model_input["calc_block_table"] = torch.tensor(
             calc_block_table, dtype=torch.int32, device="cpu"
         )
+        model_input["calc_repre_slot_mapping"] = torch.tensor(
+            calc_repre_slot_mappings, dtype=torch.int32, device="cpu"
+        )
         model_input["query_locals"] = query_locals
         return model_input
 
@@ -544,7 +551,7 @@ def copy_q(self, query: torch.Tensor, current_layer_id: int) -> None:
             if req_meta.stage() == SequenceStage.DECODE:
                 index_in_batch = req_meta.index_in_batch
                 ids[index_in_batch] = (
-                    self.model_input["query_locals"][index_in_batch] - 1
+                    self.model_input["query_locals"][index_in_batch + 1] - 1
                 )
                 self.gsa_q_cache[current_layer_id][index_in_batch].copy_(
                     query[ids[index_in_batch]]
@@ -560,12 +567,27 @@ def copy_q(self, query: torch.Tensor, current_layer_id: int) -> None:
     def copy_k(self, layer_name: str, forward_context: ForwardContext) -> None:
         current_layer_id = int(layer_name.split(".")[2])
         block_ids = self.model_input["calc_block_table"]
+        calc_repre_slot_mappings = self.model_input["calc_repre_slot_mapping"]
         if len(block_ids) > 0:
             attn = forward_context.no_compile_layers
-            k_needed = attn[layer_name].kv_cache[forward_context.virtual_engine][0]
-            result = self.gsa_offload_ops.add_copy_req(
-                True, current_layer_id, [], k_needed
+            key_cache_mean_out = (
+                attn[layer_name]
+                .kv_cache[forward_context.virtual_engine][0][block_ids]
+                .mean(dim=1, keepdim=True)
+                .cpu()
             )
+            self.prefetch_engine.kpre_caches[current_layer_id][
+                calc_repre_slot_mappings
+            ].copy_(key_cache_mean_out)
+            k_needed = attn[layer_name].kv_cache[forward_context.virtual_engine][0]
+            self.gsa_offload_ops.add_copy_req(True, current_layer_id, [], k_needed)
+
+        # if len(block_ids) > 0:
+        #     attn = forward_context.no_compile_layers
+        #     k_needed = attn[layer_name].kv_cache[forward_context.virtual_engine][0]
+        #     self.gsa_offload_ops.add_copy_req(
+        #         True, current_layer_id, [], k_needed
+        #     )
 
     def attention_begin(
         self,
@@ -588,20 +610,20 @@ def attention_begin(
 
         if isinstance(forward_context.attn_metadata, dict):
             attn_metadata = forward_context.attn_metadata[layer_name]
-            block_tables = attn_metadata.block_table
         else:
             attn_metadata = forward_context.attn_metadata
-            block_tables = attn_metadata.block_tables
         if self.prefetch_engine.atb_gsa_enable:
             if torch.cuda.is_available():
-                block_tables = self.model_input["block_tables_mp"][current_layer_id]
+                attn_metadata.block_table = self.model_input["block_tables_mp"][
+                    current_layer_id
+                ]
                 attn_metadata.seq_lens = self.model_input["gsa_seq_len"][
                     current_layer_id
                 ]
             else:
-                block_tables[: len(self.prefetch_engine.req_ids_bs)].copy_(
-                    self.model_input["block_tables_mp"][current_layer_id]
-                )
+                attn_metadata.block_tables[
+                    : len(self.prefetch_engine.req_ids_bs)
+                ].copy_(self.model_input["block_tables_mp"][current_layer_id])
                 attn_metadata.seq_lens.copy_(
                     self.model_input["gsa_seq_len"][current_layer_id]
                 )
@@ -734,6 +756,7 @@ def execute_begin(self, scheduler_output: SchedulerOutput):
             self.gsa_metadata,
             is_topk_done,
         )
+        self.gsa_stats = self.gsa_metadata.gsa_stats
         self._start_topk_cal()
 
     def execute_finished(self):
diff --git a/ucm/ucm_sparse/prefetch_engine.py b/ucm/ucm_sparse/prefetch_engine.py
@@ -153,7 +153,7 @@ def model_input_del(
             list_topk_buf = list(topk_buf_tmp.unbind(dim=0))
             list_block_table = list(block_table_tmp.unbind(dim=0))
             gsa_len_list = list(gen_len_tmp.unbind(dim=0))
-            self.is_topk_cal = is_topk_done and self.prefetch_space == 3
+            self.is_topk_cal = is_topk_done and self.num_token % 3 == 0
             gsa_model_input["topk_caches"] = list_topk_buf
             gsa_model_input["kpre_caches"] = self.kpre_caches
             gsa_model_input["is_topk"] = self.is_topk_cal

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`	`#include <cmath>`
`6`	`6`	`#include "select_topk_block.h"`
`7`	`7`
`8`		`-`
`9`	`8`	`namespace SelectTopkBlock {`
`10`	`9`	`#define OMP_THREAD_NUM 16u`
`11`	`10`
`@@ -48,6 +47,7 @@ void TopkBlockSelector::TopKImpl(const float* scores, uint32_t numScores, uint32`
`48`	`47`	`for (uint32_t i = 0; i < endWindow_; ++i) {`
`49`	`48`	`topkIndices[idx++] = numScores - endWindow_ + i;`
`50`	`49`	`}`
	`50`	`+ std::sort(topkIndices, topkIndices + k);`
`51`	`51`	`}`
`52`	`52`
`53`	`53`	`float TopkBlockSelector::ComputeBlockScore(float* qMean, const float* blockBase,`