quic
diff --git a/‎QEfficient/customop/ctx_scatter_gather.py‎
Lines changed: 11 additions & 5 deletions b/‎QEfficient/customop/ctx_scatter_gather.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎QEfficient/customop/ctx_scatter_gather_cb.py‎
Lines changed: 12 additions & 6 deletions b/‎QEfficient/customop/ctx_scatter_gather_cb.py‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎QEfficient/generation/text_generation_inference.py‎
Lines changed: 78 additions & 0 deletions b/‎QEfficient/generation/text_generation_inference.py‎
Lines changed: 78 additions & 0 deletions
@@ -115,8 +115,14 @@ def symbolic(g: torch.Graph, data: torch.Value, ctx_indices: torch.Value) -> tor
 
 
 @onnxscript.script(onnxscript.values.Opset("com.qualcomm.cloud", 1))
-def CtxGather(data: onnxscript.FLOAT, ctx_indices: onnxscript.INT32) -> onnxscript.FLOAT:
-    ctx_indices = ops.Expand(ctx_indices, ops.Slice(ops.Shape(data), starts=[0], ends=[3], axes=[0]))
+def CtxGather(
+    data: onnxscript.FLOAT, ctx_indices: onnxscript.INT32, comp_ctx_len: onnxscript.INT32
+) -> onnxscript.FLOAT:
+    # Create a shape tensor based on comp_ctx_len
+    shape_tensor = ops.Concat(ops.Shape(data)[:2], ops.Reshape(comp_ctx_len, [1]), axis=0)
+
+    # Directly use the shape tensor without validation
+    ctx_indices = ops.Expand(ctx_indices, shape_tensor)
     ctx_indices = ops.Unsqueeze(ctx_indices, [-1])
     return ops.GatherND(data, ctx_indices, batch_dims=2)
 
@@ -127,7 +133,7 @@ class CtxGatherFunc(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(data: torch.Tensor, ctx_indices: torch.Tensor):
+    def forward(data: torch.Tensor, ctx_indices: torch.Tensor, comp_ctx_len: int):
         batch_indices = torch.arange(data.shape[0]).view(-1, 1, 1)
         head_indices = torch.arange(data.shape[1]).view(1, -1, 1)
         return data[batch_indices, head_indices, ctx_indices]
@@ -137,5 +143,5 @@ def setup_context(ctx, inputs, outputs):
         pass
 
     @staticmethod
-    def symbolic(g: torch.Graph, data: torch.Value, ctx_indices: torch.Value) -> torch.Value:
-        return g.onnxscript_op(CtxGather, data, ctx_indices).setTypeAs(data)
+    def symbolic(g: torch.Graph, data: torch.Value, ctx_indices: torch.Value, comp_ctx_len: int) -> torch.Value:
+        return g.onnxscript_op(CtxGather, data, ctx_indices, comp_ctx_len).setTypeAs(data)
@@ -97,16 +97,20 @@ def symbolic(
 
 @onnxscript.script(onnxscript.values.Opset("com.qualcomm.cloud", 1))
 def CtxGatherCB(
-    data: onnxscript.FLOAT, batch_index: onnxscript.INT32, ctx_indices: onnxscript.INT32
+    data: onnxscript.FLOAT, batch_index: onnxscript.INT32, ctx_indices: onnxscript.INT32, comp_ctx_len: onnxscript.INT32
 ) -> onnxscript.FLOAT:
     batch_size = ops.Gather(ops.Shape(batch_index), [0])
     num_heads = ops.Gather(ops.Shape(data), [1])
-    ctx_len = ops.Gather(ops.Shape(data), [2])
+    # using compute-context-length (CCL) instead of context-length to do gather process based on CCL and later do attention computations based on CCL as well.
+    ctx_len = ops.Reshape(comp_ctx_len, [1])
 
     # Expanded shape to create indices
     zero = ops.Constant(value_ints=[0])
     one = ops.Constant(value_ints=[1])
-    exp_shape = ops.Concat(batch_size, num_heads, ctx_len, one, axis=0)
+    # exp_shape = ops.Concat(batch_size, num_heads, ctx_len, one, axis=0)
+    exp_shape = ops.Concat(
+        ops.Reshape(batch_size, [1]), ops.Reshape(num_heads, [1]), ops.Reshape(ctx_len, [1]), one, axis=0
+    )
 
     # Create indices
     batch_idx = ops.Expand(ops.Unsqueeze(batch_index, [2, 3]), exp_shape)
@@ -119,7 +123,7 @@ def CtxGatherCB(
 
 class CtxGatherFuncCB(torch.autograd.Function):
     @staticmethod
-    def forward(data: torch.Tensor, batch_index: torch.Tensor, ctx_indices: torch.Tensor):
+    def forward(data: torch.Tensor, batch_index: torch.Tensor, ctx_indices: torch.Tensor, comp_ctx_len: int):
         batch_indices = batch_index.view(-1, 1, 1)
         head_indices = torch.arange(data.shape[1]).view(1, -1, 1)
         return data[batch_indices, head_indices, ctx_indices]
@@ -129,8 +133,10 @@ def setup_context(ctx, inputs, outputs):
         pass
 
     @staticmethod
-    def symbolic(g: torch.Graph, data: torch.Value, batch_index: torch.Value, ctx_indices: torch.Value) -> torch.Value:
-        return g.onnxscript_op(CtxGatherCB, data, batch_index, ctx_indices).setTypeAs(data)
+    def symbolic(
+        g: torch.Graph, data: torch.Value, batch_index: torch.Value, ctx_indices: torch.Value, comp_ctx_len: int
+    ) -> torch.Value:
+        return g.onnxscript_op(CtxGatherCB, data, batch_index, ctx_indices, comp_ctx_len).setTypeAs(data)
 
 
 @onnxscript.script(onnxscript.values.Opset("com.qualcomm.cloud", 1))
 
@@ -318,6 +318,8 @@ def cloud_ai_100_exec_kv(
     prompts_txt_file_path: Optional[str] = None,
     device_id: Optional[List[int]] = None,
     generation_len: Optional[int] = None,
+    comp_ctx_lengths: Optional[List[int]] = None,
+    prefill_ccl_len: Optional[int] = 1,
     enable_debug_logs: bool = False,
     stream: bool = True,
     write_io_dir: Optional[str] = None,
@@ -382,6 +384,8 @@ def cloud_ai_100_exec_kv(
         qpc_path=qpc_path,
         device_id=device_id,
         ctx_len=ctx_len,
+        comp_ctx_lengths=comp_ctx_lengths,
+        prefill_ccl_len=prefill_ccl_len,
         enable_debug_logs=enable_debug_logs,
         write_io_dir=write_io_dir,
         full_batch_size=full_batch_size,
@@ -424,6 +428,8 @@ def __init__(
         qpc_path: str,
         full_batch_size: Optional[int] = None,
         ctx_len: Optional[int] = None,
+        comp_ctx_lengths: Optional[List[int]] = None,
+        prefill_ccl_len: Optional[int] = 1,
         device_id: Optional[List[int]] = None,
         enable_debug_logs: bool = False,
         write_io_dir: Optional[str] = None,
@@ -433,6 +439,8 @@ def __init__(
         sampling_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self._ctx_len = ctx_len
+        self.comp_ctx_lengths = comp_ctx_lengths
+        self.prefill_ccl_len = prefill_ccl_len
         self._write_io_dir = write_io_dir
         self.is_tlm = is_tlm
         self.return_pdfs = return_pdfs
@@ -791,7 +799,23 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
                 batch_lora_ids = [self._prompt_to_lora_id_mapping_prefill.popleft() for i in range(self.batch_size)]
                 inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1)
 
+        if self.comp_ctx_lengths is not None:
+            self.list_of_comp_ctx_lengths = [np.zeros(length) for length in self.comp_ctx_lengths]
+            prefill_ccl_id = 0
+            inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[prefill_ccl_id]
+
         for i in range(num_chunks):
+            if (i + 1) * self._prefill_seq_len > self.comp_ctx_lengths[prefill_ccl_id]:
+                prefill_ccl_id += 1
+                if prefill_ccl_id >= self.prefill_ccl_len:
+                    prefill_ccl_id = (
+                        (self.prefill_ccl_len - 1)
+                        if self.prefill_ccl_len != 0
+                        else min(prefill_ccl_id, len(self.comp_ctx_lengths) - 1)
+                    )
+
+                inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[prefill_ccl_id]
+
             chunk_inputs = inputs.copy()
             chunk_inputs["input_ids"] = inputs["input_ids"][
                 :, i * self._prefill_seq_len : (i + 1) * self._prefill_seq_len
@@ -810,6 +834,18 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
             generation_len,
         )
 
+    def initialize_ccl(self, decode_inputs):
+        max_ccl_id = len(self.comp_ctx_lengths) - 1
+        max_position_id = np.max(decode_inputs["position_ids"])
+        ccl_id_initial = self.prefill_ccl_len
+        ccl_id = ccl_id_initial
+        for i in range(ccl_id_initial, len(self.comp_ctx_lengths)):
+            if max_position_id < self.comp_ctx_lengths[i]:
+                ccl_id = i
+                break
+
+        return ccl_id, max_ccl_id
+
     def run_continuous_batching_decode(self, prompt_queue, generation_len):
         """
         Runs continuous batching decode for the given prompt queue and generation length.
@@ -841,6 +877,10 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
         # Prepare decode inputs inputs.
         decode_inputs = self.prepare_decode_inputs()
 
+        if self.comp_ctx_lengths is not None:
+            ccl_id, max_ccl_id = self.initialize_ccl(decode_inputs)
+            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[ccl_id]
+
         while prompt_queue or current_decode_ongoing.any():
             outputs = self._session.run(decode_inputs)
 
@@ -878,6 +918,20 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
                                 batch_id_map[decode_batch_id]
                             ]
 
+                        if self.comp_ctx_lengths is not None:
+                            ###Recalculate ccl_id based on position ids###
+                            # Determine the maximum value of position_ids across all batch elements
+                            max_position_id = np.max(decode_inputs["position_ids"])
+
+                            # Update ccl_id and comp_ctx_lengths based on the maximum position id
+                            ccl_id_initial = self.prefill_ccl_len
+                            ccl_id = ccl_id_initial
+                            for i in range(ccl_id_initial, len(self.comp_ctx_lengths)):
+                                if max_position_id < self.comp_ctx_lengths[i]:
+                                    ccl_id = i
+                                    break
+                            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[ccl_id]
+
                     else:
                         current_decode_ongoing[decode_batch_id] = False
                 else:
@@ -890,6 +944,12 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
                     if self.include_sampler:
                         decode_inputs["last_accepted_output_tokens"] = decode_inputs["input_ids"]
 
+                    if self.comp_ctx_lengths is not None:
+                        # Update ccl_id and comp_ctx_lengths based on the maximum position id
+                        if decode_inputs["position_ids"][decode_batch_id, -1] >= self.comp_ctx_lengths[ccl_id] - 1:
+                            ccl_id = min(ccl_id + 1, max_ccl_id)
+                            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[ccl_id]
+
                     generated_id_current_index[decode_batch_id] += 1
 
         return decode_pause_time
@@ -914,7 +974,18 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform
             self._session.set_buffers({"logits": logits_out_placeholder})
         finished_sequences = decode_inputs["input_ids"] == self.tokenizer.eos_token_id
         num_token = 0
+
+        if self.comp_ctx_lengths is not None:
+            ccl_id, max_ccl_id = self.initialize_ccl(decode_inputs)
+            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[ccl_id]
+
+        cache_index = np.max(decode_inputs["position_ids"])
         for num_token in range(1, generation_len):
+            if self.comp_ctx_lengths is not None:
+                if cache_index >= self.comp_ctx_lengths[ccl_id] - 1:
+                    ccl_id = min(ccl_id + 1, max_ccl_id)
+                    decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[ccl_id]
+
             if streamer:
                 streamer.put(decode_inputs["input_ids"][0])
             outputs = self._session.run(decode_inputs)
@@ -926,6 +997,7 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform
             # Prepare inputs for next iteration
             decode_inputs["input_ids"] = self._fetch_next_token_id(outputs)
             decode_inputs["position_ids"][:, -1] += 1
+            cache_index += 1
             self.generated_ids[:, num_token] = decode_inputs["input_ids"][:, -1]
             finished_sequences |= decode_inputs["input_ids"] == self.tokenizer.eos_token_id
             if self.include_sampler:
@@ -975,6 +1047,8 @@ def __init__(
         qpc_path: str,
         full_batch_size: Optional[int] = None,
         ctx_len: Optional[int] = None,
+        comp_ctx_lengths: Optional[List[int]] = None,
+        prefill_ccl_len: Optional[int] = 1,
         device_id: Optional[List[int]] = None,
         enable_debug_logs: bool = False,
         write_io_dir: Optional[str] = None,
@@ -988,6 +1062,8 @@ def __init__(
             qpc_path=qpc_path,
             full_batch_size=full_batch_size,
             ctx_len=ctx_len,
+            comp_ctx_lengths=comp_ctx_lengths,
+            prefill_ccl_len=prefill_ccl_len,
             device_id=device_id,
             enable_debug_logs=enable_debug_logs,
             write_io_dir=write_io_dir,
@@ -999,6 +1075,8 @@ def __init__(
         self._full_batch_size = self._qaic_model.full_batch_size
         self._tokenizer = self._qaic_model.tokenizer
         self._ctx_len = ctx_len
+        self.comp_ctx_lengths = comp_ctx_lengths
+        self.prefill_ccl_len = prefill_ccl_len
         self._perf_metrics = None
         self._prompt_queue = None
         self._text_streamer = None