Adding Compute-Context-Length(CCL)

vjanfaza · vjanfaza · commit 916fe134e483 · 2025-10-14T10:04:04.000-07:00
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -924,7 +924,7 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
                             max_position_id = np.max(decode_inputs["position_ids"])
 
                             # Update ccl_id and comp_ctx_lengths_decode based on the maximum position id
-                            ccl_id_initial = self.prefill_ccl_len
+                            ccl_id_initial = 0
                             ccl_id = ccl_id_initial
                             for i in range(ccl_id_initial, len(self.comp_ctx_lengths_decode)):
                                 if max_position_id < self.comp_ctx_lengths_decode[i]:
diff --git a/examples/compute_context_length.py b/examples/compute_context_length.py
@@ -12,15 +12,15 @@
 from QEfficient import QEFFAutoModelForCausalLM
 
 ## Using optional variable comp_ctx_lengths variable you can pass a list of context lengths. It will run the model with default context length if comp_ctx_lengths=None. ##
-##       - The first Prefill_ccl_len numbers in this list are the context lengths that will be used during prefilling. ##
-##       - During the decoding process, based on the position_id or cache index it will work with the specific compute-context-length in the list. It will start from a proper compute-context-length in the list based on input prompt length and will gradually increase the compute-context-length if the cache index passes the current compute-context-length. ##
+##       - The first comp_ctx_lengths_prefill list shows the compute-ctx-length list for prefilling process. ##
+##       - The second comp_ctx_lengths_decode list will be used for decoding. During the decoding process, based on the position_id or cache index it will work with the specific compute-context-length in the list. It will start from a proper compute-context-length in the list based on input prompt length and will gradually increase the compute-context-length if the cache index passes the current compute-context-length. ##
 
 
-ctx_len = 2048
+ctx_len = 1024
 comp_ctx_lengths_prefill = [256]
-comp_ctx_lengths_decode = [512, 1024, ctx_len]
+comp_ctx_lengths_decode = [512, ctx_len]
 
-model_name = "Qwen/Qwen2.5-7B"
+model_name = "ibm-granite/granite-3.2-8b-instruct "
 model = QEFFAutoModelForCausalLM.from_pretrained(
     model_name,
     continuous_batching=True,