quic
diff --git a/‎QEfficient/generation/text_generation_inference.py‎
Lines changed: 44 additions & 47 deletions b/‎QEfficient/generation/text_generation_inference.py‎
Lines changed: 44 additions & 47 deletions
diff --git a/‎QEfficient/transformers/models/gemma3/modeling_gemma3.py‎
Lines changed: 7 additions & 9 deletions b/‎QEfficient/transformers/models/gemma3/modeling_gemma3.py‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎QEfficient/transformers/models/internvl/modeling_internvl.py‎
Lines changed: 7 additions & 9 deletions b/‎QEfficient/transformers/models/internvl/modeling_internvl.py‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎QEfficient/transformers/models/llama4/modeling_llama4.py‎
Lines changed: 7 additions & 9 deletions b/‎QEfficient/transformers/models/llama4/modeling_llama4.py‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎QEfficient/transformers/models/llava/modeling_llava.py‎
Lines changed: 7 additions & 9 deletions b/‎QEfficient/transformers/models/llava/modeling_llava.py‎
Lines changed: 7 additions & 9 deletions
@@ -318,8 +318,8 @@ def cloud_ai_100_exec_kv(
     prompts_txt_file_path: Optional[str] = None,
     device_id: Optional[List[int]] = None,
     generation_len: Optional[int] = None,
-    comp_ctx_lengths: Optional[List[int]] = None,
-    prefill_ccl_len: Optional[int] = 1,
+    comp_ctx_lengths_prefill: Optional[List[int]] = None,
+    comp_ctx_lengths_decode: Optional[List[int]] = None,
     enable_debug_logs: bool = False,
     stream: bool = True,
     write_io_dir: Optional[str] = None,
@@ -384,8 +384,8 @@ def cloud_ai_100_exec_kv(
         qpc_path=qpc_path,
         device_id=device_id,
         ctx_len=ctx_len,
-        comp_ctx_lengths=comp_ctx_lengths,
-        prefill_ccl_len=prefill_ccl_len,
+        comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
+        comp_ctx_lengths_decode=comp_ctx_lengths_decode,
         enable_debug_logs=enable_debug_logs,
         write_io_dir=write_io_dir,
         full_batch_size=full_batch_size,
@@ -428,8 +428,8 @@ def __init__(
         qpc_path: str,
         full_batch_size: Optional[int] = None,
         ctx_len: Optional[int] = None,
-        comp_ctx_lengths: Optional[List[int]] = None,
-        prefill_ccl_len: Optional[int] = 1,
+        comp_ctx_lengths_prefill: Optional[List[int]] = None,
+        comp_ctx_lengths_decode: Optional[List[int]] = None,
         device_id: Optional[List[int]] = None,
         enable_debug_logs: bool = False,
         write_io_dir: Optional[str] = None,
@@ -439,8 +439,8 @@ def __init__(
         sampling_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self._ctx_len = ctx_len
-        self.comp_ctx_lengths = comp_ctx_lengths
-        self.prefill_ccl_len = prefill_ccl_len
+        self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill
+        self.comp_ctx_lengths_decode = comp_ctx_lengths_decode
         self._write_io_dir = write_io_dir
         self.is_tlm = is_tlm
         self.return_pdfs = return_pdfs
@@ -799,22 +799,15 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
                 batch_lora_ids = [self._prompt_to_lora_id_mapping_prefill.popleft() for i in range(self.batch_size)]
                 inputs["lora_ids"] = np.array(batch_lora_ids, dtype=np.int64).reshape(self.batch_size, 1)
 
-        if self.comp_ctx_lengths is not None:
-            self.list_of_comp_ctx_lengths = [np.zeros(length) for length in self.comp_ctx_lengths]
+        if self.comp_ctx_lengths_prefill is not None:
+            self.list_of_comp_ctx_lengths_prefill = [np.zeros(length) for length in self.comp_ctx_lengths_prefill]
             prefill_ccl_id = 0
-            inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[prefill_ccl_id]
+            inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_prefill[prefill_ccl_id]
 
         for i in range(num_chunks):
-            if (i + 1) * self._prefill_seq_len > self.comp_ctx_lengths[prefill_ccl_id]:
-                prefill_ccl_id += 1
-                if prefill_ccl_id >= self.prefill_ccl_len:
-                    prefill_ccl_id = (
-                        (self.prefill_ccl_len - 1)
-                        if self.prefill_ccl_len != 0
-                        else min(prefill_ccl_id, len(self.comp_ctx_lengths) - 1)
-                    )
-
-                inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[prefill_ccl_id]
+            if (i + 1) * self._prefill_seq_len > self.comp_ctx_lengths_prefill[prefill_ccl_id]:
+                prefill_ccl_id = min(prefill_ccl_id + 1, len(self.comp_ctx_lengths_prefill) - 1)
+                inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_prefill[prefill_ccl_id]
 
             chunk_inputs = inputs.copy()
             chunk_inputs["input_ids"] = inputs["input_ids"][
@@ -835,12 +828,13 @@ def run_prefill(self, prompt, generation_len, prefill_logit_bs=1, decode_batch_i
         )
 
     def initialize_ccl(self, decode_inputs):
-        max_ccl_id = len(self.comp_ctx_lengths) - 1
+        self.list_of_comp_ctx_lengths_decode = [np.zeros(length) for length in self.comp_ctx_lengths_decode]
+        max_ccl_id = len(self.comp_ctx_lengths_decode) - 1
         max_position_id = np.max(decode_inputs["position_ids"])
-        ccl_id_initial = self.prefill_ccl_len
+        ccl_id_initial = 0
         ccl_id = ccl_id_initial
-        for i in range(ccl_id_initial, len(self.comp_ctx_lengths)):
-            if max_position_id < self.comp_ctx_lengths[i]:
+        for i in range(ccl_id_initial, len(self.comp_ctx_lengths_decode)):
+            if max_position_id < self.comp_ctx_lengths_decode[i]:
                 ccl_id = i
                 break
 
@@ -877,9 +871,9 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
         # Prepare decode inputs inputs.
         decode_inputs = self.prepare_decode_inputs()
 
-        if self.comp_ctx_lengths is not None:
+        if self.comp_ctx_lengths_decode is not None:
             ccl_id, max_ccl_id = self.initialize_ccl(decode_inputs)
-            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[ccl_id]
+            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_decode[ccl_id]
 
         while prompt_queue or current_decode_ongoing.any():
             outputs = self._session.run(decode_inputs)
@@ -918,19 +912,19 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
                                 batch_id_map[decode_batch_id]
                             ]
 
-                        if self.comp_ctx_lengths is not None:
+                        if self.comp_ctx_lengths_decode is not None:
                             ###Recalculate ccl_id based on position ids###
                             # Determine the maximum value of position_ids across all batch elements
                             max_position_id = np.max(decode_inputs["position_ids"])
 
-                            # Update ccl_id and comp_ctx_lengths based on the maximum position id
+                            # Update ccl_id and comp_ctx_lengths_decode based on the maximum position id
                             ccl_id_initial = self.prefill_ccl_len
                             ccl_id = ccl_id_initial
-                            for i in range(ccl_id_initial, len(self.comp_ctx_lengths)):
-                                if max_position_id < self.comp_ctx_lengths[i]:
+                            for i in range(ccl_id_initial, len(self.comp_ctx_lengths_decode)):
+                                if max_position_id < self.comp_ctx_lengths_decode[i]:
                                     ccl_id = i
                                     break
-                            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[ccl_id]
+                            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_decode[ccl_id]
 
                     else:
                         current_decode_ongoing[decode_batch_id] = False
@@ -944,11 +938,14 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
                     if self.include_sampler:
                         decode_inputs["last_accepted_output_tokens"] = decode_inputs["input_ids"]
 
-                    if self.comp_ctx_lengths is not None:
-                        # Update ccl_id and comp_ctx_lengths based on the maximum position id
-                        if decode_inputs["position_ids"][decode_batch_id, -1] >= self.comp_ctx_lengths[ccl_id] - 1:
+                    if self.comp_ctx_lengths_decode is not None:
+                        # Update ccl_id and comp_ctx_lengths_decode based on the maximum position id
+                        if (
+                            decode_inputs["position_ids"][decode_batch_id, -1]
+                            >= self.comp_ctx_lengths_decode[ccl_id] - 1
+                        ):
                             ccl_id = min(ccl_id + 1, max_ccl_id)
-                            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[ccl_id]
+                            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_decode[ccl_id]
 
                     generated_id_current_index[decode_batch_id] += 1
 
@@ -975,16 +972,16 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform
         finished_sequences = decode_inputs["input_ids"] == self.tokenizer.eos_token_id
         num_token = 0
 
-        if self.comp_ctx_lengths is not None:
+        if self.comp_ctx_lengths_decode is not None:
             ccl_id, max_ccl_id = self.initialize_ccl(decode_inputs)
-            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[ccl_id]
+            decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_decode[ccl_id]
 
         cache_index = np.max(decode_inputs["position_ids"])
         for num_token in range(1, generation_len):
-            if self.comp_ctx_lengths is not None:
-                if cache_index >= self.comp_ctx_lengths[ccl_id] - 1:
+            if self.comp_ctx_lengths_decode is not None:
+                if cache_index >= self.comp_ctx_lengths_decode[ccl_id] - 1:
                     ccl_id = min(ccl_id + 1, max_ccl_id)
-                    decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths[ccl_id]
+                    decode_inputs["comp_ctx_lengths"] = self.list_of_comp_ctx_lengths_decode[ccl_id]
 
             if streamer:
                 streamer.put(decode_inputs["input_ids"][0])
@@ -1047,8 +1044,8 @@ def __init__(
         qpc_path: str,
         full_batch_size: Optional[int] = None,
         ctx_len: Optional[int] = None,
-        comp_ctx_lengths: Optional[List[int]] = None,
-        prefill_ccl_len: Optional[int] = 1,
+        comp_ctx_lengths_prefill: Optional[List[int]] = None,
+        comp_ctx_lengths_decode: Optional[List[int]] = None,
         device_id: Optional[List[int]] = None,
         enable_debug_logs: bool = False,
         write_io_dir: Optional[str] = None,
@@ -1062,8 +1059,8 @@ def __init__(
             qpc_path=qpc_path,
             full_batch_size=full_batch_size,
             ctx_len=ctx_len,
-            comp_ctx_lengths=comp_ctx_lengths,
-            prefill_ccl_len=prefill_ccl_len,
+            comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
+            comp_ctx_lengths_decode=comp_ctx_lengths_decode,
             device_id=device_id,
             enable_debug_logs=enable_debug_logs,
             write_io_dir=write_io_dir,
@@ -1075,8 +1072,8 @@ def __init__(
         self._full_batch_size = self._qaic_model.full_batch_size
         self._tokenizer = self._qaic_model.tokenizer
         self._ctx_len = ctx_len
-        self.comp_ctx_lengths = comp_ctx_lengths
-        self.prefill_ccl_len = prefill_ccl_len
+        self.comp_ctx_lengths_prefill = comp_ctx_lengths_prefill
+        self.comp_ctx_lengths_decode = comp_ctx_lengths_decode
         self._perf_metrics = None
         self._prompt_queue = None
         self._text_streamer = None
 
@@ -672,8 +672,8 @@ def get_specializations(
         prefill_seq_len: int,
         ctx_len: int,
         img_size: int,
-        comp_ctx_lengths: List[int] = None,
-        prefill_ccl_len: int = None,
+        comp_ctx_lengths_prefill: List[int] = None,
+        comp_ctx_lengths_decode: List[int] = None,
         kv_offload: bool = False,
         **compiler_options,
     ):
@@ -694,31 +694,29 @@ def get_specializations(
                 "ctx_len": ctx_len,
             }
         ]
-        if comp_ctx_lengths is not None:
+        if comp_ctx_lengths_prefill is not None:
             lang = []
 
-            # prefill_ccl_len elements of comp_ctx_lengths will be used for prefilling
-            for i in range(0, prefill_ccl_len):
+            for i in range(0, len(comp_ctx_lengths_prefill)):
                 lang.append(
                     {
                         "batch_size": batch_size,
                         "seq_len": prefill_seq_len,
                         "ctx_len": ctx_len,
-                        "comp_ctx_lengths": comp_ctx_lengths[i],
+                        "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                         "sliding_window": self.language_model.config.sliding_window,
                         "img_size": img_size,
                         "mm_tokens_per_image": mm_tokens_per_image,
                     }
                 )
 
-            # Remaining elements use comp_ctx_lengths[1:] in a loop
-            for i in range(prefill_ccl_len, len(comp_ctx_lengths)):
+            for i in range(0, len(comp_ctx_lengths_decode)):
                 lang.append(
                     {
                         "batch_size": batch_size,
                         "seq_len": "1",
                         "ctx_len": ctx_len,
-                        "comp_ctx_lengths": comp_ctx_lengths[i],
+                        "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                         "sliding_window": self.language_model.config.sliding_window,
                         "img_size": img_size,
                         "mm_tokens_per_image": mm_tokens_per_image,
 
@@ -69,8 +69,8 @@ def get_specializations(
         prefill_seq_len: int,
         ctx_len: int,
         img_size: int,
-        comp_ctx_lengths: List[int],
-        prefill_ccl_len: int = None,
+        comp_ctx_lengths_prefill: List[int] = None,
+        comp_ctx_lengths_decode: List[int] = None,
         kv_offload: bool = False,
         **compiler_options,
     ):
@@ -100,31 +100,29 @@ def get_specializations(
                 "img_size": img_size,
             }
         ]
-        if comp_ctx_lengths is not None:
+        if comp_ctx_lengths_prefill is not None:
             lang = []
 
-            # prefill_ccl_len elements of comp_ctx_lengths will be used for prefilling
-            for i in range(0, prefill_ccl_len):
+            for i in range(0, len(comp_ctx_lengths_prefill)):
                 lang.append(
                     {
                         "batch_size": batch_size,
                         "seq_len": prefill_seq_len,
                         "ctx_len": ctx_len,
-                        "comp_ctx_lengths": comp_ctx_lengths[i],
+                        "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                         "num_patches": num_patches,
                         "img_size": img_size,
                         "vision_size": vision_size,
                     }
                 )
 
-            # Remaining elements use comp_ctx_lengths[1:] in a loop
-            for i in range(prefill_ccl_len, len(comp_ctx_lengths)):
+            for i in range(0, len(comp_ctx_lengths_decode)):
                 lang.append(
                     {
                         "batch_size": batch_size,
                         "seq_len": "1",
                         "ctx_len": ctx_len,
-                        "comp_ctx_lengths": comp_ctx_lengths[i],
+                        "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                         "num_patches": num_patches,
                         "img_size": img_size,
                         "vision_size": vision_size,
 
@@ -908,8 +908,8 @@ def get_specializations(
         prefill_seq_len: int,
         ctx_len: int,
         img_size: int,
-        comp_ctx_lengths: List[int] = None,
-        prefill_ccl_len: int = None,
+        comp_ctx_lengths_prefill: List[int] = None,
+        comp_ctx_lengths_decode: List[int] = None,
         kv_offload: bool = False,
         **compiler_options,
     ):
@@ -959,17 +959,16 @@ def get_specializations(
                 "img_size": img_size,
             }
         ]
-        if comp_ctx_lengths is not None:
+        if comp_ctx_lengths_prefill is not None:
             lang = []
 
-            # prefill_ccl_len elements of comp_ctx_lengths will be used for prefilling
-            for i in range(0, prefill_ccl_len):
+            for i in range(0, len(comp_ctx_lengths_prefill)):
                 lang.append(
                     {
                         "batch_size": batch_size,
                         "seq_len": prefill_seq_len,
                         "ctx_len": ctx_len,
-                        "comp_ctx_lengths": comp_ctx_lengths[i],
+                        "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                         "max_num_tiles": max_num_tiles,
                         "img_size": img_size,
                         "vision_size": vision_size,
@@ -978,14 +977,13 @@ def get_specializations(
                     }
                 )
 
-            # Remaining elements use comp_ctx_lengths[1:] in a loop
-            for i in range(prefill_ccl_len, len(comp_ctx_lengths)):
+            for i in range(0, len(comp_ctx_lengths_decode)):
                 lang.append(
                     {
                         "batch_size": batch_size,
                         "seq_len": "1",
                         "ctx_len": ctx_len,
-                        "comp_ctx_lengths": comp_ctx_lengths[i],
+                        "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                         "max_num_tiles": max_num_tiles,
                         "img_size": img_size,
                         "vision_size": vision_size,
 
@@ -162,8 +162,8 @@ def get_specializations(
         prefill_seq_len: int,
         ctx_len: int,
         img_size: int,
-        comp_ctx_lengths: List[int] = None,
-        prefill_ccl_len: int = None,
+        comp_ctx_lengths_prefill: List[int] = None,
+        comp_ctx_lengths_decode: List[int] = None,
         kv_offload: bool = False,
         **compiler_options,
     ):
@@ -186,31 +186,29 @@ def get_specializations(
             }
         ]
 
-        if comp_ctx_lengths is not None:
+        if comp_ctx_lengths_prefill is not None:
             lang = []
 
-            # prefill_ccl_len elements of comp_ctx_lengths will be used for prefilling
-            for i in range(0, prefill_ccl_len):
+            for i in range(0, len(comp_ctx_lengths_prefill)):
                 lang.append(
                     {
                         "batch_size": batch_size,
                         "seq_len": prefill_seq_len,
                         "ctx_len": ctx_len,
-                        "comp_ctx_lengths": comp_ctx_lengths[i],
+                        "comp_ctx_lengths": comp_ctx_lengths_prefill[i],
                         "max_num_images": max_num_images,
                         "img_size": img_size,
                         "vision_size": vision_size,
                     }
                 )
 
-            # Remaining elements use comp_ctx_lengths[1:] in a loop
-            for i in range(prefill_ccl_len, len(comp_ctx_lengths)):
+            for i in range(0, len(comp_ctx_lengths_decode)):
                 lang.append(
                     {
                         "batch_size": batch_size,
                         "seq_len": "1",
                         "ctx_len": ctx_len,
-                        "comp_ctx_lengths": comp_ctx_lengths[i],
+                        "comp_ctx_lengths": comp_ctx_lengths_decode[i],
                         "max_num_images": max_num_images,
                         "img_size": img_size,
                         "vision_size": vision_size,