improving handeling CCL lists

vjanfaza · quic-rishinr · commit fa3c2f6d92d4 · 2025-10-23T11:13:18.000+05:30
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -879,13 +879,7 @@ def __init__(
         self.model = model
         self.config = model.config
 
-        self.comp_ctx_lengths_prefill = kwargs.pop("comp_ctx_lengths_prefill", None)
-        self.comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
-        ctx_len = kwargs.pop("ctx_len", None)
-        if self.comp_ctx_lengths_prefill:
-            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
-                self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len
-            )
+        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, _, _ = process_ccl_specializations(kwargs)
 
         self.vision_model = QEffVisionEncoderForTextImageToTextModel(model, **kwargs)
         self.lang_model = QEffCausalLMForTextImageToTextModel(model, **kwargs)
@@ -933,14 +927,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
 
-        comp_ctx_lengths_prefill = kwargs.pop("comp_ctx_lengths_prefill", None)
-        comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
-        ctx_len = kwargs.pop("ctx_len", None)
-
-        if comp_ctx_lengths_prefill:
-            comp_ctx_lengths_prefill, comp_ctx_lengths_decode = process_ccl_specializations(
-                comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len
-            )
+        comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len = process_ccl_specializations(kwargs)
 
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(
@@ -1498,14 +1485,7 @@ def __init__(
             raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
         super().__init__(model, **kwargs)
 
-        self.comp_ctx_lengths_prefill = kwargs.pop("comp_ctx_lengths_prefill", None)
-        self.comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
-        ctx_len = kwargs.pop("ctx_len", None)
-
-        if self.comp_ctx_lengths_prefill:
-            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
-                self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len
-            )
+        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, _, _ = process_ccl_specializations(kwargs)
 
         # to handle internvl models
         if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"):
@@ -1554,14 +1534,7 @@ def from_pretrained(
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
 
-        comp_ctx_lengths_prefill = kwargs.pop("comp_ctx_lengths_prefill", None)
-        comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
-        ctx_len = kwargs.pop("ctx_len", None)
-
-        if comp_ctx_lengths_prefill:
-            comp_ctx_lengths_prefill, comp_ctx_lengths_decode = process_ccl_specializations(
-                comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len
-            )
+        comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len = process_ccl_specializations(kwargs)
 
         from transformers import AutoConfig
 
@@ -2115,14 +2088,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
 
-        comp_ctx_lengths_prefill = kwargs.pop("comp_ctx_lengths_prefill", None)
-        comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
-        ctx_len = kwargs.pop("ctx_len", None)
-
-        if comp_ctx_lengths_prefill:
-            comp_ctx_lengths_prefill, comp_ctx_lengths_decode = process_ccl_specializations(
-                comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len
-            )
+        comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len = process_ccl_specializations(kwargs)
 
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(
@@ -2232,15 +2198,7 @@ def __init__(
         self.model, transformed = SpDTransform.apply(self.model, qaic_config, **kwargs)
         self.is_tlm = transformed
 
-        self.comp_ctx_lengths_prefill = kwargs.pop("comp_ctx_lengths_prefill", None)
-        self.comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
-        ctx_len = kwargs.pop("ctx_len", None)
-        prefill_seq_len = kwargs.pop("prefill_seq_len", 128)
-
-        if self.comp_ctx_lengths_prefill and prefill_seq_len > 1:
-            self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
-                self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len
-            )
+        self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, _, _ = process_ccl_specializations(kwargs)
 
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
 
@@ -2336,15 +2294,7 @@ def from_pretrained(
 
         kv_offload = kwargs.pop("kv_offload", None)
 
-        comp_ctx_lengths_prefill = kwargs.pop("comp_ctx_lengths_prefill", None)
-        comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
-        ctx_len = kwargs.pop("ctx_len", None)
-        prefill_seq_len = kwargs.pop("prefill_seq_len", 128)
-
-        if comp_ctx_lengths_prefill and prefill_seq_len > 1:
-            comp_ctx_lengths_prefill, comp_ctx_lengths_decode = process_ccl_specializations(
-                comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len
-            )
+        comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len, prefill_seq_len = process_ccl_specializations(kwargs)
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
diff --git a/QEfficient/utils/check_ccl_specializations.py b/QEfficient/utils/check_ccl_specializations.py
@@ -8,15 +8,28 @@
 from typing import List, Optional
 
 
+# def process_ccl_specializations(
+#     ccl_prefill: Optional[List[int]] = None, ccl_decode: Optional[List[int]] = None, ctx_len: Optional[int] = None
+# ):
 def process_ccl_specializations(
-    ccl_prefill: Optional[List[int]] = None, ccl_decode: Optional[List[int]] = None, ctx_len: Optional[int] = None
+    kwargs
 ):
+    ccl_prefill = kwargs.pop("comp_ctx_lengths_prefill", None)
+    ccl_decode = kwargs.pop("comp_ctx_lengths_decode", None)
+    ctx_len = kwargs.pop("ctx_len", None)
+    prefill_seq_len = kwargs.pop("prefill_seq_len", 128)
+
     if ctx_len is None:
         raise TypeError("`ctx_len` is required when loading the model.")
-    if ccl_prefill is None:
-        ccl_prefill = [ctx_len]
-    if ccl_decode is None:
-        ccl_decode = [ctx_len]
+
+    if ccl_prefill is None or ccl_decode is None:
+        return None, None, ctx_len, prefill_seq_len
+    
+    if prefill_seq_len == 1:
+        #both prefill and decode ccl can share the same specializations since prefill_seq_len=1. So, a sorted union of both lists can be used for both of them.
+        ccl_union_all = sorted(set(ccl_prefill + ccl_decode))
+        ccl_union_all = [min(x, ctx_len) for x in ccl_union_all]
+        return ccl_union_all, ccl_union_all, ctx_len, prefill_seq_len
 
     # Step 1: Cap values to ctx_len
     ccl_prefill = [min(x, ctx_len) for x in ccl_prefill]
@@ -40,4 +53,4 @@ def process_ccl_specializations(
     updated_prefill.sort()
     ccl_decode.sort()
 
-    return updated_prefill, ccl_decode
+    return updated_prefill, ccl_decode, ctx_len, prefill_seq_len