Adding Compute-Context-Length(CCL)

vjanfaza · vjanfaza · commit b4bf5f9ab515 · 2025-10-19T08:07:22.000-07:00
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -2235,8 +2235,9 @@ def __init__(
         self.comp_ctx_lengths_prefill = kwargs.pop("comp_ctx_lengths_prefill", None)
         self.comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
         ctx_len = kwargs.pop("ctx_len", None)
-
-        if self.comp_ctx_lengths_prefill:
+        prefill_seq_len = kwargs.pop("prefill_seq_len", 128)
+        
+        if self.comp_ctx_lengths_prefill and prefill_seq_len > 1:
             self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
                 self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len
             )
@@ -2338,7 +2339,9 @@ def from_pretrained(
         comp_ctx_lengths_prefill = kwargs.pop("comp_ctx_lengths_prefill", None)
         comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
         ctx_len = kwargs.pop("ctx_len", None)
-        if comp_ctx_lengths_prefill:
+        prefill_seq_len = kwargs.pop("prefill_seq_len", 128)
+        
+        if comp_ctx_lengths_prefill and prefill_seq_len > 1:
             comp_ctx_lengths_prefill, comp_ctx_lengths_decode = process_ccl_specializations(
                 comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len
             )
@@ -2356,6 +2359,7 @@ def from_pretrained(
                 comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
                 comp_ctx_lengths_decode=comp_ctx_lengths_decode,
                 ctx_len=ctx_len,
+                prefill_seq_len=prefill_seq_len,
                 kv_offload=kv_offload,
                 pretrained_model_name_or_path=pretrained_model_name_or_path,
                 **kwargs,
@@ -2368,6 +2372,7 @@ def from_pretrained(
             comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
             comp_ctx_lengths_decode=comp_ctx_lengths_decode,
             ctx_len=ctx_len,
+            prefill_seq_len=prefill_seq_len,
             **kwargs,
         )
 
@@ -2643,7 +2648,7 @@ def build_decode_specialization(
             A dictionary defining the decode specialization, or None if it would be a duplicate
             of the prefill specialization (e.g., if prefill_seq_len is 1 and not continuous batching).
         """
-        if prefill_seq_len == 1 and not self.continuous_batching and comp_ctx_lengths is None:
+        if prefill_seq_len == 1 and not self.continuous_batching:# and comp_ctx_lengths is None
             return None  # Avoid duplication with prefill
         spec = {
             "batch_size": full_batch_size if self.continuous_batching else batch_size,
diff --git a/examples/qwen3moe_example/ccl_qwen3moe_inference.py b/examples/qwen3moe_example/ccl_qwen3moe_inference.py
@@ -16,20 +16,23 @@
 # We will use prompt_len=1 for compilation for both cb and non-cb inference
 """
 
-ctx_len = 8192
-
-comp_ctx_lengths_prefill = [4096]
-comp_ctx_lengths_decode = [6144, 8192]
+ctx_len = 65536
+prefill_seq_len = 1
+# In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same specializations.
+comp_ctx_lengths_prefill = [4096,8192,16384,32768,ctx_len]
+comp_ctx_lengths_decode = [4096,8192,16384,32768,ctx_len]
 
 model = QEFFAutoModelForCausalLM.from_pretrained(
     model_name,
     comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
     comp_ctx_lengths_decode=comp_ctx_lengths_decode,
     ctx_len=ctx_len,
     continuous_batching=False,
+    prefill_seq_len=prefill_seq_len,
 )
+    # prefill_seq_len=prefill_seq_len,
 model.compile(
-    prefill_seq_len=1,
+    prefill_seq_len=prefill_seq_len,
     ctx_len=ctx_len,
     batch_size=1,
     num_cores=16,
@@ -38,5 +41,6 @@
     mxint8_kv_cache=True,
     mos=1,
 )
+    # mos=1,
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer)