Adding Compute-Context-Length(CCL)

vjanfaza · vjanfaza · commit a4fca59c9528 · 2025-10-19T20:13:16.000-07:00
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -2236,7 +2236,7 @@ def __init__(
         self.comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
         ctx_len = kwargs.pop("ctx_len", None)
         prefill_seq_len = kwargs.pop("prefill_seq_len", 128)
-        
+
         if self.comp_ctx_lengths_prefill and prefill_seq_len > 1:
             self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
                 self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len
@@ -2340,7 +2340,7 @@ def from_pretrained(
         comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
         ctx_len = kwargs.pop("ctx_len", None)
         prefill_seq_len = kwargs.pop("prefill_seq_len", 128)
-        
+
         if comp_ctx_lengths_prefill and prefill_seq_len > 1:
             comp_ctx_lengths_prefill, comp_ctx_lengths_decode = process_ccl_specializations(
                 comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len
@@ -2649,9 +2649,9 @@ def build_decode_specialization(
             of the prefill specialization (e.g., if prefill_seq_len is 1 and not continuous batching).
         """
         if prefill_seq_len == 1:
-            if not self.continuous_batching or batch_size==1:
+            if not self.continuous_batching or batch_size == 1:
                 return None  # Avoid duplication with prefill
-            
+
         spec = {
             "batch_size": full_batch_size if self.continuous_batching else batch_size,
             "seq_len": (num_speculative_tokens + 1) if self.is_tlm else 1,
diff --git a/examples/qwen3moe_example/ccl_qwen3moe_inference.py b/examples/qwen3moe_example/ccl_qwen3moe_inference.py
@@ -16,31 +16,31 @@
 # We will use prompt_len=1 for compilation for both cb and non-cb inference
 """
 
-ctx_len = 65536
+ctx_len = 32768
 prefill_seq_len = 1
 # In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same specializations.
-comp_ctx_lengths_prefill = [4096,8192,16384,32768,ctx_len]
-comp_ctx_lengths_decode = [4096,8192,16384,32768,ctx_len]
+comp_ctx_lengths_prefill = [4096, 8192, 16384, ctx_len]
+comp_ctx_lengths_decode = [4096, 8192, 16384, ctx_len]
 
 model = QEFFAutoModelForCausalLM.from_pretrained(
     model_name,
     comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
     comp_ctx_lengths_decode=comp_ctx_lengths_decode,
     ctx_len=ctx_len,
-    continuous_batching=False,
+    continuous_batching=True,
     prefill_seq_len=prefill_seq_len,
 )
-    # prefill_seq_len=prefill_seq_len,
+# prefill_seq_len=prefill_seq_len,
 model.compile(
     prefill_seq_len=prefill_seq_len,
     ctx_len=ctx_len,
-    batch_size=1,
+    full_batch_size=1,
     num_cores=16,
     num_devices=4,
     mxfp6_matmul=True,
     mxint8_kv_cache=True,
     mos=1,
 )
-    # mos=1,
+# mos=1,
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer)