adding Context Length Specialization (CCL)

vjanfaza · vjanfaza · commit a29319a3a9a2 · 2025-09-25T17:15:17.000-07:00
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/examples/granite_example/ccl_granite_vision_inference.py b/examples/granite_example/ccl_granite_vision_inference.py
@@ -95,7 +95,7 @@ def run_model(
     img_size = 384
     num_cores = 16
     num_devices = 4
-    comp_ctx_lengths = [5500,6144,8192]
+    comp_ctx_lengths = [5500, 6144, 8192]
     prefill_ccl_len = 1
 
     run_model(
diff --git a/examples/qwen3moe_example/ccl_qwen3moe_inference.py b/examples/qwen3moe_example/ccl_qwen3moe_inference.py
@@ -16,12 +16,22 @@
 # We will use prompt_len=1 for compilation for both cb and non-cb inference
 """
 
-comp_ctx_lengths = [2048,4096,8192,16384,32768,65536]  # None
+comp_ctx_lengths = [192, 256, 512, 1024]  # None
 
 ## Prefill_ccl_len shows how many numbers in the comp_ctx_lengths list is related to prefilling and the rest would be for decoding. The default value is 1.
 prefill_ccl_len = 2
 
-model = QEFFAutoModelForCausalLM.from_pretrained(model_name, comp_ctx_lengths=comp_ctx_lengths, prefill_ccl_len=prefill_ccl_len, continuous_batching=True)
-model.compile(prefill_seq_len=1, ctx_len=65536, full_batch_size=2, num_cores=16, num_devices=4, mxfp6_matmul=True, mxint8_kv_cache=True)
+model = QEFFAutoModelForCausalLM.from_pretrained(
+    model_name, comp_ctx_lengths=comp_ctx_lengths, prefill_ccl_len=prefill_ccl_len, continuous_batching=True
+)
+model.compile(
+    prefill_seq_len=1,
+    ctx_len=1024,
+    full_batch_size=2,
+    num_cores=16,
+    num_devices=4,
+    mxfp6_matmul=True,
+    mxint8_kv_cache=True,
+)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer, device_id=[16,17,18,19])
+exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer)