adding Context Length Specialization (CCL)

vjanfaza · vjanfaza · commit 2542e3d431f7 · 2025-09-25T21:37:56.000-07:00
Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/examples/ccl_image_text_to_text_inference.py b/examples/ccl_image_text_to_text_inference.py
@@ -91,7 +91,6 @@ def run_model(
 if __name__ == "__main__":
     # Model name and Input parameters
     model_name = "llava-hf/llava-1.5-7b-hf"
-    # model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
     query = "Describe this image."
     image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
 
@@ -101,10 +100,9 @@ def run_model(
     ctx_len = 8192
     generation_len = 128
     img_size = 336
-    # img_size = 560
     num_cores = 16
     num_devices = 4
-    comp_ctx_lengths = [4096,6144,8192]
+    comp_ctx_lengths = [4096, 6144, 8192]
     prefill_ccl_len = 1
 
     run_model(
diff --git a/examples/compute_context_length.py b/examples/compute_context_length.py
@@ -44,5 +44,5 @@
     prompts=[
         "My name is ",
     ],
-    tokenizer=tokenizer
+    tokenizer=tokenizer,
 )
diff --git a/examples/granite_example/ccl_granitemoe_inference.py b/examples/granite_example/ccl_granitemoe_inference.py
@@ -16,12 +16,22 @@
 # We will use prompt_len=1 for compilation for both cb and non-cb inference
 """
 
-comp_ctx_lengths = [256,512,1024,2048]  # None
+comp_ctx_lengths = [256, 512, 1024, 2048]  # None
 
 ## Prefill_ccl_len shows how many numbers in the comp_ctx_lengths list is related to prefilling and the rest would be for decoding. The default value is 1.
 prefill_ccl_len = 2
 
-model = QEFFAutoModelForCausalLM.from_pretrained(model_name, comp_ctx_lengths=comp_ctx_lengths, prefill_ccl_len=prefill_ccl_len, continuous_batching=False)
-model.compile(prefill_seq_len=1, ctx_len=2048, full_batch_size=1, num_cores=16, num_devices=4, mxfp6_matmul=False, mxint8_kv_cache=False)
+model = QEFFAutoModelForCausalLM.from_pretrained(
+    model_name, comp_ctx_lengths=comp_ctx_lengths, prefill_ccl_len=prefill_ccl_len, continuous_batching=False
+)
+model.compile(
+    prefill_seq_len=1,
+    ctx_len=2048,
+    full_batch_size=1,
+    num_cores=16,
+    num_devices=4,
+    mxfp6_matmul=False,
+    mxint8_kv_cache=False,
+)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer, device_id=[16,17,18,19])
+exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer, device_id=[16, 17, 18, 19])
diff --git a/tests/transformers/test_comp_ctx_length.py b/tests/transformers/test_comp_ctx_length.py
@@ -173,4 +173,4 @@ def test_causal_lm_compile(config, cb, tmp_cache):
     end = perf_counter()
     compile_time = end - start
     assert compile_time < 2.0
-    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))

Original file line number	Diff line number	Diff line change
`@@ -44,5 +44,5 @@`
`44`	`44`	`prompts=[`
`45`	`45`	`"My name is ",`
`46`	`46`	`],`
`47`		`- tokenizer=tokenizer`
	`47`	`+ tokenizer=tokenizer,`
`48`	`48`	`)`