Skip to content

Commit a29319a

Browse files
committed
adding Context Length Specialization (CCL)
Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
1 parent 5e44f81 commit a29319a

File tree

2 files changed

+15
-5
lines changed

2 files changed

+15
-5
lines changed

examples/granite_example/ccl_granite_vision_inference.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def run_model(
9595
img_size = 384
9696
num_cores = 16
9797
num_devices = 4
98-
comp_ctx_lengths = [5500,6144,8192]
98+
comp_ctx_lengths = [5500, 6144, 8192]
9999
prefill_ccl_len = 1
100100

101101
run_model(

examples/qwen3moe_example/ccl_qwen3moe_inference.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,22 @@
1616
# We will use prompt_len=1 for compilation for both cb and non-cb inference
1717
"""
1818

19-
comp_ctx_lengths = [2048,4096,8192,16384,32768,65536] # None
19+
comp_ctx_lengths = [192, 256, 512, 1024] # None
2020

2121
## Prefill_ccl_len shows how many numbers in the comp_ctx_lengths list is related to prefilling and the rest would be for decoding. The default value is 1.
2222
prefill_ccl_len = 2
2323

24-
model = QEFFAutoModelForCausalLM.from_pretrained(model_name, comp_ctx_lengths=comp_ctx_lengths, prefill_ccl_len=prefill_ccl_len, continuous_batching=True)
25-
model.compile(prefill_seq_len=1, ctx_len=65536, full_batch_size=2, num_cores=16, num_devices=4, mxfp6_matmul=True, mxint8_kv_cache=True)
24+
model = QEFFAutoModelForCausalLM.from_pretrained(
25+
model_name, comp_ctx_lengths=comp_ctx_lengths, prefill_ccl_len=prefill_ccl_len, continuous_batching=True
26+
)
27+
model.compile(
28+
prefill_seq_len=1,
29+
ctx_len=1024,
30+
full_batch_size=2,
31+
num_cores=16,
32+
num_devices=4,
33+
mxfp6_matmul=True,
34+
mxint8_kv_cache=True,
35+
)
2636
tokenizer = AutoTokenizer.from_pretrained(model_name)
27-
exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer, device_id=[16,17,18,19])
37+
exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer)

0 commit comments

Comments
 (0)