Skip to content

Commit a4fca59

Browse files
committed
Adding Compute-Context-Length(CCL)
Signed-off-by: Vahid Janfaza <vjanfaza@qti.qualcomm.com>
1 parent 9363689 commit a4fca59

File tree

2 files changed

+11
-11
lines changed

2 files changed

+11
-11
lines changed

QEfficient/transformers/models/modeling_auto.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2236,7 +2236,7 @@ def __init__(
22362236
self.comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
22372237
ctx_len = kwargs.pop("ctx_len", None)
22382238
prefill_seq_len = kwargs.pop("prefill_seq_len", 128)
2239-
2239+
22402240
if self.comp_ctx_lengths_prefill and prefill_seq_len > 1:
22412241
self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = process_ccl_specializations(
22422242
self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode, ctx_len
@@ -2340,7 +2340,7 @@ def from_pretrained(
23402340
comp_ctx_lengths_decode = kwargs.pop("comp_ctx_lengths_decode", None)
23412341
ctx_len = kwargs.pop("ctx_len", None)
23422342
prefill_seq_len = kwargs.pop("prefill_seq_len", 128)
2343-
2343+
23442344
if comp_ctx_lengths_prefill and prefill_seq_len > 1:
23452345
comp_ctx_lengths_prefill, comp_ctx_lengths_decode = process_ccl_specializations(
23462346
comp_ctx_lengths_prefill, comp_ctx_lengths_decode, ctx_len
@@ -2649,9 +2649,9 @@ def build_decode_specialization(
26492649
of the prefill specialization (e.g., if prefill_seq_len is 1 and not continuous batching).
26502650
"""
26512651
if prefill_seq_len == 1:
2652-
if not self.continuous_batching or batch_size==1:
2652+
if not self.continuous_batching or batch_size == 1:
26532653
return None # Avoid duplication with prefill
2654-
2654+
26552655
spec = {
26562656
"batch_size": full_batch_size if self.continuous_batching else batch_size,
26572657
"seq_len": (num_speculative_tokens + 1) if self.is_tlm else 1,

examples/qwen3moe_example/ccl_qwen3moe_inference.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,31 +16,31 @@
1616
# We will use prompt_len=1 for compilation for both cb and non-cb inference
1717
"""
1818

19-
ctx_len = 65536
19+
ctx_len = 32768
2020
prefill_seq_len = 1
2121
# In moe models when compiling with prefill_seq_len=1 and non-continuous-batching mode, prefill and decode will share the same specializations.
22-
comp_ctx_lengths_prefill = [4096,8192,16384,32768,ctx_len]
23-
comp_ctx_lengths_decode = [4096,8192,16384,32768,ctx_len]
22+
comp_ctx_lengths_prefill = [4096, 8192, 16384, ctx_len]
23+
comp_ctx_lengths_decode = [4096, 8192, 16384, ctx_len]
2424

2525
model = QEFFAutoModelForCausalLM.from_pretrained(
2626
model_name,
2727
comp_ctx_lengths_prefill=comp_ctx_lengths_prefill,
2828
comp_ctx_lengths_decode=comp_ctx_lengths_decode,
2929
ctx_len=ctx_len,
30-
continuous_batching=False,
30+
continuous_batching=True,
3131
prefill_seq_len=prefill_seq_len,
3232
)
33-
# prefill_seq_len=prefill_seq_len,
33+
# prefill_seq_len=prefill_seq_len,
3434
model.compile(
3535
prefill_seq_len=prefill_seq_len,
3636
ctx_len=ctx_len,
37-
batch_size=1,
37+
full_batch_size=1,
3838
num_cores=16,
3939
num_devices=4,
4040
mxfp6_matmul=True,
4141
mxint8_kv_cache=True,
4242
mos=1,
4343
)
44-
# mos=1,
44+
# mos=1,
4545
tokenizer = AutoTokenizer.from_pretrained(model_name)
4646
exec_info = model.generate(prompts=Constants.INPUT_STR, tokenizer=tokenizer)

0 commit comments

Comments
 (0)