From 41dbbf50617d5be8bf1c8262c1521a4707b3b809 Mon Sep 17 00:00:00 2001 From: Vinayak Baddi Date: Wed, 6 Aug 2025 19:00:00 +0000 Subject: [PATCH 01/25] [QEff]: Add gpt_oss Signed-off-by: vbaddi Signed-off-by: Onkar Chougule --- QEfficient/transformers/models/modeling_auto.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 60f60c768..57f3b430b 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -28,7 +28,7 @@ import QEfficient from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform -from QEfficient.base.pytorch_transforms import SplitGateUpWeightsTransform +from QEfficient.base.pytorch_transforms import SplitGateUpWeightsTransform, SplitGateUpWeightsTransformGPTOSS from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.generation.text_generation_inference import ( CloudAI100ExecInfoNew, @@ -2109,6 +2109,7 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel): CustomOpsTransform, KVCacheTransform, SplitGateUpWeightsTransform, + SplitGateUpWeightsTransformGPTOSS, KVCacheExternalModuleMapperTransform, ] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] From eb31daa79703f8cebbefa49bedadcb674c489516 Mon Sep 17 00:00:00 2001 From: Vinayak Baddi Date: Thu, 7 Aug 2025 14:34:08 +0000 Subject: [PATCH 02/25] nit: update modeling and make transform uniform Signed-off-by: vbaddi Signed-off-by: Onkar Chougule --- QEfficient/transformers/models/modeling_auto.py | 3 +-- pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 57f3b430b..60f60c768 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -28,7 +28,7 @@ import QEfficient from QEfficient.base.modeling_qeff import QEFFBaseModel from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform -from QEfficient.base.pytorch_transforms import SplitGateUpWeightsTransform, SplitGateUpWeightsTransformGPTOSS +from QEfficient.base.pytorch_transforms import SplitGateUpWeightsTransform from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.generation.text_generation_inference import ( CloudAI100ExecInfoNew, @@ -2109,7 +2109,6 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel): CustomOpsTransform, KVCacheTransform, SplitGateUpWeightsTransform, - SplitGateUpWeightsTransformGPTOSS, KVCacheExternalModuleMapperTransform, ] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] diff --git a/pyproject.toml b/pyproject.toml index ea3c3405d..dbff208a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,8 +22,8 @@ dependencies = [ "transformers==4.55.0", "huggingface-hub==0.34.0", "hf_transfer==0.1.9", - "peft==0.13.2", - "datasets==2.20.0", + "peft", + "datasets", "fsspec==2023.6.0", "multidict==6.0.4", "urllib3<2", From 8dcc3adcb33691c56350a4671996776650e7e781 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 7 Aug 2025 15:23:21 +0530 Subject: [PATCH 03/25] apirunner change Signed-off-by: Onkar Chougule --- QEfficient/transformers/models/modeling_auto.py | 5 +++++ QEfficient/utils/generate_inputs.py | 1 + 2 files changed, 6 insertions(+) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 60f60c768..18ffd2d33 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2741,6 +2741,11 @@ def compile( for kv in ["key", "value"]: custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype + # HACK for now + if self.model.config.model_type == "gpt_oss": + for spec in specializations: + spec.update({"sliding_window": 128}) + qpc_path = self._compile( onnx_path=onnx_path, compile_dir=compile_dir, diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py index 7d07db530..96b5be932 100644 --- a/QEfficient/utils/generate_inputs.py +++ b/QEfficient/utils/generate_inputs.py @@ -91,6 +91,7 @@ def prepare_pytorch_inputs(self): inputs["batch_index"] = torch.arange(self.full_batch_size).view(-1, 1) past_key_values = [] + sliding_padding_shape = self.padding_shape[:2] + [self.config.sliding_window] + self.padding_shape[-1] for i in range(self.n_layer): if ( all(hasattr(self.config, attr) for attr in ["sliding_window", "layer_types"]) From ce7e71903b675fbe1701564b4482cc831a2d1c60 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 7 Aug 2025 19:24:24 +0530 Subject: [PATCH 04/25] added test along with simplified Hybridcache Signed-off-by: Onkar Chougule --- QEfficient/utils/generate_inputs.py | 2 +- tests/test_gpt.py | 61 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 tests/test_gpt.py diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py index 96b5be932..409ddd6e0 100644 --- a/QEfficient/utils/generate_inputs.py +++ b/QEfficient/utils/generate_inputs.py @@ -91,7 +91,7 @@ def prepare_pytorch_inputs(self): inputs["batch_index"] = torch.arange(self.full_batch_size).view(-1, 1) past_key_values = [] - sliding_padding_shape = self.padding_shape[:2] + [self.config.sliding_window] + self.padding_shape[-1] + sliding_padding_shape = self.padding_shape[:2] + [self.config.sliding_window] + [self.padding_shape[-1]] for i in range(self.n_layer): if ( all(hasattr(self.config, attr) for attr in ["sliding_window", "layer_types"]) diff --git a/tests/test_gpt.py b/tests/test_gpt.py new file mode 100644 index 000000000..27b423b63 --- /dev/null +++ b/tests/test_gpt.py @@ -0,0 +1,61 @@ +import torch +from transformers import AutoConfig, AutoModelForCausalLM, GptOssForCausalLM, TextStreamer + +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.utils._utils import load_hf_tokenizer +from QEfficient.utils.constants import Constants +from QEfficient.utils.run_utils import ApiRunner + +Constants.INPUT_STR=["Make sure tokens don't repeat\n\nTo make a simple cup of coffee, start by boiling water. Add one to two teaspoons of instant coffee powder to a mug. Pour the hot water over the coffee and stir well. Add sugar and milk to taste, if desired. For brewed coffee, use a French press or drip filter. Add coarsely ground coffee to the device, pour hot water over it, and let it steep for four minutes. Press or filter the coffee, then serve"] + +torch.manual_seed(42) +model_id = "openai/gpt-oss-20b" +config = AutoConfig.from_pretrained(model_id) +config.num_hidden_layers=2 + +# Remove the quantization_config attribute if it exists, to avoid MXFP4 Issues +if hasattr(config, "quantization_config"): + delattr(config, "quantization_config") + +model = GptOssForCausalLM.from_pretrained( + "/home/vbaddi/transformers/src/transformers/models/gpt_oss/new_weights", torch_dtype=torch.float32, attn_implementation="eager", config=config +) +model.eval() +model.generation_config.sample=False +tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_id) +config = model.config +batch_size = len(Constants.INPUT_STR) + +api_runner = ApiRunner(batch_size, tokenizer, config, Constants.INPUT_STR, 97, 256) +pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model) + + +qeff_model = QEFFAutoModelForCausalLM(model, continuous_batching=False) +# pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) + +onnx_model_path = qeff_model.export() + + +qpc_path = qeff_model.compile( + prefill_seq_len=128, + ctx_len=256, + num_cores=16, + mxfp6_matmul=False, + mxint8_kv_cache=False, + num_devices=1, + mos=1, + aic_enable_depth_first=True, + num_speculative_tokens=None, +) +print(f"qpc path is {qpc_path}") +streamer = TextStreamer(tokenizer) +exec_info = qeff_model.generate( + tokenizer, + streamer=streamer, + prompts=Constants.INPUT_STR[0], + device_ids=[0], +) + +import ipdb; ipdb.set_trace() +print(pytorch_hf_tokens) +print(exec_info) From dedf20a4d6cd94f1137194ac41b952e479ac2dd6 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 7 Aug 2025 19:26:57 +0530 Subject: [PATCH 05/25] added test assert Signed-off-by: Onkar Chougule --- tests/test_gpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_gpt.py b/tests/test_gpt.py index 27b423b63..92c17c353 100644 --- a/tests/test_gpt.py +++ b/tests/test_gpt.py @@ -56,6 +56,6 @@ device_ids=[0], ) -import ipdb; ipdb.set_trace() print(pytorch_hf_tokens) print(exec_info) +assert (exec_info.generated_ids[0][0,:159] == pytorch_hf_tokens).all() From e35bfde066f268c9467fee3a51d1a6fa9f66c770 Mon Sep 17 00:00:00 2001 From: Vinayak Baddi Date: Fri, 8 Aug 2025 02:44:05 +0000 Subject: [PATCH 06/25] nit: update test gpt file Signed-off-by: vbaddi Signed-off-by: Onkar Chougule --- tests/test_gpt.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/test_gpt.py b/tests/test_gpt.py index 92c17c353..8e44f2f82 100644 --- a/tests/test_gpt.py +++ b/tests/test_gpt.py @@ -1,27 +1,39 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + import torch -from transformers import AutoConfig, AutoModelForCausalLM, GptOssForCausalLM, TextStreamer +from transformers import AutoConfig, GptOssForCausalLM, TextStreamer from QEfficient import QEFFAutoModelForCausalLM from QEfficient.utils._utils import load_hf_tokenizer from QEfficient.utils.constants import Constants from QEfficient.utils.run_utils import ApiRunner -Constants.INPUT_STR=["Make sure tokens don't repeat\n\nTo make a simple cup of coffee, start by boiling water. Add one to two teaspoons of instant coffee powder to a mug. Pour the hot water over the coffee and stir well. Add sugar and milk to taste, if desired. For brewed coffee, use a French press or drip filter. Add coarsely ground coffee to the device, pour hot water over it, and let it steep for four minutes. Press or filter the coffee, then serve"] +Constants.INPUT_STR = [ + "Make sure tokens don't repeat\n\nTo make a simple cup of coffee, start by boiling water. Add one to two teaspoons of instant coffee powder to a mug. Pour the hot water over the coffee and stir well. Add sugar and milk to taste, if desired. For brewed coffee, use a French press or drip filter. Add coarsely ground coffee to the device, pour hot water over it, and let it steep for four minutes. Press or filter the coffee, then serve" +] torch.manual_seed(42) model_id = "openai/gpt-oss-20b" config = AutoConfig.from_pretrained(model_id) -config.num_hidden_layers=2 +config.num_hidden_layers = 2 # Remove the quantization_config attribute if it exists, to avoid MXFP4 Issues if hasattr(config, "quantization_config"): delattr(config, "quantization_config") model = GptOssForCausalLM.from_pretrained( - "/home/vbaddi/transformers/src/transformers/models/gpt_oss/new_weights", torch_dtype=torch.float32, attn_implementation="eager", config=config + "/home/vbaddi/transformers/src/transformers/models/gpt_oss/new_weights", + torch_dtype=torch.float32, + attn_implementation="eager", + config=config, ) model.eval() -model.generation_config.sample=False +model.generation_config.sample = False tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_id) config = model.config batch_size = len(Constants.INPUT_STR) @@ -58,4 +70,4 @@ print(pytorch_hf_tokens) print(exec_info) -assert (exec_info.generated_ids[0][0,:159] == pytorch_hf_tokens).all() +assert (exec_info.generated_ids[0][0, :159] == pytorch_hf_tokens).all() From 7f6c4f6bc12ee18b15e207a21843f1e778331d8e Mon Sep 17 00:00:00 2001 From: Vinayak Baddi Date: Mon, 11 Aug 2025 07:00:22 +0000 Subject: [PATCH 07/25] nit: update modeling with new decode moe forward Signed-off-by: vbaddi Signed-off-by: Onkar Chougule --- QEfficient/transformers/models/modeling_auto.py | 2 +- QEfficient/transformers/models/pytorch_transforms.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 18ffd2d33..d0e8db83b 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2108,7 +2108,7 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel): Mxfp4GptOssExpertDequantizeTransform, CustomOpsTransform, KVCacheTransform, - SplitGateUpWeightsTransform, + # SplitGateUpWeightsTransform, KVCacheExternalModuleMapperTransform, ] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 773ce178c..1fa072637 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -54,7 +54,6 @@ from transformers.models.gpt_oss.modeling_gpt_oss import ( GptOssAttention, GptOssDecoderLayer, - GptOssExperts, GptOssForCausalLM, GptOssMLP, GptOssModel, @@ -255,7 +254,6 @@ from QEfficient.transformers.models.gpt_oss.modeling_gpt_oss import ( QEffGptOssAttention, QEffGptOssDecoderLayer, - QEffGptOssExperts, QEffGptOssForCausalLM, QEffGptOssMLP, QEffGptOssModel, @@ -526,7 +524,7 @@ class KVCacheTransform(ModuleMappingTransform): GptOssModel: QEffGptOssModel, GptOssForCausalLM: QEffGptOssForCausalLM, GptOssMLP: QEffGptOssMLP, - GptOssExperts: QEffGptOssExperts, + # GptOssExperts: QEffGptOssExperts, # Granite GraniteModel: QEffGraniteModel, GraniteForCausalLM: QEffGraniteForCausalLM, From 908d649e3b9b3110291e86d3116dbcb43f5b1f33 Mon Sep 17 00:00:00 2001 From: Vinayak Baddi Date: Wed, 20 Aug 2025 08:50:05 +0000 Subject: [PATCH 08/25] nit: seperate gate, up projections for MoE Signed-off-by: vbaddi Signed-off-by: Onkar Chougule --- QEfficient/transformers/models/modeling_auto.py | 2 +- QEfficient/transformers/models/pytorch_transforms.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index d0e8db83b..18ffd2d33 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2108,7 +2108,7 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel): Mxfp4GptOssExpertDequantizeTransform, CustomOpsTransform, KVCacheTransform, - # SplitGateUpWeightsTransform, + SplitGateUpWeightsTransform, KVCacheExternalModuleMapperTransform, ] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 1fa072637..773ce178c 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -54,6 +54,7 @@ from transformers.models.gpt_oss.modeling_gpt_oss import ( GptOssAttention, GptOssDecoderLayer, + GptOssExperts, GptOssForCausalLM, GptOssMLP, GptOssModel, @@ -254,6 +255,7 @@ from QEfficient.transformers.models.gpt_oss.modeling_gpt_oss import ( QEffGptOssAttention, QEffGptOssDecoderLayer, + QEffGptOssExperts, QEffGptOssForCausalLM, QEffGptOssMLP, QEffGptOssModel, @@ -524,7 +526,7 @@ class KVCacheTransform(ModuleMappingTransform): GptOssModel: QEffGptOssModel, GptOssForCausalLM: QEffGptOssForCausalLM, GptOssMLP: QEffGptOssMLP, - # GptOssExperts: QEffGptOssExperts, + GptOssExperts: QEffGptOssExperts, # Granite GraniteModel: QEffGraniteModel, GraniteForCausalLM: QEffGraniteForCausalLM, From e427267c7ef24c2db4f763c71bcd72b73782854c Mon Sep 17 00:00:00 2001 From: Vinayak Baddi Date: Wed, 15 Oct 2025 08:57:42 +0000 Subject: [PATCH 09/25] nit: remove test file and add sample test in config Signed-off-by: vbaddi Signed-off-by: Onkar Chougule --- tests/test_gpt.py | 73 ----------------------------------------------- 1 file changed, 73 deletions(-) delete mode 100644 tests/test_gpt.py diff --git a/tests/test_gpt.py b/tests/test_gpt.py deleted file mode 100644 index 8e44f2f82..000000000 --- a/tests/test_gpt.py +++ /dev/null @@ -1,73 +0,0 @@ -# ----------------------------------------------------------------------------- -# -# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. -# SPDX-License-Identifier: BSD-3-Clause -# -# ----------------------------------------------------------------------------- - -import torch -from transformers import AutoConfig, GptOssForCausalLM, TextStreamer - -from QEfficient import QEFFAutoModelForCausalLM -from QEfficient.utils._utils import load_hf_tokenizer -from QEfficient.utils.constants import Constants -from QEfficient.utils.run_utils import ApiRunner - -Constants.INPUT_STR = [ - "Make sure tokens don't repeat\n\nTo make a simple cup of coffee, start by boiling water. Add one to two teaspoons of instant coffee powder to a mug. Pour the hot water over the coffee and stir well. Add sugar and milk to taste, if desired. For brewed coffee, use a French press or drip filter. Add coarsely ground coffee to the device, pour hot water over it, and let it steep for four minutes. Press or filter the coffee, then serve" -] - -torch.manual_seed(42) -model_id = "openai/gpt-oss-20b" -config = AutoConfig.from_pretrained(model_id) -config.num_hidden_layers = 2 - -# Remove the quantization_config attribute if it exists, to avoid MXFP4 Issues -if hasattr(config, "quantization_config"): - delattr(config, "quantization_config") - -model = GptOssForCausalLM.from_pretrained( - "/home/vbaddi/transformers/src/transformers/models/gpt_oss/new_weights", - torch_dtype=torch.float32, - attn_implementation="eager", - config=config, -) -model.eval() -model.generation_config.sample = False -tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_id) -config = model.config -batch_size = len(Constants.INPUT_STR) - -api_runner = ApiRunner(batch_size, tokenizer, config, Constants.INPUT_STR, 97, 256) -pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model) - - -qeff_model = QEFFAutoModelForCausalLM(model, continuous_batching=False) -# pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) - -onnx_model_path = qeff_model.export() - - -qpc_path = qeff_model.compile( - prefill_seq_len=128, - ctx_len=256, - num_cores=16, - mxfp6_matmul=False, - mxint8_kv_cache=False, - num_devices=1, - mos=1, - aic_enable_depth_first=True, - num_speculative_tokens=None, -) -print(f"qpc path is {qpc_path}") -streamer = TextStreamer(tokenizer) -exec_info = qeff_model.generate( - tokenizer, - streamer=streamer, - prompts=Constants.INPUT_STR[0], - device_ids=[0], -) - -print(pytorch_hf_tokens) -print(exec_info) -assert (exec_info.generated_ids[0][0, :159] == pytorch_hf_tokens).all() From 54323382534a42dc3861dc2342fed1d7b6620038 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Mon, 3 Nov 2025 11:52:29 +0000 Subject: [PATCH 10/25] Enable CB for GptOssModel Signed-off-by: Mamta Singh Signed-off-by: Onkar Chougule --- QEfficient/utils/generate_inputs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py index 409ddd6e0..9a6ba2e3d 100644 --- a/QEfficient/utils/generate_inputs.py +++ b/QEfficient/utils/generate_inputs.py @@ -173,6 +173,7 @@ def prepare_ort_inputs(self): inputs["past_key." + str(i)] = np.zeros((cache_shape), dtype=np.float32) inputs["past_value." + str(i)] = np.zeros((cache_shape), dtype=np.float32) else: + sliding_padding_shape = self.padding_shape[:2] + [self.config.sliding_window] + [self.padding_shape[-1]] for i in range(self.n_layer): if ( all(hasattr(self.config, attr) for attr in ["sliding_window", "layer_types"]) From cb8145f3449a185cae7a665f0017da95cc43bb5d Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Tue, 4 Nov 2025 06:33:47 +0000 Subject: [PATCH 11/25] Fix tests Signed-off-by: Mamta Singh --- QEfficient/utils/generate_inputs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py index 9a6ba2e3d..c5a17a76f 100644 --- a/QEfficient/utils/generate_inputs.py +++ b/QEfficient/utils/generate_inputs.py @@ -91,12 +91,15 @@ def prepare_pytorch_inputs(self): inputs["batch_index"] = torch.arange(self.full_batch_size).view(-1, 1) past_key_values = [] - sliding_padding_shape = self.padding_shape[:2] + [self.config.sliding_window] + [self.padding_shape[-1]] for i in range(self.n_layer): +<<<<<<< HEAD if ( all(hasattr(self.config, attr) for attr in ["sliding_window", "layer_types"]) and self.config.layer_types[i] == "sliding_attention" ): +======= + if hasattr(self.config, "sliding_window") and self.config.layer_types[i] == "sliding_attention": +>>>>>>> b1ed627 (Fix tests) pad_shape = self.padding_shape[:2] + [self.config.sliding_window] + [self.padding_shape[-1]] else: pad_shape = self.padding_shape @@ -173,7 +176,6 @@ def prepare_ort_inputs(self): inputs["past_key." + str(i)] = np.zeros((cache_shape), dtype=np.float32) inputs["past_value." + str(i)] = np.zeros((cache_shape), dtype=np.float32) else: - sliding_padding_shape = self.padding_shape[:2] + [self.config.sliding_window] + [self.padding_shape[-1]] for i in range(self.n_layer): if ( all(hasattr(self.config, attr) for attr in ["sliding_window", "layer_types"]) From 58c1740bf400818e5a372221b1af7f9cf3601642 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Tue, 4 Nov 2025 09:41:57 +0000 Subject: [PATCH 12/25] Address review comments Signed-off-by: Mamta Singh --- QEfficient/transformers/models/modeling_auto.py | 5 ----- QEfficient/utils/generate_inputs.py | 4 ---- pyproject.toml | 4 ++-- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 18ffd2d33..60f60c768 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2741,11 +2741,6 @@ def compile( for kv in ["key", "value"]: custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype - # HACK for now - if self.model.config.model_type == "gpt_oss": - for spec in specializations: - spec.update({"sliding_window": 128}) - qpc_path = self._compile( onnx_path=onnx_path, compile_dir=compile_dir, diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py index c5a17a76f..7d07db530 100644 --- a/QEfficient/utils/generate_inputs.py +++ b/QEfficient/utils/generate_inputs.py @@ -92,14 +92,10 @@ def prepare_pytorch_inputs(self): past_key_values = [] for i in range(self.n_layer): -<<<<<<< HEAD if ( all(hasattr(self.config, attr) for attr in ["sliding_window", "layer_types"]) and self.config.layer_types[i] == "sliding_attention" ): -======= - if hasattr(self.config, "sliding_window") and self.config.layer_types[i] == "sliding_attention": ->>>>>>> b1ed627 (Fix tests) pad_shape = self.padding_shape[:2] + [self.config.sliding_window] + [self.padding_shape[-1]] else: pad_shape = self.padding_shape diff --git a/pyproject.toml b/pyproject.toml index dbff208a9..ea3c3405d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,8 +22,8 @@ dependencies = [ "transformers==4.55.0", "huggingface-hub==0.34.0", "hf_transfer==0.1.9", - "peft", - "datasets", + "peft==0.13.2", + "datasets==2.20.0", "fsspec==2023.6.0", "multidict==6.0.4", "urllib3<2", From dec061655055238a003da631e2f6dfaad2039bc1 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Tue, 4 Nov 2025 19:28:03 +0000 Subject: [PATCH 13/25] prefill only changes for gpt-oss Signed-off-by: Onkar Chougule --- QEfficient/__init__.py | 14 +- QEfficient/base/modeling_qeff.py | 27 +- QEfficient/transformers/cache_utils.py | 31 +++ QEfficient/transformers/modeling_utils.py | 3 + .../models/gpt_oss/modeling_gpt_oss.py | 238 +++++++++++++++++- .../transformers/models/modeling_auto.py | 57 ++++- .../transformers/models/pytorch_transforms.py | 11 + QEfficient/utils/_utils.py | 1 + QEfficient/utils/hash_utils.py | 1 + examples/gpt_oss_disagg_mode.py | 47 ++++ 10 files changed, 407 insertions(+), 23 deletions(-) create mode 100644 examples/gpt_oss_disagg_mode.py diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py index 33c6f5588..0efccd41b 100644 --- a/QEfficient/__init__.py +++ b/QEfficient/__init__.py @@ -6,17 +6,21 @@ # ----------------------------------------------------------------------------- import os -import warnings - -import QEfficient.utils.model_registery # noqa: F401 -from QEfficient.utils import custom_format_warning -from QEfficient.utils.logging_utils import logger +# ----------------------------------------------------------------------------- # # For faster downloads via hf_transfer # This code is put above import statements as this needs to be executed before # hf_transfer is imported (will happen on line 15 via leading imports) os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" +# DO NOT ADD ANY CODE ABOVE THIS LINE +# Please contact maintainers if you must edit this file above this line. +# ----------------------------------------------------------------------------- # # Placeholder for all non-transformer models registered in QEfficient +import warnings # noqa: I001 + +import QEfficient.utils.model_registery # noqa: F401 +from QEfficient.utils import custom_format_warning +from QEfficient.utils.logging_utils import logger # custom warning for the better logging experience diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 6ecbf0fc0..3a8b4d041 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -57,6 +57,7 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None: super().__init__() self.model = model self.hash_params = create_model_params(self, **kwargs) + self.prefill_onnx_path: Optional[str] = None self.onnx_path: Optional[str] = None self.qpc_path: Optional[str] = None self.qpc_session: Optional[QAICInferenceSession] = None @@ -179,6 +180,7 @@ def _export( onnx_transform_kwargs: Optional[Dict[str, any]] = None, export_dir: Optional[str] = None, offload_pt_weights: bool = True, + prefill_only: Optional[bool] = False, ) -> str: """ Export the PyTorch model to ONNX and apply ONNX transforms @@ -207,7 +209,10 @@ def _export( # Return early if ONNX already exists if onnx_path.is_file(): - self.onnx_path = onnx_path + if prefill_only: + self.prefill_onnx_path = onnx_path + else: + self.onnx_path = onnx_path return onnx_path # check if the model is in meta state or weights are offloaded @@ -283,8 +288,11 @@ def _export( finally: shutil.rmtree(tmp_onnx_dir, ignore_errors=True) - - self.onnx_path = onnx_path + print(onnx_path) + if prefill_only: + self.prefill_onnx_path = onnx_path + else: + self.onnx_path = onnx_path return onnx_path @dump_qconfig @@ -300,6 +308,8 @@ def _compile( num_speculative_tokens: Optional[int] = None, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + prefill_only: Optional[str] = None, + offload_pt_weights: Optional[bool] = True, **compiler_options, ) -> str: """ @@ -325,10 +335,16 @@ def _compile( For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored. """ + kwargs = {"offload_pt_weights": offload_pt_weights} + if prefill_only and self.prefill_onnx_path is None: + kwargs.update({"prefill_only": prefill_only, "prefill_seq_len": specializations[0].get("seq_len")}) + self.export(**kwargs) + onnx_path = Path(onnx_path or self.prefill_onnx_path) + if onnx_path is None and self.onnx_path is None: - self.export() + self.export(**kwargs) + onnx_path = Path(onnx_path or self.onnx_path) - onnx_path = Path(onnx_path or self.onnx_path) compile_dir = Path(compile_dir or onnx_path.parent) qpc_path = compile_dir / "qpc" if not onnx_path.is_file(): @@ -390,6 +406,7 @@ def _compile( "mdp_ts_num_devices": mdp_ts_num_devices, "mdp_ts_json": mdp_ts_json, "num_speculative_tokens": num_speculative_tokens, + "prefill_only": prefill_only, } compile_hash = hash_dict_params(compile_hash_params) diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py index 853567be9..18a15e480 100644 --- a/QEfficient/transformers/cache_utils.py +++ b/QEfficient/transformers/cache_utils.py @@ -594,6 +594,37 @@ def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]: legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),) return legacy_cache + def write_only( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + if len(self.key_cache) <= layer_idx: + self.key_cache.append(key_states) + self.value_cache.append(value_states) + k_out, v_out = key_states, value_states + else: + position_ids = cache_kwargs.get("position_ids") + is_sliding_layer = cache_kwargs.get("is_sliding") + _, _, ctx_len, _ = self.key_cache[layer_idx].shape + if is_sliding_layer: + kv_position_ids = torch.arange(ctx_len, dtype=torch.int64).reshape(1, -1) + self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], kv_position_ids, key_states) + self.value_cache[layer_idx] = CtxScatterFunc.apply( + self.value_cache[layer_idx], kv_position_ids, value_states + ) + else: + kv_position_ids = position_ids + + self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], kv_position_ids, key_states) + self.value_cache[layer_idx] = CtxScatterFunc.apply( + self.value_cache[layer_idx], kv_position_ids, value_states + ) + k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx] + return k_out, v_out + def update( self, key_states: torch.Tensor, diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index 5337b44f5..47059d8dc 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -188,6 +188,9 @@ # This is for supporting different seq_len for different layers for Sliding window attn, chunked attn etc. DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH = {"gemma3", "llama4", "gemma3_text", "llama4_text"} +# This is for supporting different modelling classes specially written for prefill-only model +SPECIALIZED_PREFILL_ONLY_MODEL_ARCH = {"gpt_oss"} + # Define a transformers layers to QEff layers dictionary # While onboarding new models make sure to add the new layer maps to this dictionary. TransformersToQEffModulesDict: Dict[Type[nn.Module], Type[nn.Module]] = { diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index 62bc849b7..69294fc66 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- +import os from typing import Callable, Optional, Union import torch @@ -32,6 +33,7 @@ from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask from QEfficient.utils import constants from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE +from QEfficient.utils.logging_utils import logger class QEffGptOssExperts(GptOssExperts): @@ -42,8 +44,8 @@ def __qeff_init__(self): self.up_proj_bias = nn.Parameter(torch.empty(self.num_experts, self.expert_dim)) -class QEffGptOssMLP(GptOssMLP): - def alt_forward(self, hidden: torch.Tensor): +class QEffPrefillOnlyGptOssMLP(GptOssMLP): + def forward(self, hidden: torch.Tensor): B, S, H = hidden.shape T = B * S hidden = hidden.view(T, H) @@ -95,6 +97,8 @@ def alt_forward(self, hidden: torch.Tensor): # original shape [B, S, H] return expert_out.view(B, S, H), router_logits + +class QEffGptOssMLP(GptOssMLP): # ------------------- Gather based, weights as activation approach --------------- def forward_weights_as_activation(self, hidden_states): bs, seq_len, _ = hidden_states.shape @@ -404,6 +408,137 @@ def eager_attention_forward( return attn_output, attn_weights +def eager_attention_forward_blocked( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + **kwargs, +): + softmax_count = 0 + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + BS, NH, CL, DH = query.shape + target_blocks = int(os.environ.get("NUM_BLOCKS")) + block_positions = [] + for j in range(target_blocks): + block_positions.append(j * (CL // target_blocks)) + + print(f"CL={CL}, target_blocks={target_blocks}") + + block_count = 0 + outs = [] + for block_idx in range(target_blocks): + block_count += 1 + qi = block_positions[block_idx] + + # Calculate block size (last block should be handled with remainder) + if block_idx == target_blocks - 1: + real_q_len = CL - qi + else: + real_q_len = block_positions[block_idx + 1] - qi + + q_block = query[:, :, qi : qi + real_q_len, :] + scores = torch.matmul(q_block, key_states.transpose(2, 3)) * scaling + attn_mask_block = attention_mask[:, :, qi : qi + real_q_len, :] + curr_attn_weights = torch.where( + attn_mask_block, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=torch.float32), scores + ) + sinks = module.sinks.reshape(1, -1, 1, 1).expand( + curr_attn_weights.shape[0], -1, curr_attn_weights.shape[-2], -1 + ) + combined_logits = torch.cat([curr_attn_weights, sinks], dim=-1) + combined_logits = combined_logits - combined_logits.max(dim=-1, keepdim=True).values + curr_attn_weights = nn.functional.softmax(combined_logits, dim=-1, dtype=torch.float32) + curr_attn_weights = curr_attn_weights[..., :-1] + out_block = torch.matmul(curr_attn_weights, value_states) + outs.append(out_block) + output = torch.cat(outs, dim=2) + + print(f"Completed {block_count} blocks, {softmax_count} softmax operations") + output = output.view(BS, NH, CL, DH).transpose(1, 2).contiguous() + return output, output + + +class QEffPrefillOnlyGptOssAttention(GptOssAttention): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __qeff_init__(self): + self.rotary_emb = QEffGptOssRotaryEmbedding(config=self.config) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + batch_index: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + sliding_mask=None, + **kwargs: Unpack[TransformersKwargs], + ) -> tuple[torch.Tensor, torch.Tensor]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + hidden_shape = (*input_shape, -1, self.head_dim) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + cos, sin = self.rotary_emb(value_states, seq_len=32 * 1024) + query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = { + "sin": sin, + "cos": cos, + "batch_index": batch_index, + "position_ids": position_ids, + "config": self.config, + "is_sliding": self.sliding_window is not None, + "sliding_window": past_key_value.sliding_window_len, + } + if self.sliding_window is not None: + sliding_window_len = past_key_value.sliding_window_len + short_read_idx = torch.arange(sliding_window_len) + read_idx = short_read_idx + torch.where( + position_ids.max() > sliding_window_len - 1, position_ids.max() - sliding_window_len + 1, 0 + ) + # This is a trick to export with NUM_BLOCKS position_ids.max(), 0, read_idx) + k_cache = key_states[:, :, read_idx, :] + v_cache = value_states[:, :, read_idx, :] + else: + k_cache, v_cache = key_states, value_states + _, _ = past_key_value.write_only(k_cache, v_cache, self.layer_idx, cache_kwargs) + + if self.sliding_window is not None: + attention_mask = sliding_mask + else: + attention_mask = attention_mask + + attention_interface: Callable = eager_attention_forward_blocked + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + sliding_window=self.sliding_window, + s_aux=self.sinks, # diff with Llama + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights, past_key_value + + class QEffGptOssAttention(GptOssAttention): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -505,7 +640,6 @@ def forward( residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states, _ = self.mlp(hidden_states) # diff with llama: router scores - # alth, _ = self.mlp.alt_forward(hidden_states) hidden_states = hidden_states.reshape(residual.shape) hidden_states = residual + hidden_states outputs = (hidden_states,) @@ -519,6 +653,98 @@ def forward( return outputs +class QEffPrefillOnlyGptOssModel(GptOssModel): + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + batch_index: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[TransformersKwargs], + ) -> MoeModelOutputWithPast: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + past_key_values = QEffHybridCacheForGPTOSS.from_legacy_cache(self.config, past_key_values) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + # target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens + causal_mask = _create_causal_mask(position_ids=position_ids, target_length=past_key_values.max_cache_len) + sliding_mask = _create_causal_mask( + position_ids=position_ids, + target_length=past_key_values.max_cache_len, + sliding_window=past_key_values.sliding_window_len, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + batch_index=batch_index, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + sliding_mask=sliding_mask, + **kwargs, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if return_legacy_cache: + past_key_values = past_key_values.to_legacy_cache() + + return MoeModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + ) + + class QEffGptOssModel(GptOssModel): def forward( self, @@ -714,9 +940,15 @@ def get_specializations( batch_size: int, prefill_seq_len: int, ctx_len: int, + **kwargs, ): batch_size = batch_size if batch_size else 1 prefill_seq_len = prefill_seq_len if prefill_seq_len else constants.PROMPT_LEN + if kwargs.get("prefill_only") and ctx_len != prefill_seq_len: + ctx_len = prefill_seq_len + logger.warning( + f"overriding ctx_len={prefill_seq_len}, currently we don't support ctx_len different than prefill_seq_len for prefill_only model" + ) ctx_len = ctx_len if ctx_len else constants.CTX_LEN specializations = [ diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 60f60c768..3de527f16 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -5,6 +5,7 @@ # # ---------------------------------------------------------------------------- +import os import warnings from pathlib import Path from time import perf_counter @@ -37,12 +38,16 @@ get_compilation_dims, ) from QEfficient.generation.vlm_generation import VisionLanguageGeneration -from QEfficient.transformers.modeling_utils import DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH +from QEfficient.transformers.modeling_utils import ( + DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH, + SPECIALIZED_PREFILL_ONLY_MODEL_ARCH, +) from QEfficient.transformers.models.pytorch_transforms import ( CustomOpsTransform, KVCacheExternalModuleMapperTransform, KVCacheTransform, PoolingTransform, + PrefillOnlyTransform, SamplerTransform, SpDTransform, VlmKVOffloadTransform, @@ -314,7 +319,7 @@ def get_model_config(self) -> dict: """ return self.model.config.__dict__ - def export(self, export_dir: Optional[str] = None) -> str: + def export(self, export_dir: Optional[str] = None, **kwargs) -> str: """ Export the model to ONNX format using ``torch.onnx.export``. @@ -594,7 +599,7 @@ def __init__(self, model: nn.modules, **kwargs): self.model = model.get_qeff_vision_encoder() self.hash_params["qeff_auto_class"] = self.__class__.__name__ - def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): + def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True, **kwargs): """ Exports the vision encoder component to ONNX format. @@ -736,7 +741,7 @@ def __init__(self, model, **kwargs): self.model = model.get_qeff_language_decoder() self.hash_params["qeff_auto_class"] = self.__class__.__name__ - def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True): + def export(self, inputs, output_names, dynamic_axes, export_dir=None, offload_pt_weights=True, **kwargs): """ Exports the language decoder component to ONNX format. @@ -2295,7 +2300,14 @@ def get_model_config(self) -> dict: """ return self.model.config.__dict__ - def export(self, export_dir: Optional[str] = None) -> str: + def export( + self, + export_dir: Optional[str] = None, + prefill_only: Optional[bool] = False, + prefill_seq_len: Optional[int] = None, + offload_pt_weights: Optional[bool] = True, + **kwargs, + ) -> str: """ Export the model to ONNX format using ``torch.onnx.export``. @@ -2314,8 +2326,25 @@ def export(self, export_dir: Optional[str] = None) -> str: str Path to the generated ONNX graph file. """ + if prefill_only: + block_size = os.environ.get("BLOCK_SIZE", None) + if block_size is None: + block_size = 128 + logger.warning( + "Setting BLOCK_SIZE=128 for prefill_only model, please set ENV variable `BLOCK_SIZE` to override" + ) + if prefill_seq_len is None or prefill_seq_len % block_size != 0: + raise ValueError( + f"When prefill_only=True, 'prefill_seq_len' must be explicitly set and divisible by block_size={block_size}. " + f"Received: prefill_seq_len={prefill_seq_len}" + ) + + os.environ["NUM_BLOCKS"] = str(prefill_seq_len // block_size) + if self.model.config.model_type in SPECIALIZED_PREFILL_ONLY_MODEL_ARCH: + self.model, tf = PrefillOnlyTransform.apply(self.model) + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE - seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN if not prefill_only else prefill_seq_len // block_size fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS kv_cache_shape = get_padding_shape_from_config( self.model.config, fbs if self.continuous_batching else bs, seq_len @@ -2394,12 +2423,13 @@ def export(self, export_dir: Optional[str] = None) -> str: output_names=output_names, dynamic_axes=dynamic_axes, ) - return self._export( example_inputs, output_names, dynamic_axes, export_dir=export_dir, + offload_pt_weights=offload_pt_weights, + prefill_only=prefill_only, ) def get_sampling_inputs_and_outputs( @@ -2488,6 +2518,7 @@ def build_prefill_specialization( batch_size: int = 1, kv_cache_batch_size: Optional[int] = None, full_batch_size: Optional[int] = None, + **kwargs, ): """ Builds a dictionary representing a compilation specialization for the prefill phase. @@ -2515,6 +2546,7 @@ def build_prefill_specialization( batch_size=1 if self.continuous_batching else batch_size, prefill_seq_len=prefill_seq_len, ctx_len=ctx_len, + **kwargs, )[0] else: spec = { @@ -2603,6 +2635,7 @@ def compile( mxint8_kv_cache: bool = False, num_speculative_tokens: Optional[int] = None, prefill_only: Optional[bool] = None, + offload_pt_weights: Optional[bool] = True, **compiler_options, ) -> str: """ @@ -2705,6 +2738,9 @@ def compile( ): raise ValueError("Currently, sampler does not support `num_speculative_tokens` > 0.") + if kv_cache_batch_size and prefill_only is not None and prefill_only: + logger.warning("kv_cache_batch_size will be ignored as prefill_only is set to True") + # Infer kv_cache_batch_size if not provided kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size @@ -2740,7 +2776,6 @@ def compile( for i in range(self.num_layers): for kv in ["key", "value"]: custom_io[f"past_{kv}.{i}{suffix}"] = kv_cache_dtype - qpc_path = self._compile( onnx_path=onnx_path, compile_dir=compile_dir, @@ -2754,6 +2789,8 @@ def compile( num_speculative_tokens=num_speculative_tokens, aic_num_cores=num_cores, mxint8_kv_cache=mxint8_kv_cache, + prefill_only=prefill_only, + offload_pt_weights=offload_pt_weights, **compiler_options, ) @@ -2943,7 +2980,7 @@ def get_model_config(self) -> dict: """ return self.model.config.__dict__ - def export(self, export_dir: Optional[str] = None) -> str: + def export(self, export_dir: Optional[str] = None, **kwargs) -> str: """ Export the model to ONNX format using ``torch.onnx.export``. @@ -3307,7 +3344,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, pooling=None, *args, **k def get_model_config(self) -> dict: return self.model.config.__dict__ - def export(self, export_dir: Optional[str] = None) -> str: + def export(self, export_dir: Optional[str] = None, **kwargs) -> str: """ Exports the model to ``ONNX`` format using ``torch.onnx.export``. diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 773ce178c..4d482fa98 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -259,6 +259,9 @@ QEffGptOssForCausalLM, QEffGptOssMLP, QEffGptOssModel, + QEffPrefillOnlyGptOssAttention, + QEffPrefillOnlyGptOssMLP, + QEffPrefillOnlyGptOssModel, ) from QEfficient.transformers.models.gptj.modeling_gptj import ( QEffGPTJAttention, @@ -630,6 +633,14 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]: return model, transformed +class PrefillOnlyTransform(ModuleMappingTransform): + _module_mapping = { + QEffGptOssModel: QEffPrefillOnlyGptOssModel, + QEffGptOssAttention: QEffPrefillOnlyGptOssAttention, + QEffGptOssExperts: QEffPrefillOnlyGptOssMLP, + } + + class SpDTransform: """ Apply generic QEffForCausalLM forward pass to extract `num_speculative_tokens+1` hidden states before computing logits during decode phase and extract last predicted token during prefill. diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index d58f54952..09b6bd830 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -566,6 +566,7 @@ def wrapper(self, *args, **kwargs): dynamic_axes=all_args.get("dynamic_axes"), export_kwargs=all_args.get("export_kwargs", None), onnx_transform_kwargs=all_args.get("onnx_transform_kwargs", None), + prefill_only=all_args.get("prefill_only", False), ) export_dir = export_dir.with_name(export_dir.name + "-" + export_hash) kwargs["export_dir"] = export_dir diff --git a/QEfficient/utils/hash_utils.py b/QEfficient/utils/hash_utils.py index b6b38b8b4..6b48cc244 100644 --- a/QEfficient/utils/hash_utils.py +++ b/QEfficient/utils/hash_utils.py @@ -67,5 +67,6 @@ def create_export_hash(**kwargs): export_hash_params.update(onnx_transform_kwargs) if export_hash_params.get("peft_config") is not None and not isinstance(export_hash_params["peft_config"], dict): export_hash_params["peft_config"] = export_hash_params["peft_config"].to_dict() + export_hash_params["prefill_only"] = kwargs.get("prefill_only") return hash_dict_params(export_hash_params), export_hash_params diff --git a/examples/gpt_oss_disagg_mode.py b/examples/gpt_oss_disagg_mode.py new file mode 100644 index 000000000..22238de13 --- /dev/null +++ b/examples/gpt_oss_disagg_mode.py @@ -0,0 +1,47 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +from transformers import AutoTokenizer, TextStreamer + +from QEfficient import QEFFAutoModelForCausalLM + +model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 + +qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) +tokenizer = AutoTokenizer.from_pretrained(model_id) + +decode_qpc_path = qeff_model.compile( + prefill_seq_len=1, # Currently we can get best perf using PL=1 i.e. decode-only model, prefill optimizations are being worked on. + ctx_len=256, + num_cores=16, + mxfp6_matmul=True, + mxint8_kv_cache=True, + num_devices=1, + mos=1, + aic_enable_depth_first=True, + num_speculative_tokens=None, + offload_pt_weights=False, +) +prefill_qpc_path = qeff_model.compile( + prefill_seq_len=256, # Currently we can get best perf using PL=1 i.e. decode-only model, prefill optimizations are being worked on. + ctx_len=256, + num_cores=16, + mxfp6_matmul=True, + mxint8_kv_cache=True, + num_devices=1, + mos=1, + aic_enable_depth_first=True, + num_speculative_tokens=None, + prefill_only=True, +) +# print(f"qpc path is {qpc_path}") +# streamer = TextStreamer(tokenizer) +# exec_info = qeff_model.generate( +# tokenizer, +# prompts="Who is your creator? and What all you are allowed to do?", +# device_id=[0, 1, 2, 3], +# ) From c57e20824be3881e9e2eff89dc688f95f32e1864 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Wed, 5 Nov 2025 06:24:20 +0000 Subject: [PATCH 14/25] fixed mapping Signed-off-by: Onkar Chougule --- QEfficient/transformers/models/pytorch_transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 4d482fa98..14f0987c1 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -637,7 +637,7 @@ class PrefillOnlyTransform(ModuleMappingTransform): _module_mapping = { QEffGptOssModel: QEffPrefillOnlyGptOssModel, QEffGptOssAttention: QEffPrefillOnlyGptOssAttention, - QEffGptOssExperts: QEffPrefillOnlyGptOssMLP, + QEffGptOssMLP: QEffPrefillOnlyGptOssMLP, } From 7f8416f58cc6b0971b32b4065fa8b96deae1e5a0 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 6 Nov 2025 07:58:55 +0000 Subject: [PATCH 15/25] added test Signed-off-by: Onkar Chougule --- QEfficient/base/modeling_qeff.py | 2 +- .../models/gpt_oss/modeling_gpt_oss.py | 2 +- .../transformers/models/modeling_auto.py | 14 +- .../transformers/models/pytorch_transforms.py | 4 + .../transformers/quantizers/__init__.py | 4 +- examples/gpt_oss_disagg_mode.py | 133 ++++++++++++-- tests/transformers/models/test_disagg_mode.py | 171 ++++++++++++++++++ 7 files changed, 311 insertions(+), 19 deletions(-) create mode 100644 tests/transformers/models/test_disagg_mode.py diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 3a8b4d041..1f0796e96 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -57,6 +57,7 @@ def __init__(self, model: torch.nn.Module, **kwargs) -> None: super().__init__() self.model = model self.hash_params = create_model_params(self, **kwargs) + self.prefill_enabled = False self.prefill_onnx_path: Optional[str] = None self.onnx_path: Optional[str] = None self.qpc_path: Optional[str] = None @@ -288,7 +289,6 @@ def _export( finally: shutil.rmtree(tmp_onnx_dir, ignore_errors=True) - print(onnx_path) if prefill_only: self.prefill_onnx_path = onnx_path else: diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index 69294fc66..7a7cd3cbf 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -422,7 +422,7 @@ def eager_attention_forward_blocked( value_states = repeat_kv(value, module.num_key_value_groups) BS, NH, CL, DH = query.shape - target_blocks = int(os.environ.get("NUM_BLOCKS")) + target_blocks = int(os.environ.get("NUM_BLOCKS", 1)) block_positions = [] for j in range(target_blocks): block_positions.append(j * (CL // target_blocks)) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 3de527f16..8c634a933 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -48,6 +48,7 @@ KVCacheTransform, PoolingTransform, PrefillOnlyTransform, + RevertPrefillOnlyTransform, SamplerTransform, SpDTransform, VlmKVOffloadTransform, @@ -2118,6 +2119,14 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel): ] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] + def prefill(self, enable: Optional[bool] = True): + if enable: + self.model, tf = PrefillOnlyTransform.apply(self.model) + self.prefill_enabled = True + else: + self.model, tf = RevertPrefillOnlyTransform.apply(self.model) + self.prefill_enabled = False + def __init__( self, model: nn.Module, @@ -2341,8 +2350,9 @@ def export( os.environ["NUM_BLOCKS"] = str(prefill_seq_len // block_size) if self.model.config.model_type in SPECIALIZED_PREFILL_ONLY_MODEL_ARCH: - self.model, tf = PrefillOnlyTransform.apply(self.model) - + self.prefill(True) + else: + self.prefill(False) bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN if not prefill_only else prefill_seq_len // block_size fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 14f0987c1..b3e60a8b9 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -641,6 +641,10 @@ class PrefillOnlyTransform(ModuleMappingTransform): } +class RevertPrefillOnlyTransform(ModuleMappingTransform): + _module_mapping = {v: k for k, v in PrefillOnlyTransform._module_mapping.items()} + + class SpDTransform: """ Apply generic QEffForCausalLM forward pass to extract `num_speculative_tokens+1` hidden states before computing logits during decode phase and extract last predicted token during prefill. diff --git a/QEfficient/transformers/quantizers/__init__.py b/QEfficient/transformers/quantizers/__init__.py index dfadc00ef..dc2308e99 100644 --- a/QEfficient/transformers/quantizers/__init__.py +++ b/QEfficient/transformers/quantizers/__init__.py @@ -5,6 +5,6 @@ # # ----------------------------------------------------------------------------- -from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers +from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers, undo_transformers_quantizers -__all__ = ["replace_transformers_quantizers"] +__all__ = ["replace_transformers_quantizers", "undo_transformers_quantizers"] diff --git a/examples/gpt_oss_disagg_mode.py b/examples/gpt_oss_disagg_mode.py index 22238de13..9f196e002 100644 --- a/examples/gpt_oss_disagg_mode.py +++ b/examples/gpt_oss_disagg_mode.py @@ -5,18 +5,76 @@ # # ----------------------------------------------------------------------------- -from transformers import AutoTokenizer, TextStreamer +import time + +import numpy as np +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, HybridCache from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.generation.cloud_infer import QAICInferenceSession model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 +# prompt = """ +# Billions of years ago, in the vast emptiness of the early universe, tiny fluctuations in the density of matter began to grow under the influence of gravity. Clouds of gas—mostly hydrogen and helium—started to collapse, forming the first stars. These stars grouped together, bound by gravity, creating the earliest galaxies. +# Over time, these galaxies merged, collided, and evolved, shaping their spiral arms, elliptical forms, or irregular structures. Within their swirling depths, stars were born and died, enriching the galactic gas with heavier elements. These elements became the building blocks for planets, moons, and eventually life. +# Life is a very interesting phenomenon that occured in this universe +# """ +# prompt = "Once upon a time" +prompt = """ +Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures. + +As Alex flipped through the pages, he discovered a map that led to a hidden treasure. Excited by the prospect of a real-life treasure hunt, Alex decided to embark on a thrilling journey. He packed his backpack with snacks, a flashlight, and a compass, and set off into the unknown. -qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) +The path to the treasure was not an easy one. Alex had to navigate through dense forests, cross rickety bridges, and solve riddles that guarded the treasure's location. +""" +all_outputs = [] +# Run prefill tokenizer = AutoTokenizer.from_pretrained(model_id) +PREFILL_SEQ_LEN = 256 +CTX_LEN = 256 +inputs = tokenizer(prompt, return_tensors="np", padding=True) +position_ids = inputs["attention_mask"].sum(1, keepdims=True) +padded_len = inputs["input_ids"].shape[1] +num_chunks = -(padded_len // -PREFILL_SEQ_LEN) # ceil divide without float +padded_len = num_chunks * PREFILL_SEQ_LEN # Convert to a multiple of prompt_len + +# Initialize variables specific to request +# Calculate the max generation length. +max_gen_len = CTX_LEN - position_ids.max() +generation_len = max_gen_len + +# model = AutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) +# config = model.config +# inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) +# inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) +# inputs.pop("token_type_ids", None) +# inputs = {k: torch.from_numpy(v).to(model.device) for k, v in inputs.items()} +# cache = HybridCache(config=config, batch_size=1, max_cache_len=8192) +# out = model(**tokenizer(prompt, return_tensors="pt"), past_key_values=cache) + + +qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id) +config = qeff_model.model.config +inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) +inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) +inputs.pop("token_type_ids", None) +inputs = {k: torch.from_numpy(v) for k, v in inputs.items()} +past_key_values = [] +for i in range(config.num_hidden_layers): + cache_len = config.sliding_window if i % 2 == 0 else PREFILL_SEQ_LEN + pad_shape = (1, 8, cache_len, 64) + past_key = torch.zeros((pad_shape), dtype=torch.float32) + past_value = torch.zeros((pad_shape), dtype=torch.float32) + pkv = (past_key, past_value) + past_key_values.append(pkv) +inputs["past_key_values"] = past_key_values + +# qeff_out = qeff_model.model(**inputs) decode_qpc_path = qeff_model.compile( - prefill_seq_len=1, # Currently we can get best perf using PL=1 i.e. decode-only model, prefill optimizations are being worked on. - ctx_len=256, + prefill_seq_len=1, + ctx_len=CTX_LEN, num_cores=16, mxfp6_matmul=True, mxint8_kv_cache=True, @@ -27,8 +85,8 @@ offload_pt_weights=False, ) prefill_qpc_path = qeff_model.compile( - prefill_seq_len=256, # Currently we can get best perf using PL=1 i.e. decode-only model, prefill optimizations are being worked on. - ctx_len=256, + prefill_seq_len=PREFILL_SEQ_LEN, + ctx_len=CTX_LEN, num_cores=16, mxfp6_matmul=True, mxint8_kv_cache=True, @@ -38,10 +96,59 @@ num_speculative_tokens=None, prefill_only=True, ) -# print(f"qpc path is {qpc_path}") -# streamer = TextStreamer(tokenizer) -# exec_info = qeff_model.generate( -# tokenizer, -# prompts="Who is your creator? and What all you are allowed to do?", -# device_id=[0, 1, 2, 3], -# ) + +prefill_session = QAICInferenceSession(prefill_qpc_path) + +logits_out_placeholder = np.zeros((1, 1, 201088), dtype=np.float32) +prefill_session.set_buffers({"logits": logits_out_placeholder}) +inputs.pop("past_key_values") +inputs = {k: v.detach().numpy() for k, v in inputs.items()} +st = time.time() +qpc_out = prefill_session.run(inputs) +print(f"time for prefill_run={time.time() - st} sec\n") + +decode_session = QAICInferenceSession(decode_qpc_path) +decode_session.set_buffers({"logits": logits_out_placeholder}) + +decode_inputs = { + "input_ids": np.argmax(qpc_out["logits"]).reshape(1, 1), + "position_ids": np.max(inputs["position_ids"]).reshape(1, 1) + 1, +} +print("pos_id for decodee", decode_inputs["position_ids"]) + +all_outputs.append(decode_inputs["input_ids"][0][0]) +for i in range(config.num_hidden_layers): + if i % 2 == 0 and decode_inputs["position_ids"] >= config.sliding_window: + k = qpc_out[f"past_key.{i}_RetainedState"] + v = qpc_out[f"past_value.{i}_RetainedState"] + mod_pos_id = config.sliding_window - decode_inputs["position_ids"][0][0] % config.sliding_window + decode_inputs[f"past_key.{i}"] = np.concatenate((k[:, :, mod_pos_id:, :], k[:, :, :mod_pos_id, :]), axis=-2) + decode_inputs[f"past_value.{i}"] = np.concatenate((v[:, :, mod_pos_id:, :], v[:, :, :mod_pos_id, :]), axis=-2) + else: + decode_inputs[f"past_key.{i}"] = qpc_out[f"past_key.{i}_RetainedState"] + decode_inputs[f"past_value.{i}"] = qpc_out[f"past_value.{i}_RetainedState"] + +st = time.time() +decode_out = decode_session.run(decode_inputs) +print(f"time for first run of decode with KV as input = {time.time() - st} sec\n") +decode_session.skip_buffers( + [x for x in decode_session.input_names + decode_session.output_names if x.startswith("past_")] +) +pos_id = np.max(decode_inputs["position_ids"]).reshape(1, 1) + 1 +st = time.time() +for i in range(generation_len - 2): + loop_decode_inputs = { + "input_ids": np.argmax(decode_out["logits"]).reshape(1, 1), + "position_ids": pos_id, + } + # for i in range(config.num_hidden_layers): + # loop_decode_inputs[f"past_key.{i}"] = decode_out[f"past_key.{i}_RetainedState"] + # loop_decode_inputs[f"past_value.{i}"] = decode_out[f"past_value.{i}_RetainedState"] + all_outputs.append(loop_decode_inputs["input_ids"][0][0]) + decode_out = decode_session.run(loop_decode_inputs) + pos_id += 1 + + +print(f"time for decode generation = {(time.time() - st) / (generation_len - 2)}") +print(all_outputs) +print(tokenizer.decode(all_outputs)) diff --git a/tests/transformers/models/test_disagg_mode.py b/tests/transformers/models/test_disagg_mode.py new file mode 100644 index 000000000..07106bddc --- /dev/null +++ b/tests/transformers/models/test_disagg_mode.py @@ -0,0 +1,171 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + +import time + +import numpy as np +import pytest +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, HybridCache + +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.generation.cloud_infer import QAICInferenceSession +from QEfficient.transformers.quantizers import replace_transformers_quantizers, undo_transformers_quantizers + +replace_transformers_quantizers() + +model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 +# prompt = """ +# Billions of years ago, in the vast emptiness of the early universe, tiny fluctuations in the density of matter began to grow under the influence of gravity. Clouds of gas—mostly hydrogen and helium—started to collapse, forming the first stars. These stars grouped together, bound by gravity, creating the earliest galaxies. +# Over time, these galaxies merged, collided, and evolved, shaping their spiral arms, elliptical forms, or irregular structures. Within their swirling depths, stars were born and died, enriching the galactic gas with heavier elements. These elements became the building blocks for planets, moons, and eventually life. +# Thus, from the quiet whispers of cosmic dust, a galaxy emerged—an island of stars, nebulae, and mysteries, drifting through the infinite sea of space. +# As the galaxy matured, its stars danced in intricate orbits, weaving patterns shaped by gravity and time. Supernovae exploded like cosmic fireworks, scattering elements across space and triggering new waves of star formation. Black holes formed at the hearts of galaxies, anchoring their structure and influencing their evolution. Over billions of years, the galaxy became a dynamic ecosystem—where stars are born, live, and die—each cycle adding to the richness of the cosmic tapestry. +# """ +prompt = """ +Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures. + +As Alex flipped through the pages, he discovered a map that led to a hidden treasure. Excited by the prospect of a real-life treasure hunt, Alex decided to embark on a thrilling journey. He packed his backpack with snacks, a flashlight, and a compass, and set off into the unknown. + +The path to the treasure was not an easy one. Alex had to navigate through dense forests, cross rickety bridges, and solve riddles that guarded the treasure's location. +""" + + +@pytest.mark.parametrize("model_id", [model_id]) +def test_disagg_mode(model_id): + all_outputs = [] + # Run prefill + tokenizer = AutoTokenizer.from_pretrained(model_id) + PREFILL_SEQ_LEN = 256 + CTX_LEN = 256 + inputs = tokenizer(prompt, return_tensors="np", padding=True) + position_ids = inputs["attention_mask"].sum(1, keepdims=True) + padded_len = inputs["input_ids"].shape[1] + num_chunks = -(padded_len // -PREFILL_SEQ_LEN) # ceil divide without float + padded_len = num_chunks * PREFILL_SEQ_LEN # Convert to a multiple of prompt_len + + # Initialize variables specific to request + # Calculate the max generation length. + max_gen_len = CTX_LEN - position_ids.max() + generation_len = 50 + + # model = AutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) + model = AutoModelForCausalLM.from_pretrained(model_id) + config = model.config + inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) + inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) + inputs.pop("token_type_ids", None) + inputs = {k: torch.from_numpy(v).to(model.device) for k, v in inputs.items()} + cache = HybridCache(config=config, batch_size=1, max_cache_len=CTX_LEN) + ins = tokenizer(prompt, return_tensors="pt") + out = model(**ins, past_key_values=cache) + puts = { + "input_ids": out.logits[:, -1, :].argmax().reshape(1, -1), + "position_ids": ins["input_ids"].shape[-1].reshape(1, -1), + } + import ipdb + + ipdb.set_trace() + new_out = model(**puts, past_key_values=cache) + model.generation_config.do_sample = False + orig_all_out = model.generate( + **tokenizer(prompt, return_tensors="pt"), past_key_values=cache, max_new_tokens=max_gen_len + ) + undo_transformers_quantizers() + + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id) + # qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) + qeff_model.prefill(True) + config = qeff_model.model.config + inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) + inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) + inputs.pop("token_type_ids", None) + inputs = {k: torch.from_numpy(v) for k, v in inputs.items()} + past_key_values = [] + for i in range(config.num_hidden_layers): + cache_len = 128 if i % 2 == 0 else PREFILL_SEQ_LEN + pad_shape = (1, 8, cache_len, 64) + past_key = torch.zeros((pad_shape), dtype=torch.float32) + past_value = torch.zeros((pad_shape), dtype=torch.float32) + pkv = (past_key, past_value) + past_key_values.append(pkv) + inputs["past_key_values"] = past_key_values + + qeff_out = qeff_model.model(**inputs) + + import ipdb + + ipdb.set_trace() + + decode_qpc_path = qeff_model.compile( + prefill_seq_len=1, + ctx_len=CTX_LEN, + num_cores=16, + mxfp6_matmul=True, + mxint8_kv_cache=True, + num_devices=1, + mos=1, + aic_enable_depth_first=True, + num_speculative_tokens=None, + offload_pt_weights=False, + ) + prefill_qpc_path = qeff_model.compile( + prefill_seq_len=PREFILL_SEQ_LEN, + ctx_len=CTX_LEN, + num_cores=16, + mxfp6_matmul=True, + mxint8_kv_cache=True, + num_devices=1, + mos=1, + aic_enable_depth_first=True, + num_speculative_tokens=None, + prefill_only=True, + ) + + prefill_session = QAICInferenceSession(prefill_qpc_path) + + logits_out_placeholder = np.zeros((1, 1, 201088), dtype=np.float32) + prefill_session.set_buffers({"logits": logits_out_placeholder}) + inputs.pop("past_key_values") + inputs = {k: v.detach().numpy() for k, v in inputs.items()} + st = time.time() + qpc_out = prefill_session.run(inputs) + print(f"time for prefill_run={time.time() - st} sec\n") + import ipdb + + ipdb.set_trace() + decode_session = QAICInferenceSession(decode_qpc_path) + decode_session.set_buffers({"logits": logits_out_placeholder}) + decode_session.skip_buffers( + [x for x in decode_session.input_names + decode_session.output_names if x.startswith("past_")] + ) + + decode_inputs = { + "input_ids": np.argmax(qpc_out["logits"]).reshape(1, 1), + "position_ids": np.max(inputs["position_ids"]).reshape(1, 1) + 1, + } + + all_outputs.append(decode_inputs["input_ids"][0][0]) + for i in range(config.num_hidden_layers): + decode_inputs[f"past_key.{i}"] = qpc_out[f"past_key.{i}_RetainedState"] + decode_inputs[f"past_value.{i}"] = qpc_out[f"past_value.{i}_RetainedState"] + + st = time.time() + decode_out = decode_session.run(decode_inputs) + print(f"time for first run of decode with KV as input = {time.time() - st} sec\n") + + st = time.time() + for i in range(generation_len - 2): + loop_decode_inputs = { + "input_ids": np.argmax(decode_out["logits"]).reshape(1, 1), + "position_ids": np.max(inputs["position_ids"]).reshape(1, 1) + 1, + } + all_outputs.append(loop_decode_inputs["input_ids"][0][0]) + decode_out = decode_session.run(loop_decode_inputs) + + print(f"time for decode generation = {(time.time() - st) / (generation_len - 2)}") + print(all_outputs) + print(tokenizer.decode(all_outputs)) From 08ccd20bfa70f563f3e40978b0217111bcee550e Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 6 Nov 2025 10:44:31 +0000 Subject: [PATCH 16/25] added test Signed-off-by: Onkar Chougule --- .../models/gpt_oss/modeling_gpt_oss.py | 6 +- tests/transformers/models/test_disagg_mode.py | 104 ++++-------------- 2 files changed, 23 insertions(+), 87 deletions(-) diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index 7a7cd3cbf..2f7b4017a 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -487,7 +487,8 @@ def forward( hidden_shape = (*input_shape, -1, self.head_dim) key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, seq_len=32 * 1024) + + cos, sin = self.rotary_emb(value_states, seq_len=getattr(self.config, "max_position_embeddings", 32 * 1024)) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: @@ -564,7 +565,7 @@ def forward( key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, seq_len=32 * 1024) + cos, sin = self.rotary_emb(value_states, seq_len=getattr(self.config, "max_position_embeddings", 32 * 1024)) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: @@ -797,7 +798,6 @@ def forward( ) hidden_states = inputs_embeds - # position_embeddings = self.rotary_emb(hidden_states, position_ids) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/tests/transformers/models/test_disagg_mode.py b/tests/transformers/models/test_disagg_mode.py index 07106bddc..fdbd374ff 100644 --- a/tests/transformers/models/test_disagg_mode.py +++ b/tests/transformers/models/test_disagg_mode.py @@ -19,41 +19,33 @@ replace_transformers_quantizers() model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 -# prompt = """ -# Billions of years ago, in the vast emptiness of the early universe, tiny fluctuations in the density of matter began to grow under the influence of gravity. Clouds of gas—mostly hydrogen and helium—started to collapse, forming the first stars. These stars grouped together, bound by gravity, creating the earliest galaxies. -# Over time, these galaxies merged, collided, and evolved, shaping their spiral arms, elliptical forms, or irregular structures. Within their swirling depths, stars were born and died, enriching the galactic gas with heavier elements. These elements became the building blocks for planets, moons, and eventually life. -# Thus, from the quiet whispers of cosmic dust, a galaxy emerged—an island of stars, nebulae, and mysteries, drifting through the infinite sea of space. -# As the galaxy matured, its stars danced in intricate orbits, weaving patterns shaped by gravity and time. Supernovae exploded like cosmic fireworks, scattering elements across space and triggering new waves of star formation. Black holes formed at the hearts of galaxies, anchoring their structure and influencing their evolution. Over billions of years, the galaxy became a dynamic ecosystem—where stars are born, live, and die—each cycle adding to the richness of the cosmic tapestry. -# """ -prompt = """ + +prompt2 = """ Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures. As Alex flipped through the pages, he discovered a map that led to a hidden treasure. Excited by the prospect of a real-life treasure hunt, Alex decided to embark on a thrilling journey. He packed his backpack with snacks, a flashlight, and a compass, and set off into the unknown. The path to the treasure was not an easy one. Alex had to navigate through dense forests, cross rickety bridges, and solve riddles that guarded the treasure's location. """ +prompt1 = "Once upon a time" + +prompts = [prompt1, prompt2] +@pytest.mark.on_qaic @pytest.mark.parametrize("model_id", [model_id]) -def test_disagg_mode(model_id): - all_outputs = [] +@pytest.mark.parametrize("prompt", prompts) +def test_disagg_mode_prefill(model_id, prompt): # Run prefill tokenizer = AutoTokenizer.from_pretrained(model_id) PREFILL_SEQ_LEN = 256 CTX_LEN = 256 inputs = tokenizer(prompt, return_tensors="np", padding=True) - position_ids = inputs["attention_mask"].sum(1, keepdims=True) padded_len = inputs["input_ids"].shape[1] num_chunks = -(padded_len // -PREFILL_SEQ_LEN) # ceil divide without float padded_len = num_chunks * PREFILL_SEQ_LEN # Convert to a multiple of prompt_len - # Initialize variables specific to request - # Calculate the max generation length. - max_gen_len = CTX_LEN - position_ids.max() - generation_len = 50 - - # model = AutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) - model = AutoModelForCausalLM.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) config = model.config inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) @@ -62,22 +54,12 @@ def test_disagg_mode(model_id): cache = HybridCache(config=config, batch_size=1, max_cache_len=CTX_LEN) ins = tokenizer(prompt, return_tensors="pt") out = model(**ins, past_key_values=cache) - puts = { - "input_ids": out.logits[:, -1, :].argmax().reshape(1, -1), - "position_ids": ins["input_ids"].shape[-1].reshape(1, -1), - } - import ipdb - - ipdb.set_trace() - new_out = model(**puts, past_key_values=cache) - model.generation_config.do_sample = False - orig_all_out = model.generate( - **tokenizer(prompt, return_tensors="pt"), past_key_values=cache, max_new_tokens=max_gen_len - ) + undo_transformers_quantizers() - qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id) - # qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_id, num_hidden_layers=2, max_position_embeddings=64 * 1024 + ) qeff_model.prefill(True) config = qeff_model.model.config inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) @@ -96,28 +78,15 @@ def test_disagg_mode(model_id): qeff_out = qeff_model.model(**inputs) - import ipdb + # Check our pytorch implementation + assert (qeff_out.logits - out.logits[:, -1, :]).abs().max() < 1e-4 - ipdb.set_trace() - - decode_qpc_path = qeff_model.compile( - prefill_seq_len=1, - ctx_len=CTX_LEN, - num_cores=16, - mxfp6_matmul=True, - mxint8_kv_cache=True, - num_devices=1, - mos=1, - aic_enable_depth_first=True, - num_speculative_tokens=None, - offload_pt_weights=False, - ) prefill_qpc_path = qeff_model.compile( prefill_seq_len=PREFILL_SEQ_LEN, ctx_len=CTX_LEN, num_cores=16, - mxfp6_matmul=True, - mxint8_kv_cache=True, + mxfp6_matmul=False, + mxint8_kv_cache=False, num_devices=1, mos=1, aic_enable_depth_first=True, @@ -126,7 +95,6 @@ def test_disagg_mode(model_id): ) prefill_session = QAICInferenceSession(prefill_qpc_path) - logits_out_placeholder = np.zeros((1, 1, 201088), dtype=np.float32) prefill_session.set_buffers({"logits": logits_out_placeholder}) inputs.pop("past_key_values") @@ -134,38 +102,6 @@ def test_disagg_mode(model_id): st = time.time() qpc_out = prefill_session.run(inputs) print(f"time for prefill_run={time.time() - st} sec\n") - import ipdb - - ipdb.set_trace() - decode_session = QAICInferenceSession(decode_qpc_path) - decode_session.set_buffers({"logits": logits_out_placeholder}) - decode_session.skip_buffers( - [x for x in decode_session.input_names + decode_session.output_names if x.startswith("past_")] - ) - - decode_inputs = { - "input_ids": np.argmax(qpc_out["logits"]).reshape(1, 1), - "position_ids": np.max(inputs["position_ids"]).reshape(1, 1) + 1, - } - - all_outputs.append(decode_inputs["input_ids"][0][0]) - for i in range(config.num_hidden_layers): - decode_inputs[f"past_key.{i}"] = qpc_out[f"past_key.{i}_RetainedState"] - decode_inputs[f"past_value.{i}"] = qpc_out[f"past_value.{i}_RetainedState"] - - st = time.time() - decode_out = decode_session.run(decode_inputs) - print(f"time for first run of decode with KV as input = {time.time() - st} sec\n") - - st = time.time() - for i in range(generation_len - 2): - loop_decode_inputs = { - "input_ids": np.argmax(decode_out["logits"]).reshape(1, 1), - "position_ids": np.max(inputs["position_ids"]).reshape(1, 1) + 1, - } - all_outputs.append(loop_decode_inputs["input_ids"][0][0]) - decode_out = decode_session.run(loop_decode_inputs) - - print(f"time for decode generation = {(time.time() - st) / (generation_len - 2)}") - print(all_outputs) - print(tokenizer.decode(all_outputs)) + del prefill_session + # Check QAIC output isclose with QEFF pytorch output + assert (torch.from_numpy(qpc_out["logits"]) - qeff_out.logits).abs().max() < 5e-2 From 9c8dcae003c51784256753dafea0367c37e5e5fd Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 6 Nov 2025 10:46:23 +0000 Subject: [PATCH 17/25] made example not ugly Signed-off-by: Onkar Chougule --- examples/gpt_oss_disagg_mode.py | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/examples/gpt_oss_disagg_mode.py b/examples/gpt_oss_disagg_mode.py index 9f196e002..ee03f573a 100644 --- a/examples/gpt_oss_disagg_mode.py +++ b/examples/gpt_oss_disagg_mode.py @@ -9,18 +9,13 @@ import numpy as np import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, HybridCache +from transformers import AutoTokenizer from QEfficient import QEFFAutoModelForCausalLM from QEfficient.generation.cloud_infer import QAICInferenceSession model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 -# prompt = """ -# Billions of years ago, in the vast emptiness of the early universe, tiny fluctuations in the density of matter began to grow under the influence of gravity. Clouds of gas—mostly hydrogen and helium—started to collapse, forming the first stars. These stars grouped together, bound by gravity, creating the earliest galaxies. -# Over time, these galaxies merged, collided, and evolved, shaping their spiral arms, elliptical forms, or irregular structures. Within their swirling depths, stars were born and died, enriching the galactic gas with heavier elements. These elements became the building blocks for planets, moons, and eventually life. -# Life is a very interesting phenomenon that occured in this universe -# """ -# prompt = "Once upon a time" + prompt = """ Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures. @@ -44,15 +39,6 @@ max_gen_len = CTX_LEN - position_ids.max() generation_len = max_gen_len -# model = AutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) -# config = model.config -# inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) -# inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1) -# inputs.pop("token_type_ids", None) -# inputs = {k: torch.from_numpy(v).to(model.device) for k, v in inputs.items()} -# cache = HybridCache(config=config, batch_size=1, max_cache_len=8192) -# out = model(**tokenizer(prompt, return_tensors="pt"), past_key_values=cache) - qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id) config = qeff_model.model.config @@ -70,7 +56,6 @@ past_key_values.append(pkv) inputs["past_key_values"] = past_key_values -# qeff_out = qeff_model.model(**inputs) decode_qpc_path = qeff_model.compile( prefill_seq_len=1, @@ -141,9 +126,6 @@ "input_ids": np.argmax(decode_out["logits"]).reshape(1, 1), "position_ids": pos_id, } - # for i in range(config.num_hidden_layers): - # loop_decode_inputs[f"past_key.{i}"] = decode_out[f"past_key.{i}_RetainedState"] - # loop_decode_inputs[f"past_value.{i}"] = decode_out[f"past_value.{i}_RetainedState"] all_outputs.append(loop_decode_inputs["input_ids"][0][0]) decode_out = decode_session.run(loop_decode_inputs) pos_id += 1 From 50db73f2a2c7d0d7a586ee054cce34a672e6e42f Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 6 Nov 2025 15:21:26 +0000 Subject: [PATCH 18/25] fixed tests Signed-off-by: Onkar Chougule --- QEfficient/base/modeling_qeff.py | 11 ++++++----- .../transformers/models/gpt_oss/modeling_gpt_oss.py | 4 ++-- QEfficient/transformers/models/modeling_auto.py | 5 +++++ tests/transformers/test_causal_lm.py | 2 ++ 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 1f0796e96..43d1e3bfb 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -336,14 +336,15 @@ def _compile( For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored. """ kwargs = {"offload_pt_weights": offload_pt_weights} - if prefill_only and self.prefill_onnx_path is None: + if onnx_path is None and prefill_only: kwargs.update({"prefill_only": prefill_only, "prefill_seq_len": specializations[0].get("seq_len")}) self.export(**kwargs) - onnx_path = Path(onnx_path or self.prefill_onnx_path) - - if onnx_path is None and self.onnx_path is None: + onnx_path = Path(self.prefill_onnx_path) + elif onnx_path is None: self.export(**kwargs) - onnx_path = Path(onnx_path or self.onnx_path) + onnx_path = Path(self.onnx_path) + else: + onnx_path = Path(onnx_path) compile_dir = Path(compile_dir or onnx_path.parent) qpc_path = compile_dir / "qpc" diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index 2f7b4017a..ff3459753 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -488,7 +488,7 @@ def forward( key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, seq_len=getattr(self.config, "max_position_embeddings", 32 * 1024)) + cos, sin = self.rotary_emb(value_states, seq_len=getattr(self.config, "max_seq_len_cached", 32 * 1024)) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: @@ -565,7 +565,7 @@ def forward( key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, seq_len=getattr(self.config, "max_position_embeddings", 32 * 1024)) + cos, sin = self.rotary_emb(value_states, seq_len=getattr(self.config, "max_seq_len_cached", 32 * 1024)) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 8c634a933..bbfc5d2e8 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2132,6 +2132,7 @@ def __init__( model: nn.Module, continuous_batching: bool = False, qaic_config: Optional[dict] = None, + max_seq_len_cached: Optional[int] = None, **kwargs, ): """ @@ -2177,6 +2178,7 @@ def __init__( ) # Set use_cache=True to get KV values as output during ONNX export model.config.use_cache = True + setattr(model.config, "max_seq_len_cached", max_seq_len_cached) super().__init__(model, qaic_config=qaic_config, **kwargs) self.num_layers = model.config.num_hidden_layers self.continuous_batching = continuous_batching @@ -2185,6 +2187,7 @@ def __init__( self.is_tlm = transformed self.hash_params["qeff_auto_class"] = self.__class__.__name__ + self.hash_params["max_seq_len_cached"] = max_seq_len_cached # ---Sampling--- # Note: SamplerTransform should be applied after all other transforms @@ -2221,6 +2224,7 @@ def from_pretrained( pretrained_model_name_or_path, continuous_batching: bool = False, qaic_config: Optional[dict] = None, + max_seq_len_cached: Optional[int] = None, *args, **kwargs, ): @@ -2294,6 +2298,7 @@ def from_pretrained( continuous_batching=continuous_batching, qaic_config=qaic_config, pretrained_model_name_or_path=pretrained_model_name_or_path, + max_seq_len_cached=max_seq_len_cached, **kwargs, ) diff --git a/tests/transformers/test_causal_lm.py b/tests/transformers/test_causal_lm.py index 0810ac6ba..5e5ad4b5d 100644 --- a/tests/transformers/test_causal_lm.py +++ b/tests/transformers/test_causal_lm.py @@ -154,6 +154,7 @@ def test_causal_lm_hash_creation(config, cb, tmp_path): hash_params["peft_config"] = None hash_params["applied_transform_names"] = qeff_model._transform_names() hash_params["qeff_auto_class"] = qeff_model.__class__.__name__ + hash_params["max_seq_len_cached"] = None hash_params["qaic_config"] = None # Create parameters separately for hash creation @@ -204,6 +205,7 @@ def test_causal_lm_hash_creation(config, cb, tmp_path): export_params["output_names"] = output_names export_params["dynamic_axes"] = dynamic_axes hash_params["export_params"] = export_params + hash_params["prefill_only"] = False manual_hash = hash_dict_params(hash_params) assert manual_hash == qeff_model.export_hash From 00eab9801149c3e265cbbe66c4b9253c9170d769 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 6 Nov 2025 15:44:39 +0000 Subject: [PATCH 19/25] fixed tests Signed-off-by: Onkar Chougule --- tests/transformers/models/test_disagg_mode.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/transformers/models/test_disagg_mode.py b/tests/transformers/models/test_disagg_mode.py index fdbd374ff..0e303d389 100644 --- a/tests/transformers/models/test_disagg_mode.py +++ b/tests/transformers/models/test_disagg_mode.py @@ -16,8 +16,6 @@ from QEfficient.generation.cloud_infer import QAICInferenceSession from QEfficient.transformers.quantizers import replace_transformers_quantizers, undo_transformers_quantizers -replace_transformers_quantizers() - model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 prompt2 = """ @@ -45,6 +43,7 @@ def test_disagg_mode_prefill(model_id, prompt): num_chunks = -(padded_len // -PREFILL_SEQ_LEN) # ceil divide without float padded_len = num_chunks * PREFILL_SEQ_LEN # Convert to a multiple of prompt_len + replace_transformers_quantizers() model = AutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) config = model.config inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) From 4a43f0b8d7914b8d1576f644f73bdde2e0c4907d Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Fri, 7 Nov 2025 09:31:55 +0000 Subject: [PATCH 20/25] added new test and fixed failing tests Signed-off-by: Onkar Chougule --- QEfficient/peft/auto.py | 3 +- QEfficient/peft/lora/auto.py | 3 +- .../models/gpt_oss/modeling_gpt_oss.py | 178 +++++++++++++++++- .../transformers/models/modeling_auto.py | 44 +++-- QEfficient/utils/_utils.py | 1 - QEfficient/utils/hash_utils.py | 1 - scripts/Jenkinsfile | 2 +- tests/peft/test_peft_model.py | 6 +- tests/transformers/test_causal_lm.py | 77 +++++--- 9 files changed, 255 insertions(+), 60 deletions(-) diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py index 592c0c1d3..e5b8bd185 100644 --- a/QEfficient/peft/auto.py +++ b/QEfficient/peft/auto.py @@ -245,7 +245,7 @@ def from_pretrained(cls, pretrained_name_or_path: str, *args, **kwargs): obj = cls._from_pretrained(pretrained_name_or_path, *args, **kwargs) return obj - def export(self, export_dir: Optional[str] = None) -> str: + def export(self, export_dir: Optional[str] = None, **kwargs) -> str: """ Export the model with the active adapter to ONNX format. @@ -286,6 +286,7 @@ def export(self, export_dir: Optional[str] = None) -> str: export_kwargs={"do_constant_folding": False}, # To avoid merging adapter weights with base weights onnx_transform_kwargs={"adapter_name": self.model.active_adapter}, export_dir=export_dir, + **kwargs, ) def compile( diff --git a/QEfficient/peft/lora/auto.py b/QEfficient/peft/lora/auto.py index 8196cd769..8ff8335f5 100644 --- a/QEfficient/peft/lora/auto.py +++ b/QEfficient/peft/lora/auto.py @@ -327,7 +327,7 @@ def _init_adapter_model(self): # load_weight to model self._load_adapter_weights_to_model() - def export(self, export_dir: Optional[str] = None) -> str: + def export(self, export_dir: Optional[str] = None, **kwargs) -> str: """ Export the model with all loaded adapters to ONNX format using ``torch.onnx.export``. @@ -387,6 +387,7 @@ def export(self, export_dir: Optional[str] = None) -> str: output_names, dynamic_axes, export_dir=export_dir, + **kwargs, ) def generate( diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index ff3459753..2f9e33113 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- +import math import os from typing import Callable, Optional, Union @@ -97,6 +98,167 @@ def forward(self, hidden: torch.Tensor): # original shape [B, S, H] return expert_out.view(B, S, H), router_logits + def blocked_ffn_forward(self, hidden: torch.Tensor): + B, S, H = hidden.shape + T = B * S + hidden = hidden.view(T, H) + + # Router computation + router_logits = F.linear(hidden, self.router.weight, self.router.bias) + + # Top-k selection + top_w, top_i = torch.topk(router_logits, self.router.top_k, dim=-1) # both [T, K] + top_w = torch.nn.functional.softmax(top_w, dim=1, dtype=top_w.dtype) + + masked_logits = torch.zeros_like(router_logits) + masked_logits.scatter_(1, top_i, top_w) + + # Routing weights for each expert [T, E] + routing_weights = masked_logits + + # ────────────────── allocate the output tensor ───── + expert_out = hidden.new_zeros((T, H)) # accumulation buffer + target_blocks = int(os.environ.get("NUM_BLOCKS", 1)) + block_positions = [] + for j in range(target_blocks): + block_positions.append(j * (T // target_blocks)) + # ───────────────────────── Expert computation loop ───────────────────────────── + for e in range(self.experts.num_experts): + routing_weight = routing_weights[:, e].unsqueeze(-1) # [T, 1] + + W_g, W_u = self.experts.gate_proj[e], self.experts.up_proj[e] # [H, I], [H, I] + b_g, b_u = self.experts.gate_proj_bias[e], self.experts.up_proj_bias[e] # [I], [I] + W_d = self.experts.down_proj[e] # [I, H] + b_d = self.experts.down_proj_bias[e] # [H] + + block_count = 0 + outs = [] + for block_idx in range(target_blocks): + block_count += 1 + qi = block_positions[block_idx] + + # Calculate block size (last block should be handled with remainder) + if block_idx == target_blocks - 1: + real_q_len = T - qi + else: + real_q_len = block_positions[block_idx + 1] - qi + + tgb = hidden[qi : qi + real_q_len, :] + # Gate and Up projections + # Gate and Up projections + gate = (tgb @ W_g) + b_g # [T, I] + up = (tgb @ W_u) + b_u # [T, I] + + # Apply GptOss activation with clamping + gate = gate.clamp(min=None, max=self.experts.limit) + up = up.clamp(min=-self.experts.limit, max=self.experts.limit) + + # GLU activation + glu = gate * torch.sigmoid(gate * self.experts.alpha) + intermediate = (up + 1) * glu # [T, I] + + # Down projection + down_out_block = (intermediate @ W_d) + b_d # [T, H] + + outs.append(down_out_block) + + down_out = torch.cat(outs, dim=0) + + # Apply routing weights and accumulate + masked_down = torch.where(routing_weight > 0, down_out * routing_weight, torch.zeros_like(expert_out)) + expert_out += masked_down + + # original shape [B, S, H] + return expert_out.view(B, S, H), router_logits + + def blocked_ffn_forward_block_weights(self, hidden: torch.Tensor): + B, S, H = hidden.shape + T = B * S + hidden = hidden.view(T, H) + + # Router computation + router_logits = F.linear(hidden, self.router.weight, self.router.bias) + + # Top-k selection + top_w, top_i = torch.topk(router_logits, self.router.top_k, dim=-1) # both [T, K] + top_w = torch.nn.functional.softmax(top_w, dim=1, dtype=top_w.dtype) + + masked_logits = torch.zeros_like(router_logits) + masked_logits.scatter_(1, top_i, top_w) + + # Routing weights for each expert [T, E] + routing_weights = masked_logits + + # ────────────────── allocate the output tensor ───── + expert_out = hidden.new_zeros((T, H)) # accumulation buffer + target_blocks = int(os.environ.get("NUM_BLOCKS", 1)) + block_positions = [] + for j in range(target_blocks): + block_positions.append(j * (T // target_blocks)) + # ───────────────────────── Expert computation loop ───────────────────────────── + for e in range(self.experts.num_experts): + routing_weight = routing_weights[:, e].unsqueeze(-1) # [T, 1] + + W_g, W_u = self.experts.gate_proj[e], self.experts.up_proj[e] # [H, I], [H, I] + b_g, b_u = self.experts.gate_proj_bias[e], self.experts.up_proj_bias[e] # [I], [I] + W_d = self.experts.down_proj[e] # [I, H] + b_d = self.experts.down_proj_bias[e] # [H] + + block_count = 0 + outs = [] + for block_idx in range(target_blocks): + block_count += 1 + qi = block_positions[block_idx] + + # Calculate block size (last block should be handled with remainder) + if block_idx == target_blocks - 1: + real_q_len = T - qi + else: + real_q_len = block_positions[block_idx + 1] - qi + + tgb = hidden[qi : qi + real_q_len, :] + # Gate and Up projections + + wg_col_shape = W_g.shape[1] + wg_num_blocks = math.ceil(wg_col_shape / 128) + last_block_size = wg_col_shape % 128 if wg_col_shape % 128 != 0 else 128 + + intermediates = [] + for i in range(wg_num_blocks): + if i == wg_num_blocks - 1: + cur_gate = (tgb @ W_g[:, -last_block_size:]) + b_g[-last_block_size:] + cur_up = (tgb @ W_u[:, -last_block_size:]) + b_u[-last_block_size:] + else: + cur_gate = (tgb @ W_g[:, i * 128 : (i + 1) * 128]) + b_g[i * 128 : (i + 1) * 128] + cur_up = (tgb @ W_u[:, i * 128 : (i + 1) * 128]) + b_u[i * 128 : (i + 1) * 128] + + cur_gate = cur_gate.clamp(min=None, max=self.experts.limit) + cur_up = cur_up.clamp(min=-self.experts.limit, max=self.experts.limit) + cur_glu = cur_gate * torch.sigmoid(cur_gate * self.experts.alpha) + cur_intermediate = (cur_up + 1) * cur_glu + intermediates.append(cur_intermediate) + + intermediate = torch.cat(intermediates, dim=-1) + + downs = [] + for i in range(wg_num_blocks): + if i == wg_num_blocks - 1: + downs.append((intermediate @ W_d[:, -last_block_size:]) + b_d[-last_block_size:]) + else: + downs.append((intermediate @ W_d[:, i * 128 : (i + 1) * 128]) + b_d[i * 128 : (i + 1) * 128]) + + down_out_block = torch.cat(downs, dim=1) + outs.append(down_out_block) + + down_out = torch.cat(outs, dim=0) + + # Apply routing weights and accumulate + masked_down = torch.where(routing_weight > 0, down_out * routing_weight, torch.zeros_like(expert_out)) + expert_out += masked_down + + # original shape [B, S, H] + return expert_out.view(B, S, H), router_logits + class QEffGptOssMLP(GptOssMLP): # ------------------- Gather based, weights as activation approach --------------- @@ -146,7 +308,6 @@ def forward_weights_as_activation(self, hidden_states): # ------------------- Gather based, weights as activation approach, With Seperate Gate, up Projections --------------- def forward(self, hidden_states): - # print("Seperate Split, Up, Gate Projections") bs, seq_len, _ = hidden_states.shape hidden_states = hidden_states.view(bs * seq_len, self.experts.hidden_size) @@ -417,7 +578,6 @@ def eager_attention_forward_blocked( scaling: float, **kwargs, ): - softmax_count = 0 key_states = repeat_kv(key, module.num_key_value_groups) value_states = repeat_kv(value, module.num_key_value_groups) @@ -426,9 +586,6 @@ def eager_attention_forward_blocked( block_positions = [] for j in range(target_blocks): block_positions.append(j * (CL // target_blocks)) - - print(f"CL={CL}, target_blocks={target_blocks}") - block_count = 0 outs = [] for block_idx in range(target_blocks): @@ -458,7 +615,6 @@ def eager_attention_forward_blocked( outs.append(out_block) output = torch.cat(outs, dim=2) - print(f"Completed {block_count} blocks, {softmax_count} softmax operations") output = output.view(BS, NH, CL, DH).transpose(1, 2).contiguous() return output, output @@ -487,8 +643,9 @@ def forward( hidden_shape = (*input_shape, -1, self.head_dim) key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - - cos, sin = self.rotary_emb(value_states, seq_len=getattr(self.config, "max_seq_len_cached", 32 * 1024)) + if not (max_seq_len_cached := getattr(self.config, "max_seq_len_cached")): + max_seq_len_cached = 32 * 1024 + cos, sin = self.rotary_emb(value_states, seq_len=max_seq_len_cached) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: @@ -564,8 +721,9 @@ def forward( query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - - cos, sin = self.rotary_emb(value_states, seq_len=getattr(self.config, "max_seq_len_cached", 32 * 1024)) + if not (max_seq_len_cached := getattr(self.config, "max_seq_len_cached")): + max_seq_len_cached = 32 * 1024 + cos, sin = self.rotary_emb(value_states, seq_len=max_seq_len_cached) query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) if past_key_value is not None: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index bbfc5d2e8..11a223a2f 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2340,30 +2340,38 @@ def export( str Path to the generated ONNX graph file. """ + bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE + seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS + kv_cache_shape = get_padding_shape_from_config( + self.model.config, fbs if self.continuous_batching else bs, seq_len + ) if prefill_only: - block_size = os.environ.get("BLOCK_SIZE", None) - if block_size is None: - block_size = 128 - logger.warning( - "Setting BLOCK_SIZE=128 for prefill_only model, please set ENV variable `BLOCK_SIZE` to override" - ) - if prefill_seq_len is None or prefill_seq_len % block_size != 0: - raise ValueError( - f"When prefill_only=True, 'prefill_seq_len' must be explicitly set and divisible by block_size={block_size}. " - f"Received: prefill_seq_len={prefill_seq_len}" - ) + assert not self.continuous_batching, "prefill_only=True is not supported with continuous_batching=True" - os.environ["NUM_BLOCKS"] = str(prefill_seq_len // block_size) if self.model.config.model_type in SPECIALIZED_PREFILL_ONLY_MODEL_ARCH: + block_size = os.environ.get("BLOCK_SIZE", None) + if block_size is None: + block_size = 128 + logger.warning( + "Setting BLOCK_SIZE=128 for prefill_only model, please set ENV variable `BLOCK_SIZE` to override" + ) + if prefill_seq_len is None or prefill_seq_len % block_size != 0: + raise ValueError( + f"When prefill_only=True, 'prefill_seq_len' must be explicitly set and divisible by block_size={block_size}. " + f"Received: prefill_seq_len={prefill_seq_len}" + ) + os.environ["NUM_BLOCKS"] = str(prefill_seq_len // block_size) + self.prefill(True) + self.hash_params["prefill_only"] = True + self.hash_params["num_blocks"] = os.environ["NUM_BLOCKS"] + seq_len = prefill_seq_len // block_size if (prefill_seq_len // block_size) > seq_len else seq_len else: self.prefill(False) - bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE - seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN if not prefill_only else prefill_seq_len // block_size - fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS - kv_cache_shape = get_padding_shape_from_config( - self.model.config, fbs if self.continuous_batching else bs, seq_len - ) + self.hash_params.pop("prefill_only", None) + self.hash_params.pop("num_blocks", None) + example_inputs = { "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 09b6bd830..d58f54952 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -566,7 +566,6 @@ def wrapper(self, *args, **kwargs): dynamic_axes=all_args.get("dynamic_axes"), export_kwargs=all_args.get("export_kwargs", None), onnx_transform_kwargs=all_args.get("onnx_transform_kwargs", None), - prefill_only=all_args.get("prefill_only", False), ) export_dir = export_dir.with_name(export_dir.name + "-" + export_hash) kwargs["export_dir"] = export_dir diff --git a/QEfficient/utils/hash_utils.py b/QEfficient/utils/hash_utils.py index 6b48cc244..b6b38b8b4 100644 --- a/QEfficient/utils/hash_utils.py +++ b/QEfficient/utils/hash_utils.py @@ -67,6 +67,5 @@ def create_export_hash(**kwargs): export_hash_params.update(onnx_transform_kwargs) if export_hash_params.get("peft_config") is not None and not isinstance(export_hash_params["peft_config"], dict): export_hash_params["peft_config"] = export_hash_params["peft_config"].to_dict() - export_hash_params["prefill_only"] = kwargs.get("prefill_only") return hash_dict_params(export_hash_params), export_hash_params diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index d9d391d47..bf6f82ce3 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -42,7 +42,7 @@ pipeline { mkdir -p $PWD/Non_cli_qaic && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_cli_qaic && - pytest tests -m '(not cli) and (not on_qaic) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log1.xml && + pytest tests -m '(not cli) and (not on_qaic) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log1.xml && junitparser merge tests/tests_log1.xml tests/tests_log.xml && deactivate" ''' diff --git a/tests/peft/test_peft_model.py b/tests/peft/test_peft_model.py index cc94467db..c3bb2f140 100644 --- a/tests/peft/test_peft_model.py +++ b/tests/peft/test_peft_model.py @@ -178,9 +178,9 @@ def test_auto_peft_model_for_causal_lm_activate_invalid(base_config, adapter_con def test_auto_peft_model_for_causal_lm_compile_generate(base_config, adapter_config, batch_size, tmp_path): _, lora_model = create_peft_model(base_config, adapter_config) qeff_model = QEffAutoPeftModelForCausalLM(lora_model) - qeff_model.export(tmp_path) + onnx_path = qeff_model.export(tmp_path) start = perf_counter() - qeff_model.compile(batch_size=batch_size, prefill_seq_len=32, ctx_len=128) + qeff_model.compile(onnx_path=onnx_path, batch_size=batch_size, prefill_seq_len=32, ctx_len=128) end = perf_counter() compile_time_0 = end - start @@ -197,7 +197,7 @@ def test_auto_peft_model_for_causal_lm_compile_generate(base_config, adapter_con ) start = perf_counter() - qeff_model.compile(batch_size=batch_size, prefill_seq_len=32, ctx_len=128) + qeff_model.compile(onnx_path=onnx_path, batch_size=batch_size, prefill_seq_len=32, ctx_len=128) end = perf_counter() compile_time_1 = end - start assert compile_time_1 < 0.01 * compile_time_0 diff --git a/tests/transformers/test_causal_lm.py b/tests/transformers/test_causal_lm.py index 5e5ad4b5d..925af8b3a 100644 --- a/tests/transformers/test_causal_lm.py +++ b/tests/transformers/test_causal_lm.py @@ -17,7 +17,7 @@ from QEfficient.utils import constants, get_padding_shape_from_config from QEfficient.utils.hash_utils import hash_dict_params -configs = [ +test_configs = [ # name, max_position_embeddings, num_hidden_layers, num_attention_heads, hidden_size, intermediate_size, vocab_size, additional_params ("gpt2", 256, 2, 4, 128, 512, 127, {}), ("codegen", 256, 2, 4, 128, 512, 127, {"rotary_dim": 16}), @@ -36,30 +36,43 @@ ("gpt_oss", 256, 3, 4, 128, 512, 127, {"num_key_value_heads": 2}), ] -configs = [ - AutoConfig.for_model( - model_name, - max_position_embeddings=max_position_embeddings, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - vocab_size=vocab_size, - **additional_params, - ) - for ( - model_name, - max_position_embeddings, - num_hidden_layers, - num_attention_heads, - hidden_size, - intermediate_size, - vocab_size, - additional_params, - ) in configs +test_prefill_only_specialized_models_configs = [ + ("gpt_oss", 256, 2, 2, 32, 32, 127, {"num_key_value_heads": 2}), ] + + +def get_auto_config_from_test_config(configs): + auto_configs = [ + AutoConfig.for_model( + model_name, + max_position_embeddings=max_position_embeddings, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + vocab_size=vocab_size, + **additional_params, + ) + for ( + model_name, + max_position_embeddings, + num_hidden_layers, + num_attention_heads, + hidden_size, + intermediate_size, + vocab_size, + additional_params, + ) in configs + ] + return auto_configs + + +configs = get_auto_config_from_test_config(test_configs) config_ids = [x.model_type for x in configs] +prefill_only_configs = get_auto_config_from_test_config(test_prefill_only_specialized_models_configs) +prefill_only_config_ids = [x.model_type for x in prefill_only_configs] + model_kwargs = {"attn_implementation": "eager"} @@ -158,7 +171,6 @@ def test_causal_lm_hash_creation(config, cb, tmp_path): hash_params["qaic_config"] = None # Create parameters separately for hash creation - bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS @@ -205,12 +217,29 @@ def test_causal_lm_hash_creation(config, cb, tmp_path): export_params["output_names"] = output_names export_params["dynamic_axes"] = dynamic_axes hash_params["export_params"] = export_params - hash_params["prefill_only"] = False manual_hash = hash_dict_params(hash_params) assert manual_hash == qeff_model.export_hash +@pytest.mark.parametrize("cb", [False, True], ids=["nocb", "cb"]) +@pytest.mark.parametrize("config", prefill_only_configs, ids=prefill_only_config_ids) +def test_prefill_only_specialized_models(config, cb, tmp_path): + model = AutoModelForCausalLM.from_config(config, **model_kwargs) + qeff_model = QEFFAutoModelForCausalLM(model, cb) + if cb: + with pytest.raises(AssertionError): + qeff_model.export(tmp_path, prefill_only=True, offload_pt_weights=False) + else: + with pytest.raises(ValueError): + qeff_model.export(tmp_path, prefill_only=True, offload_pt_weights=False) + qeff_model.export(tmp_path, prefill_only=True, prefill_seq_len=256, offload_pt_weights=False) + first_export_hash = qeff_model.export_hash + qeff_model.export(tmp_path, prefill_only=False, offload_pt_weights=False) + second_export_hash = qeff_model.export_hash + assert first_export_hash != second_export_hash + + @pytest.fixture def tmp_cache(tmp_path, monkeypatch): monkeypatch.setattr("QEfficient.utils._utils.QEFF_HOME", tmp_path) From 128b2c94fdab426c2deebd73a0861a0df448e25c Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Mon, 10 Nov 2025 07:30:56 +0000 Subject: [PATCH 21/25] fixed tests Signed-off-by: Onkar Chougule --- tests/peft/lora/test_lora_model.py | 4 ++-- tests/transformers/models/test_disagg_mode.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/peft/lora/test_lora_model.py b/tests/peft/lora/test_lora_model.py index 00a4216b7..46b33c60b 100644 --- a/tests/peft/lora/test_lora_model.py +++ b/tests/peft/lora/test_lora_model.py @@ -222,7 +222,7 @@ def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate( # export start = perf_counter() - qeff_model.export(export_dir=tmp_path) + onnx_path = qeff_model.export(export_dir=tmp_path) end = perf_counter() export_time_0 = end - start model_path = tmp_path.with_name(tmp_path.name + "-" + qeff_model.export_hash) @@ -237,7 +237,7 @@ def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate( assert export_time_1 < export_time_0 # test compile - qeff_model.compile(prefill_seq_len=32, ctx_len=64) + qeff_model.compile(onnx_path=onnx_path, prefill_seq_len=32, ctx_len=64) assert Path(qeff_model.qpc_path).is_dir() assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) diff --git a/tests/transformers/models/test_disagg_mode.py b/tests/transformers/models/test_disagg_mode.py index 0e303d389..67ee48944 100644 --- a/tests/transformers/models/test_disagg_mode.py +++ b/tests/transformers/models/test_disagg_mode.py @@ -56,9 +56,7 @@ def test_disagg_mode_prefill(model_id, prompt): undo_transformers_quantizers() - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_id, num_hidden_layers=2, max_position_embeddings=64 * 1024 - ) + qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id, num_hidden_layers=2) qeff_model.prefill(True) config = qeff_model.model.config inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len) From ea320ed24446a21fc9ccbe79c4cbb42672067972 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Mon, 10 Nov 2025 11:08:13 +0000 Subject: [PATCH 22/25] fixed kv cache shape Signed-off-by: Onkar Chougule --- QEfficient/transformers/models/modeling_auto.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 11a223a2f..552f78e86 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2343,9 +2343,6 @@ def export( bs: int = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE seq_len: int = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS - kv_cache_shape = get_padding_shape_from_config( - self.model.config, fbs if self.continuous_batching else bs, seq_len - ) if prefill_only: assert not self.continuous_batching, "prefill_only=True is not supported with continuous_batching=True" @@ -2372,6 +2369,9 @@ def export( self.hash_params.pop("prefill_only", None) self.hash_params.pop("num_blocks", None) + kv_cache_shape = get_padding_shape_from_config( + self.model.config, fbs if self.continuous_batching else bs, seq_len + ) example_inputs = { "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), From fbb85c088d0762ff9b560d9e6e8a67cac7705cd0 Mon Sep 17 00:00:00 2001 From: Onkar Chougule <168134249+ochougul@users.noreply.github.com> Date: Tue, 11 Nov 2025 15:09:28 +0530 Subject: [PATCH 23/25] fixed self.onnx_path issue in modeling_qeff Signed-off-by: Onkar Chougule <168134249+ochougul@users.noreply.github.com> --- QEfficient/base/modeling_qeff.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 43d1e3bfb..4479479a5 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -295,6 +295,20 @@ def _export( self.onnx_path = onnx_path return onnx_path + def get_onnx_path(self, prefill_only: Optional[bool] = False, + specializations: Optional[List[Dict[str, int]]] = None, + offload_pt_weights: Optional[bool] = True): + kwargs = {"offload_pt_weights": offload_pt_weights} + if prefill_only: + if self.prefill_onnx_path is None: + kwargs.update({"prefill_only": prefill_only, "prefill_seq_len": specializations[0].get("seq_len")}) + self.export(**kwargs) + return self.prefill_onnx_path + else: + if self.onnx_path is None: + self.export(**kwargs) + return self.onnx_path + @dump_qconfig def _compile( self, @@ -335,17 +349,7 @@ def _compile( For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored. """ - kwargs = {"offload_pt_weights": offload_pt_weights} - if onnx_path is None and prefill_only: - kwargs.update({"prefill_only": prefill_only, "prefill_seq_len": specializations[0].get("seq_len")}) - self.export(**kwargs) - onnx_path = Path(self.prefill_onnx_path) - elif onnx_path is None: - self.export(**kwargs) - onnx_path = Path(self.onnx_path) - else: - onnx_path = Path(onnx_path) - + onnx_path = Path(onnx_path if onnx_path else self.get_onnx_path(prefill_only, specializations, offload_pt_weights)) compile_dir = Path(compile_dir or onnx_path.parent) qpc_path = compile_dir / "qpc" if not onnx_path.is_file(): From 2c2abf2bf3aa05e79ca19712f380844e62af1007 Mon Sep 17 00:00:00 2001 From: Mamta Singh Date: Tue, 11 Nov 2025 10:21:45 +0000 Subject: [PATCH 24/25] fix formatting error Signed-off-by: Mamta Singh --- QEfficient/base/modeling_qeff.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 4479479a5..c2c5a7212 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -295,9 +295,12 @@ def _export( self.onnx_path = onnx_path return onnx_path - def get_onnx_path(self, prefill_only: Optional[bool] = False, - specializations: Optional[List[Dict[str, int]]] = None, - offload_pt_weights: Optional[bool] = True): + def get_onnx_path( + self, + prefill_only: Optional[bool] = False, + specializations: Optional[List[Dict[str, int]]] = None, + offload_pt_weights: Optional[bool] = True, + ): kwargs = {"offload_pt_weights": offload_pt_weights} if prefill_only: if self.prefill_onnx_path is None: @@ -349,7 +352,9 @@ def _compile( For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored. """ - onnx_path = Path(onnx_path if onnx_path else self.get_onnx_path(prefill_only, specializations, offload_pt_weights)) + onnx_path = Path( + onnx_path if onnx_path else self.get_onnx_path(prefill_only, specializations, offload_pt_weights) + ) compile_dir = Path(compile_dir or onnx_path.parent) qpc_path = compile_dir / "qpc" if not onnx_path.is_file(): From fba1ac055fa15187b56079db188a722a915e80c0 Mon Sep 17 00:00:00 2001 From: Onkar Chougule Date: Thu, 13 Nov 2025 10:29:09 +0000 Subject: [PATCH 25/25] added ffn blocking and num blocks env variables Signed-off-by: Onkar Chougule --- .../models/gpt_oss/modeling_gpt_oss.py | 4 +- .../transformers/models/modeling_auto.py | 53 ++++++++++++------- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py index 2f9e33113..946a2851c 100644 --- a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -47,6 +47,8 @@ def __qeff_init__(self): class QEffPrefillOnlyGptOssMLP(GptOssMLP): def forward(self, hidden: torch.Tensor): + if os.environ.get("NUM_FFN_BLOCKS", None) is not None: + return self.blocked_ffn_forward(hidden) B, S, H = hidden.shape T = B * S hidden = hidden.view(T, H) @@ -118,7 +120,7 @@ def blocked_ffn_forward(self, hidden: torch.Tensor): # ────────────────── allocate the output tensor ───── expert_out = hidden.new_zeros((T, H)) # accumulation buffer - target_blocks = int(os.environ.get("NUM_BLOCKS", 1)) + target_blocks = int(os.environ.get("NUM_FFN_BLOCKS", 1)) block_positions = [] for j in range(target_blocks): block_positions.append(j * (T // target_blocks)) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 552f78e86..7e661ff10 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2314,6 +2314,35 @@ def get_model_config(self) -> dict: """ return self.model.config.__dict__ + def get_seq_len_and_handle_specialized_prefill_model(self, prefill_seq_len: Optional[int] = None) -> int: + num_q_blocks = os.environ.get("NUM_Q_BLOCKS", None) + if num_q_blocks is None: + block_size = 128 + if prefill_seq_len is None or prefill_seq_len % block_size != 0 or prefill_seq_len < 128: + raise ValueError( + f"When prefill_only=True, 'prefill_seq_len' must be explicitly set and divisible by block_size={block_size}. " + f"Or set `NUM_BLOCKS` ENV variable" + f"Received: prefill_seq_len={prefill_seq_len}" + ) + + num_q_blocks = prefill_seq_len // block_size + logger.warning( + f"Setting NUM_BLOCKS={num_q_blocks} used in attention Q-blocking for prefill_only model, please set ENV variable `NUM_BLOCKS` to override" + ) + os.environ["NUM_Q_BLOCKS"] = num_q_blocks + + num_ffn_blocks = os.environ.get("NUM_FFN_BLOCKS", None) + min_seq_len = int(max(num_q_blocks, num_ffn_blocks)) if num_ffn_blocks else num_q_blocks + + self.prefill(True) + self.hash_params["prefill_only"] = True + self.hash_params["num_blocks"] = os.environ["NUM_BLOCKS"] + return ( + min_seq_len + if min_seq_len > constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + else constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN + ) + def export( self, export_dir: Optional[str] = None, @@ -2345,25 +2374,11 @@ def export( fbs: int = constants.ONNX_EXPORT_EXAMPLE_FBS if prefill_only: assert not self.continuous_batching, "prefill_only=True is not supported with continuous_batching=True" - - if self.model.config.model_type in SPECIALIZED_PREFILL_ONLY_MODEL_ARCH: - block_size = os.environ.get("BLOCK_SIZE", None) - if block_size is None: - block_size = 128 - logger.warning( - "Setting BLOCK_SIZE=128 for prefill_only model, please set ENV variable `BLOCK_SIZE` to override" - ) - if prefill_seq_len is None or prefill_seq_len % block_size != 0: - raise ValueError( - f"When prefill_only=True, 'prefill_seq_len' must be explicitly set and divisible by block_size={block_size}. " - f"Received: prefill_seq_len={prefill_seq_len}" - ) - os.environ["NUM_BLOCKS"] = str(prefill_seq_len // block_size) - - self.prefill(True) - self.hash_params["prefill_only"] = True - self.hash_params["num_blocks"] = os.environ["NUM_BLOCKS"] - seq_len = prefill_seq_len // block_size if (prefill_seq_len // block_size) > seq_len else seq_len + seq_len = ( + self.get_seq_len_and_handle_specialized_prefill_model(prefill_seq_len) + if self.model.config.model_type in SPECIALIZED_PREFILL_ONLY_MODEL_ARCH + else seq_len + ) else: self.prefill(False) self.hash_params.pop("prefill_only", None)