Merge branch 'quic:main' into CCL-main

vjanfaza · web-flow · commit bdb2deea4191 · 2025-10-13T17:25:51.000-07:00
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
@@ -11,18 +11,20 @@
 
 from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.utils import check_and_assign_cache_dir
+from QEfficient.utils.custom_yaml import generate_custom_io
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
 
-def get_onnx_model_path(
+def get_onnx_path_and_setup_customIO(
     model_name: str,
     cache_dir: Optional[str] = None,
     hf_token: Optional[str] = None,
     full_batch_size: Optional[int] = None,
     local_model_dir: Optional[str] = None,
+    mxint8_kv_cache: Optional[int] = False,
 ):
     """
     Exports the PyTorch model to ONNX format if a pre-exported file is not found,
@@ -63,6 +65,9 @@ def get_onnx_model_path(
     )
     onnx_model_path = qeff_model.export()
     logger.info(f"Generated onnx_path: {onnx_model_path}")
+
+    # Generating Custom IO for the compile.
+    generate_custom_io(qeff_model, mxint8_kv_cache=mxint8_kv_cache)
     return onnx_model_path
 
 
@@ -72,13 +77,14 @@ def main(
     hf_token: Optional[str] = None,
     local_model_dir: Optional[str] = None,
     full_batch_size: Optional[int] = None,
+    mxint8_kv_cache: Optional[bool] = False,
 ) -> None:
     """
     Main function for the QEfficient ONNX export CLI application.
 
     This function serves as the entry point for exporting a PyTorch model, loaded
     via QEFFCommonLoader, to the ONNX format. It prepares the necessary
-    paths and calls `get_onnx_model_path`.
+    paths and calls `get_onnx_path_and_setup_customIO`.
 
     Parameters
     ----------
@@ -106,12 +112,13 @@ def main(
 
     """
     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
-    get_onnx_model_path(
+    get_onnx_path_and_setup_customIO(
         model_name=model_name,
         cache_dir=cache_dir,
         hf_token=hf_token,
         full_batch_size=full_batch_size,
         local_model_dir=local_model_dir,
+        mxint8_kv_cache=mxint8_kv_cache,
     )
 
 
@@ -137,5 +144,11 @@ def main(
         default=None,
         help="Set full batch size to enable continuous batching mode, default is None",
     )
+    parser.add_argument(
+        "--mxint8_kv_cache",
+        "--mxint8-kv-cache",
+        required=False,
+        help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
+    )
     args = parser.parse_args()
     main(**args.__dict__)
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
@@ -248,6 +248,8 @@ def main(
 
     image_path = kwargs.pop("image_path", None)
     image_url = kwargs.pop("image_url", None)
+    iteration = kwargs.pop("iteration", 1)
+    automation = kwargs.pop("automation", False)
 
     config = qeff_model.model.config
     architecture = config.architectures[0] if config.architectures else None
@@ -310,6 +312,8 @@ def main(
             device_id=device_group,
             prompts_txt_file_path=prompts_txt_file_path,
             generation_len=generation_len,
+            iteration=iteration,
+            automation=automation,
         )
 
 
diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
@@ -270,6 +270,7 @@ def compile(
         This method will be removed soon; use `QEFFAutoModelForCausalLM.compile` instead.
 
     """
+
     if full_batch_size and batch_size != 1:
         raise ValueError("Only either batch_size or full_batch_size should be greater than one")
 
@@ -284,11 +285,20 @@ def compile(
         full_batch_size=full_batch_size,
     )
 
-    # Select the customIO config based on the mx flag.
-    custom_io_file_name = "custom_io_int8.yaml" if mxint8 else "custom_io_fp16.yaml"
+    dtype_suffix = "int8" if mxint8 else "fp16"
+    source_path = f"./custom_io_{dtype_suffix}.yaml"
+    destination_path = os.path.join(os.path.dirname(qpc_path), f"custom_io_{dtype_suffix}.yaml")
+
+    # Move the custom YAML file to the cache/qeff_model directory
+    try:
+        shutil.move(source_path, destination_path)
+        print(f"Successfully moved '{source_path}' to '{destination_path}'.")
+    except Exception as e:
+        print(f"Error while moving file '{source_path}': {e}")
 
+    custom_io_file_name = f"custom_io_{dtype_suffix}.yaml"
     if custom_io_file_path is None:
-        custom_io_file_path = os.path.join(os.path.dirname(onnx_path), custom_io_file_name)
+        custom_io_file_path = os.path.join(os.path.dirname(qpc_path), custom_io_file_name)
 
     if not os.path.isfile(custom_io_file_path):
         raise FileNotFoundError(
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
@@ -324,6 +324,7 @@ def cloud_ai_100_exec_kv(
     stream: bool = True,
     write_io_dir: Optional[str] = None,
     automation=False,
+    iteration: int = 1,
     prompt_to_lora_id_mapping: Optional[List[int]] = None,
     is_tlm: bool = False,
     include_sampler: bool = False,
@@ -348,6 +349,7 @@ def cloud_ai_100_exec_kv(
         :stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them. ``Defaults to True``.
         :Write_io_dir (str): Path to write the input and output files. ``Defaults to None``.
         :automation (bool): If true, it prints input, output, and performance stats. ``Defaults to False``.
+        :iteration (int): Number of iterations to run the inference. ``Defaults to 1``.
         :prompt_to_lora_id_mapping (List[int]): Mapping to associate prompts with their respective LoRA adapter.
         :include_sampler (bool, default=False): Enable/Disable sampling of next tokens.
         :return_pdfs (bool, default=False): Return probability distributions along with sampled
@@ -394,30 +396,34 @@ def cloud_ai_100_exec_kv(
         return_pdfs=return_pdfs,
         sampling_params=sampling_params,
     )
-    if full_batch_size is None:
-        exec_info = [
-            generate_text.generate(prompt[i : i + batch_size], generation_len, stream, prompt_to_lora_id_mapping)
-            for i in range(0, len(prompt), batch_size)
-        ]
-        prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info])
-        decode_perf = np.average([info.perf_metrics.decode_perf for info in exec_info])
-        total_perf = np.average([info.perf_metrics.total_perf for info in exec_info])
-        total_time = np.average([info.perf_metrics.total_time for info in exec_info])
-        generated_texts = [info.generated_texts for info in exec_info]
-        generated_ids = [info.generated_ids for info in exec_info]
-
-        exec_info = CloudAI100ExecInfo(
-            batch_size=batch_size,
-            generated_texts=generated_texts,
-            generated_ids=generated_ids,
-            perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time),
-        )
-    else:
-        exec_info = generate_text.generate(
-            prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping
-        )
 
-    print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation)
+    for _ in range(0, int(iteration)):
+        if full_batch_size is None:
+            exec_info = [
+                generate_text.generate(prompt[i : i + batch_size], generation_len, stream, prompt_to_lora_id_mapping)
+                for i in range(0, len(prompt), batch_size)
+            ]
+            prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info])
+            decode_perf = np.average([info.perf_metrics.decode_perf for info in exec_info])
+            total_perf = np.average([info.perf_metrics.total_perf for info in exec_info])
+            total_time = np.average([info.perf_metrics.total_time for info in exec_info])
+            generated_texts = [info.generated_texts for info in exec_info]
+            generated_ids = [info.generated_ids for info in exec_info]
+
+            exec_info = CloudAI100ExecInfo(
+                batch_size=batch_size,
+                generated_texts=generated_texts,
+                generated_ids=generated_ids,
+                perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time),
+            )
+        else:
+            exec_info = generate_text.generate(
+                prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping
+            )
+
+        print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation)
+
+    # TODO: Need to handle the case where exec_info if given for n iterations
     return exec_info
 
 
@@ -951,7 +957,9 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len):
 
         return decode_pause_time
 
-    def run_decode(self, decode_inputs, generation_len, streamer: Optional[transformers.TextStreamer] = None):
+    def run_decode(
+        self, decode_inputs, generation_len, automation, streamer: Optional[transformers.TextStreamer] = None
+    ):
         """
         Default method for running decode. Executes the decoding process for a given set of inputs and a specified generation length.
 
@@ -1000,11 +1008,11 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform
             if self.include_sampler:
                 decode_inputs["last_accepted_output_tokens"] = decode_inputs["input_ids"]
 
-            if finished_sequences.all():
+            if finished_sequences.all() and not automation:
                 break
         return num_token
 
-    def generate_decode_stream(self, decode_inputs, generation_len):
+    def generate_decode_stream(self, decode_inputs, generation_len, automation):
         """
         Generator method for yielding decode tokens. Executes the decoding process for a given set of inputs and a specified generation length.
 
@@ -1032,7 +1040,7 @@ def generate_decode_stream(self, decode_inputs, generation_len):
             self.generated_ids[:, num_token] = decode_inputs["input_ids"].squeeze(1)
             finished_sequences |= decode_inputs["input_ids"] == self.tokenizer.eos_token_id
 
-            if finished_sequences.all():
+            if finished_sequences.all() and not automation:
                 break
         yield decode_inputs["input_ids"]  # yield the last token
 
@@ -1115,6 +1123,7 @@ def _regular_model_execution(
         prompt: List[str],
         generation_len: Optional[int] = None,
         stream: Optional[bool] = True,
+        automation: Optional[bool] = False,
         prompt_to_lora_id_mapping: Optional[List[int]] = None,
     ):
         """
@@ -1142,7 +1151,7 @@ def _regular_model_execution(
         decode_inputs = self._qaic_model.prepare_decode_inputs()
 
         loop_start = perf_counter()  # Start decode loop timer
-        num_token = self._qaic_model.run_decode(decode_inputs, generation_len, self._text_streamer)
+        num_token = self._qaic_model.run_decode(decode_inputs, generation_len, automation, self._text_streamer)
         end = perf_counter()
         generated_texts = self._tokenizer.batch_decode(self._qaic_model.generated_ids, skip_special_tokens=True)
 
@@ -1196,6 +1205,7 @@ def generate_stream_tokens(
         self,
         prompt: List[str],
         generation_len: Optional[int] = None,
+        automation: Optional[bool] = False,
         prompt_to_lora_id_mapping: Optional[List[int]] = None,
     ):
         """
@@ -1225,7 +1235,7 @@ def generate_stream_tokens(
 
         loop_start = perf_counter()  # Start decode loop timer
         num_token = 0
-        for token_id in self._qaic_model.generate_decode_stream(decode_inputs, generation_len):
+        for token_id in self._qaic_model.generate_decode_stream(decode_inputs, generation_len, automation):
             decoded_tokens = []
             for idx in range(self._qaic_model.batch_size):
                 decoded_tokens.append(self._tokenizer.decode(token_id[idx], skip_special_tokens=True))
@@ -1244,6 +1254,7 @@ def generate(
         prompt: List[str],
         generation_len: Optional[int] = None,
         stream: bool = True,
+        automation: Optional[bool] = False,
         prompt_to_lora_id_mapping: Optional[List[int]] = None,
     ):
         """
@@ -1267,7 +1278,7 @@ def generate(
             if stream:
                 print("\nPrompt : " + prompt[0] + "\nCompletion :", flush=True, end="")
             perf_metrics, generated_texts = self._regular_model_execution(
-                prompt, generation_len, stream, prompt_to_lora_id_mapping
+                prompt, generation_len, stream, automation, prompt_to_lora_id_mapping
             )
 
         if stream:
diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py
@@ -183,7 +183,7 @@ def forward(
         )
 
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
+        attn_output = self.o_proj(attn_output, **kwargs)
         return attn_output, attn_weights, past_key_value
 
 
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -2914,6 +2914,8 @@ def generate(
                 comp_ctx_lengths_decode=self.comp_ctx_lengths_decode,
                 device_id=device_id,
                 generation_len=generation_len,
+                automation=kwargs.pop("automation", False),
+                iteration=kwargs.pop("iteration", 1),
                 is_tlm=self.is_tlm,
                 **kwargs,
             )
diff --git a/QEfficient/utils/custom_yaml.py b/QEfficient/utils/custom_yaml.py
diff --git a/examples/cpp_execution/text_inference_using_cpp.py b/examples/cpp_execution/text_inference_using_cpp.py
diff --git a/tests/cloud/test_export_compile_execute.py b/tests/cloud/test_export_compile_execute.py

Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@ def forward(`
`183`	`183`	`)`
`184`	`184`
`185`	`185`	`attn_output = attn_output.reshape(*input_shape, -1).contiguous()`
`186`		`- attn_output = self.o_proj(attn_output)`
	`186`	`+ attn_output = self.o_proj(attn_output, **kwargs)`
`187`	`187`	`return attn_output, attn_weights, past_key_value`
`188`	`188`
`189`	`189`
Original file line number	Diff line number	Diff line change
`@@ -2914,6 +2914,8 @@ def generate(`
`2914`	`2914`	`comp_ctx_lengths_decode=self.comp_ctx_lengths_decode,`
`2915`	`2915`	`device_id=device_id,`
`2916`	`2916`	`generation_len=generation_len,`
	`2917`	`+ automation=kwargs.pop("automation", False),`
	`2918`	`+ iteration=kwargs.pop("iteration", 1),`
`2917`	`2919`	`is_tlm=self.is_tlm,`
`2918`	`2920`	`**kwargs,`
`2919`	`2921`	`)`