Parallel compilation and onnx subfunction is added

Amit Raj · Amit Raj · commit 19e0ccdb33d7 · 2025-11-14T10:24:20.000Z
Signed-off-by: Amit Raj &lt;amitraj@qti.qualcommm.com&gt;
diff --git a/QEfficient/diffusers/pipelines/flux/pipeline_flux.py b/QEfficient/diffusers/pipelines/flux/pipeline_flux.py
@@ -24,6 +24,8 @@
 from QEfficient.diffusers.pipelines.pipeline_utils import (
     ModulePerf,
     QEffPipelineOutput,
+    compile_modules_parallel,
+    compile_modules_sequential,
     config_manager,
     set_module_device_ids,
 )
@@ -192,7 +194,7 @@ def get_default_config_path() -> str:
         """
         return os.path.join(os.path.dirname(__file__), "flux_config.json")
 
-    def compile(self, compile_config: Optional[str] = None) -> None:
+    def compile(self, compile_config: Optional[str] = None, parallel: bool = False) -> None:
         """
         Compile ONNX models for deployment on Qualcomm AI hardware.
 
@@ -202,6 +204,8 @@ def compile(self, compile_config: Optional[str] = None) -> None:
         Args:
             compile_config (str, optional): Path to JSON configuration file.
                                            If None, uses default configuration.
+            parallel (bool): If True, compile modules in parallel using ProcessPoolExecutor.
+                           If False, compile sequentially (default: False).
         """
         # Ensure all modules are exported to ONNX before compilation
         if any(
@@ -219,21 +223,20 @@ def compile(self, compile_config: Optional[str] = None) -> None:
         if self.custom_config is None:
             config_manager(self, config_source=compile_config)
 
-        # Compile each module with its specific configuration
-        for module_name, module_obj in tqdm(self.modules.items(), desc="Compiling modules", unit="module"):
-            module_config = self.custom_config["modules"]
-            specializations = module_config[module_name]["specializations"]
-            compile_kwargs = module_config[module_name]["compilation"]
-
-            # Set dynamic specialization values based on image dimensions
-            if module_name == "transformer":
-                specializations["cl"] = self.cl
-            elif module_name == "vae_decoder":
-                specializations["latent_height"] = self.latent_height
-                specializations["latent_width"] = self.latent_width
+        # Prepare dynamic specialization updates based on image dimensions
+        specialization_updates = {
+            "transformer": {"cl": self.cl},
+            "vae_decoder": {
+                "latent_height": self.latent_height,
+                "latent_width": self.latent_width,
+            },
+        }
 
-            # Compile the module to QPC format
-            module_obj.compile(specializations=[specializations], **compile_kwargs)
+        # Use generic utility functions for compilation
+        if parallel:
+            compile_modules_parallel(self.modules, self.custom_config, specialization_updates)
+        else:
+            compile_modules_sequential(self.modules, self.custom_config, specialization_updates)
 
     def _get_t5_prompt_embeds(
         self,
@@ -467,6 +470,7 @@ def __call__(
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
         custom_config_path: Optional[str] = None,
+        parallel_compile: bool = False,
     ):
         """
         Generate images from text prompts using the Flux pipeline.
@@ -501,6 +505,8 @@ def __call__(
             callback_on_step_end_tensor_inputs (List[str]): Tensors to pass to callback
             max_sequence_length (int): Maximum sequence length for T5 (default: 512)
             custom_config_path (str, optional): Path to custom compilation config
+            parallel_compile (bool): If True, compile modules in parallel for faster compilation.
+                                    If False, compile sequentially (default: False).
 
         Returns:
             QEffPipelineOutput or tuple: Generated images and performance metrics
@@ -512,7 +518,7 @@ def __call__(
             config_manager(self, custom_config_path)
             set_module_device_ids(self)
 
-        self.compile(compile_config=custom_config_path)
+        self.compile(compile_config=custom_config_path, parallel=parallel_compile)
 
         # Validate all inputs
         self.check_inputs(
diff --git a/QEfficient/diffusers/pipelines/pipeline_module.py b/QEfficient/diffusers/pipelines/pipeline_module.py
@@ -374,7 +374,7 @@ class QEffFluxTransformerModel(QEFFBaseModel):
         _onnx_transforms (List): ONNX transformations applied after export
     """
 
-    _pytorch_transforms = [AttentionTransform, CustomOpsTransform, NormalizationTransform]
+    _pytorch_transforms = [AttentionTransform, NormalizationTransform, CustomOpsTransform]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
     def __init__(self, model: nn.Module, use_onnx_function: bool) -> None:
@@ -386,13 +386,17 @@ def __init__(self, model: nn.Module, use_onnx_function: bool) -> None:
             use_onnx_function (bool): Whether to export transformer blocks as ONNX functions
                                      for better modularity and potential optimization
         """
-        super().__init__(model)
 
         # Optionally apply ONNX function transform for modular export
+
         if use_onnx_function:
-            self._pytorch_transforms.append(OnnxFunctionTransform)
             model, _ = OnnxFunctionTransform.apply(model)
 
+        super().__init__(model)
+
+        if use_onnx_function:
+            self._pytorch_transforms.append(OnnxFunctionTransform)
+
         # Ensure model is on CPU to avoid meta device issues
         self.model = model.to("cpu")
 
diff --git a/QEfficient/diffusers/pipelines/pipeline_utils.py b/QEfficient/diffusers/pipelines/pipeline_utils.py
@@ -6,13 +6,16 @@
 # ----------------------------------------------------------------------------
 
 import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import PIL.Image
+from tqdm import tqdm
 
 from QEfficient.utils._utils import load_json
+from QEfficient.utils.logging_utils import logger
 
 
 def config_manager(cls, config_source: Optional[str] = None):
@@ -51,14 +54,101 @@ def set_module_device_ids(cls):
         module_obj.device_ids = config_modules[module_name]["execute"]["device_ids"]
 
 
+def compile_modules_parallel(
+    modules: Dict[str, Any],
+    config: Dict[str, Any],
+    specialization_updates: Dict[str, Dict[str, Any]] = None,
+) -> None:
+    """
+    Compile multiple pipeline modules in parallel using ThreadPoolExecutor.
+
+    Args:
+        modules: Dictionary of module_name -> module_object pairs to compile
+        config: Configuration dictionary containing module-specific compilation settings
+        specialization_updates: Optional dictionary of module_name -> specialization_updates
+                               to apply dynamic values (e.g., image dimensions)
+    """
+
+    def _prepare_and_compile(module_name: str, module_obj: Any) -> None:
+        """Prepare specializations and compile a single module."""
+        specializations = config["modules"][module_name]["specializations"].copy()
+        compile_kwargs = config["modules"][module_name]["compilation"]
+
+        if specialization_updates and module_name in specialization_updates:
+            specializations.update(specialization_updates[module_name])
+
+        module_obj.compile(specializations=[specializations], **compile_kwargs)
+
+    # Execute compilations in parallel
+    with ThreadPoolExecutor(max_workers=len(modules)) as executor:
+        futures = {executor.submit(_prepare_and_compile, name, obj): name for name, obj in modules.items()}
+
+        with tqdm(total=len(futures), desc="Compiling modules", unit="module") as pbar:
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    logger.error(f"Compilation failed for {futures[future]}: {e}")
+                    raise
+                pbar.update(1)
+
+
+def compile_modules_sequential(
+    modules: Dict[str, Any],
+    config: Dict[str, Any],
+    specialization_updates: Dict[str, Dict[str, Any]] = None,
+) -> None:
+    """
+    Compile multiple pipeline modules sequentially.
+
+    This function provides a generic way to compile diffusion pipeline modules
+    sequentially, which is the default behavior for backward compatibility.
+
+    Args:
+        modules: Dictionary of module_name -> module_object pairs to compile
+        config: Configuration dictionary containing module-specific compilation settings
+        specialization_updates: Optional dictionary of module_name -> specialization_updates
+                               to apply dynamic values (e.g., image dimensions)
+
+    """
+    for module_name, module_obj in tqdm(modules.items(), desc="Compiling modules", unit="module"):
+        module_config = config["modules"]
+        specializations = module_config[module_name]["specializations"].copy()
+        compile_kwargs = module_config[module_name]["compilation"]
+
+        # Apply dynamic specialization updates if provided
+        if specialization_updates and module_name in specialization_updates:
+            specializations.update(specialization_updates[module_name])
+
+        # Compile the module to QPC format
+        module_obj.compile(specializations=[specializations], **compile_kwargs)
+
+
 @dataclass(frozen=True)
 class ModulePerf:
+    """
+    Data class to store performance metrics for a pipeline module.
+
+    Attributes:
+        module_name: Name of the pipeline module (e.g., 'text_encoder', 'transformer', 'vae_decoder')
+        perf: Performance metric in seconds. Can be a single float for modules that run once,
+              or a list of floats for modules that run multiple times (e.g., transformer steps)
+    """
+
     module_name: str
     perf: int
 
 
 @dataclass(frozen=True)
 class QEffPipelineOutput:
+    """
+    Data class to store the output of a QEfficient diffusion pipeline.
+
+    Attributes:
+        pipeline_module: List of ModulePerf objects containing performance metrics for each module
+        images: Generated images as either a list of PIL Images or numpy array
+    """
+
     pipeline_module: list[ModulePerf]
     images: Union[List[PIL.Image.Image], np.ndarray]
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
@@ -568,7 +568,8 @@ def wrapper(self, *args, **kwargs):
             model_params=self.hash_params,
             output_names=all_args.get("output_names"),
             dynamic_axes=all_args.get("dynamic_axes"),
-            export_kwargs=all_args.get("export_kwargs", None),
+            # TODO: Re-enable export_kwargs hashing before merging this PR
+            # export_kwargs=all_args.get("export_kwargs", None),
             onnx_transform_kwargs=all_args.get("onnx_transform_kwargs", None),
         )
         export_dir = export_dir.with_name(export_dir.name + "-" + export_hash)
diff --git a/examples/diffusers/flux/flux_1_shnell_custom.py b/examples/diffusers/flux/flux_1_shnell_custom.py
@@ -35,10 +35,11 @@
 # Note: Smaller dimensions = faster generation but lower resolution
 
 # Option 1: Basic initialization with custom image dimensions
+# NOTE: use_onnx_function=True enables modular ONNX export optimizations (Experimental so not recommended)
+#       This feature improves export performance by breaking down the model into smaller,
+#       more manageable ONNX functions, which can lead to better compilation and runtime efficiency.
 pipeline = QEFFFluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-schnell",
-    height=512,
-    width=512,
+    "black-forest-labs/FLUX.1-schnell", height=256, width=256, use_onnx_function=False
 )
 
 # Option 2: Advanced initialization with custom modules
@@ -109,6 +110,7 @@
     num_inference_steps=4,
     max_sequence_length=256,
     generator=torch.manual_seed(42),
+    parallel_compile=True,
 )
 
 images = output.images[0]