Added cpu memory budget to the frontend

cehongwang · cehongwang · commit f03ab2cc6bf6 · 2025-11-06T20:06:40.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -105,6 +105,7 @@ def cross_compile_for_windows(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
+    cpu_memory_budget: int = _defaults.CPU_MEMORY_BUDGET,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -179,6 +180,7 @@ def cross_compile_for_windows(
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
+        cpu_memory_budget (int): The maximum amount of CPU memory to use for the compilation. If the compilation requires more memory than this budget, the compilation will fail. If set to -1, the compilation will use all available CPU memory.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -334,6 +336,7 @@ def cross_compile_for_windows(
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "use_distributed_mode_trace": use_distributed_mode_trace,
+        "cpu_memory_budget": cpu_memory_budget,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -435,6 +438,7 @@ def compile(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
+    cpu_memory_budget: int = _defaults.CPU_MEMORY_BUDGET,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -681,6 +685,7 @@ def compile(
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
         "use_distributed_mode_trace": use_distributed_mode_trace,
+        "cpu_memory_budget": cpu_memory_budget,
     }
     logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB")
     settings = CompilationSettings(**compilation_options)
@@ -833,6 +838,7 @@ def preserve_module_specs(
                 torch_executed_ops=settings.torch_executed_ops,
                 require_full_compilation=settings.require_full_compilation,
                 skip_fusion=(num_supported_ops == total_ops),
+                cpu_memory_budget=settings.cpu_memory_budget,
             )
 
         except torch.fx.passes.splitter_base.FxNetSplitterInternalError:
@@ -878,11 +884,10 @@ def preserve_module_specs(
         if attr.startswith("_frozen_param"):
             delattr(gm, attr)
 
-
-
     from torch_tensorrt.dynamo.conversion._ConverterRegistry import DYNAMO_CONVERTERS
+
     DYNAMO_CONVERTERS.disallowed_targets = set()
-    
+
     for name, _ in partitioned_module.named_children():
         submodule = getattr(partitioned_module, name)
         # filter on the GraphModule
@@ -1071,6 +1076,7 @@ def convert_exported_program_to_serialized_trt_engine(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
+    cpu_memory_budget: int = _defaults.CPU_MEMORY_BUDGET,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -1345,7 +1351,7 @@ def convert_exported_program_to_serialized_trt_engine(
             )
 
     flattened_input_list = get_flat_args_with_check(
-        exported_program, list(trt_arg_inputs), trt_kwarg_inputs  # type: ignore
+        exported_program, list(trt_arg_inputs), trt_kwarg_inputs
     )[0]
 
     try:
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -57,6 +57,7 @@
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
 OFFLOAD_MODULE_TO_CPU = False
+CPU_MEMORY_BUDGET = -1
 
 if platform.system() == "Linux":
     import pwd
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -7,6 +7,7 @@
 from torch_tensorrt.dynamo._defaults import (
     ASSUME_DYNAMIC_SHAPE_SUPPORT,
     CACHE_BUILT_ENGINES,
+    CPU_MEMORY_BUDGET,
     DISABLE_TF32,
     DLA_GLOBAL_DRAM_SIZE,
     DLA_LOCAL_DRAM_SIZE,
@@ -140,6 +141,7 @@ class CompilationSettings:
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
     offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
+    cpu_memory_budget: int = CPU_MEMORY_BUDGET
 
     def __getstate__(self) -> dict[str, Any]:
         from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
diff --git a/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py
@@ -118,6 +118,7 @@ def __init__(
         require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
         return_tuple: bool = False,
         skip_fusion: bool = False,
+        cpu_memory_budget: int = -1,
     ):
         """
         Preprocesses graph before splitting:
@@ -137,6 +138,7 @@ def __init__(
             skip_fusion=skip_fusion,
         )
         self.operator_support = operator_support
+        self.cpu_memory_budget = cpu_memory_budget
 
         # Get all accelerated nodes based on operator support conditions
         self.acc_nodes = FxNetAccNodesFinder(
@@ -231,19 +233,15 @@ def partition_graph(self) -> torch.fx.GraphModule:
         subgraphs = self.remove_small_acc_subgraphs(subgraphs)
 
         subgraphs = self.break_subgraphs(
-            subgraphs, size_budget=self.calculate_size_budget()
+            subgraphs, subgraph_size_budget=self.calculate_size_budget()
         )
 
         # Set the number of TRT engines to be generated
         self.num_trt_accelerated_subgraphs = len([s for s in subgraphs if s.is_acc])
 
         # Tag the accelerated nodes and split the graph accordingly
-        print([len(s.nodes) for s in subgraphs])
         self.tag(subgraphs)
 
-        for s in subgraphs:
-            print(s.nodes)
-
         gm = self.split()
 
         return gm
@@ -255,8 +253,11 @@ def calculate_size_budget(
         This function calculates the size budget based on the available RSS. We assume that TRT compilation
         needs at most 4x the memory of the model.
         """
-
-        available_rss: int = psutil.virtual_memory().available
+        if self.cpu_memory_budget == -1:
+            available_rss: int = psutil.virtual_memory().available
+        else:
+            used_rss: int = psutil.virtual_memory().used
+            available_rss = self.cpu_memory_budget - used_rss
         return available_rss // engine_compilation_memory_usage_multiplier
 
     def break_subgraphs_by_node(
@@ -303,24 +304,25 @@ def break_subgraphs_by_node(
         return new_subgraphs
 
     def break_subgraphs(
-        self, subgraphs: List[Subgraph], size_budget: int
+        self, subgraphs: List[Subgraph], subgraph_size_budget: int
     ) -> List[Subgraph]:
         """
         This function breaks the subgraphs into smaller subgraphs to save CPU memory.
         """
         new_subgraphs = []
         # We throw an error if the remaining memory is almost empty compared to the model size.
         # i.e. if the remaining memory is 4G (budget is 1G) the model size is greater than 40G, we stop the compilation.
-        sizes = [(subgraph, self.size_of_subgraph(subgraph)) for subgraph in subgraphs]
-        if sum([size for _, size in sizes]) > size_budget * 40:
+        sizes = self.size_of_subgraphs(subgraphs)
+        if sum(sizes) > subgraph_size_budget * 40:
             raise ValueError(
-                f"Subgraph size {sum([size for _, size in sizes])} is too large to break. Size budget: {size_budget}"
+                f"CPU memory budget or available memory is too small to compile the model. CPU memory budget: {self.cpu_memory_budget // (1024 * 1024) if self.cpu_memory_budget != -1 else "All available memory"} MB, Model size: {sum(sizes) // (1024 * 1024)} MB. "
+                + "Consider setting cpu_memory_budget to a larger value or disable offload_module_to_cpu to save more CPU memory."
             )
-        for subgraph, size in sizes:
+        for subgraph, size in zip(subgraphs, sizes):
 
-            while size > size_budget:
+            while size > subgraph_size_budget:
                 broken_subgraphs, size_0, size_1 = self.break_subgraph_by_size(
-                    subgraph, size_budget
+                    subgraph, subgraph_size_budget
                 )
                 size = size_1
                 new_subgraphs.append(broken_subgraphs[0])
@@ -351,10 +353,11 @@ def break_subgraph_by_size(
         ]
 
         while True:
-            new_subgraphs = self.step_and_validate(new_subgraphs)
-            size_0, size_1 = self.size_of_subgraph(
-                new_subgraphs[0]
-            ), self.size_of_subgraph(new_subgraphs[1])
+            step_size = (
+                1 if not new_subgraphs[0].nodes else max(1, len(all_nodes) // 50)
+            )  # Set a step size proportional to the size of the subgraph to make the algorithm more efficient
+            new_subgraphs = self.step_and_validate(new_subgraphs, step_size)
+            size_0, size_1 = self.size_of_subgraphs(new_subgraphs)
             if size_0 > size_to_break:
                 break
 
@@ -451,31 +454,34 @@ def get_leaf_node(
                     break
         return leaf_node
 
-    def size_of_subgraph(self, subgraph: Subgraph) -> int:
+    def size_of_subgraphs(self, subgraphs: List[Subgraph]) -> List[int]:
         """
         This function calculates the size of the subgraph.
         """
-        nodes_in_subgraph = set(subgraph.nodes)
+        state_dict = self.module.state_dict(keep_vars=True)
+        sizes = []
         weight_visited_nodes = set()
-        stack = subgraph.nodes.copy()
-        size = 0
-        while stack:
-            node = stack.pop()
-            if node in weight_visited_nodes:
-                continue
-            if node.op == "get_attr":
-                weight = self.module.state_dict()[node.target]
-                size += weight.numel() * weight.element_size()
+        for subgraph in subgraphs:
+            nodes_in_subgraph = set(subgraph.nodes)
+            stack = subgraph.nodes.copy()
+            size = 0
+            while stack:
+                node = stack.pop()
+                if node in weight_visited_nodes:
+                    continue
                 weight_visited_nodes.add(node)
-                continue
-            if node not in nodes_in_subgraph:
-                # Trace to other subgraphs
-                continue
-            for input_node in node._input_nodes:
-                if input_node not in weight_visited_nodes:
-                    stack.append(input_node)
-
-        return size
+                if node.op == "get_attr":
+                    weight = state_dict[node.target]
+                    size += weight.numel() * weight.element_size()
+                    continue
+                if node not in nodes_in_subgraph:
+                    # Trace to other subgraphs
+                    continue
+                for input_node in node._input_nodes:
+                    if input_node not in weight_visited_nodes:
+                        stack.append(input_node)
+            sizes.append(size)
+        return sizes
 
     def validate_and_correct_subgraphs(
         self, subgraphs: List[Subgraph]
@@ -541,6 +547,7 @@ def partition(
     torch_executed_ops: Collection[Target] = set(),
     require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
     skip_fusion: bool = False,
+    cpu_memory_budget: int = -1,
 ) -> Tuple[torch.fx.GraphModule, OpSupportTester]:
     """Partition an FX GraphModule with aten ops into TRT engines
     Partitioning is based on converter operator support
@@ -567,6 +574,7 @@ def partition(
         min_block_size=min_block_size,
         require_full_compilation=require_full_compilation,
         skip_fusion=skip_fusion,
+        cpu_memory_budget=cpu_memory_budget,
     )
 
     partitioned_graph = partitioner.partition_graph()
diff --git a/py/torch_tensorrt/dynamo/partitioning/fusion_subgraphs.py b/py/torch_tensorrt/dynamo/partitioning/fusion_subgraphs.py