Implemeted the prototype

cehongwang · cehongwang · commit 57b04d715992 · 2025-11-04T20:04:26.000Z
diff --git a/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py b/py/torch_tensorrt/dynamo/partitioning/_adjacency_partitioner.py
@@ -1,4 +1,3 @@
-from ast import Assert
 import logging
 from typing import Collection, Dict, List, Optional, Tuple
 
@@ -26,6 +25,10 @@
 )
 
 logger = logging.getLogger(__name__)
+NON_BREAKABLE_OP_LISTS = [
+    ["addmm", "addmm"],
+    ["conv2d", "batch_norm2d", "relu"],
+]
 
 
 class OpSupportTester(ops.OperatorSupportBase):  # type: ignore
@@ -227,8 +230,9 @@ def partition_graph(self) -> torch.fx.GraphModule:
         # Remove segments smaller than the block size (with exceptions)
         subgraphs = self.remove_small_acc_subgraphs(subgraphs)
 
-        # num_of_break = self.calculate_num_of_break(subgraphs)
-        subgraphs = self.break_subgraphs_by_node(subgraphs, num_of_break=5)
+        subgraphs = self.break_subgraphs(
+            subgraphs, size_budget=self.calculate_size_budget()
+        )
 
         # Set the number of TRT engines to be generated
         self.num_trt_accelerated_subgraphs = len([s for s in subgraphs if s.is_acc])
@@ -241,44 +245,27 @@ def partition_graph(self) -> torch.fx.GraphModule:
             print(s.nodes)
 
         gm = self.split()
-        self.weight_visited_nodes = set()
-        [self.size_of_subgraph(s) for s in subgraphs]
-        
 
         return gm
-    
-    def calculate_num_of_break(self, subgraphs: List[Subgraph]) -> int:
+
+    def calculate_size_budget(
+        self, engine_compilation_memory_usage_multiplier: int = 4
+    ) -> int:
         """
-        This function calculates the break period based on the number of subgraphs.
+        This function calculates the size budget based on the available RSS. We assume that TRT compilation
+        needs at most 4x the memory of the model.
         """
-        rss = psutil.Process().memory_info().rss
-        available_rss = psutil.virtual_memory().available
-        num_of_graphs = len(subgraphs)
-        if rss < available_rss * 0.3:
-            num_of_graphs = 1
-        elif rss < available_rss * 0.5:
-            num_of_graphs = 2
-        elif rss < available_rss:
-            num_of_graphs = 4
-        elif rss < available_rss * 1.5:
-            num_of_graphs = 8
-        elif rss < available_rss * 2:
-            num_of_graphs = 16
-        else:
-            num_of_graphs = 32
-
-        return max(
-            1, num_of_graphs // ((len(subgraphs) + 1) // 2)
-        )  # If there are already graph breaks, for each TRT subgraph, we break for a few times.
 
+        available_rss: int = psutil.virtual_memory().available
+        return available_rss // engine_compilation_memory_usage_multiplier
 
     def break_subgraphs_by_node(
         self, subgraphs: List[Subgraph], num_of_break: int = 1
     ) -> List[Subgraph]:
         """
         This function breaks the subgraphs into smaller subgraphs at the specified frequency to save CPU memory.
         """
-        op_to_break = "add."
+        op_to_break = "addmm."
         num_of_sdpa_node = len(
             [node for node in self.acc_nodes if op_to_break in str(node.target)]
         )
@@ -312,80 +299,200 @@ def break_subgraphs_by_node(
                 new_subgraphs.append(subgraph)
 
         new_subgraphs = self.validate_and_correct_subgraphs(new_subgraphs)
-        
+
         return new_subgraphs
 
     def break_subgraphs(
-        self, subgraphs: List[Subgraph], num_of_break: int = 1
+        self, subgraphs: List[Subgraph], size_budget: int
     ) -> List[Subgraph]:
         """
-        This function breaks the subgraphs into smaller subgraphs at the specified frequency to save CPU memory.
+        This function breaks the subgraphs into smaller subgraphs to save CPU memory.
         """
-        break_pos = [0, 100, 200, 300, 400]
-        current_break_idx = 0
         new_subgraphs = []
-        for subgraph in subgraphs:
-            if subgraph.is_acc:
-                for i, node in enumerate(subgraph.nodes):
-                    if i in break_pos:
+        # We throw an error if the remaining memory is almost empty compared to the model size.
+        # i.e. if the remaining memory is 4G (budget is 1G) the model size is greater than 40G, we stop the compilation.
+        sizes = [(subgraph, self.size_of_subgraph(subgraph)) for subgraph in subgraphs]
+        if sum([size for _, size in sizes]) > size_budget * 40:
+            raise ValueError(
+                f"Subgraph size {sum([size for _, size in sizes])} is too large to break. Size budget: {size_budget}"
+            )
+        for subgraph, size in sizes:
 
-                        new_subgraphs.append(
-                            Subgraph(
-                                is_acc=True,
-                                nodes=subgraph.nodes[current_break_idx : i + 1],
-                                device_ordinal=subgraph.device_ordinal,
-                            )
-                        )
-                        current_break_idx = i + 1
-                new_subgraphs.append(
-                    Subgraph(
-                        is_acc=True,
-                        nodes=subgraph.nodes[current_break_idx:],
-                        device_ordinal=subgraph.device_ordinal,
-                    )
+            while size > size_budget:
+                broken_subgraphs, size_0, size_1 = self.break_subgraph_by_size(
+                    subgraph, size_budget
                 )
-            else:
-                new_subgraphs.append(subgraph)
+                size = size_1
+                new_subgraphs.append(broken_subgraphs[0])
+                subgraph = broken_subgraphs[1]
+            new_subgraphs.append(subgraph)
 
-        new_subgraphs = self.validate_and_correct_subgraphs(new_subgraphs)
         return new_subgraphs
 
+    def break_subgraph_by_size(
+        self, subgraph: Subgraph, size_to_break: int
+    ) -> Tuple[List[Subgraph], int, int]:
+        """
+        This function breaks the subgraphs into smaller subgraphs at the specified frequency to save CPU memory.
+        """
+        all_nodes = subgraph.nodes
+        device_ordinal = subgraph.device_ordinal
+        new_subgraphs = [
+            Subgraph(
+                is_acc=True,
+                nodes=[],
+                device_ordinal=device_ordinal,
+            ),
+            Subgraph(
+                is_acc=True,
+                nodes=all_nodes,
+                device_ordinal=device_ordinal,
+            ),
+        ]
+
+        while True:
+            new_subgraphs = self.step_and_validate(new_subgraphs)
+            size_0, size_1 = self.size_of_subgraph(
+                new_subgraphs[0]
+            ), self.size_of_subgraph(new_subgraphs[1])
+            if size_0 > size_to_break:
+                break
+
+        if len(new_subgraphs[1].nodes) == 0:
+            new_subgraphs.pop(1)
+        return new_subgraphs, size_0, size_1
+
+    def step_and_validate(
+        self, new_subgraphs: List[Subgraph], step_size: int = 1
+    ) -> List[Subgraph]:
+
+        # TODO: We can change it to binary search to find the optimal break point
+        for _ in range(step_size):
+            new_subgraphs[0].nodes.append(new_subgraphs[1].nodes.pop(0))
+
+        while True:
+            new_subgraphs = self.validate_and_correct_subgraphs(new_subgraphs)
+            nodes_in_first_subgraph = set(new_subgraphs[0].nodes)
+            leaf_node = self.get_leaf_node(nodes_in_first_subgraph)
+            broken_fusion = self.step_if_break_fusion(
+                new_subgraphs, leaf_node, nodes_in_first_subgraph
+            )
+            if not broken_fusion or len(new_subgraphs[1].nodes) == 0:
+                break
+
+        return new_subgraphs
+
+    def step_if_break_fusion(
+        self,
+        subgraphs: List[Subgraph],
+        leaf_nodes: set[torch.fx.Node],
+        nodes_in_first_subgraph: set[torch.fx.Node],
+    ) -> bool:
+
+        def add_nodes(node: torch.fx.Node) -> None:
+            """
+            This function adds a node and all its previous nodes to the first subgraph and removes it from the second subgraph in post order.
+            """
+            if node.op in CALLABLE_NODE_OPS and node not in nodes_in_first_subgraph:
+                nodes_in_first_subgraph.add(node)
+                for input_node in node._input_nodes:
+                    add_nodes(input_node)
+                subgraphs[0].nodes.append(node)
+                subgraphs[1].nodes.remove(node)
+
+        def match_subgraph_and_step(node: torch.fx.Node) -> bool:
+            added_nodes = False
+            for op_list in NON_BREAKABLE_OP_LISTS:
+                for i, op in enumerate(op_list):
+                    if i != len(op_list) - 1 and op in str(node.target):
+                        # Search following ops forward using BFS. We skip search previous ops because
+                        # even if it's just a subset of fusion graph, we still want it to be fused.
+
+                        users = node.users.keys()
+                        matching_nodes: set[torch.fx.Node] = set()
+                        for following_op_idx in range(i + 1, len(op_list)):
+                            matching_nodes = set()
+                            for user in users:
+                                if op_list[following_op_idx] in str(user.target):
+                                    matching_nodes.add(user)
+                            if not matching_nodes:
+                                break
+                            users = set()
+                            for matching_node in matching_nodes:
+                                for next_user in matching_node.users:
+                                    users.add(next_user)
+
+                        for matching_node in matching_nodes:
+                            added_nodes = True
+                            add_nodes(matching_node)
+
+                        if added_nodes:
+                            # Early terminate the search if we have found a match because preceeding matches can cover following matches
+                            break
+
+            return True if added_nodes else False
+
+        found_match = False
+        for leaf in leaf_nodes:
+            if match_subgraph_and_step(leaf):
+                found_match = True
+
+        return found_match
+
+    def get_leaf_node(
+        self, nodes_in_first_subgraph: set[torch.fx.Node]
+    ) -> set[torch.fx.Node]:
+        leaf_node = set()
+
+        for node in nodes_in_first_subgraph:
+            for user in node.users:
+                if user not in nodes_in_first_subgraph:
+                    leaf_node.add(node)
+                    break
+        return leaf_node
+
     def size_of_subgraph(self, subgraph: Subgraph) -> int:
         """
         This function calculates the size of the subgraph.
         """
+        nodes_in_subgraph = set(subgraph.nodes)
+        weight_visited_nodes = set()
         stack = subgraph.nodes.copy()
         size = 0
         while stack:
             node = stack.pop()
-            if node in self.weight_visited_nodes:
+            if node in weight_visited_nodes:
                 continue
-            self.weight_visited_nodes.add(node)
             if node.op == "get_attr":
                 weight = self.module.state_dict()[node.target]
                 size += weight.numel() * weight.element_size()
-                self.weight_visited_nodes.add(node)
+                weight_visited_nodes.add(node)
+                continue
+            if node not in nodes_in_subgraph:
+                # Trace to other subgraphs
                 continue
-            for input_node in node._input_nodes: 
-                if input_node not in self.weight_visited_nodes:
+            for input_node in node._input_nodes:
+                if input_node not in weight_visited_nodes:
                     stack.append(input_node)
-        print(size)
+
         return size
 
-    def validate_and_correct_subgraphs(self, subgraphs: List[Subgraph]) -> List[Subgraph]:
+    def validate_and_correct_subgraphs(
+        self, subgraphs: List[Subgraph]
+    ) -> List[Subgraph]:
         """
         This function validates the subgraphs by checking if the subgraphs are valid, and corrects the subgraphs if they are not valid.
         """
-        visited_nodes = {}
-        print([len(s.nodes) for s in subgraphs])
+        visited_nodes = (
+            {}
+        )  # a map from a node to the index of the subgraph it's user should belong to
         for i, subgraph in enumerate(subgraphs):
             if i == 0:
                 for node in subgraph.nodes:
                     visited_nodes[node] = i
                 visited_nodes[subgraph.nodes[-1]] = i + 1
                 continue
 
-
             elif not subgraph.is_acc:
                 for node in subgraph.nodes:
                     visited_nodes[subgraph.nodes[-1]] = i + 1
@@ -401,18 +508,15 @@ def validate_and_correct_subgraphs(self, subgraphs: List[Subgraph]) -> List[Subg
                     for dep in self.deps[node]:
                         if dep in visited_nodes:
                             subgraph_idx = max(subgraph_idx, visited_nodes[dep])
-                        else:
-                            raise ValueError(f"Node {node} have a dependency that is not covered in the previous subgraphs. This is caused by a invalid subgraph segmentation.")
+
                     if subgraph_idx != i:
                         subgraphs[subgraph_idx].nodes.append(node)
                         to_remove_nodes.append(node)
                     visited_nodes[node] = subgraph_idx
                 for node in to_remove_nodes:
                     subgraph.nodes.remove(node)
-        
+
         return subgraphs
-                    
-                    
 
     def starter_nodes(self) -> Tuple[NodeSet, NodeSet]:
         """Generates starter nodes for partitioning + segmentation"""