made strategy mandatory, fixed missing param

MrGeva · MrGeva · commit 3124116b3f64 · 2025-11-11T14:28:45.000-08:00
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/dist.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/dist.py
@@ -37,6 +37,14 @@ def all_reduce(t: torch.Tensor, strategy: str = "AUTO") -> torch.Tensor:
     efficient all_reduce ops one should write/replace it with a fused op.
     """
     if trtllm_dist.is_trtllm_op_available():
+        # Debug logging to see what strategy is actually passed
+        if not hasattr(all_reduce, "_logged_strategies"):
+            all_reduce._logged_strategies = set()
+        if strategy not in all_reduce._logged_strategies:
+            from tensorrt_llm.logger import logger
+
+            logger.info(f"[DEBUG] torch_dist_all_reduce called with strategy='{strategy}'")
+            all_reduce._logged_strategies.add(strategy)
         return trtllm_dist.trtllm_allreduce(t, op=dist.ReduceOp.SUM, strategy=strategy)
     t_res = t.clone()
     dist.all_reduce(t_res, op=dist.ReduceOp.SUM)
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py b/tensorrt_llm/_torch/auto_deploy/transform/library/collectives.py
@@ -17,7 +17,11 @@
 
 
 def _allreduce_residual_rmsnorm_pattern(
-    x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float = 0.1253
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 0.1253,
+    strategy: str = "AUTO",
 ):
     """
     Reference PyTorch composition of:
@@ -28,7 +32,7 @@ def _allreduce_residual_rmsnorm_pattern(
     """
 
     input_dtype = x.dtype
-    hidden_states = torch.ops.auto_deploy.torch_dist_all_reduce(x)
+    hidden_states = torch.ops.auto_deploy.torch_dist_all_reduce(x, strategy)
     add = residual + hidden_states
 
     hidden_states = add.to(torch.float32)
@@ -41,7 +45,11 @@ def _allreduce_residual_rmsnorm_pattern(
 
 
 def _allreduce_residual_rmsnorm_pattern2(
-    x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float = 0.1253
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 0.1253,
+    strategy: str = "AUTO",
 ):
     """
     Reference PyTorch composition of:
@@ -52,7 +60,7 @@ def _allreduce_residual_rmsnorm_pattern2(
     """
 
     input_dtype = x.dtype
-    hidden_states = torch.ops.auto_deploy.torch_dist_all_reduce(x)
+    hidden_states = torch.ops.auto_deploy.torch_dist_all_reduce(x, strategy)
     add = hidden_states + residual
 
     hidden_states = add.to(torch.float32)
@@ -65,9 +73,13 @@ def _allreduce_residual_rmsnorm_pattern2(
 
 
 def _allreduce_residual_rmsnorm_repl(
-    x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    strategy: str = "AUTO",
 ):
-    return torch.ops.dist.fused_allreduce_residual_rmsnorm(x, residual, weight, eps)
+    return torch.ops.dist.fused_allreduce_residual_rmsnorm(x, residual, weight, eps, strategy)
 
 
 @TransformRegistry.register("fuse_allreduce_residual_rmsnorm")
@@ -90,6 +102,7 @@ def _apply(
             torch.randn(bsz, hidden, device="meta", dtype=torch.bfloat16),  # residual
             torch.randn(hidden, device="meta", dtype=torch.bfloat16),  # weight
             0.1253,  # eps
+            "AUTO",  # strategy
         ]
 
         register_ad_pattern(
@@ -98,15 +111,15 @@ def _apply(
             patterns=patterns,
             dummy_args=dummy_args,
             op_ignore_types={torch.ops.aten.to.dtype: (torch.dtype,)},
-            scalar_workaround={"eps": 0.1253},
+            scalar_workaround={"eps": 0.1253, "strategy": "AUTO"},
         )
         register_ad_pattern(
             search_fn=_allreduce_residual_rmsnorm_pattern2,
             replace_fn=_allreduce_residual_rmsnorm_repl,
             patterns=patterns,
             dummy_args=dummy_args,
             op_ignore_types={torch.ops.aten.to.dtype: (torch.dtype,)},
-            scalar_workaround={"eps": 0.1253},
+            scalar_workaround={"eps": 0.1253, "strategy": "AUTO"},
         )
 
         num_matches = patterns.apply(gm.graph)
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -133,6 +133,7 @@ def _process_simple_shard(
                         world_size=world_size,
                         dist_op="all_gather",
                         min_local_shape=1,
+                        allreduce_strategy=sharding_config.allreduce_strategy,
                     )
                 )
             )
@@ -360,6 +361,7 @@ def _process_ssm_sharding(
             dist_op=None,
             min_local_shape=min_local_shape,
             fused_weight_dims=fused_weight_dims["in_proj"],
+            allreduce_strategy=sharding_config.allreduce_strategy,
         )
     )
 
@@ -398,6 +400,7 @@ def _process_ssm_sharding(
                 dist_op=None,
                 min_local_shape=min_local_shape,
                 fused_weight_dims=fused_dims,
+                allreduce_strategy=sharding_config.allreduce_strategy,
             )
         )
 
@@ -461,6 +464,7 @@ def _process_column_sharding(
                 world_size=world_size,
                 dist_op=None,  # for column sharding, no dist op is performed
                 min_local_shape=min_local_shape,
+                allreduce_strategy=sharding_config.allreduce_strategy,
             )
         )
 
@@ -594,6 +598,7 @@ def detect_sharding_from_factory_config(
                             world_size=world_size,
                             dist_op=None,
                             min_local_shape=min_local_shape,
+                            allreduce_strategy=sharding_config.allreduce_strategy,
                         )
                     )
                     num_row_col_shards += 1
@@ -620,6 +625,7 @@ def detect_sharding_from_factory_config(
                             dist_op=None,
                             min_local_shape=min_local_shape,
                             layer_type=LayerType.MAMBA,
+                            allreduce_strategy=sharding_config.allreduce_strategy,
                         )
                     )
                     num_row_col_shards += 1
@@ -640,6 +646,7 @@ def detect_sharding_from_factory_config(
                                     world_size=world_size,
                                     dist_op=None,
                                     min_local_shape=min_local_shape,
+                                    allreduce_strategy=sharding_config.allreduce_strategy,
                                 )
                             )
                         elif col_row_action == "rowwise":
@@ -671,6 +678,7 @@ def detect_sharding_from_factory_config(
                             world_size=world_size,
                             dist_op="all_gather",
                             min_local_shape=1,
+                            allreduce_strategy=sharding_config.allreduce_strategy,
                         )
                     )
                     num_simple_shards += 1
@@ -686,6 +694,7 @@ def detect_sharding_from_factory_config(
                             world_size=world_size,
                             dist_op="all_gather",
                             min_local_shape=1,
+                            allreduce_strategy=sharding_config.allreduce_strategy,
                         )
                     )
                 # after successful match, break the loop
@@ -1085,6 +1094,7 @@ def detect_ep_shard(gm: GraphModule, sharding_config: ShardingConfig) -> Transfo
                 node,
                 rank=rank,
                 world_size=world_size,
+                allreduce_strategy=sharding_config.allreduce_strategy,
             )
         )
         num_moe_patterns += 1
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/sharding_utils.py
@@ -566,7 +566,7 @@ class ShardingTransformInfo(BaseModel, ABC):
     target_node: str
     rank: int
     world_size: int
-    allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO
+    allreduce_strategy: AllReduceStrategy  # REQUIRED: must be explicitly passed
 
     @field_validator("allreduce_strategy", mode="before")
     @classmethod
@@ -696,6 +696,8 @@ class ParameterUpdateInfo(ShardingTransformInfo):
     rank: int
     world_size: int
     args: tuple
+    # ParameterUpdateInfo doesn't insert distributed ops, so strategy doesn't matter
+    allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO
 
     def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
         """Validate the transformation configuration."""
@@ -984,8 +986,8 @@ def _insert_sharded_moe(
     node: Node,
     rank: int,
     world_size: int,
+    allreduce_strategy: AllReduceStrategy,  # REQUIRED: must be explicitly passed
     scale_names: Sequence[str] = (),
-    allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO,
 ):
     """Update the torch_moe node with sharded weight lists,
     sharded `selected_experts` and `final_scales(router_logics)`.
@@ -1091,7 +1093,7 @@ def _insert_sharded_mxfp4_mlp_ep(
     node: Node,
     rank: int,
     world_size: int,
-    allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO,
+    allreduce_strategy: AllReduceStrategy,  # REQUIRED: must be explicitly passed
 ):
     """
     Transform a call to auto_deploy::triton_mxfp4_moe into:
@@ -1165,7 +1167,7 @@ def validate(self, gm: GraphModule = None, node: Node = None) -> bool:
 
     def apply(self, gm: GraphModule, node: Node) -> None:
         """Apply EP sharding transformation to the graph module."""
-        _insert_sharded_moe(gm, node, self.rank, self.world_size, [], self.allreduce_strategy)
+        _insert_sharded_moe(gm, node, self.rank, self.world_size, self.allreduce_strategy, [])
 
 
 class MXFP4EPShardingInfo(EPShardingInfo):
@@ -1196,7 +1198,7 @@ def scale_names(self) -> List[str]:
 
     def apply(self, gm: GraphModule, node: Node) -> None:
         _insert_sharded_moe(
-            gm, node, self.rank, self.world_size, self.scale_names(), self.allreduce_strategy
+            gm, node, self.rank, self.world_size, self.allreduce_strategy, self.scale_names()
         )
 
 
@@ -1214,7 +1216,7 @@ def scale_names(self) -> List[str]:
 
     def apply(self, gm: GraphModule, node: Node) -> None:
         _insert_sharded_moe(
-            gm, node, self.rank, self.world_size, self.scale_names(), self.allreduce_strategy
+            gm, node, self.rank, self.world_size, self.allreduce_strategy, self.scale_names()
         )
 
 

Original file line number	Diff line number	Diff line change
`@@ -133,6 +133,7 @@ def _process_simple_shard(`
`133`	`133`	`world_size=world_size,`
`134`	`134`	`dist_op="all_gather",`
`135`	`135`	`min_local_shape=1,`
	`136`	`+ allreduce_strategy=sharding_config.allreduce_strategy,`
`136`	`137`	`)`
`137`	`138`	`)`
`138`	`139`	`)`
`@@ -360,6 +361,7 @@ def _process_ssm_sharding(`
`360`	`361`	`dist_op=None,`
`361`	`362`	`min_local_shape=min_local_shape,`
`362`	`363`	`fused_weight_dims=fused_weight_dims["in_proj"],`
	`364`	`+ allreduce_strategy=sharding_config.allreduce_strategy,`
`363`	`365`	`)`
`364`	`366`	`)`
`365`	`367`
`@@ -398,6 +400,7 @@ def _process_ssm_sharding(`
`398`	`400`	`dist_op=None,`
`399`	`401`	`min_local_shape=min_local_shape,`
`400`	`402`	`fused_weight_dims=fused_dims,`
	`403`	`+ allreduce_strategy=sharding_config.allreduce_strategy,`
`401`	`404`	`)`
`402`	`405`	`)`
`403`	`406`
`@@ -461,6 +464,7 @@ def _process_column_sharding(`
`461`	`464`	`world_size=world_size,`
`462`	`465`	`dist_op=None, # for column sharding, no dist op is performed`
`463`	`466`	`min_local_shape=min_local_shape,`
	`467`	`+ allreduce_strategy=sharding_config.allreduce_strategy,`
`464`	`468`	`)`
`465`	`469`	`)`
`466`	`470`
`@@ -594,6 +598,7 @@ def detect_sharding_from_factory_config(`
`594`	`598`	`world_size=world_size,`
`595`	`599`	`dist_op=None,`
`596`	`600`	`min_local_shape=min_local_shape,`
	`601`	`+ allreduce_strategy=sharding_config.allreduce_strategy,`
`597`	`602`	`)`
`598`	`603`	`)`
`599`	`604`	`num_row_col_shards += 1`
`@@ -620,6 +625,7 @@ def detect_sharding_from_factory_config(`
`620`	`625`	`dist_op=None,`
`621`	`626`	`min_local_shape=min_local_shape,`
`622`	`627`	`layer_type=LayerType.MAMBA,`
	`628`	`+ allreduce_strategy=sharding_config.allreduce_strategy,`
`623`	`629`	`)`
`624`	`630`	`)`
`625`	`631`	`num_row_col_shards += 1`
`@@ -640,6 +646,7 @@ def detect_sharding_from_factory_config(`
`640`	`646`	`world_size=world_size,`
`641`	`647`	`dist_op=None,`
`642`	`648`	`min_local_shape=min_local_shape,`
	`649`	`+ allreduce_strategy=sharding_config.allreduce_strategy,`
`643`	`650`	`)`
`644`	`651`	`)`
`645`	`652`	`elif col_row_action == "rowwise":`
`@@ -671,6 +678,7 @@ def detect_sharding_from_factory_config(`
`671`	`678`	`world_size=world_size,`
`672`	`679`	`dist_op="all_gather",`
`673`	`680`	`min_local_shape=1,`
	`681`	`+ allreduce_strategy=sharding_config.allreduce_strategy,`
`674`	`682`	`)`
`675`	`683`	`)`
`676`	`684`	`num_simple_shards += 1`
`@@ -686,6 +694,7 @@ def detect_sharding_from_factory_config(`
`686`	`694`	`world_size=world_size,`
`687`	`695`	`dist_op="all_gather",`
`688`	`696`	`min_local_shape=1,`
	`697`	`+ allreduce_strategy=sharding_config.allreduce_strategy,`
`689`	`698`	`)`
`690`	`699`	`)`
`691`	`700`	`# after successful match, break the loop`
`@@ -1085,6 +1094,7 @@ def detect_ep_shard(gm: GraphModule, sharding_config: ShardingConfig) -> Transfo`
`1085`	`1094`	`node,`
`1086`	`1095`	`rank=rank,`
`1087`	`1096`	`world_size=world_size,`
	`1097`	`+ allreduce_strategy=sharding_config.allreduce_strategy,`
`1088`	`1098`	`)`
`1089`	`1099`	`)`
`1090`	`1100`	`num_moe_patterns += 1`