moved to use global config variable and transform sets it

MrGeva · MrGeva · commit a5a6c33e89fa · 2025-11-12T06:41:50.000-08:00
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/config/default.yaml b/tensorrt_llm/_torch/auto_deploy/config/default.yaml
@@ -80,6 +80,7 @@ transforms:
     sharding_source: ['heuristic']
     support_partial_config: true
     sharding_dims: ['tp', 'ep', 'bmm']
+    allreduce_strategy: 'AUTO'
     requires_shape_prop: true
   # TODO: (hg) need to ensure run_shape_prop after sharding.
   sharding_transform_executor:
diff --git a/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py b/tensorrt_llm/_torch/auto_deploy/distributed/trtllm.py
@@ -13,21 +13,36 @@
     # warmup causes hangs due to workspace allocation with CPU synchronization
     _allreduce_cache = {}
 
-    # Global allreduce strategy configuration
-    # Can be set via set_allreduce_strategy() to override the default AUTO strategy
+    # Global AllReduce Strategy Configuration
+    # =========================================
+    # This global variable controls which allreduce implementation is used across
+    # all distributed operations in AutoDeploy. It's set once at initialization
+    # time via set_allreduce_strategy() and remains constant during execution.
     _global_allreduce_strategy = AllReduceStrategy.AUTO
 
     def set_allreduce_strategy(strategy: AllReduceStrategy):
-        """Set the global allreduce strategy for distributed operations.
+        """Set the global allreduce strategy for all distributed operations.
 
-        Args:
-            strategy: AllReduceStrategy enum value (AUTO, NCCL, ONESHOT, TWOSHOT, etc.)
+        This should be called once during initialization, before any distributed
+        operations are executed. All subsequent allreduce calls will use this strategy.
+
+        Note:
+            This clears the allreduce cache to ensure new operations use the updated strategy.
+            Call this before any model compilation or CUDA graph capture.
         """
         global _global_allreduce_strategy
         _global_allreduce_strategy = strategy
         # Clear cache when strategy changes to force recreation with new strategy
         _allreduce_cache.clear()
 
+    def get_allreduce_strategy() -> AllReduceStrategy:
+        """Get the current global allreduce strategy.
+
+        Returns:
+            The currently configured AllReduceStrategy enum value.
+        """
+        return _global_allreduce_strategy
+
     def trtllm_allgather(tensor, dim, sizes=None):
         rank, world_size = get_rank_world_size()
         p_config = Mapping(world_size=world_size, tp_size=world_size, rank=rank)
@@ -77,6 +92,9 @@ def fused_allreduce_residual_rmsnorm_fake(
     def set_allreduce_strategy(strategy):
         raise ImportError("TRT-LLM is not available.")
 
+    def get_allreduce_strategy():
+        raise ImportError("TRT-LLM is not available.")
+
     def trtllm_allgather(tensor, dim, sizes=None):
         raise ImportError("TRT-LLM is not available.")
 
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -325,17 +325,6 @@ def create_autodeploy_executor(ad_config: LlmArgs, tokenizer: Optional[Tokenizer
     port = mpi_dist.broadcast(dist.get_free_port())  # use MPI broadcast to pick a free port
     dist.initialize_or_skip(rank, world_size, port)
 
-    # Configure allreduce strategy if specified
-    if hasattr(ad_config, "allreduce_strategy") and ad_config.allreduce_strategy != "AUTO":
-        from tensorrt_llm.functional import AllReduceStrategy
-
-        from ..distributed.trtllm import TRTLLM_OP_AVAILABLE, set_allreduce_strategy
-
-        if TRTLLM_OP_AVAILABLE:
-            strategy = getattr(AllReduceStrategy, ad_config.allreduce_strategy)
-            set_allreduce_strategy(strategy)
-            ad_logger.info(f"Using allreduce strategy: {ad_config.allreduce_strategy}")
-
     # some config
     assert ad_config.max_beam_width <= 1, "_autodeploy + beam_search is not supported"
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -22,9 +22,10 @@
 from typing import DefaultDict, Dict, List, Set, Tuple, Type
 
 import torch
-from pydantic import Field
+from pydantic import Field, field_validator
 from torch.fx import GraphModule, Node
 
+from .....functional import AllReduceStrategy
 from ...models.factory import ModelFactory, ShardingConfigSource
 from ...shim.interface import CachedSequenceInterface
 from ...utils.logger import ad_logger
@@ -149,6 +150,32 @@ class ShardingTransformConfig(TransformConfig):
     sharding_dims: List[ShardingDim] = Field(
         default_factory=lambda: [ShardingDim.SSM, ShardingDim.TP, ShardingDim.EP, ShardingDim.BMM]
     )
+    allreduce_strategy: AllReduceStrategy = Field(
+        default=AllReduceStrategy.AUTO,
+        description="AllReduce strategy for distributed operations. Options: AUTO (automatic selection), "
+        "NCCL (NCCL-based), ONESHOT (single-phase fusion kernel), TWOSHOT (two-phase fusion kernel), "
+        "MIN_LATENCY (minimum latency heuristic), LOWPRECISION (low precision allreduce), "
+        "UB (unified buffer), MNNVL (multi-node NVLINK), NCCL_SYMMETRIC (NCCL symmetric). "
+        "This is set as a global variable during transform application.",
+    )
+
+    @field_validator("allreduce_strategy", mode="before")
+    @classmethod
+    def _validate_allreduce_strategy(cls, v):
+        """Convert string names like 'AUTO' or 'ONESHOT' to AllReduceStrategy enum."""
+        if isinstance(v, AllReduceStrategy):
+            return v
+        if isinstance(v, str):
+            try:
+                return AllReduceStrategy[v]
+            except KeyError:
+                raise ValueError(
+                    f"Invalid allreduce strategy: {v}. "
+                    f"Valid options: {', '.join(s.name for s in AllReduceStrategy)}"
+                )
+        if isinstance(v, int):
+            return AllReduceStrategy(v)
+        return v
 
 
 @TransformRegistry.register("detect_sharding")
@@ -186,6 +213,23 @@ def _apply(
         local_rank, world_size = shared_config.local_rank, shared_config.world_size
         # world_size = 2
 
+        # Configure global allreduce strategy from transform config
+        # This is set once during sharding transform and used by all distributed operations
+        if hasattr(self.config, "allreduce_strategy"):
+            try:
+                from ...distributed.trtllm import TRTLLM_OP_AVAILABLE, set_allreduce_strategy
+
+                if TRTLLM_OP_AVAILABLE:
+                    # config.allreduce_strategy is already an AllReduceStrategy enum
+                    set_allreduce_strategy(self.config.allreduce_strategy)
+                    if self.config.allreduce_strategy != AllReduceStrategy.AUTO:
+                        ad_logger.info(
+                            f"Global allreduce strategy configured from transform: "
+                            f"{self.config.allreduce_strategy.name}"
+                        )
+            except (ImportError, AttributeError) as e:
+                ad_logger.warning(f"Failed to set allreduce strategy: {e}")
+
         if world_size < 2:
             ad_logger.info("Skipping sharding for single device")
             return gm, TransformInfo(