Add custom communicator for trtllm_mnnvl_ar

wenscarl · wenscarl · commit 3175f3348d07 · 2025-11-06T19:03:57.000Z
diff --git a/flashinfer/comm/mnnvl.py b/flashinfer/comm/mnnvl.py
@@ -547,14 +547,15 @@ def supports_mnnvl() -> bool:
 
 class McastDeviceMemory:
     """Python port of McastDeviceMemory from TensorRT-LLM"""
-
+    # config: Optional[MnnvlConfig] = None
     def __init__(
         self,
         buf_size: int,
         group_size: int,
         group_rank: int,
         device_idx: int,
         is_multi_node: bool = True,
+        comm: Optional[CommBackend] = None,
     ):
         cu_device = checkCudaErrors(cuda.cuDeviceGet(device_idx))
 
@@ -631,7 +632,7 @@ def __init__(
                     "[McastDeviceMemory] Device does not support fabric handle."
                 )
 
-            self._alloc_mn_mcast_mem(buf_size)
+            self._alloc_mn_mcast_mem(buf_size, comm)
         else:
             # For single-node NVLS, would need to implement _alloc_nvls_mcast_mem
             raise NotImplementedError("Single-node NVLS allocation not implemented yet")
@@ -649,6 +650,7 @@ def __init__(
         self.signal_pads_dev = alloc_and_copy_to_cuda(self.signal_pads)
         self.uc_ptrs_dev = alloc_and_copy_to_cuda(self.uc_ptrs)
 
+
     def __del__(self):
         """Destructor - cleanup allocated memory"""
 
@@ -753,7 +755,7 @@ def get_world_size(self) -> int:
         """Get the total number of devices in the group"""
         return self.group_size
 
-    def _alloc_mn_mcast_mem(self, buf_size: int):
+    def _alloc_mn_mcast_mem(self, buf_size: int, comm: Any=MpiComm()):
         """Allocate multi-node multicast memory using MNNVL"""
 
         # Verify CUDA context
@@ -768,7 +770,12 @@ def _alloc_mn_mcast_mem(self, buf_size: int):
             print(f"Error checking CUDA context: {e}")
 
         # Get MPI communicator
-        comm = MpiComm()
+        # comm = MpiComm()
+        # comm = McastDeviceMemory.get_comm()
+        # if config:
+        #     comm = config.comm_backend
+        # else:
+        #     comm = MpiComm()
 
         # Set up allocation properties
         handle_type = cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
@@ -831,6 +838,9 @@ def _alloc_mn_mcast_mem(self, buf_size: int):
         )
 
         # All-gather fabric handles
+        print(my_fabric_handle.data)
+        print(type(my_fabric_handle.data))
+        # all_fabric_handles=[my_fabric_handle.data] * 4
         all_fabric_handles = comm.allgather(my_fabric_handle.data)
         cuda.cuCtxSynchronize()
 
@@ -969,6 +979,7 @@ def __init__(
         group_rank: int,
         device: torch.device,
         mn_nvlink: bool = True,
+        comm: Optional[CommBackend] = None,
     ):
         """
         Constructor for McastGpuBuffer.
@@ -981,7 +992,7 @@ def __init__(
             mn_nvlink: Flag indicating if multi-node NVLink is used
         """
         self.mcast_device_memory = McastDeviceMemory(
-            buf_size, group_size, group_rank, device.index, mn_nvlink
+            buf_size, group_size, group_rank, device.index, mn_nvlink, comm
         )
         self.buf_size = buf_size
         self.local_device = device
diff --git a/flashinfer/comm/trtllm_mnnvl_ar.py b/flashinfer/comm/trtllm_mnnvl_ar.py
@@ -15,7 +15,7 @@
 
 from ..jit import gen_trtllm_mnnvl_comm_module
 from ..utils import register_custom_op
-from .mnnvl import McastGPUBuffer
+from .mnnvl import (McastGPUBuffer, CommBackend)
 
 
 def mpi_barrier():
@@ -122,7 +122,8 @@ def trtllm_mnnvl_rmsnorm(
 
 
 def get_allreduce_mnnvl_workspace(
-    mapping: Mapping, dtype: torch.dtype
+    mapping: Mapping, dtype: torch.dtype,
+    comm: Optional[CommBackend] = None,
 ) -> Tuple[McastGPUBuffer, torch.Tensor, int]:
     """Get workspace buffers needed for multi-node NVLink all-reduce operation.
 
@@ -164,14 +165,18 @@ def get_allreduce_mnnvl_workspace(
         mapping.tp_rank,
         torch.device("cuda", mapping.local_rank),
         mapping.is_multi_node() or force_mn,
+        comm=comm,
     )
 
     # Initialize the unicast buffer with -0.0
     mcast_buffer.lamport_initialize(mapping.tp_rank, dtype)
 
     # CPU barrier since we assume this should not be called in cuda graph
     torch.cuda.synchronize()
-    mpi_barrier()
+    if comm:
+        comm.barrier()
+    else:
+        mpi_barrier()
 
     # This is a buffer to maintain the state of this allreduce Op
     # [Buffer_ptr, Clear_ptr, Buffer_size, num_tokens_prev, atomic access counter]
diff --git a/tests/comm/test_trtllm_mnnvl_allreduce_custom_comm.py b/tests/comm/test_trtllm_mnnvl_allreduce_custom_comm.py