Rounding up workspace size according to allocation (page size).

timlee0212 · timlee0212 · commit 815aaf33dc6f · 2025-11-26T15:53:14.000-08:00
diff --git a/flashinfer/comm/mnnvl.py b/flashinfer/comm/mnnvl.py
@@ -803,6 +803,14 @@ def get_world_size(self) -> int:
         """Get the total number of devices in the group"""
         return self.group_size
 
+    def get_allocation_size(self) -> int:
+        """Get the total allocation size (including signal pad)"""
+        return self.allocation_size
+
+    def get_usable_buffer_size(self) -> int:
+        """Get the usable buffer size (excluding signal pad)"""
+        return self.allocation_size - self.SIGNAL_PAD_SIZE
+
     def _init_ipc_socket(self):
         if self.group_rank == 0:
             # Gnerate the opId
@@ -838,7 +846,7 @@ def _alloc_mn_mcast_mem(self, buf_size: int):
         alloc_granularity = checkCudaErrors(
             cuda.cuMemGetAllocationGranularity(
                 allocation_prop,
-                cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_MINIMUM,
+                cuda.CUmemAllocationGranularity_flags.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED,
             )
         )
 
@@ -1015,8 +1023,8 @@ def lamport_initialize(self, rank: int, dtype: torch.dtype):
         else:
             raise ValueError(f"Unsupported dtype: {dtype}")
 
-        # Calculate number of elements that fit in allocation_size
-        num_elements = self.allocation_size // dsize
+        # Calculate number of elements that fit in allocation_size; We don't want to include the signal pad.
+        num_elements = (self.allocation_size - self.SIGNAL_PAD_SIZE) // dsize
 
         checkCudaErrors(memset_func(int(self.uc_ptrs[self.group_rank]), neg_zero, num_elements))
 
@@ -1042,7 +1050,7 @@ def __init__(
         Constructor for McastGpuBuffer.
 
         Args:
-            buf_size: The total size of the buffer in bytes
+            buf_size: The requested size of the buffer in bytes. The actual usable size may differ due to alignment requirements.
             group_size: The number of ranks in the communication group
             group_rank: The rank of the local process within the group
             device: The CUDA device for buffer allocation
@@ -1061,7 +1069,8 @@ def __init__(
             mn_nvlink,
             comm_backend_for_handle_transfer,
         )
-        self.buf_size = buf_size
+        # Update buf_size to reflect the actual usable buffer size after allocation
+        self.buf_size = self.mcast_device_memory.get_usable_buffer_size()
         self.local_device = device
 
     def lamport_initialize(self, rank: int, dtype: torch.dtype):
diff --git a/flashinfer/comm/trtllm_mnnvl_ar.py b/flashinfer/comm/trtllm_mnnvl_ar.py
@@ -65,11 +65,11 @@ def __init__(
 
         Args:
             mapping: Mapping configuration containing rank info
-            buffer_size_in_bytes: The size in bytes for each lamport buffer. The actual allocation size will be NUM_LAMPORT_BUFFERS * buffer_size_in_bytes.
+            buffer_size_in_bytes: The requested size in bytes for each lamport buffer. The actual allocation size may be larger due to alignment requirements. The actual usable size will be NUM_LAMPORT_BUFFERS * actual_buffer_size_per_lamport_buffer.
         """
         if buffer_size_in_bytes is None:
-            # Default to 512MB workspace size if not provided
-            buffer_size_in_bytes = 512 * (1024**2)
+            # Default to 16MB workspace size if not provided
+            buffer_size_in_bytes = 16 * (1024**2)
         else:
             # Round up to the nearest multiple of 8MB
             buffer_size_in_bytes = math.ceil(buffer_size_in_bytes / (8 * (1024**2))) * (8 * (1024**2))
@@ -80,22 +80,38 @@ def __init__(
                 f"The buffer size in bytes {buffer_size_in_bytes} is greater than the maximum supported size (UINT32_MAX)."
             )
 
-        self.buffer_size_bytes = buffer_size_in_bytes
-        self.workspace_size_bytes = buffer_size_in_bytes * self.NUM_LAMPORT_BUFFERS
+        # Calculate total requested workspace size
+        requested_workspace_size = buffer_size_in_bytes * self.NUM_LAMPORT_BUFFERS
+
         self.rank = mapping.tp_rank
         self.tp_size = mapping.tp_size
         logging.debug(
-            f"[MNNVL Allreduce] TP size: {mapping.tp_size}, rank: {mapping.tp_rank}, Allocating workspace with size {buffer_size_in_bytes} bytes."
+            f"[MNNVL Allreduce] TP size: {mapping.tp_size}, rank: {mapping.tp_rank}, Allocating workspace with requested size {buffer_size_in_bytes} bytes per buffer."
         )
+
+        # Allocate the workspace
         self.mcast_buffer_handle = McastGPUBuffer(
-            self.workspace_size_bytes,
+            requested_workspace_size,
             mapping.tp_size,
             mapping.tp_rank,
             torch.device("cuda", mapping.local_rank),
             mapping.is_multi_node(),
             comm_backend,
         )
 
+        # Get the actual usable buffer size after allocation (buf_size is updated by McastGPUBuffer)
+        allocated_size = self.mcast_buffer_handle.buf_size
+        # We want the buffer size to be aligned to 16B which is the granularity for buffer management.
+        self.buffer_size_bytes = (
+            math.floor(allocated_size / self.NUM_LAMPORT_BUFFERS) // 16 * 16
+        )
+        # This workspace size is used for checking the buffer. We need to set it to the actual size in use. The buffer free logic does not rely on this size.
+        self.workspace_size_bytes = self.buffer_size_bytes * self.NUM_LAMPORT_BUFFERS
+
+        logging.debug(
+            f"[MNNVL Allreduce] Actual allocated size: {allocated_size} bytes, Actual buffer size per lamport buffer: {self.buffer_size_bytes} bytes, total workspace: {self.workspace_size_bytes} bytes."
+        )
+
         # We use FP32 for sentinel value regardless of the real dtype
         self.mcast_buffer_handle.lamport_initialize(mapping.tp_rank, torch.float32)
         # Wait until the initialization is done