Address review comments.

timlee0212 · timlee0212 · commit 775918d5d8d6 · 2025-11-26T15:52:54.000-08:00
diff --git a/csrc/trtllm_mnnvl_allreduce.cu b/csrc/trtllm_mnnvl_allreduce.cu
@@ -53,11 +53,18 @@ void trtllm_mnnvl_allreduce_fusion(TensorView input, int64_t multicast_buffer_pt
         << "nranks must be between 2 and 64, got " << nranks;
     TVM_FFI_ICHECK(rank >= 0 && rank < nranks)
         << "rank must be between 0 and nranks-1, got " << rank;
-    TVM_FFI_ICHECK((residual_out.has_value() && gamma.has_value() && epsilon.has_value()) ||
+    TVM_FFI_ICHECK((residual_in.has_value() && residual_out.has_value() && gamma.has_value() &&
+                    epsilon.has_value()) ||
                    !rmsnorm_fusion)
-        << "residual_out, gamma, and epsilon must be provided if rmsnorm_fusion is true";
+        << "residual_in, residual_out, gamma, and epsilon must be provided if rmsnorm_fusion is "
+           "true";
 
     if (rmsnorm_fusion) {
+      TVM_FFI_ICHECK(residual_in.value().size(0) == num_tokens &&
+                     residual_in.value().size(1) == token_dim)
+          << "residual_in shape mismatch: expected (" << input.size(0) << ", " << input.size(1)
+          << ") but got (" << residual_in.value().size(0) << ", " << residual_in.value().size(1)
+          << ")";
       TVM_FFI_ICHECK(residual_out.value().size(0) == num_tokens &&
                      residual_out.value().size(1) == token_dim)
           << "residual_out shape mismatch: expected (" << input.size(0) << ", " << input.size(1)
diff --git a/flashinfer/comm/mnnvl.py b/flashinfer/comm/mnnvl.py
@@ -716,6 +716,9 @@ def __del__(self):
         if not hasattr(self, "is_multi_node"):
             return
 
+        if hasattr(self, "_ipc_socket"):
+            self._ipc_socket.close()
+
         # Skip cleanup during Python finalization to avoid segfaults
         # Especially cause the CUDA context could be destroyed at this point.
         if sys.is_finalizing():
@@ -864,7 +867,7 @@ def _alloc_mn_mcast_mem(self, buf_size: int):
         # Allocate local GPU memory
         self.uc_handles[self.group_rank] = checkCudaErrors(cuda.cuMemCreate(self.allocation_size, allocation_prop, 0))
 
-        # Export local handle to fabric handle
+        # Export local handle to fabric handle or FD
         local_shareable_uc_handle = checkCudaErrors(
             cuda.cuMemExportToShareableHandle(
                 self.uc_handles[self.group_rank],
@@ -898,6 +901,12 @@ def _alloc_mn_mcast_mem(self, buf_size: int):
                         self._shareable_handle_type,
                     )
                 )
+                if (
+                    self._shareable_handle_type
+                    == cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+                ):
+                    # Close FD after import
+                    os.close(all_shareable_uc_handles[p])
 
         # Initialize multicasting
         if self.group_rank == 0:
@@ -943,7 +952,12 @@ def _alloc_mn_mcast_mem(self, buf_size: int):
                     self._shareable_handle_type,
                 )
             )
-
+        if (
+            self._shareable_handle_type
+            == cuda.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+        ):
+            # Close FD after import
+            os.close(shareable_mc_handle)
         # Add device to multicast
         checkCudaErrors(cuda.cuMulticastAddDevice(self.mc_handle, self.device_idx))
 
diff --git a/include/flashinfer/utils.cuh b/include/flashinfer/utils.cuh
@@ -21,6 +21,7 @@
 #include <cuda_fp8.h>
 #include <cuda_runtime.h>
 
+#include <atomic>
 #include <cstdint>
 #include <iostream>
 #include <type_traits>
@@ -335,16 +336,20 @@ inline std::pair<int, int> GetCudaComputeCapability() {
   return std::make_pair(major, minor);
 }
 
+// This function is thread-safe and cached the sm_count.
+// But it will only check the current CUDA device, thus assuming each process handles single GPU.
 inline int GetCudaMultiProcessorCount() {
-  static int sm_count = 0;
-  if (sm_count == 0) {
+  static std::atomic<int> sm_count{0};
+  int cached = sm_count.load(std::memory_order_relaxed);
+  if (cached == 0) {
     int device_id;
     cudaGetDevice(&device_id);
     cudaDeviceProp device_prop;
     cudaGetDeviceProperties(&device_prop, device_id);
-    sm_count = device_prop.multiProcessorCount;
+    cached = device_prop.multiProcessorCount;
+    sm_count.store(cached, std::memory_order_relaxed);
   }
-  return sm_count;
+  return cached;
 }
 
 template <typename T>
diff --git a/tests/comm/test_trtllm_mnnvl_allreduce.py b/tests/comm/test_trtllm_mnnvl_allreduce.py
@@ -1,6 +1,6 @@
 # Check torch version:
 import traceback
-from typing import Tuple
+from typing import Tuple, Optional
 
 import pytest
 import torch
@@ -286,7 +286,7 @@ def run_mnnvl_ar_full(
     fusion: bool,
     dtype: torch.dtype,
     hidden_size: int,
-    legacy_explicit_workspace_bytes: int = None,
+    legacy_explicit_workspace_bytes: Optional[int] = None,
     legacy_api: bool = False,
 ):
     """Core test logic for MNNVL AllReduce operations.