enable all_gather_v support

tianfengfrank · meta-codesync[bot] · commit 73a225d4aaab · 2025-10-28T16:38:05.000-07:00
Summary:
tp_overlapping requires to work with uneven_split introduced by D84788079. To support that, we need all_gather_v in torchcomm

- enable all_gather_v to support various tensor size of output_tensor list
- add both cpp/py integration UTs

Reviewed By: d4l3k

Differential Revision: D85292529

fbshipit-source-id: 27b281de41121b7887e55248591881503979680a
diff --git a/comms/torchcomms/TorchComm.cpp b/comms/torchcomms/TorchComm.cpp
@@ -89,6 +89,14 @@ std::shared_ptr<TorchWork> TorchComm::all_gather(
   return impl_->all_gather(tensor_list, tensor, async_op, options);
 }
 
+std::shared_ptr<TorchWork> TorchComm::all_gather_v(
+    const std::vector<at::Tensor>& tensor_list,
+    const at::Tensor& tensor,
+    bool async_op,
+    const AllGatherOptions& options) {
+  return impl_->all_gather_v(tensor_list, tensor, async_op, options);
+}
+
 std::shared_ptr<TorchWork> TorchComm::all_gather_single(
     at::Tensor& output,
     const at::Tensor& input,
diff --git a/comms/torchcomms/TorchComm.hpp b/comms/torchcomms/TorchComm.hpp
@@ -65,6 +65,11 @@ class TorchComm {
       const at::Tensor& tensor,
       bool async_op,
       const AllGatherOptions& options = {});
+  std::shared_ptr<TorchWork> all_gather_v(
+      const std::vector<at::Tensor>& tensor_list,
+      const at::Tensor& tensor,
+      bool async_op,
+      const AllGatherOptions& options = {});
   std::shared_ptr<TorchWork> all_gather_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/TorchCommBackend.hpp b/comms/torchcomms/TorchCommBackend.hpp
@@ -78,6 +78,11 @@ class TorchCommBackend {
       const at::Tensor& tensor,
       bool async_op,
       const AllGatherOptions& options = {}) = 0;
+  virtual std::shared_ptr<TorchWork> all_gather_v(
+      const std::vector<at::Tensor>& tensor_list,
+      const at::Tensor& tensor,
+      bool async_op,
+      const AllGatherOptions& options = {}) = 0;
   virtual std::shared_ptr<TorchWork> all_gather_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/TorchCommPy.cpp b/comms/torchcomms/TorchCommPy.cpp
@@ -640,6 +640,41 @@ Output will be available on all ranks.
     tensor: the input tensor to share
     async_op: whether to perform the operation asynchronously
     hints: dictionary of string hints for backend-specific options
+    timeout: timeout for the operation
+          )",
+          py::arg("tensor_list"),
+          py::arg("tensor"),
+          py::arg("async_op"),
+          py::arg("hints") = std::nullopt,
+          py::arg("timeout") = std::nullopt,
+          py::call_guard<py::gil_scoped_release>())
+      .def(
+          "all_gather_v",
+          [](TorchComm& self,
+             const std::vector<at::Tensor>& tensor_list,
+             const at::Tensor& tensor,
+             bool async_op,
+             std::optional<std::unordered_map<std::string, std::string>> hints,
+             std::optional<std::chrono::milliseconds> timeout) {
+            AllGatherOptions opts;
+            if (hints) {
+              opts.hints = *hints;
+            }
+            if (timeout) {
+              opts.timeout = *timeout;
+            }
+            return self.all_gather_v(tensor_list, tensor, async_op, opts);
+          },
+          R"(
+Gather a tensor from all ranks in the communicator, supporting variable tensor sizes per rank.
+
+Output will be available on all ranks.
+
+Args:
+    tensor_list: the list of tensors to gather into; the list is the same on all ranks, but tensor sizes may differ between indices.
+    tensor: the input tensor to share; size may differ per rank.
+    async_op: whether to perform the operation asynchronously
+    hints: dictionary of string hints for backend-specific options
     timeout: timeout for the operation
           )",
           py::arg("tensor_list"),
diff --git a/comms/torchcomms/_comms.pyi b/comms/torchcomms/_comms.pyi
@@ -236,6 +236,14 @@ class TorchComm:
         hints: Dict[str, str] | None = None,
         timeout: timedelta | None = None,
     ) -> TorchWork: ...
+    def all_gather_v(
+        self,
+        tensor_list: List[Any],
+        tensor: Any,
+        async_op: bool,
+        hints: Dict[str, str] | None = None,
+        timeout: timedelta | None = None,
+    ) -> TorchWork: ...
     def all_gather_single(
         self,
         output: Any,
diff --git a/comms/torchcomms/gloo/TorchCommGloo.cpp b/comms/torchcomms/gloo/TorchCommGloo.cpp
@@ -719,6 +719,14 @@ std::shared_ptr<TorchWork> TorchCommGloo::all_gather(
       async_op);
 }
 
+std::shared_ptr<TorchWork> TorchCommGloo::all_gather_v(
+    const std::vector<at::Tensor>& tensor_list,
+    const at::Tensor& tensor,
+    bool async_op,
+    const AllGatherOptions& options) {
+  throw std::runtime_error("all_gather_v is not supported in GLOO backend yet");
+}
+
 std::shared_ptr<TorchWork> TorchCommGloo::all_gather_single(
     at::Tensor& output,
     const at::Tensor& input,
diff --git a/comms/torchcomms/gloo/TorchCommGloo.hpp b/comms/torchcomms/gloo/TorchCommGloo.hpp
@@ -90,6 +90,11 @@ class TorchCommGloo : public TorchCommBackend,
       const at::Tensor& tensor,
       bool async_op,
       const AllGatherOptions& options = {}) override;
+  std::shared_ptr<TorchWork> all_gather_v(
+      const std::vector<at::Tensor>& tensor_list,
+      const at::Tensor& tensor,
+      bool async_op,
+      const AllGatherOptions& options = {}) override;
   std::shared_ptr<TorchWork> all_gather_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/nccl/TorchCommNCCL.cpp b/comms/torchcomms/nccl/TorchCommNCCL.cpp
@@ -691,6 +691,14 @@ std::shared_ptr<TorchWork> TorchCommNCCL::all_gather(
   return work;
 }
 
+std::shared_ptr<TorchWork> TorchCommNCCL::all_gather_v(
+    const std::vector<at::Tensor>& tensor_list,
+    const at::Tensor& tensor,
+    bool async_op,
+    const AllGatherOptions& options) {
+  throw std::runtime_error("all_gather_v is not supported in NCCL backend");
+}
+
 std::shared_ptr<TorchWork> TorchCommNCCL::all_gather_single(
     at::Tensor& output,
     const at::Tensor& input,
diff --git a/comms/torchcomms/nccl/TorchCommNCCL.hpp b/comms/torchcomms/nccl/TorchCommNCCL.hpp
@@ -107,6 +107,11 @@ class TorchCommNCCL : public TorchCommBackend,
       const at::Tensor& tensor,
       bool async_op,
       const AllGatherOptions& options = {}) override;
+  std::shared_ptr<TorchWork> all_gather_v(
+      const std::vector<at::Tensor>& tensor_list,
+      const at::Tensor& tensor,
+      bool async_op,
+      const AllGatherOptions& options = {}) override;
   std::shared_ptr<TorchWork> all_gather_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLX.cpp b/comms/torchcomms/ncclx/TorchCommNCCLX.cpp
@@ -710,6 +710,65 @@ std::shared_ptr<TorchWork> TorchCommNCCLX::all_gather(
   return work;
 }
 
+std::shared_ptr<TorchWork> TorchCommNCCLX::all_gather_v(
+    const std::vector<at::Tensor>& tensor_list,
+    const at::Tensor& tensor,
+    bool async_op,
+    const AllGatherOptions& options) {
+  checkInitialized();
+  checkAndAbortIfTimedOutOrError();
+  if (tensor_list.size() != static_cast<size_t>(comm_size_)) {
+    throw std::runtime_error(
+        "tensor_list size must equal comm_size for all_gather");
+  }
+
+  // Ensure input tensor is contiguous
+  ensureTensorContiguous(tensor);
+
+  for (const auto& t : tensor_list) {
+    ensureTensorContiguous(t);
+  }
+  TorchCommTracingGuard tracingGuard(
+      name_, comm_size_, "all_gather_v", rank_, tensor_list, {tensor});
+
+  cudaStream_t stream = getOperationStream(async_op);
+  auto work = createWork(
+      stream, getOperationTimeout(options.timeout, options_.timeout), {tensor});
+
+  work->recordStart();
+
+  // Use multiple broadcast operations for all_gather
+  nccl_api_->groupStart();
+
+  for (int i = 0; i < comm_size_; ++i) {
+    // assign inpu/output tensors to support vector all_gather (all_gather_v)
+    // where unevenly sized inputs are gathered among participating ranks
+    auto& output = tensor_list[i];
+    auto& input = (i == rank_) ? tensor : output;
+    if (input.numel() != output.numel()) {
+      throw std::runtime_error(
+          "Output tensor size must equal input tensor size for all_gather_v");
+    }
+    nccl_api_->broadcast(
+        input.data_ptr(),
+        output.data_ptr(),
+        input.numel(),
+        getNcclDataType(output),
+        i,
+        nccl_comm_,
+        stream);
+  }
+
+  nccl_api_->groupEnd();
+
+  work->recordEnd();
+
+  // Enqueue the work after events have been recorded
+  enqueueWork(work, stream);
+
+  return work;
+}
+
 std::shared_ptr<TorchWork> TorchCommNCCLX::all_gather_single(
     at::Tensor& output,
     const at::Tensor& input,
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLX.hpp b/comms/torchcomms/ncclx/TorchCommNCCLX.hpp
@@ -107,6 +107,11 @@ class TorchCommNCCLX : public TorchCommBackend,
       const at::Tensor& tensor,
       bool async_op,
       const AllGatherOptions& options = {}) override;
+  std::shared_ptr<TorchWork> all_gather_v(
+      const std::vector<at::Tensor>& tensor_list,
+      const at::Tensor& tensor,
+      bool async_op,
+      const AllGatherOptions& options = {}) override;
   std::shared_ptr<TorchWork> all_gather_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/ncclx/tests/unit/cpp/TorchCommNCCLXTest.cpp b/comms/torchcomms/ncclx/tests/unit/cpp/TorchCommNCCLXTest.cpp
@@ -1109,6 +1109,7 @@ TEST_F(
   testOperation([&]() { comm->all_reduce(tensor, ReduceOp::SUM, false); });
   testOperation([&]() { comm->reduce(tensor, 0, ReduceOp::SUM, false); });
   testOperation([&]() { comm->all_gather(tensor_list, tensor, false); });
+  testOperation([&]() { comm->all_gather_v(tensor_list, tensor, false); });
   testOperation(
       [&]() { comm->all_gather_single(output_tensor, input_tensor, false); });
   testOperation([&]() {
@@ -1185,6 +1186,7 @@ TEST_F(TorchCommNCCLXTest, CollectiveOperationsAfterFinalizeThrowException) {
   testOperation([&]() { comm->all_reduce(tensor, ReduceOp::SUM, false); });
   testOperation([&]() { comm->reduce(tensor, 0, ReduceOp::SUM, false); });
   testOperation([&]() { comm->all_gather(tensor_list, tensor, false); });
+  testOperation([&]() { comm->all_gather_v(tensor_list, tensor, false); });
   testOperation(
       [&]() { comm->all_gather_single(output_tensor, input_tensor, false); });
   testOperation([&]() {
diff --git a/comms/torchcomms/rccl/TorchCommRCCL.cpp b/comms/torchcomms/rccl/TorchCommRCCL.cpp
@@ -693,6 +693,14 @@ std::shared_ptr<TorchWork> TorchCommRCCL::all_gather(
   return work;
 }
 
+std::shared_ptr<TorchWork> TorchCommRCCL::all_gather_v(
+    const std::vector<at::Tensor>& tensor_list,
+    const at::Tensor& tensor,
+    bool async_op,
+    const AllGatherOptions& options) {
+  throw std::runtime_error("all_gather_v not implemented");
+}
+
 std::shared_ptr<TorchWork> TorchCommRCCL::all_gather_single(
     at::Tensor& output,
     const at::Tensor& input,
diff --git a/comms/torchcomms/rccl/TorchCommRCCL.hpp b/comms/torchcomms/rccl/TorchCommRCCL.hpp
@@ -103,6 +103,11 @@ class TorchCommRCCL : public TorchCommBackend,
       const at::Tensor& tensor,
       bool async_op,
       const AllGatherOptions& options = {}) override;
+  std::shared_ptr<TorchWork> all_gather_v(
+      const std::vector<at::Tensor>& tensor_list,
+      const at::Tensor& tensor,
+      bool async_op,
+      const AllGatherOptions& options = {}) override;
   std::shared_ptr<TorchWork> all_gather_single(
       at::Tensor& output,
       const at::Tensor& input,
diff --git a/comms/torchcomms/tests/integration/cpp/AllGatherVTest.cpp b/comms/torchcomms/tests/integration/cpp/AllGatherVTest.cpp
@@ -0,0 +1,81 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+#include "AllGatherVTest.hpp"
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "TorchCommTestHelpers.h"
+
+std::unique_ptr<TorchCommTestWrapper> AllGatherVTest::createWrapper() {
+  return std::make_unique<TorchCommTestWrapper>();
+}
+
+void AllGatherVTest::SetUp() {
+  wrapper_ = createWrapper();
+  torchcomm_ = wrapper_->getTorchComm();
+  rank_ = torchcomm_->getRank();
+  num_ranks_ = torchcomm_->getSize();
+}
+
+void AllGatherVTest::TearDown() {
+  // Explicitly reset the TorchComm object to ensure proper cleanup
+  torchcomm_.reset();
+  wrapper_.reset();
+}
+
+// Test function for synchronous all_gather_v with work object
+void AllGatherVTest::testSyncAllGatherV(int count, at::ScalarType dtype) {
+  SCOPED_TRACE(
+      ::testing::Message() << "Testing sync all_gather_v with count=" << count
+                           << " and dtype=" << getDtypeName(dtype));
+
+  // Create input and output tensors
+  auto counts = std::vector<int>(num_ranks_, count);
+  for (int i = 0; i < num_ranks_; i++) {
+    counts[i] = count + i;
+  }
+  at::Tensor input = createInputTensor(counts[rank_], dtype);
+  std::vector<at::Tensor> outputs = createOutputTensors(counts, dtype);
+
+  // Call all_gather_v
+  auto work = torchcomm_->all_gather_v(outputs, input, false);
+  work->wait();
+
+  // Verify the results
+  verifyResults(outputs);
+}
+
+// Helper function to create input tensor
+at::Tensor AllGatherVTest::createInputTensor(int count, at::ScalarType dtype) {
+  auto options = at::TensorOptions().dtype(dtype).device(device_type_);
+  at::Tensor input;
+  if (dtype == at::kFloat || dtype == at::kHalf || dtype == at::kBFloat16) {
+    input = at::ones({count}, options) * static_cast<float>(rank_ + 1);
+  } else if (dtype == at::kInt) {
+    input = at::ones({count}, options) * static_cast<int>(rank_ + 1);
+  } else if (dtype == at::kChar) {
+    input = at::ones({count}, options) * static_cast<signed char>(rank_ + 1);
+  }
+  return input;
+}
+
+// Helper function to create output tensors
+std::vector<at::Tensor> AllGatherVTest::createOutputTensors(
+    std::vector<int> counts,
+    at::ScalarType dtype) {
+  auto options = at::TensorOptions().dtype(dtype).device(device_type_);
+  std::vector<at::Tensor> outputs(num_ranks_);
+  for (int i = 0; i < num_ranks_; i++) {
+    outputs[i] = at::zeros({counts[i]}, options);
+  }
+  return outputs;
+}
+
+// Helper function to verify results
+void AllGatherVTest::verifyResults(const std::vector<at::Tensor>& outputs) {
+  for (int i = 0; i < num_ranks_; i++) {
+    // Use verifyTensorEquality to compare output with expected tensor
+    std::string description = "rank " + std::to_string(i) + " tensor";
+    verifyTensorEquality(outputs[i].cpu(), i + 1, description);
+  }
+}
diff --git a/comms/torchcomms/tests/integration/cpp/AllGatherVTest.hpp b/comms/torchcomms/tests/integration/cpp/AllGatherVTest.hpp
@@ -0,0 +1,42 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGraph.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "comms/torchcomms/tests/integration/cpp/TorchCommTestHelpers.h"
+
+class AllGatherVTest
+    : public ::testing::TestWithParam<std::tuple<int, at::ScalarType>> {
+ public:
+  AllGatherVTest() : AllGatherVTest(c10::DeviceType::CUDA) {}
+  explicit AllGatherVTest(c10::DeviceType device_type)
+      : rank_(0), num_ranks_(0), device_type_(device_type) {}
+
+  // Test function declarations with parameters
+  void testSyncAllGatherV(int count, at::ScalarType dtype);
+
+ protected:
+  virtual std::unique_ptr<TorchCommTestWrapper> createWrapper();
+
+  virtual void SetUp() override;
+
+  virtual void TearDown() override;
+
+  std::unique_ptr<TorchCommTestWrapper> wrapper_;
+  std::shared_ptr<torch::comms::TorchComm> torchcomm_;
+  int rank_;
+  int num_ranks_;
+  c10::DeviceType device_type_;
+
+  static constexpr int num_replays = 4;
+
+  // Helper function declarations with parameters
+  virtual at::Tensor createInputTensor(int count, at::ScalarType dtype);
+  virtual std::vector<at::Tensor> createOutputTensors(
+      std::vector<int> counts,
+      at::ScalarType dtype);
+  void verifyResults(const std::vector<at::Tensor>& outputs);
+};
diff --git a/comms/torchcomms/tests/integration/cpp/AllGatherVTestMain.cpp b/comms/torchcomms/tests/integration/cpp/AllGatherVTestMain.cpp
diff --git a/comms/torchcomms/tests/integration/py/AllGatherVTest.py b/comms/torchcomms/tests/integration/py/AllGatherVTest.py
diff --git a/comms/torchcomms/tests/unit/cpp/DummyTorchCommBackend.cpp b/comms/torchcomms/tests/unit/cpp/DummyTorchCommBackend.cpp
diff --git a/comms/torchcomms/tests/unit/cpp/DummyTorchCommBackend.hpp b/comms/torchcomms/tests/unit/cpp/DummyTorchCommBackend.hpp