Skip to content

Commit d47dc93

Browse files
Saif Hasanmeta-codesync[bot]
authored andcommitted
Fix NCCLX UnitTest
Summary: Mocks doesn't return any valid cuda device count. The code was relying on uninitialized value, at the mercy of compiler. Recently the code started to segfault when it does `rank_ % device_count` with device_count being 0 Changes - Explicitly initialize the device_count to 0 - Extend mock to always return `1` by default for `getDeviceCount` - Fix the UT to setup mock behaviors by default Test Failure - https://www.internalfb.com/intern/test/562950190647332 ``` [ RUN ] TorchCommNCCLXTest.InitializationFailsWithInvalidDeviceId I1026 04:38:23.001531 1483368 TorchCommNCCLXBootstrap.cpp:43] [TC] TORCHCOMM_NCCLX_BOOTSTRAP_UNIQUEID_EXCHANGE_METHOD not set, defaulting to auto *** Aborted at 1761478703 (Unix time, try 'date -d 1761478703') *** *** Signal 8 (SIGFPE) (0x3170a51f) received by PID 1483368 (pthread TID 0x7fd11c3e3000) (linux TID 1483368) (code: integer divide by zero), stack trace: *** @ 000000000ba032ff folly::symbolizer::(anonymous namespace)::signalHandler(int, siginfo_t*, void*) ./fbcode/folly/debugging/symbolizer/SignalHandler.cpp:528 @ 000000000004455f (unknown) /home/engshare/third-party2/glibc/2.34/src/glibc-2.34/signal/../sysdeps/unix/sysv/linux/libc_sigaction.c:8 -> /home/engshare/third-party2/glibc/2.34/src/glibc-2.34/signal/../sysdeps/unix/sysv/linux/x86_64/libc_sigaction.c @ 000000003170a51f torch::comms::TorchCommNCCLXBootstrap::TorchCommNCCLXBootstrap(c10::intrusive_ptr<c10d::Store, c10::detail::intrusive_target_default_null_type<c10d::Store> >, c10::Device, std::shared_ptr<torch::comms::NcclxApi>, std::shared_ptr<torch::comms::CudaApi>, std::chrono::duration<long, std::ratio<1l, 1000l> >) ./fbcode/comms/torchcomms/ncclx/TorchCommNCCLXBootstrap.cpp:63 @ 00000000316fd281 torch::comms::TorchCommNCCLX::init(c10::Device, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, torch::comms::CommOptions const&) ./fbcode/comms/torchcomms/ncclx/TorchCommNCCLX.cpp:74 @ 000000000b94fb02 torch::comms::test::TorchCommNCCLXTest_InitializationFailsWithInvalidDeviceId_Test::TestBody() ./fbcode/comms/torchcomms/ncclx/tests/unit/cpp/TorchCommNCCLXTest.cpp:155 @ 000000000b9a586f testing::Test::Run() fbsource/src/gtest.cc:2751 @ 000000000b9a6cde testing::TestInfo::Run() fbsource/src/gtest.cc:2897 @ 000000000b9a80f4 testing::TestSuite::Run() fbsource/src/gtest.cc:3075 @ 000000000b9ba72f testing::internal::UnitTestImpl::RunAllTests() fbsource/src/gtest.cc:6066 @ 000000000b9b980f testing::UnitTest::Run() fbsource/src/gtest.cc:5606 @ 000000000b9c977d main fbsource/gtest/gtest.h:2337 @ 000000000002c656 __libc_start_call_main /home/engshare/third-party2/glibc/2.34/src/glibc-2.34/csu/../sysdeps/nptl/libc_start_call_main.h:58 -> /home/engshare/third-party2/glibc/2.34/src/glibc-2.34/csu/../sysdeps/x86/libc-start.c @ 000000000002c717 __libc_start_main /home/engshare/third-party2/glibc/2.34/src/glibc-2.34/csu/../csu/libc-start.c:409 -> /home/engshare/third-party2/glibc/2.34/src/glibc-2.34/csu/../sysdeps/x86/libc-start.c @ 000000000b944220 _start /home/engshare/third-party2/glibc/2.34/src/glibc-2.34/csu/../sysdeps/x86_64/start.S:116 ``` Reviewed By: pavanbalaji Differential Revision: D85489376 fbshipit-source-id: 993377d73b05c91cb0df4219e4f8f4e9b78de36b
1 parent ea81c2e commit d47dc93

File tree

3 files changed

+16
-4
lines changed

3 files changed

+16
-4
lines changed

comms/torchcomms/ncclx/TorchCommNCCLXBootstrap.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
// Copyright (c) Meta Platforms, Inc. and affiliates.
22

3-
#include "comms/torchcomms/ncclx/TorchCommNCCLXBootstrap.hpp"
3+
#include <stdexcept>
4+
45
#include <ATen/cuda/CUDAContext.h>
56
#include <dlfcn.h>
67
#include <torch/csrc/distributed/c10d/TCPStore.hpp> // @manual
78
#include "comms/torchcomms/StoreManager.hpp"
89
#include "comms/torchcomms/TorchCommLogging.hpp"
910
#include "comms/torchcomms/TorchCommUtils.hpp"
1011
#include "comms/torchcomms/ncclx/TorchCommNCCLX.hpp"
12+
#include "comms/torchcomms/ncclx/TorchCommNCCLXBootstrap.hpp"
1113
#include "nccl.h" // @manual
1214

1315
namespace torch {
@@ -54,11 +56,16 @@ TorchCommNCCLXBootstrap::TorchCommNCCLXBootstrap(
5456
[](unsigned char c) { return std::tolower(c); });
5557

5658
if (device_.index() == -1) {
57-
int device_count;
59+
int device_count{0};
5860
CUDA_CHECK(
5961
cuda_api_,
6062
cuda_api_->getDeviceCount(&device_count),
6163
"Failed to get CUDA device count");
64+
if (device_count <= 0) {
65+
throw std::invalid_argument(
66+
"No CUDA devices found; please check your CUDA installation");
67+
}
68+
TC_LOG(INFO, nullptr) << "Found " << device_count << " CUDA devices";
6269

6370
device_ = c10::Device(c10::kCUDA, rank_ % device_count);
6471
TC_LOG(INFO, nullptr)

comms/torchcomms/ncclx/tests/unit/cpp/TorchCommNCCLXTest.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,10 @@ TEST_F(TorchCommNCCLXTest, InitializationRank0GetUniqueId) {
5151
// Clear - 1 (destructor)
5252
setupCCAExpectations(1, 2, 1);
5353

54-
auto comm = createMockedTorchComm();
55-
5654
cuda_mock_->setupDefaultBehaviors();
5755

56+
auto comm = createMockedTorchComm();
57+
5858
// Expect rank 0 to get unique ID and store it
5959
ncclUniqueId expected_id{};
6060
memset(&expected_id, 0x42, sizeof(expected_id)); // Fill with test pattern
@@ -134,6 +134,8 @@ TEST_F(TorchCommNCCLXTest, InitializationFailsWithInvalidDeviceId) {
134134
// Setup CCA expectations - first init (device -1) should succeed
135135
setupCCAExpectations(1, 2, 0);
136136

137+
cuda_mock_->setupDefaultBehaviors();
138+
137139
// Test with negative device ID
138140
{
139141
at::Device invalid_device(at::DeviceType::CUDA, -1);

comms/torchcomms/ncclx/tests/unit/cpp/mocks/CudaMock.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ void CudaMock::setupDefaultBehaviors() {
1616
// Device management - return success by default
1717
ON_CALL(*this, setDevice(_)).WillByDefault(Return(cudaSuccess));
1818

19+
ON_CALL(*this, getDeviceCount(_))
20+
.WillByDefault(DoAll(SetArgPointee<0>(1), Return(cudaSuccess)));
21+
1922
ON_CALL(*this, getDeviceProperties(_, _))
2023
.WillByDefault(
2124
DoAll(SetArgPointee<0>(cudaDeviceProp{}), Return(cudaSuccess)));

0 commit comments

Comments
 (0)