diff --git a/comms/ncclx/v2_27/meta/colltrace/tests/CollTraceDistTest.cc b/comms/ncclx/v2_27/meta/colltrace/tests/CollTraceDistTest.cc index 7feab380..013296b4 100644 --- a/comms/ncclx/v2_27/meta/colltrace/tests/CollTraceDistTest.cc +++ b/comms/ncclx/v2_27/meta/colltrace/tests/CollTraceDistTest.cc @@ -1201,7 +1201,7 @@ TEST_F(CollTraceTest, winPutWait) { int prevPeer = (this->globalRank + this->numRanks - 1) % this->numRanks; for (auto iter = 0; iter < kNumIters; iter++) { - NCCLCHECK_TEST(ncclPutSignal( + NCCLCHECK_TEST(ncclPutSignal_old( localbuf + kNumElements * statex->rank(), kNumElements, ncclInt32, @@ -1209,7 +1209,7 @@ TEST_F(CollTraceTest, winPutWait) { kNumElements * statex->rank(), win, put_stream)); - NCCLCHECK_TEST(ncclWaitSignal(prevPeer, win, wait_stream)); + NCCLCHECK_TEST(ncclWaitSignal_old(prevPeer, win, wait_stream)); if (iter == 0) { CUDACHECK_TEST(cudaEventRecord(start_event, put_stream)); } diff --git a/comms/ncclx/v2_27/meta/colltrace/tests/CollTraceWatchdogTest.cc b/comms/ncclx/v2_27/meta/colltrace/tests/CollTraceWatchdogTest.cc index 5e69f8dd..d0172d42 100644 --- a/comms/ncclx/v2_27/meta/colltrace/tests/CollTraceWatchdogTest.cc +++ b/comms/ncclx/v2_27/meta/colltrace/tests/CollTraceWatchdogTest.cc @@ -229,9 +229,9 @@ TEST_F(CollTraceWatchdogTest, TestAsyncErrorFromGPE) { auto srcRank = (rank - 1 + worldSize) % worldSize; auto dstRank = (rank + 1) % worldSize; - NCCLCHECK_FATAL( - ncclPutSignal(sendBuff, 32, ncclFloat, dstRank, 0, win, stream.raw())); - NCCLCHECK_FATAL(ncclWaitSignal(srcRank, win, stream.raw())); + NCCLCHECK_FATAL(ncclPutSignal_old( + sendBuff, 32, ncclFloat, dstRank, 0, win, stream.raw())); + NCCLCHECK_FATAL(ncclWaitSignal_old(srcRank, win, stream.raw())); waitStreamWithTimeout(stream.raw(), std::chrono::seconds{20}); } diff --git a/comms/ncclx/v2_27/meta/colltrace/tests/NewCollTraceDistTestNoLocal.cc b/comms/ncclx/v2_27/meta/colltrace/tests/NewCollTraceDistTestNoLocal.cc index f0b5f8ec..dad44b81 100644 --- a/comms/ncclx/v2_27/meta/colltrace/tests/NewCollTraceDistTestNoLocal.cc +++ b/comms/ncclx/v2_27/meta/colltrace/tests/NewCollTraceDistTestNoLocal.cc @@ -622,7 +622,7 @@ TEST_F(CollTraceTest, winPutWait) { int prevPeer = (this->globalRank + this->numRanks - 1) % this->numRanks; for (auto iter = 0; iter < kNumIters; iter++) { - NCCLCHECK_TEST(ncclPutSignal( + NCCLCHECK_TEST(ncclPutSignal_old( localbuf + kNumElements * statex->rank(), kNumElements, ncclInt32, @@ -630,7 +630,7 @@ TEST_F(CollTraceTest, winPutWait) { kNumElements * statex->rank(), win, put_stream)); - NCCLCHECK_TEST(ncclWaitSignal(prevPeer, win, wait_stream)); + NCCLCHECK_TEST(ncclWaitSignal_old(prevPeer, win, wait_stream)); } int errs = 0; diff --git a/comms/ncclx/v2_27/meta/rma/rma.cc b/comms/ncclx/v2_27/meta/rma/rma.cc index 6e4f1ac4..8fa8bd8f 100644 --- a/comms/ncclx/v2_27/meta/rma/rma.cc +++ b/comms/ncclx/v2_27/meta/rma/rma.cc @@ -11,7 +11,7 @@ NCCL_API( ncclResult_t, - ncclPutSignal, + ncclPutSignal_old, const void* origin_buff, size_t count, ncclDataType_t datatype, @@ -19,7 +19,7 @@ NCCL_API( size_t target_disp, ncclWin_t win, cudaStream_t stream); -ncclResult_t ncclPutSignal( +ncclResult_t ncclPutSignal_old( const void* origin_buff, size_t count, ncclDataType_t datatype, @@ -29,7 +29,8 @@ ncclResult_t ncclPutSignal( cudaStream_t stream) { auto comm = win->comm->ctranComm_.get(); if (!ctranInitialized(comm)) { - FB_ERRORRETURN(ncclInternalError, "ncclPutSignal requires Ctran support"); + FB_ERRORRETURN( + ncclInternalError, "ncclPutSignal_old requires Ctran support"); } return metaCommToNccl(ctranPutSignal( origin_buff, @@ -111,21 +112,22 @@ ncclResult_t ncclGet( NCCL_API( ncclResult_t, - ncclWaitSignal, + ncclWaitSignal_old, int peer, ncclWin_t win, cudaStream_t stream); -ncclResult_t ncclWaitSignal(int peer, ncclWin_t win, cudaStream_t stream) { +ncclResult_t ncclWaitSignal_old(int peer, ncclWin_t win, cudaStream_t stream) { auto comm = win->comm->ctranComm_.get(); if (!ctranInitialized(comm)) { - FB_ERRORRETURN(ncclInternalError, "ncclWaitSignal requires Ctran support"); + FB_ERRORRETURN( + ncclInternalError, "ncclWaitSignal_old requires Ctran support"); } return metaCommToNccl(ctranWaitSignal(peer, win->ctranWindow, comm, stream)); } NCCL_API( ncclResult_t, - ncclPutSignal_v2, + ncclPutSignal, const void* origin_buff, size_t target_disp, size_t count, @@ -135,7 +137,7 @@ NCCL_API( int peer, ncclWin_t win, cudaStream_t stream); -ncclResult_t ncclPutSignal_v2( +ncclResult_t ncclPutSignal( const void* origin_buff, size_t target_disp, size_t count, @@ -164,13 +166,13 @@ ncclResult_t ncclPutSignal_v2( NCCL_API( ncclResult_t, - ncclWaitSignal_v2, + ncclWaitSignal, size_t signal_disp, uint64_t cmp_val, ncclCmpOp_t cmp_op, ncclWin_t win, cudaStream_t stream); -ncclResult_t ncclWaitSignal_v2( +ncclResult_t ncclWaitSignal( size_t signal_disp, uint64_t cmp_val, ncclCmpOp_t cmp_op, diff --git a/comms/ncclx/v2_27/meta/rma/tests/RMATest.cc b/comms/ncclx/v2_27/meta/rma/tests/RMATest.cc index d8777720..f35e1b49 100644 --- a/comms/ncclx/v2_27/meta/rma/tests/RMATest.cc +++ b/comms/ncclx/v2_27/meta/rma/tests/RMATest.cc @@ -133,7 +133,7 @@ TEST_P(MultiWindowTestParam, multiWindow) { int prevPeer = (this->globalRank + this->numRanks - 1) % this->numRanks; for (auto iter = 0; iter < kNumIters; iter++) { - NCCLCHECK_TEST(ncclPutSignal( + NCCLCHECK_TEST(ncclPutSignal_old( localbuf + numElements * statex->rank(), numElements, ncclInt32, @@ -141,7 +141,7 @@ TEST_P(MultiWindowTestParam, multiWindow) { numElements * statex->rank(), win, put_stream)); - NCCLCHECK_TEST(ncclWaitSignal(prevPeer, win, wait_stream)); + NCCLCHECK_TEST(ncclWaitSignal_old(prevPeer, win, wait_stream)); } // Barrier to ensure all peers have finished put this->barrier(comm, main_stream); @@ -238,7 +238,7 @@ TEST_P(RMATestParam, winPutWait) { int prevPeer = (this->globalRank + this->numRanks - 1) % this->numRanks; for (auto iter = 0; iter < kNumIters; iter++) { - NCCLCHECK_TEST(ncclPutSignal( + NCCLCHECK_TEST(ncclPutSignal_old( localBuf, kNumElements, ncclInt32, @@ -246,7 +246,7 @@ TEST_P(RMATestParam, winPutWait) { kNumElements * statex->rank(), win, put_stream)); - NCCLCHECK_TEST(ncclWaitSignal(prevPeer, win, wait_stream)); + NCCLCHECK_TEST(ncclWaitSignal_old(prevPeer, win, wait_stream)); if (iter == 0) { // Skip first iteration to avoid any warmup overhead CUDACHECK_TEST(cudaEventRecord(start_event, put_stream)); diff --git a/comms/ncclx/v2_27/src/nccl.h.in b/comms/ncclx/v2_27/src/nccl.h.in index af1e90b3..5ffe3a5f 100644 --- a/comms/ncclx/v2_27/src/nccl.h.in +++ b/comms/ncclx/v2_27/src/nccl.h.in @@ -689,7 +689,7 @@ ncclResult_t pncclWinFree(ncclComm_t comm, ncclWin_t win); * One-side put operation from a local buffer to a remote peer's pre-allocated * and registered buffer within a NCCL window. */ -ncclResult_t ncclPutSignal( +ncclResult_t ncclPutSignal_old( const void* originBuff, size_t count, ncclDataType_t datatype, @@ -697,7 +697,7 @@ ncclResult_t ncclPutSignal( size_t targetDisp, ncclWin_t win, cudaStream_t stream); -ncclResult_t pncclPutSignal( +ncclResult_t pncclPutSignal_old( const void* originBuff, size_t count, ncclDataType_t datatype, @@ -705,7 +705,7 @@ ncclResult_t pncclPutSignal( size_t targetDisp, ncclWin_t win, cudaStream_t stream); -ncclResult_t ncclPutSignal_v2( +ncclResult_t ncclPutSignal( const void* originBuff, size_t targetDisp, size_t count, @@ -745,13 +745,13 @@ ncclResult_t ncclPutSignal_v2( /* * Wait for a signal from remote peer to complete the put operation. */ -ncclResult_t ncclWaitSignal(int peer, ncclWin_t win, cudaStream_t stream); -ncclResult_t pncclWaitSignal(int peer, ncclWin_t win, cudaStream_t stream); +ncclResult_t ncclWaitSignal_old(int peer, ncclWin_t win, cudaStream_t stream); +ncclResult_t pncclWaitSignal_old(int peer, ncclWin_t win, cudaStream_t stream); /* * Wait for a signal given the local signal displacement, the signal value, and the comparison op. */ -ncclResult_t ncclWaitSignal_v2( +ncclResult_t ncclWaitSignal( size_t signalDisp, uint64_t cmpVal, ncclCmpOp_t cmpOp, diff --git a/comms/torchcomms/ncclx/NcclxApi.cpp b/comms/torchcomms/ncclx/NcclxApi.cpp index 3393cb62..b354f3ae 100644 --- a/comms/torchcomms/ncclx/NcclxApi.cpp +++ b/comms/torchcomms/ncclx/NcclxApi.cpp @@ -259,7 +259,7 @@ ncclResult_t DefaultNcclxApi::winWaitSignal( NcclxWindow win, cudaStream_t stream) { #ifdef NCCL_RMA_SUPPORTED - return ncclWaitSignal_v2(signal_disp, cmp_val, cmp_op, win, stream); + return ncclWaitSignal(signal_disp, cmp_val, cmp_op, win, stream); #else throw std::logic_error( "NCCL does not support window, NCCL_RMA_SUPPORTED is not set");