44#define CTRAN_ALL_TO_ALLV_IMPL_H_
55
66#include < folly/synchronization/CallOnce.h>
7+
78#include " comms/ctran/CtranComm.h"
89#include " comms/ctran/algos/CtranAlgo.h"
910#include " comms/ctran/mapper/CtranMapper.h"
11+ #include " comms/ctran/profiler/Profiler.h"
1012#include " comms/utils/cvars/nccl_cvars.h"
1113
1214static inline const std::string allToAllAlgoName (enum NCCL_ALLTOALL_ALGO algo) {
@@ -60,6 +62,7 @@ commResult_t ctranAllToAllvIbImpl(
6062 std::vector<size_t >& recvCounts,
6163 std::vector<size_t >& rDispls,
6264 commDataType_t datatype,
65+ uint64_t opCount,
6366 CtranComm* comm,
6467 std::unique_ptr<CtranMapperTimestamp> timestamp) {
6568 const auto & statex = comm->statex_ ;
@@ -82,6 +85,12 @@ commResult_t ctranAllToAllvIbImpl(
8285 std::vector<int > ibRecvPeers, ibSendPeers;
8386 std::unordered_set<int > ibPeers;
8487
88+ ctran::Profiler* profiler = comm->ctran_ ->profiler .get ();
89+ if (profiler) {
90+ profiler->initForEachColl (
91+ opCount, NCCL_CTRAN_ALGO_PROFILING_SAMPLING_WEIGHT);
92+ }
93+
8594 if (sendCounts.size () > 0 ) {
8695 std::vector<size_t > sendSizes (nRanks, 0 );
8796 for (int i = 0 ; i < nRanks; i++) {
@@ -93,6 +102,13 @@ commResult_t ctranAllToAllvIbImpl(
93102 }
94103 CtranMapperContext context (algoName, sendSizes, recvSizes);
95104 comm->ctran_ ->mapper ->setContext (std::move (context));
105+
106+ CTRAN_PROFILER_IF (profiler, {
107+ auto & algoContext = profiler->algoContext ;
108+ algoContext.algorithmName = algoName;
109+ algoContext.sendContext .messageSizes = folly::join (' ,' , sendSizes);
110+ algoContext.recvContext .messageSizes = folly::join (' ,' , recvSizes);
111+ });
96112 }
97113
98114 // Prepare buffers shifted with displacement, and set ctrl/put/notify
@@ -135,14 +151,23 @@ commResult_t ctranAllToAllvIbImpl(
135151 // Search for the handle only when there are RecvPeers to avoid attempting to
136152 // search/register with a buffer size of 0.
137153 if (!ibRecvPeers.empty ()) {
154+ CTRAN_PROFILER_IF (
155+ profiler, profiler->startEvent (ctran::ProfilerEvent::BUF_REG));
156+
138157 FB_COMMCHECK (searchRegHandle (
139158 comm,
140159 recvbuff,
141160 contigRecvBufSize * commTypeSize (datatype),
142161 tmpHdl,
143162 tmpRegHdls));
163+
164+ CTRAN_PROFILER_IF (
165+ profiler, profiler->endEvent (ctran::ProfilerEvent::BUF_REG));
144166 }
145167
168+ CTRAN_PROFILER_IF (
169+ profiler, profiler->startEvent (ctran::ProfilerEvent::ALGO_CTRL));
170+
146171 FB_COMMCHECK (comm->ctran_ ->mapper ->isendCtrlBatch <PerfConfig>(
147172 recvBuffs, tmpHdl, ibRecvPeers, ibSendCtrlReqs, CtranMapperBackend::IB));
148173 FB_COMMCHECK (comm->ctran_ ->mapper ->initNotifyBatchIB (ibRecvPeers, notifyVec));
@@ -151,12 +176,18 @@ commResult_t ctranAllToAllvIbImpl(
151176 // Search for the handle only when there are SendPeers to avoid attempting to
152177 // search/register with a buffer size of 0.
153178 if (!ibSendPeers.empty ()) {
179+ CTRAN_PROFILER_IF (
180+ profiler, profiler->startEvent (ctran::ProfilerEvent::BUF_REG));
181+
154182 FB_COMMCHECK (searchRegHandle (
155183 comm,
156184 sendbuff,
157185 contigSendBufSize * commTypeSize (datatype),
158186 tmpHdl,
159187 tmpRegHdls));
188+
189+ CTRAN_PROFILER_IF (
190+ profiler, profiler->endEvent (ctran::ProfilerEvent::BUF_REG));
160191 }
161192 int idx = 0 ;
162193 for (auto peer : ibSendPeers) {
@@ -178,11 +209,26 @@ commResult_t ctranAllToAllvIbImpl(
178209 idx = 0 ;
179210 for (auto & recvCtrlReq : ibRecvCtrlReqs) {
180211 FB_COMMCHECK (comm->ctran_ ->mapper ->waitRequest <PerfConfig>(&recvCtrlReq));
212+
213+ // Check whether it is the last request. We should end the algo ctrl event
214+ // after finish waiting the last request.
215+ if (&recvCtrlReq == &ibRecvCtrlReqs.back ()) {
216+ CTRAN_PROFILER_IF (
217+ profiler, profiler->endEvent (ctran::ProfilerEvent::ALGO_CTRL));
218+ }
219+
181220 const int peer = recvCtrlReq.peer ;
182221 if (useProfiler) {
183222 timestamp->recvCtrl .push_back (CtranMapperTimestampPoint (peer));
184223 }
185224 auto sendSize = sendCounts[peer] * commTypeSize (datatype);
225+
226+ // Check whether it is the first request. We should start timing the data
227+ // event after finish waiting the first request.
228+ if (&recvCtrlReq == &ibRecvCtrlReqs.front ()) {
229+ CTRAN_PROFILER_IF (
230+ profiler, profiler->startEvent (ctran::ProfilerEvent::ALGO_DATA));
231+ }
186232 // FIXME: we should compare sendSize with real maxWqeSize:
187233 // NCCL_CTRAN_IB_QP_SCALING_THRESHOLD may not be maxWqeSize if user
188234 // specified NCCL_CTRAN_IB_QP_CONFIG_ALGO to overwrite qp_scaling_threshold
@@ -212,12 +258,17 @@ commResult_t ctranAllToAllvIbImpl(
212258 // Wait for all receives (i.e., remote IB puts) to complete
213259 FB_COMMCHECK (comm->ctran_ ->mapper ->waitAllNotifies <PerfConfig>(notifyVec));
214260
261+ CTRAN_PROFILER_IF (
262+ profiler, profiler->endEvent (ctran::ProfilerEvent::ALGO_DATA));
263+
215264 // Always wait for all sendCtrlReqs to complete so that the memory can be
216265 // safely reused in next collective; otherwise, ibvc may complete the previous
217266 // request while the memory has already been assigned to a new request.
218267 FB_COMMCHECK (
219268 comm->ctran_ ->mapper ->waitAllRequests <PerfConfig>(ibSendCtrlReqs));
220269
270+ CTRAN_PROFILER_IF (profiler, { profiler->reportToScuba (); });
271+
221272 if (useProfiler) {
222273 comm->ctran_ ->mapper ->timestamps .emplace_back (std::move (timestamp));
223274 comm->ctran_ ->mapper ->reportProfiling ();
0 commit comments