Skip to content

Commit f80ce55

Browse files
YulunWmeta-codesync[bot]
authored andcommitted
Add profiler tracing for Ctran AllToAll
Summary: Due to Ctran refactoring, we removed the old algo profiler and added a new one instead. Using the new algo profiler to trace Ctran AllToAll Reviewed By: minsii Differential Revision: D85452153 fbshipit-source-id: e984694ebefbde71561f57bb199324af2342c86a
1 parent ba55b32 commit f80ce55

File tree

4 files changed

+54
-0
lines changed

4 files changed

+54
-0
lines changed

comms/ctran/algos/AllToAll/AllToAll.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
recvcounts, \
2323
rdispls, \
2424
op->alltoall.datatype, \
25+
op->opCount, \
2526
comm, \
2627
std::move(timestamp));
2728

comms/ctran/algos/AllToAll/AllToAllPImpl.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ using ctran::alltoallp::PersistArgs;
2121
counts, \
2222
displs, \
2323
pArgs->datatype, \
24+
op->opCount, \
2425
comm, \
2526
std::move(timestamp));
2627

comms/ctran/algos/AllToAll/AllToAllv.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
op->alltoallv.recvcounts, \
2525
op->alltoallv.rdispls, \
2626
op->alltoallv.datatype, \
27+
op->opCount, \
2728
comm, \
2829
std::move(timestamp));
2930

comms/ctran/algos/AllToAll/AllToAllvImpl.h

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
#define CTRAN_ALL_TO_ALLV_IMPL_H_
55

66
#include <folly/synchronization/CallOnce.h>
7+
78
#include "comms/ctran/CtranComm.h"
89
#include "comms/ctran/algos/CtranAlgo.h"
910
#include "comms/ctran/mapper/CtranMapper.h"
11+
#include "comms/ctran/profiler/Profiler.h"
1012
#include "comms/utils/cvars/nccl_cvars.h"
1113

1214
static inline const std::string allToAllAlgoName(enum NCCL_ALLTOALL_ALGO algo) {
@@ -60,6 +62,7 @@ commResult_t ctranAllToAllvIbImpl(
6062
std::vector<size_t>& recvCounts,
6163
std::vector<size_t>& rDispls,
6264
commDataType_t datatype,
65+
uint64_t opCount,
6366
CtranComm* comm,
6467
std::unique_ptr<CtranMapperTimestamp> timestamp) {
6568
const auto& statex = comm->statex_;
@@ -82,6 +85,12 @@ commResult_t ctranAllToAllvIbImpl(
8285
std::vector<int> ibRecvPeers, ibSendPeers;
8386
std::unordered_set<int> ibPeers;
8487

88+
ctran::Profiler* profiler = comm->ctran_->profiler.get();
89+
if (profiler) {
90+
profiler->initForEachColl(
91+
opCount, NCCL_CTRAN_ALGO_PROFILING_SAMPLING_WEIGHT);
92+
}
93+
8594
if (sendCounts.size() > 0) {
8695
std::vector<size_t> sendSizes(nRanks, 0);
8796
for (int i = 0; i < nRanks; i++) {
@@ -93,6 +102,13 @@ commResult_t ctranAllToAllvIbImpl(
93102
}
94103
CtranMapperContext context(algoName, sendSizes, recvSizes);
95104
comm->ctran_->mapper->setContext(std::move(context));
105+
106+
CTRAN_PROFILER_IF(profiler, {
107+
auto& algoContext = profiler->algoContext;
108+
algoContext.algorithmName = algoName;
109+
algoContext.sendContext.messageSizes = folly::join(',', sendSizes);
110+
algoContext.recvContext.messageSizes = folly::join(',', recvSizes);
111+
});
96112
}
97113

98114
// Prepare buffers shifted with displacement, and set ctrl/put/notify
@@ -135,14 +151,23 @@ commResult_t ctranAllToAllvIbImpl(
135151
// Search for the handle only when there are RecvPeers to avoid attempting to
136152
// search/register with a buffer size of 0.
137153
if (!ibRecvPeers.empty()) {
154+
CTRAN_PROFILER_IF(
155+
profiler, profiler->startEvent(ctran::ProfilerEvent::BUF_REG));
156+
138157
FB_COMMCHECK(searchRegHandle(
139158
comm,
140159
recvbuff,
141160
contigRecvBufSize * commTypeSize(datatype),
142161
tmpHdl,
143162
tmpRegHdls));
163+
164+
CTRAN_PROFILER_IF(
165+
profiler, profiler->endEvent(ctran::ProfilerEvent::BUF_REG));
144166
}
145167

168+
CTRAN_PROFILER_IF(
169+
profiler, profiler->startEvent(ctran::ProfilerEvent::ALGO_CTRL));
170+
146171
FB_COMMCHECK(comm->ctran_->mapper->isendCtrlBatch<PerfConfig>(
147172
recvBuffs, tmpHdl, ibRecvPeers, ibSendCtrlReqs, CtranMapperBackend::IB));
148173
FB_COMMCHECK(comm->ctran_->mapper->initNotifyBatchIB(ibRecvPeers, notifyVec));
@@ -151,12 +176,18 @@ commResult_t ctranAllToAllvIbImpl(
151176
// Search for the handle only when there are SendPeers to avoid attempting to
152177
// search/register with a buffer size of 0.
153178
if (!ibSendPeers.empty()) {
179+
CTRAN_PROFILER_IF(
180+
profiler, profiler->startEvent(ctran::ProfilerEvent::BUF_REG));
181+
154182
FB_COMMCHECK(searchRegHandle(
155183
comm,
156184
sendbuff,
157185
contigSendBufSize * commTypeSize(datatype),
158186
tmpHdl,
159187
tmpRegHdls));
188+
189+
CTRAN_PROFILER_IF(
190+
profiler, profiler->endEvent(ctran::ProfilerEvent::BUF_REG));
160191
}
161192
int idx = 0;
162193
for (auto peer : ibSendPeers) {
@@ -178,11 +209,26 @@ commResult_t ctranAllToAllvIbImpl(
178209
idx = 0;
179210
for (auto& recvCtrlReq : ibRecvCtrlReqs) {
180211
FB_COMMCHECK(comm->ctran_->mapper->waitRequest<PerfConfig>(&recvCtrlReq));
212+
213+
// Check whether it is the last request. We should end the algo ctrl event
214+
// after finish waiting the last request.
215+
if (&recvCtrlReq == &ibRecvCtrlReqs.back()) {
216+
CTRAN_PROFILER_IF(
217+
profiler, profiler->endEvent(ctran::ProfilerEvent::ALGO_CTRL));
218+
}
219+
181220
const int peer = recvCtrlReq.peer;
182221
if (useProfiler) {
183222
timestamp->recvCtrl.push_back(CtranMapperTimestampPoint(peer));
184223
}
185224
auto sendSize = sendCounts[peer] * commTypeSize(datatype);
225+
226+
// Check whether it is the first request. We should start timing the data
227+
// event after finish waiting the first request.
228+
if (&recvCtrlReq == &ibRecvCtrlReqs.front()) {
229+
CTRAN_PROFILER_IF(
230+
profiler, profiler->startEvent(ctran::ProfilerEvent::ALGO_DATA));
231+
}
186232
// FIXME: we should compare sendSize with real maxWqeSize:
187233
// NCCL_CTRAN_IB_QP_SCALING_THRESHOLD may not be maxWqeSize if user
188234
// specified NCCL_CTRAN_IB_QP_CONFIG_ALGO to overwrite qp_scaling_threshold
@@ -212,12 +258,17 @@ commResult_t ctranAllToAllvIbImpl(
212258
// Wait for all receives (i.e., remote IB puts) to complete
213259
FB_COMMCHECK(comm->ctran_->mapper->waitAllNotifies<PerfConfig>(notifyVec));
214260

261+
CTRAN_PROFILER_IF(
262+
profiler, profiler->endEvent(ctran::ProfilerEvent::ALGO_DATA));
263+
215264
// Always wait for all sendCtrlReqs to complete so that the memory can be
216265
// safely reused in next collective; otherwise, ibvc may complete the previous
217266
// request while the memory has already been assigned to a new request.
218267
FB_COMMCHECK(
219268
comm->ctran_->mapper->waitAllRequests<PerfConfig>(ibSendCtrlReqs));
220269

270+
CTRAN_PROFILER_IF(profiler, { profiler->reportToScuba(); });
271+
221272
if (useProfiler) {
222273
comm->ctran_->mapper->timestamps.emplace_back(std::move(timestamp));
223274
comm->ctran_->mapper->reportProfiling();

0 commit comments

Comments
 (0)