Skip to content

Commit 6d361cb

Browse files
YulunWmeta-codesync[bot]
authored andcommitted
Create API to get last comms error message
Summary: Due to the fact that we log nccl "stack trace" (e.g. `NCCL WARN init.cc:1234 -> 2`) as warning, the ncclGetLastError will return those "stack traces" as the last error, making the message useless. Additionally, ncclGetLastError will not show errors that we log via XLOG. Implement last error in NcclLogFormatter to restore access to the last error string. Reviewed By: minsii Differential Revision: D85412934 fbshipit-source-id: bf552f81c106e1a61514753a71d742981875fb42
1 parent f36f61d commit 6d361cb

File tree

4 files changed

+122
-1
lines changed

4 files changed

+122
-1
lines changed

comms/ncclx/v2_27/meta/logger/tests/LoggingUT.cc

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,105 @@ TEST_F(NcclLoggerTest, LogDisplay) {
9292
finishLogging();
9393
}
9494

95+
TEST_F(NcclLoggerTest, GetLastCommsErrorTest) {
96+
auto debugGuard = EnvRAII(NCCL_DEBUG, std::string{"INFO"});
97+
ncclResetDebugInit();
98+
99+
initLogging();
100+
101+
// Initially, the last error should be empty
102+
auto lastError = meta::comms::logger::getLastCommsError();
103+
EXPECT_THAT(lastError, ::testing::StrEq(""));
104+
105+
// Log an info message - should not update last error
106+
std::string infoMsg = "INFO MESSAGE";
107+
INFO(NCCL_ALL, "%s", infoMsg.c_str());
108+
sleep(1);
109+
lastError = meta::comms::logger::getLastCommsError();
110+
EXPECT_THAT(lastError, ::testing::StrEq(""));
111+
112+
// Log a warning message - should not update last error
113+
std::string warnMsg = "WARN MESSAGE";
114+
WARN("%s", warnMsg.c_str());
115+
sleep(1);
116+
lastError = meta::comms::logger::getLastCommsError();
117+
EXPECT_THAT(lastError, ::testing::StrEq(""));
118+
119+
// Log an error message - should update last error
120+
std::string errorMsg = "ERROR MESSAGE";
121+
ERR("%s", errorMsg.c_str());
122+
sleep(1);
123+
lastError = meta::comms::logger::getLastCommsError();
124+
EXPECT_THAT(lastError, ::testing::StrEq(errorMsg));
125+
126+
// Log another error message - should update to the new error
127+
std::string errorMsg2 = "SECOND ERROR MESSAGE";
128+
ERR("%s", errorMsg2.c_str());
129+
sleep(1);
130+
lastError = meta::comms::logger::getLastCommsError();
131+
EXPECT_THAT(lastError, ::testing::StrEq(errorMsg2));
132+
133+
// Log info and warn - last error should remain unchanged
134+
INFO(NCCL_ALL, "Another info");
135+
WARN("Another warn");
136+
sleep(1);
137+
lastError = meta::comms::logger::getLastCommsError();
138+
EXPECT_THAT(lastError, ::testing::StrEq(errorMsg2));
139+
140+
finishLogging();
141+
}
142+
143+
TEST_F(NcclLoggerTest, GetLastCommsErrorMultilineTest) {
144+
auto debugGuard = EnvRAII(NCCL_DEBUG, std::string{"INFO"});
145+
ncclResetDebugInit();
146+
147+
initLogging();
148+
149+
// Log a multiline error message
150+
std::string multilineError = "First line\nSecond line\nThird line";
151+
ERR("%s", multilineError.c_str());
152+
sleep(1);
153+
154+
auto lastError = meta::comms::logger::getLastCommsError();
155+
EXPECT_THAT(lastError, ::testing::StrEq(multilineError));
156+
157+
finishLogging();
158+
}
159+
160+
TEST_F(NcclLoggerTest, GetLastCommsErrorLongMessageTest) {
161+
auto debugGuard = EnvRAII(NCCL_DEBUG, std::string{"INFO"});
162+
ncclResetDebugInit();
163+
164+
initLogging();
165+
166+
// Create a long error message (but within the 1024 char buffer)
167+
std::string longError(500, 'X');
168+
ERR("%s", longError.c_str());
169+
sleep(1);
170+
171+
auto lastError = meta::comms::logger::getLastCommsError();
172+
EXPECT_THAT(lastError, ::testing::StrEq(longError));
173+
174+
finishLogging();
175+
}
176+
177+
TEST_F(NcclLoggerTest, GetLastCommsErrorLongMessageTestXLOG) {
178+
auto debugGuard = EnvRAII(NCCL_DEBUG, std::string{"INFO"});
179+
ncclResetDebugInit();
180+
181+
initLogging();
182+
183+
// Create a long error message (but within the 1024 char buffer)
184+
std::string longError(500, 'X');
185+
XLOG(ERR) << longError;
186+
sleep(1);
187+
188+
auto lastError = meta::comms::logger::getLastCommsError();
189+
EXPECT_THAT(lastError, ::testing::StrEq(longError));
190+
191+
finishLogging();
192+
}
193+
95194
TEST_F(NcclLoggerTest, WarnLogTest) {
96195
auto debugGuard = EnvRAII(NCCL_DEBUG, std::string{"WARN"});
97196
ncclResetDebugInit();

comms/ncclx/v2_27/src/init.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848

4949
#include "comms/utils/cvars/nccl_cvars.h"
5050
#include "comms/utils/logger/EventsScubaUtil.h"
51+
#include "comms/utils/logger/LoggingFormat.h"
5152
#include "comms/ctran/memory/SlabAllocator.h"
5253
#include "comms/ctran/memory/Utils.h"
5354
#include "meta/wrapper/MetaFactory.h"
@@ -2845,7 +2846,7 @@ const char* ncclGetErrorString(ncclResult_t code) {
28452846
*/
28462847
NCCL_API(const char*, ncclGetLastError, const ncclComm_t comm);
28472848
const char* ncclGetLastError(ncclComm_t comm) {
2848-
return ncclLastError;
2849+
return meta::comms::logger::getLastCommsError();
28492850
}
28502851

28512852
NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);

comms/utils/logger/LoggingFormat.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include <fmt/chrono.h>
99
#include <fmt/format.h>
10+
#include <folly/Synchronized.h>
1011
#include <folly/logging/LogMessage.h>
1112
#include <folly/logging/LogName.h>
1213
#include <folly/synchronization/CallOnce.h>
@@ -33,6 +34,10 @@ struct ProcMetaData {
3334
folly::once_flag procMetaDataInitFlag;
3435
static thread_local std::string myThreadName = "main";
3536

37+
// Using char array to ensure compatibility with NCCL GetLastError API
38+
static folly::Synchronized<std::array<char, 1024>> lastCommsError{
39+
std::array<char, 1024>{'\0'}};
40+
3641
} // Anonymous namespace
3742

3843
namespace meta::comms::logger {
@@ -229,6 +234,16 @@ std::string NcclLogFormatter::formatMessage(
229234
const folly::LogCategory* /* handlerCategory */) {
230235
initProcMetaData();
231236

237+
bool isErrorMessage = message.getLevel() >= folly::LogLevel::ERR;
238+
if (isErrorMessage) {
239+
auto lastErrorLocked = lastCommsError.wlock();
240+
std::snprintf(
241+
lastErrorLocked->data(),
242+
lastErrorLocked->size(),
243+
"%s",
244+
message.getMessage().c_str());
245+
}
246+
232247
auto timeSinceEpoch = message.getTimestamp().time_since_epoch();
233248
auto epochSeconds =
234249
std::chrono::duration_cast<std::chrono::seconds>(timeSinceEpoch);
@@ -310,6 +325,10 @@ std::string NcclLogFormatter::formatMessage(
310325
return buffer;
311326
}
312327

328+
const char* getLastCommsError() {
329+
return lastCommsError.rlock()->data();
330+
}
331+
313332
NcclLogFormatter::NcclLogFormatter(
314333
std::string prefix,
315334
std::function<int(void)> threadContextFn)

comms/utils/logger/LoggingFormat.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ void initThreadMetaData(std::string_view threadName);
6161

6262
fmt::memory_buffer getLogPrefix(LogLevel level);
6363

64+
const char* getLastCommsError();
65+
6466
class NcclLogFormatter : public folly::LogFormatter {
6567
public:
6668
NcclLogFormatter(

0 commit comments

Comments
 (0)