Skip to content

Commit ab13bec

Browse files
SuhitKmeta-codesync[bot]
authored andcommitted
Rebase colltrace for v2.28
Summary: First change to rebase colltrace onto NCCL v2.28. Currently includes both new and old colltrace. There will be a second diff that will remove the old colltrace invocations. ~~Note: I am running into issues with NCCL 2.28 where it is failing when trying to use ncclGin plugin. For now I have disabled the plugin, but need some feedback on best way to handle the bug in NCCL baselin~~e. Update: After adding NCCL_GIN_TYPE=0 in cvars, this issue is resolved. Reviewed By: zhiyongww Differential Revision: D86152808 fbshipit-source-id: 762bd16f0571b0d066cdf822b7b1e3fc5f0a616b
1 parent f15c18f commit ab13bec

File tree

3 files changed

+22
-2
lines changed

3 files changed

+22
-2
lines changed

comms/utils/cvars/nccl_cvars.cc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Copyright (c) Meta Platforms, Inc. and affiliates.
2-
// @generated SignedSource<<c2b0347281b5f4c58bca1e3b1547530d8f7507e8>>
2+
// @generated SignedSource<<12578d0c7386c85de2417d28efac7a19e1861246>>
33
// Automatically generated by ./comms/utils/cvars/extractcvars.py --- START
44
// DO NOT EDIT!!!
55

@@ -680,6 +680,8 @@ int64_t NCCL_GDRCOPY_SYNC_ENABLE;
680680
int64_t NCCL_GDRCOPY_SYNC_ENABLE_DEFAULT;
681681
int64_t NCCL_GDR_FLUSH_DISABLE;
682682
int64_t NCCL_GDR_FLUSH_DISABLE_DEFAULT;
683+
int64_t NCCL_GIN_TYPE;
684+
int64_t NCCL_GIN_TYPE_DEFAULT;
683685
std::string NCCL_GRAPH_DUMP_FILE;
684686
std::string NCCL_GRAPH_DUMP_FILE_DEFAULT;
685687
int64_t NCCL_GRAPH_DUMP_FILE_RANK;
@@ -1173,6 +1175,7 @@ std::unordered_map<std::string, int64_t*> env_int64_values = {
11731175
{"NCCL_GDRCOPY_FLUSH_ENABLE", &NCCL_GDRCOPY_FLUSH_ENABLE},
11741176
{"NCCL_GDRCOPY_SYNC_ENABLE", &NCCL_GDRCOPY_SYNC_ENABLE},
11751177
{"NCCL_GDR_FLUSH_DISABLE", &NCCL_GDR_FLUSH_DISABLE},
1178+
{"NCCL_GIN_TYPE", &NCCL_GIN_TYPE},
11761179
{"NCCL_GRAPH_DUMP_FILE_RANK", &NCCL_GRAPH_DUMP_FILE_RANK},
11771180
{"NCCL_GRAPH_HELPER_DISABLE", &NCCL_GRAPH_HELPER_DISABLE},
11781181
{"NCCL_GRAPH_MIXING_SUPPORT", &NCCL_GRAPH_MIXING_SUPPORT},
@@ -1706,6 +1709,7 @@ static void initEnvSet(std::unordered_set<std::string>& env) {
17061709
env.insert("NCCL_GDRCOPY_FLUSH_ENABLE");
17071710
env.insert("NCCL_GDRCOPY_SYNC_ENABLE");
17081711
env.insert("NCCL_GDR_FLUSH_DISABLE");
1712+
env.insert("NCCL_GIN_TYPE");
17091713
env.insert("NCCL_GRAPH_DUMP_FILE");
17101714
env.insert("NCCL_GRAPH_DUMP_FILE_RANK");
17111715
env.insert("NCCL_GRAPH_FILE");
@@ -3994,6 +3998,12 @@ static void readCvarEnv() {
39943998
CVAR_INFO(
39953999
"NCCL Config - CVAR {} has an override", "NCCL_GDR_FLUSH_DISABLE");
39964000
}
4001+
NCCL_GIN_TYPE = env2num<int64_t>("NCCL_GIN_TYPE", "0");
4002+
NCCL_GIN_TYPE_DEFAULT = env2num<int64_t>("NCCL_ENV_DO_NOT_SET", "0");
4003+
4004+
if (NCCL_GIN_TYPE_DEFAULT != NCCL_GIN_TYPE) {
4005+
CVAR_INFO("NCCL Config - CVAR {} has an override", "NCCL_GIN_TYPE");
4006+
}
39974007
NCCL_GRAPH_DUMP_FILE = env2str("NCCL_GRAPH_DUMP_FILE", "");
39984008
NCCL_GRAPH_DUMP_FILE_DEFAULT = env2str("NCCL_ENV_DO_NOT_SET", "");
39994009

comms/utils/cvars/nccl_cvars.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// Copyright (c) Meta Platforms, Inc. and affiliates.
2-
// @generated SignedSource<<c2b0347281b5f4c58bca1e3b1547530d8f7507e8>>
2+
// @generated SignedSource<<12578d0c7386c85de2417d28efac7a19e1861246>>
33
// Automatically generated by ./comms/utils/cvars/extractcvars.py --- START
44
// DO NOT EDIT!!!
55

@@ -677,6 +677,9 @@ extern int64_t NCCL_GDRCOPY_SYNC_ENABLE_DEFAULT;
677677
extern int64_t NCCL_GDR_FLUSH_DISABLE;
678678
extern int64_t NCCL_GDR_FLUSH_DISABLE_DEFAULT;
679679

680+
extern int64_t NCCL_GIN_TYPE;
681+
extern int64_t NCCL_GIN_TYPE_DEFAULT;
682+
680683
extern std::string NCCL_GRAPH_DUMP_FILE;
681684
extern std::string NCCL_GRAPH_DUMP_FILE_DEFAULT;
682685

comms/utils/cvars/nccl_cvars.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2245,6 +2245,13 @@ cvars:
22452245
We are targeting 8000 ranks most, so leaving room for 100K events should
22462246
be sufficient.
22472247
2248+
- name : NCCL_GIN_TYPE
2249+
type : int64_t
2250+
default : 0
2251+
description : |-
2252+
GPU-Initiated Network (GIN) type configuration. This controls the type
2253+
of GIN implementation to use for network operations.
2254+
22482255
- name : NCCL_PROXYMOCK_NET_SEND_FAILURE
22492256
type : stringlist
22502257
default :

0 commit comments

Comments
 (0)