Skip to content

Commit d96897b

Browse files
Alexander Jipafacebook-github-bot
authored andcommitted
fix allgather for empty buffers (facebookincubator#179) (#356)
Summary: I used PyTorch-Ignite metrics that were all-gathered on both NCCL and Gloo backends. In one of the cases the tensor (buffer) was empty and while NCCL tolerated this (hopefully as a no-op), Gloo gave me SIGFPE(8). I think this is because of the modulo by `dataSize` present today in Gloo all-gather code paths. I believe this can also be a no-op for Gloo, especially since contextSize(rank)=1 is already a no-op. Pull Request resolved: #356 Reviewed By: albanD, atalman Differential Revision: D45153246 Pulled By: malfet fbshipit-source-id: 135165158eaad2dca591f6cc3ccec52edca38912
1 parent 91c3736 commit d96897b

File tree

5 files changed

+11
-7
lines changed

5 files changed

+11
-7
lines changed

gloo/allgather.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ void allgather(AllgatherOptions& opts) {
5353
in->size);
5454
}
5555

56-
// Short circuit if there is only a single process.
57-
if (context->size == 1) {
56+
// Short circuit if there is only a single process or the output is empty.
57+
if (context->size == 1 || outBytes == 0) {
5858
return;
5959
}
6060

gloo/allgather_ring.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ class AllgatherRing : public Algorithm {
5555
virtual ~AllgatherRing() {}
5656

5757
void run() {
58+
// Short circuit if there is only a single process or the output is empty.
59+
if (this->contextSize_ == 1 || count_ == 0) {
60+
return;
61+
}
5862
const int rank = this->contextRank_;
5963
const int numRounds = this->contextSize_ - 1;
6064

gloo/allgatherv.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,8 @@ void allgatherv(AllgathervOptions& opts) {
111111
}
112112
}
113113

114-
// Short circuit if there is only a single process.
115-
if (context->size == 1) {
114+
// Short circuit if there is only a single process or the output is empty.
115+
if (context->size == 1 || offset == 0) {
116116
return;
117117
}
118118

gloo/test/allgather_test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ INSTANTIATE_TEST_CASE_P(
100100
::testing::Combine(
101101
::testing::ValuesIn(kTransportsForClassAlgorithms),
102102
::testing::Range(2, 10),
103-
::testing::Values(4, 100, 1000, 10000),
103+
::testing::Values(0, 4, 100, 1000, 10000),
104104
::testing::Range(1, 4)));
105105

106106
using NewParam = std::tuple<Transport, int, int, bool>;
@@ -157,7 +157,7 @@ INSTANTIATE_TEST_CASE_P(
157157
::testing::Combine(
158158
::testing::ValuesIn(kTransportsForFunctionAlgorithms),
159159
::testing::Values(1, 2, 4, 7),
160-
::testing::Values(4, 100, 1000, 10000),
160+
::testing::Values(0, 4, 100, 1000, 10000),
161161
::testing::Values(false, true)));
162162

163163
TEST_F(AllgatherNewTest, TestTimeout) {

gloo/test/allgatherv_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ INSTANTIATE_TEST_CASE_P(
8686
::testing::Combine(
8787
::testing::ValuesIn(kTransportsForFunctionAlgorithms),
8888
::testing::Values(1, 2, 4, 7),
89-
::testing::Values(1, 10, 100, 1000),
89+
::testing::Values(0, 1, 10, 100, 1000),
9090
::testing::Values(false, true)));
9191

9292
TEST_F(AllgathervTest, TestTimeout) {

0 commit comments

Comments
 (0)