Skip to content

Commit afc1517

Browse files
shuqiangzhangfacebook-github-bot
authored andcommitted
Add some visibility on which rank to rank connections created by Gloo
Summary: Created a starter task for myself to get familiar with Gloo diff work flow. In recent debugging on prod Gloo issues, we found it 'not-so-easy' to answer 1) when and how many Gloo PGs are created. 2) For each rank in each PG, how many TCP connections are established. First adding a simple log at the TCP context layer to expose the rank to rank connectivity during the creation of Full Mesh. Gloo logging seems inconvenient now, thinking we might introduce other OSS logging libs in the future Reviewed By: XilunWu Differential Revision: D52897536 fbshipit-source-id: c4b78ee7c2a83534c704a55c39766b08d98810f9
1 parent 695ab0a commit afc1517

File tree

7 files changed

+77
-0
lines changed

7 files changed

+77
-0
lines changed

gloo/transport/ibverbs/pair.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,12 @@ void Pair::recv(
346346
"Unbound buffers not supported yet for ibverbs transport");
347347
}
348348

349+
// place holder for future use
350+
bool Pair::isConnected() {
351+
GLOO_THROW_INVALID_OPERATION_EXCEPTION(
352+
"isConnected not supported yet for ibverbs transport");
353+
}
354+
349355
// handleCompletionEvent is called by the device thread when it
350356
// received an event for this pair's completion queue on its
351357
// completion channel.

gloo/transport/ibverbs/pair.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class Pair : public ::gloo::transport::Pair {
6767
virtual std::unique_ptr<::gloo::transport::Buffer>
6868
createRecvBuffer(int slot, void* ptr, size_t size) override;
6969

70+
virtual bool isConnected() override;
71+
7072
// Send from the specified buffer to remote side of pair.
7173
virtual void send(
7274
transport::UnboundBuffer* tbuf,

gloo/transport/pair.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ class Pair {
3636
virtual std::unique_ptr<Buffer>
3737
createRecvBuffer(int slot, void* ptr, size_t size) = 0;
3838

39+
virtual bool isConnected() = 0;
40+
3941
// Send from the specified buffer to remote side of pair.
4042
virtual void send(
4143
UnboundBuffer* buf,

gloo/transport/tcp/context.cc

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "gloo/transport/tcp/context.h"
1010

1111
#include <cstring>
12+
#include <iostream>
1213

1314
#include "gloo/common/error.h"
1415
#include "gloo/common/logging.h"
@@ -117,6 +118,8 @@ void Context::createAndConnectAllPairs(IStore &store) {
117118
const auto& pair = getPair(i);
118119
pair->setLocalRank(localRank);
119120
}
121+
122+
printConnectivityInfo();
120123
}
121124

122125
std::unique_ptr<transport::Pair>& Context::createPair(int rank) {
@@ -132,6 +135,49 @@ std::unique_ptr<transport::UnboundBuffer> Context::createUnboundBuffer(
132135
return std::unique_ptr<transport::UnboundBuffer>(buf);
133136
}
134137

138+
std::vector<int> Context::getConnectedPeerRanks() const {
139+
std::vector<int> result;
140+
GLOO_ENFORCE(size == pairs_.size());
141+
for (int i = 0; i < size; i++) {
142+
if (pairs_.at(i)->isConnected() && i != rank) {
143+
result.push_back(i);
144+
}
145+
}
146+
return result;
147+
}
148+
149+
std::vector<int> Context::getUnConnectedPeerRanks() const {
150+
std::vector<int> result;
151+
GLOO_ENFORCE(size == pairs_.size());
152+
for (int i = 0; i < size; i++) {
153+
if (!pairs_.at(i)->isConnected() && i != rank) {
154+
result.push_back(i);
155+
}
156+
}
157+
return result;
158+
}
159+
160+
void Context::printConnectivityInfo() const {
161+
int numConnectedPeers = getConnectedPeerRanks().size();
162+
std::cout << "[Gloo] Rank " << rank << " is connected to "
163+
<< numConnectedPeers << " peer ranks. "
164+
<< "Expected number of connected peer ranks is : " << size - 1
165+
<< std::endl;
166+
167+
if (numConnectedPeers != size - 1) {
168+
std::vector<int> unConnectedPeers = getUnConnectedPeerRanks();
169+
std::cout << "[Gloo] Rank " << rank << " is NOT connected to: [";
170+
for (int i = 0; i < unConnectedPeers.size(); i++) {
171+
if (i != unConnectedPeers.size() - 1) {
172+
std::cout << unConnectedPeers[i] << ", ";
173+
} else {
174+
std::cout << unConnectedPeers[i];
175+
}
176+
}
177+
std::cout << "]" << std::endl;
178+
}
179+
}
180+
135181
void Context::recvFromAny(
136182
UnboundBuffer* buf,
137183
uint64_t slot,

gloo/transport/tcp/context.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,20 @@ class Context : public ::gloo::transport::Context,
8686
// out. All pairs should be signaled and closed in that event.
8787
void signalException(const std::string& msg);
8888

89+
// Returns a sorted list of connected peer ranks, excluding self.
90+
// Normally all peer ranks should be connected at the end of
91+
// createAndConnectAllPairs
92+
// peer rank concept is introduced at the transport/tcp/pair level, so
93+
// this method is defined at the same level instead of the parent contexts
94+
std::vector<int> getConnectedPeerRanks() const;
95+
96+
// Returns a sorted list of unconnected and peer ranks, excluding self.
97+
// Normally empty at the end of createAndConnectAllPairs
98+
std::vector<int> getUnConnectedPeerRanks() const;
99+
100+
// a helper function to print out rank to rank connectivity information
101+
void printConnectivityInfo() const;
102+
89103
friend class ContextMutator;
90104

91105
friend class UnboundBuffer;

gloo/transport/tcp/pair.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,11 @@ const Address& Pair::address() const {
9595
return self_;
9696
}
9797

98+
bool Pair::isConnected() {
99+
std::lock_guard<std::mutex> lock(m_);
100+
return state_ == CONNECTED;
101+
}
102+
98103
void Pair::connect(const std::vector<char>& bytes) {
99104
const auto peer = Address(bytes);
100105

gloo/transport/tcp/pair.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,8 @@ class Pair : public ::gloo::transport::Pair, public Handler {
146146

147147
void close() override;
148148

149+
bool isConnected() override;
150+
149151
protected:
150152
// Refer to parent context using raw pointer. This could be a
151153
// weak_ptr, seeing as the context class is a shared_ptr, but:

0 commit comments

Comments
 (0)