diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index e2d575c59..6b0272eb1 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -23,6 +23,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 + with: + submodules: true - name: Setup environment run: | sudo apt-get install -y libssl-dev diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml index 310423b82..9839aa772 100644 --- a/.github/workflows/build-windows.yml +++ b/.github/workflows/build-windows.yml @@ -24,6 +24,8 @@ jobs: uses: microsoft/setup-msbuild@v2 - name: Checkout code uses: actions/checkout@v4 + with: + submodules: true - name: Set up Python 3.10 uses: actions/setup-python@v5 with: diff --git a/.github/workflows/super-linter.yml b/.github/workflows/super-linter.yml index 19ef6a3f9..2ce018664 100644 --- a/.github/workflows/super-linter.yml +++ b/.github/workflows/super-linter.yml @@ -24,6 +24,7 @@ jobs: # Full git history is needed to get a proper list of changed files # within `super-linter` fetch-depth: 0 + submodules: true - name: Super-linter uses: super-linter/super-linter@v7.3.0 # x-release-please-version env: diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..eab6041af --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "third_party/spdlog"] + path = third_party/spdlog + url = https://github.com/gabime/spdlog.git diff --git a/CMakeLists.txt b/CMakeLists.txt index caf3bf388..487e2857f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,6 +98,9 @@ endif() set(GLOO_INSTALL ON CACHE BOOL "") mark_as_advanced(GLOO_INSTALL) +# Compile definitions for Gloo +set(GLOO_COMPILE_DEFS) + # Build shared or static libraries (override from parent project) if(BUILD_SHARED_LIBS) set(GLOO_STATIC_OR_SHARED SHARED CACHE STRING "") diff --git a/README.md b/README.md index d9d29ace7..63b6c5538 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,23 @@ Optional dependencies are: Please refer to [docs/](docs/) for detailed documentation. +## Cloning + +Gloo makes use of [Git submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules) +for third-party build-time dependencies. When cloning, ensure that you +initialize and sync submodules: + +``` shell +git clone --recurse-submodules https://github.com/pytorch/gloo.git # or use SSH remote +``` + +Or, if you already cloned without submodules, run the following in the existing +local repository: + +``` shell +git submodule update --init --recursive +``` + ## Building You can build Gloo using CMake. diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 63873fc64..7ca837a8d 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -2,6 +2,9 @@ set(gloo_DEPENDENCY_LIBS "") set(gloo_cuda_DEPENDENCY_LIBS "") set(gloo_hip_DEPENDENCY_LIBS "") +# Dependency libraries for all Gloo targets (e.g. tests, benchmarks, etc.) +set(gloo_ALL_TARGETS_DEPENDENCY_LIBS "") + # Configure path to modules (for find_package) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake/Modules/") @@ -104,7 +107,7 @@ if(USE_MPI) message(STATUS "MPI libraries: " ${MPI_CXX_LIBRARIES}) include_directories(SYSTEM ${MPI_CXX_INCLUDE_PATH}) list(APPEND gloo_DEPENDENCY_LIBS ${MPI_CXX_LIBRARIES}) - add_definitions(-DGLOO_USE_MPI=1) + list(APPEND GLOO_COMPILE_DEFS "GLOO_USE_MPI=1") else() message(WARNING "Not compiling with MPI support. Suppress this warning with -DUSE_MPI=OFF.") set(USE_MPI OFF) @@ -199,3 +202,19 @@ if(BUILD_TEST) target_link_libraries(gtest INTERFACE ${GTEST_LIBRARIES}) endif() endif() + +set(SPDLOG_BUILD_SHARED OFF CACHE BOOL "" FORCE) +set(SPDLOG_BUILD_PIC ON CACHE BOOL "" FORCE) +set(SPDLOG_FMT_EXTERNAL OFF CACHE BOOL "" FORCE) + +set(_save_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) +set(BUILD_SHARED_LIBS OFF) +add_subdirectory(third_party/spdlog EXCLUDE_FROM_ALL) +set(BUILD_SHARED_LIBS ${_save_BUILD_SHARED_LIBS}) + +if(GLOO_STATIC_OR_SHARED STREQUAL "STATIC") + list(APPEND gloo_ALL_TARGETS_DEPENDENCY_LIBS $) +else() + list(APPEND gloo_ALL_TARGETS_DEPENDENCY_LIBS spdlog::spdlog) + list(APPEND GLOO_COMPILE_DEFS SPDLOG_COMPILED_LIB) +endif() diff --git a/docs/errors.md b/docs/errors.md index 93f1279b7..94248d584 100644 --- a/docs/errors.md +++ b/docs/errors.md @@ -18,4 +18,4 @@ extend `::gloo::Exception`. ## Assertions Gloo asserts unexpected errors and logical invariants instead of expecting callers to handle them. `GLOO_ENFORCE` macros are defined in -[`logging.h`](../gloo/common/logging.h) +[`enforce.h`](../gloo/common/enforce.h) diff --git a/gloo/CMakeLists.txt b/gloo/CMakeLists.txt index 186fe1288..11dd34b79 100644 --- a/gloo/CMakeLists.txt +++ b/gloo/CMakeLists.txt @@ -147,10 +147,12 @@ if(USE_CUDA) cuda_add_library(gloo_cuda ${GLOO_CUDA_SRCS} ${GLOO_STATIC_OR_SHARED}) endif() target_link_libraries(gloo_cuda gloo ${gloo_cuda_DEPENDENCY_LIBS}) + set_property(TARGET gloo_cuda APPEND PROPERTY LINK_LIBRARIES ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) endif() if(USE_ROCM) gloo_hip_add_library(gloo_hip ${GLOO_HIP_SRCS}) target_link_libraries(gloo_hip gloo) + set_property(TARGET gloo_hip APPEND PROPERTY LINK_LIBRARIES ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) endif() if(USE_LIBUV) target_link_libraries(gloo PRIVATE uv_a) @@ -165,7 +167,8 @@ elseif(USE_TCP_OPENSSL_LOAD) target_link_libraries(gloo PRIVATE dl) endif() -target_link_libraries(gloo PRIVATE ${gloo_DEPENDENCY_LIBS}) +target_link_libraries(gloo PRIVATE ${gloo_DEPENDENCY_LIBS} ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) +target_compile_definitions(gloo PRIVATE ${GLOO_COMPILE_DEFS}) # Add Interface include directories that are relocatable. target_include_directories(gloo INTERFACE $) diff --git a/gloo/algorithm.cc b/gloo/algorithm.cc index 9488ff59d..bc2d06fe1 100644 --- a/gloo/algorithm.cc +++ b/gloo/algorithm.cc @@ -8,7 +8,7 @@ #include "gloo/algorithm.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { diff --git a/gloo/allgather.cc b/gloo/allgather.cc index 8ab4e6f4f..b45a0215f 100644 --- a/gloo/allgather.cc +++ b/gloo/allgather.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/allgatherv.cc b/gloo/allgatherv.cc index 72fc3988e..50db2a143 100644 --- a/gloo/allgatherv.cc +++ b/gloo/allgatherv.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/allreduce.cc b/gloo/allreduce.cc index 080f7f302..00da4dff7 100644 --- a/gloo/allreduce.cc +++ b/gloo/allreduce.cc @@ -12,7 +12,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/math.h" #include "gloo/types.h" diff --git a/gloo/allreduce_bcube.cc b/gloo/allreduce_bcube.cc new file mode 100644 index 000000000..2aed4606f --- /dev/null +++ b/gloo/allreduce_bcube.cc @@ -0,0 +1,74 @@ +#include "gloo/allreduce_bcube.h" + +#include + +#include "gloo/common/log.h" + +namespace gloo { + +template +void AllreduceBcube::printElems(T* p, int count, int start) { + /* Early return if log level is not high enough, to prevent expensive code + * running. */ + if (!spdlog::should_log(spdlog::level::trace)) + return; + + const std::size_t alignedStart = (start / wordsPerLine) * wordsPerLine; + fmt::memory_buffer line{}; + + /* Logs/flushes the line buffer - starting a new line */ + auto printLine = [&]() { + if (!line.size()) + return; + std::string_view sv{line.data(), line.size()}; + GLOO_TRACE("{}", sv); + line.clear(); + }; + + for (std::size_t x = alignedStart; x < start + count; ++x) { + if (x % wordsPerLine == 0) { + if (x != alignedStart) + printLine(); + fmt::format_to( + std::back_inserter(line), "{} {:05}: ", fmt::ptr(&p[x]), x); + } else if (x % wordsPerSection == 0) { + fmt::format_to(std::back_inserter(line), "- "); + } + + if (x < start) + fmt::format_to(std::back_inserter(line), "..... "); + else + fmt::format_to(std::back_inserter(line), "{:05} ", p[x]); + } + printLine(); +} + +template +void AllreduceBcube::printStageBuffer(const std::string& msg) { + if (printCheck(myRank_)) { + GLOO_TRACE("rank ({}) {}:", myRank_, msg); + printElems(&ptrs_[0][0], totalNumElems_); + } +} + +template +void AllreduceBcube::printStepBuffer( + const std::string& stage, + int step, + int srcRank, + int destRank, + T* p, + int count, + int start) { + if (printCheck(myRank_)) { + GLOO_TRACE( + "{}: step ({}) srcRank ({}) -> destRank ({})", + stage, + step, + srcRank, + destRank); + printElems(p, count, start); + } +} + +} // namespace gloo diff --git a/gloo/allreduce_bcube.h b/gloo/allreduce_bcube.h index c91fa130d..ff094efa8 100644 --- a/gloo/allreduce_bcube.h +++ b/gloo/allreduce_bcube.h @@ -13,15 +13,12 @@ #include #include #include -#include -#include #include #include #include #include #include "gloo/algorithm.h" -#include "gloo/common/error.h" #include "gloo/context.h" /** @@ -537,48 +534,18 @@ class AllreduceBcube : public Algorithm { static bool printCheck(int /*rank*/) { return false; } - /** - * Prints a break given the offset of an element about to be printed - * @param p Pointer to the elements - * @param x The current offset to the pointer to words - */ - static void printBreak(T* p, int x) { - if (0 == x % wordsPerLine) { - std::cout << std::endl - << &p[x] << " " << std::setfill('0') << std::setw(5) << x - << ": "; - } else if (0 == x % wordsPerSection) { - std::cout << "- "; - } - } /** * Pretty prints a list of elements * @param p Pointer to the elements * @param count The number of elements to be printed * @param start The offset from which to print */ - static void printElems(T* p, int count, int start = 0) { - auto alignedStart = (start / wordsPerLine) * wordsPerLine; - for (int x = alignedStart; x < start + count; ++x) { - printBreak(p, x); - if (x < start) { - std::cout << "..... "; - } else { - std::cout << std::setfill('0') << std::setw(5) << p[x] << " "; - } - } - } + static void printElems(T* p, int count, int start); /** * Prints contents in the ptrs array at a particular stage * @param msg Custom message to be printed */ - void printStageBuffer(const std::string& msg) { - if (printCheck(myRank_)) { - std::cout << "rank (" << myRank_ << ") " << msg << ": "; - printElems(&ptrs_[0][0], totalNumElems_); - std::cout << std::endl; - } - } + void printStageBuffer(const std::string& msg); /** * Prints specified buffer during a step @@ -596,14 +563,7 @@ class AllreduceBcube : public Algorithm { int destRank, T* p, int count, - int start = 0) { - if (printCheck(myRank_)) { - std::cout << stage << ": step (" << step << ") " << "srcRank (" << srcRank - << ") -> " << "destRank (" << destRank << "): "; - printElems(p, count, start); - std::cout << std::endl; - } - } + int start = 0); /** * Get all the peers of node with specified rank * @param rank Rank of the node for which peers are needed diff --git a/gloo/alltoall.cc b/gloo/alltoall.cc index aa7db9a87..2ce5d5754 100644 --- a/gloo/alltoall.cc +++ b/gloo/alltoall.cc @@ -10,7 +10,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/alltoall.h b/gloo/alltoall.h index c588f9b2c..448d2c209 100644 --- a/gloo/alltoall.h +++ b/gloo/alltoall.h @@ -8,7 +8,7 @@ #pragma once -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/context.h" #include "gloo/transport/unbound_buffer.h" diff --git a/gloo/alltoallv.cc b/gloo/alltoallv.cc index 7a51f9527..612ad8a06 100644 --- a/gloo/alltoallv.cc +++ b/gloo/alltoallv.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/alltoallv.h b/gloo/alltoallv.h index b0bb21e10..b7761d89a 100644 --- a/gloo/alltoallv.h +++ b/gloo/alltoallv.h @@ -8,7 +8,7 @@ #pragma once -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/context.h" #include "gloo/transport/unbound_buffer.h" diff --git a/gloo/benchmark/CMakeLists.txt b/gloo/benchmark/CMakeLists.txt index 74da2c04b..2c8069131 100644 --- a/gloo/benchmark/CMakeLists.txt +++ b/gloo/benchmark/CMakeLists.txt @@ -5,7 +5,7 @@ set(GLOO_BENCHMARK_SRCS ) add_executable(benchmark ${GLOO_BENCHMARK_SRCS}) -target_link_libraries(benchmark gloo) +target_link_libraries(benchmark gloo ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) if(GLOO_INSTALL) install(TARGETS benchmark DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) diff --git a/gloo/benchmark/cuda_main.cc b/gloo/benchmark/cuda_main.cc index 387338f6e..7736f2b97 100644 --- a/gloo/benchmark/cuda_main.cc +++ b/gloo/benchmark/cuda_main.cc @@ -11,7 +11,7 @@ #include "gloo/benchmark/benchmark.h" #include "gloo/benchmark/runner.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda_allreduce_bcube.h" #include "gloo/cuda_allreduce_halving_doubling.h" #include "gloo/cuda_allreduce_halving_doubling_pipelined.h" diff --git a/gloo/benchmark/main.cc b/gloo/benchmark/main.cc index 2e4678006..d7e19f060 100644 --- a/gloo/benchmark/main.cc +++ b/gloo/benchmark/main.cc @@ -26,7 +26,7 @@ #include "gloo/broadcast_one_to_all.h" #include "gloo/common/aligned_allocator.h" #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/context.h" #include "gloo/pairwise_exchange.h" #include "gloo/reduce.h" diff --git a/gloo/benchmark/runner.cc b/gloo/benchmark/runner.cc index f73dade2b..ed3cbaab9 100644 --- a/gloo/benchmark/runner.cc +++ b/gloo/benchmark/runner.cc @@ -15,7 +15,7 @@ #include "gloo/barrier_all_to_one.h" #include "gloo/broadcast_one_to_all.h" #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/rendezvous/context.h" #include "gloo/rendezvous/file_store.h" #include "gloo/rendezvous/prefix_store.h" diff --git a/gloo/broadcast.cc b/gloo/broadcast.cc index 14f0fc2b4..311d16a00 100644 --- a/gloo/broadcast.cc +++ b/gloo/broadcast.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/math.h" #include "gloo/types.h" diff --git a/gloo/broadcast_one_to_all.h b/gloo/broadcast_one_to_all.h index e599e102f..e8910bb76 100644 --- a/gloo/broadcast_one_to_all.h +++ b/gloo/broadcast_one_to_all.h @@ -13,7 +13,7 @@ #include "gloo/algorithm.h" #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { diff --git a/gloo/common/CMakeLists.txt b/gloo/common/CMakeLists.txt index a8d474498..ebf0fb23d 100644 --- a/gloo/common/CMakeLists.txt +++ b/gloo/common/CMakeLists.txt @@ -1,13 +1,13 @@ set(GLOO_COMMON_SRCS - "${CMAKE_CURRENT_SOURCE_DIR}/logging.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/enforce.cc" "${CMAKE_CURRENT_SOURCE_DIR}/utils.cc" ) set(GLOO_COMMON_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/aligned_allocator.h" "${CMAKE_CURRENT_SOURCE_DIR}/common.h" + "${CMAKE_CURRENT_SOURCE_DIR}/enforce.h" "${CMAKE_CURRENT_SOURCE_DIR}/error.h" - "${CMAKE_CURRENT_SOURCE_DIR}/logging.h" "${CMAKE_CURRENT_SOURCE_DIR}/store.h" "${CMAKE_CURRENT_SOURCE_DIR}/string.h" "${CMAKE_CURRENT_SOURCE_DIR}/utils.h" diff --git a/gloo/common/logging.cc b/gloo/common/enforce.cc similarity index 50% rename from gloo/common/logging.cc rename to gloo/common/enforce.cc index 91b02ea18..879a33c2e 100644 --- a/gloo/common/logging.cc +++ b/gloo/common/enforce.cc @@ -6,42 +6,12 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" -#include -#include #include namespace gloo { -// Initialize log level from environment variable, and return static value at -// each inquiry. -LogLevel logLevel() { - // Global log level. Initialized once. - static LogLevel log_level = LogLevel::UNSET; - if (log_level != LogLevel::UNSET) { - return log_level; - } - - const char* level = getenv("GLOO_LOG_LEVEL"); - // Defaults to WARN. - if (level == nullptr) { - log_level = LogLevel::WARN; - return log_level; - } - - if (std::strcmp(level, "DEBUG") == 0) { - log_level = LogLevel::DEBUG; - } else if (std::strcmp(level, "INFO") == 0) { - log_level = LogLevel::INFO; - } else if (std::strcmp(level, "WARN") == 0) { - log_level = LogLevel::WARN; - } else { - log_level = LogLevel::ERROR; - } - return log_level; -} - EnforceNotMet::EnforceNotMet( const char* file, const int line, diff --git a/gloo/common/logging.h b/gloo/common/enforce.h similarity index 81% rename from gloo/common/logging.h rename to gloo/common/enforce.h index c5e7c1da9..d940a3fc1 100644 --- a/gloo/common/logging.h +++ b/gloo/common/enforce.h @@ -10,58 +10,12 @@ #include #include -#include -#include -#include #include -#include "gloo/common/error.h" #include "gloo/common/string.h" namespace gloo { -enum LogLevel { - ERROR, - WARN, - INFO, - DEBUG, - UNSET, -}; - -LogLevel logLevel(); - -#define GLOO_LOG_MSG(level, ...) \ - std::cerr << ::gloo::MakeString( \ - "[", __FILE__, ":", __LINE__, "] ", level, " ", __VA_ARGS__, "\n") - -#define GLOO_ERROR(...) \ - do { \ - if (logLevel() >= LogLevel::ERROR) { \ - GLOO_LOG_MSG("ERROR", __VA_ARGS__); \ - } \ - } while (0) - -#define GLOO_WARN(...) \ - do { \ - if (logLevel() >= LogLevel::WARN) { \ - GLOO_LOG_MSG("WARN", __VA_ARGS__); \ - } \ - } while (0) - -#define GLOO_INFO(...) \ - do { \ - if (logLevel() >= LogLevel::INFO) { \ - GLOO_LOG_MSG("INFO", __VA_ARGS__); \ - } \ - } while (0) - -#define GLOO_DEBUG(...) \ - do { \ - if (logLevel() >= LogLevel::DEBUG) { \ - GLOO_LOG_MSG("DEBUG", __VA_ARGS__); \ - } \ - } while (0) - class EnforceNotMet : public std::exception { public: EnforceNotMet( diff --git a/gloo/common/linux.cc b/gloo/common/linux.cc index 9b45fc314..784e5e8f9 100644 --- a/gloo/common/linux.cc +++ b/gloo/common/linux.cc @@ -28,7 +28,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #ifndef SPEED_UNKNOWN /* SPEED_UNKOWN is sometimes undefined, c.f. diff --git a/gloo/common/log.h b/gloo/common/log.h new file mode 100644 index 000000000..cde3eacde --- /dev/null +++ b/gloo/common/log.h @@ -0,0 +1,72 @@ +#pragma once + +/** + * @file log.h + * @brief This header is intended to be internal only, so as not to make spdlog + * a dependency of gloo library consumers. For this reason, it should not be + * included by public headers. + */ + +/* The compile-time log level is set to the most sensitive (TRACE), to ensure no + * logging statements are removed at build time. This allows logging to be fully + * controllable at runtime, even to the TRACE level. */ +#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE + +#include +#include + +/* ranges.h provides formatter for containers, e.g. std::vector */ +#include + +namespace gloo::log { + +inline void init() { + /* Default to WARN log level */ + spdlog::set_level(spdlog::level::warn); + /* Override log level if the environment variable is set. */ + spdlog::cfg::load_env_levels("GLOO_LOG_LEVEL"); + + /* Set custom format. This is similiar to PyTorch's format. Equivalent to: + * [{short-log-level}{month-num}{day-of-month} {hour}:{min}:{sec}.{microsec} + * {threadid} {source_file}:{line_no}] {MESSAGE} */ + spdlog::set_pattern("[%L%m%d %H:%M:%S.%f %t %s:%#] %v"); +} + +inline std::once_flag onceFlag; + +inline void ensureInit() { + std::call_once(onceFlag, [] { init(); }); +} + +} // namespace gloo::log + +#define GLOO_TRACE(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_TRACE(__VA_ARGS__); \ + } while (0) +#define GLOO_DEBUG(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_DEBUG(__VA_ARGS__); \ + } while (0) +#define GLOO_INFO(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_INFO(__VA_ARGS__); \ + } while (0) +#define GLOO_WARN(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_WARN(__VA_ARGS__); \ + } while (0) +#define GLOO_ERROR(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_ERROR(__VA_ARGS__); \ + } while (0) +#define GLOO_CRITICAL(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_CRITICAL(__VA_ARGS__); \ + } while (0) diff --git a/gloo/common/win.cc b/gloo/common/win.cc index 5dfb80e28..bbce93c6c 100644 --- a/gloo/common/win.cc +++ b/gloo/common/win.cc @@ -7,7 +7,7 @@ */ #include "gloo/common/win.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include diff --git a/gloo/context.cc b/gloo/context.cc index fd9b83c7b..a919163ac 100644 --- a/gloo/context.cc +++ b/gloo/context.cc @@ -8,8 +8,8 @@ #include "gloo/context.h" +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/transport/device.h" #include "gloo/transport/unbound_buffer.h" diff --git a/gloo/cuda.h b/gloo/cuda.h index ccb196a24..39049ba76 100644 --- a/gloo/cuda.h +++ b/gloo/cuda.h @@ -16,7 +16,7 @@ #include #include "gloo/algorithm.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/config.h" // Check that configuration header was properly generated diff --git a/gloo/cuda_allreduce_bcube.cc b/gloo/cuda_allreduce_bcube.cc index 065862f95..8374c8fb4 100644 --- a/gloo/cuda_allreduce_bcube.cc +++ b/gloo/cuda_allreduce_bcube.cc @@ -8,11 +8,13 @@ #include "gloo/cuda_allreduce_bcube.h" +#include "gloo/common/log.h" #include "gloo/cuda_collectives_device.h" #include "gloo/cuda_collectives_host.h" #include "gloo/cuda_private.h" #include +#include #ifndef _WIN32 #include #endif @@ -226,35 +228,48 @@ bool CudaAllreduceBcube::printCheck(int /* rank */) { return false; } -template -void CudaAllreduceBcube::printBreak(T* p, int x) { - if (0 == x % wordsPerLine) { - std::cout << std::endl - << &p[x] << " " << std::setfill('0') << std::setw(5) << x << ": "; - } else if (0 == x % wordsPerSection) { - std::cout << "- "; - } -} - template void CudaAllreduceBcube::printElems(T* p, int count, int start) { - auto alignedStart = (start / wordsPerLine) * wordsPerLine; - for (int x = alignedStart; x < start + count; ++x) { - printBreak(p, x); - if (x < start) { - std::cout << "..... "; - } else { - std::cout << std::setfill('0') << std::setw(5) << p[x] << " "; + /* Early return if log level is not high enough, to prevent expensive code + * running. */ + if (!spdlog::should_log(spdlog::level::trace)) + return; + + const std::size_t alignedStart = (start / wordsPerLine) * wordsPerLine; + fmt::memory_buffer line{}; + + /* Logs/flushes the line buffer - starting a new line */ + auto printLine = [&]() { + if (!line.size()) + return; + std::string_view sv{line.data(), line.size()}; + GLOO_TRACE("{}", sv); + line.clear(); + }; + + for (std::size_t x = alignedStart; x < start + count; ++x) { + if (x % wordsPerLine == 0) { + if (x != alignedStart) + printLine(); + fmt::format_to( + std::back_inserter(line), "{} {:05}: ", fmt::ptr(&p[x]), x); + } else if (x % wordsPerSection == 0) { + fmt::format_to(std::back_inserter(line), "- "); } + + if (x < start) + fmt::format_to(std::back_inserter(line), "..... "); + else + fmt::format_to(std::back_inserter(line), "{:05} ", p[x]); } + printLine(); } template void CudaAllreduceBcube::printStageBuffer(const std::string& msg) { if (printCheck(myRank_)) { - std::cout << "rank (" << myRank_ << ") " << msg << ": "; + GLOO_TRACE("rank ({}) {}:", myRank_, msg); printElems(&scratch_[0], totalNumElems_); - std::cout << std::endl; } } @@ -268,10 +283,13 @@ void CudaAllreduceBcube::printStepBuffer( int count, int start) { if (printCheck(myRank_)) { - std::cout << stage << ": step (" << step << ") " << "srcRank (" << srcRank - << ") -> " << "destRank (" << destRank << "): "; + GLOO_TRACE( + "{}: step ({}) srcRank ({}) -> destRank ({}):", + stage, + step, + srcRank, + destRank); printElems(p, count, start); - std::cout << std::endl; } } diff --git a/gloo/cuda_allreduce_local.cc b/gloo/cuda_allreduce_local.cc index 9c31e5e46..52c78f054 100644 --- a/gloo/cuda_allreduce_local.cc +++ b/gloo/cuda_allreduce_local.cc @@ -8,7 +8,7 @@ #include "gloo/cuda_allreduce_local.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda_collectives_device.h" #include "gloo/cuda_private.h" diff --git a/gloo/cuda_collectives_device.h b/gloo/cuda_collectives_device.h index c2c6a81d1..2cace96a8 100644 --- a/gloo/cuda_collectives_device.h +++ b/gloo/cuda_collectives_device.h @@ -12,7 +12,7 @@ #include #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/config.h" #include "gloo/cuda.h" #include "gloo/cuda_private.h" diff --git a/gloo/cuda_collectives_host.h b/gloo/cuda_collectives_host.h index 32eb6d3ef..2f9579162 100644 --- a/gloo/cuda_collectives_host.h +++ b/gloo/cuda_collectives_host.h @@ -9,7 +9,7 @@ #pragma once #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda.h" namespace gloo { diff --git a/gloo/cuda_collectives_native.h b/gloo/cuda_collectives_native.h index e6c45c401..07b21384d 100644 --- a/gloo/cuda_collectives_native.h +++ b/gloo/cuda_collectives_native.h @@ -13,7 +13,7 @@ #include #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda.h" #include "gloo/cuda_private.h" diff --git a/gloo/cuda_collectives_nccl.h b/gloo/cuda_collectives_nccl.h index acc64a6b0..ffec2463c 100644 --- a/gloo/cuda_collectives_nccl.h +++ b/gloo/cuda_collectives_nccl.h @@ -9,7 +9,7 @@ #pragma once #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/nccl/nccl.h" namespace gloo { diff --git a/gloo/cuda_private.h b/gloo/cuda_private.h index 355fedd06..9fc353980 100644 --- a/gloo/cuda_private.h +++ b/gloo/cuda_private.h @@ -16,7 +16,7 @@ #ifdef __linux__ #include "gloo/common/linux.h" #endif -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda.h" #include "gloo/transport/device.h" diff --git a/gloo/examples/CMakeLists.txt b/gloo/examples/CMakeLists.txt index b5f87e85c..a5fba49cf 100644 --- a/gloo/examples/CMakeLists.txt +++ b/gloo/examples/CMakeLists.txt @@ -1,8 +1,8 @@ if(MSVC) add_executable(example_reduce example_reduce.cc) - target_link_libraries(example_reduce gloo) + target_link_libraries(example_reduce gloo ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) add_executable(example_allreduce example_allreduce.cc) - target_link_libraries(example_allreduce gloo) + target_link_libraries(example_allreduce gloo ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) if(USE_LIBUV) add_custom_command(TARGET example_reduce POST_BUILD @@ -12,7 +12,7 @@ if(MSVC) endif() else() add_executable(example1 example1.cc) - target_link_libraries(example1 gloo) + target_link_libraries(example1 gloo ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) add_executable(looks_like_mpi looks_like_mpi.cc) - target_link_libraries(looks_like_mpi gloo) + target_link_libraries(looks_like_mpi gloo ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) endif() diff --git a/gloo/gather.cc b/gloo/gather.cc index cab5ffadc..c86ed29ad 100644 --- a/gloo/gather.cc +++ b/gloo/gather.cc @@ -10,7 +10,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/gatherv.cc b/gloo/gatherv.cc index 306f6e45f..c0deb61f3 100644 --- a/gloo/gatherv.cc +++ b/gloo/gatherv.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/mpi/context.cc b/gloo/mpi/context.cc index e8c287960..245a1af56 100644 --- a/gloo/mpi/context.cc +++ b/gloo/mpi/context.cc @@ -12,8 +12,8 @@ #include #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/transport/address.h" namespace gloo { diff --git a/gloo/nccl/nccl.h b/gloo/nccl/nccl.h index 35746e350..0e6d5356f 100644 --- a/gloo/nccl/nccl.h +++ b/gloo/nccl/nccl.h @@ -13,7 +13,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda.h" #include "gloo/types.h" diff --git a/gloo/pairwise_exchange.h b/gloo/pairwise_exchange.h index f0da7552f..03550865f 100644 --- a/gloo/pairwise_exchange.h +++ b/gloo/pairwise_exchange.h @@ -11,7 +11,7 @@ #include #include "gloo/algorithm.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/context.h" namespace gloo { diff --git a/gloo/reduce.cc b/gloo/reduce.cc index 4ace5607d..41d7db00c 100644 --- a/gloo/reduce.cc +++ b/gloo/reduce.cc @@ -12,7 +12,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/math.h" #include "gloo/types.h" diff --git a/gloo/rendezvous/context.cc b/gloo/rendezvous/context.cc index f71739d3a..7db0a17cd 100644 --- a/gloo/rendezvous/context.cc +++ b/gloo/rendezvous/context.cc @@ -10,7 +10,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/rendezvous/store.h" #include "gloo/transport/address.h" diff --git a/gloo/rendezvous/file_store.cc b/gloo/rendezvous/file_store.cc index 1202e80f0..12a1b34e9 100644 --- a/gloo/rendezvous/file_store.cc +++ b/gloo/rendezvous/file_store.cc @@ -26,8 +26,8 @@ #include #endif +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" namespace gloo { namespace rendezvous { diff --git a/gloo/rendezvous/hash_store.cc b/gloo/rendezvous/hash_store.cc index 34c0951f7..984c46d0d 100644 --- a/gloo/rendezvous/hash_store.cc +++ b/gloo/rendezvous/hash_store.cc @@ -8,8 +8,8 @@ #include "gloo/rendezvous/hash_store.h" +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" namespace gloo { namespace rendezvous { diff --git a/gloo/rendezvous/redis_store.cc b/gloo/rendezvous/redis_store.cc index f4d91fb2a..503162178 100644 --- a/gloo/rendezvous/redis_store.cc +++ b/gloo/rendezvous/redis_store.cc @@ -10,8 +10,8 @@ #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/common/string.h" namespace gloo { diff --git a/gloo/rendezvous/store.h b/gloo/rendezvous/store.h index e2569d0ef..b32e5f72c 100644 --- a/gloo/rendezvous/store.h +++ b/gloo/rendezvous/store.h @@ -12,8 +12,8 @@ #include #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/common/store.h" // can be used by upstream users to know whether this is available or not. diff --git a/gloo/scatter.cc b/gloo/scatter.cc index c1810ca47..e7317a62e 100644 --- a/gloo/scatter.cc +++ b/gloo/scatter.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/test/CMakeLists.txt b/gloo/test/CMakeLists.txt index d16af64e9..e97738575 100644 --- a/gloo/test/CMakeLists.txt +++ b/gloo/test/CMakeLists.txt @@ -17,7 +17,7 @@ set(GLOO_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/remote_key_test.cc" "${CMAKE_CURRENT_SOURCE_DIR}/send_recv_test.cc" ) -set(GLOO_TEST_LIBRARIES) +set(GLOO_TEST_LIBRARIES ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") list(APPEND GLOO_TEST_SRCS @@ -64,7 +64,7 @@ if(USE_CUDA) else() cuda_add_executable(gloo_test_cuda ${GLOO_TEST_CUDA_SRCS}) endif() - target_link_libraries(gloo_test_cuda gloo_cuda gtest OpenSSL::SSL OpenSSL::Crypto ${GLOO_CUDA_LIBRARIES}) + target_link_libraries(gloo_test_cuda gloo_cuda gtest ${GLOO_TEST_LIBRARIES} ${GLOO_CUDA_LIBRARIES}) endif() endif() @@ -79,5 +79,5 @@ if(USE_ROCM) ) gloo_hip_add_executable(gloo_test_hip ${GLOO_TEST_HIP_SRCS}) - target_link_libraries(gloo_test_hip gloo_hip gtest OpenSSL::SSL OpenSSL::Crypto) + target_link_libraries(gloo_test_hip gloo_hip gtest ${GLOO_TEST_LIBRARIES}) endif() diff --git a/gloo/test/base_test.cc b/gloo/test/base_test.cc index ccccfa3bb..b5cf44bc8 100644 --- a/gloo/test/base_test.cc +++ b/gloo/test/base_test.cc @@ -7,6 +7,7 @@ */ #include "gloo/test/base_test.h" +#include "gloo/common/log.h" #include "gloo/test/openssl_utils.h" namespace gloo { @@ -81,7 +82,7 @@ std::shared_ptr<::gloo::transport::Device> createDevice(Transport transport) { try { return ::gloo::transport::ibverbs::CreateDevice(attr); } catch (const InvalidOperationException& e) { - GLOO_INFO("IBVERBS not available: ", e.what()); + GLOO_INFO("IBVERBS not available: {}", e.what()); } } #endif diff --git a/gloo/test/multiproc_test.h b/gloo/test/multiproc_test.h index e639fc3e7..0e4193052 100644 --- a/gloo/test/multiproc_test.h +++ b/gloo/test/multiproc_test.h @@ -15,7 +15,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/rendezvous/context.h" #include "gloo/rendezvous/file_store.h" #include "gloo/test/base_test.h" diff --git a/gloo/test/openssl_utils.cc b/gloo/test/openssl_utils.cc index 485f78e25..8eb9f0ba9 100644 --- a/gloo/test/openssl_utils.cc +++ b/gloo/test/openssl_utils.cc @@ -5,7 +5,7 @@ #include "gloo/test/openssl_utils.h" #if GLOO_HAVE_TRANSPORT_TCP_TLS -#include +#include #include namespace gloo { diff --git a/gloo/transport/buffer.h b/gloo/transport/buffer.h index 382061a87..8431d37b6 100644 --- a/gloo/transport/buffer.h +++ b/gloo/transport/buffer.h @@ -16,13 +16,9 @@ namespace transport { class Buffer { public: explicit Buffer(int slot, void* ptr, size_t size) - : slot_(slot), ptr_(ptr), size_(size), debug_(false) {} + : slot_(slot), ptr_(ptr), size_(size) {} virtual ~Buffer() = 0; - virtual void setDebug(bool debug) { - debug_ = debug; - } - virtual void send(size_t offset, size_t length, size_t roffset = 0) = 0; // Send entire buffer by default @@ -37,7 +33,6 @@ class Buffer { int slot_; void* ptr_; size_t size_; - bool debug_; }; } // namespace transport diff --git a/gloo/transport/ibverbs/address.cc b/gloo/transport/ibverbs/address.cc index 7f1b9d0c5..050a813e7 100644 --- a/gloo/transport/ibverbs/address.cc +++ b/gloo/transport/ibverbs/address.cc @@ -12,7 +12,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { namespace transport { diff --git a/gloo/transport/ibverbs/buffer.cc b/gloo/transport/ibverbs/buffer.cc index bb39e8b04..64836641b 100644 --- a/gloo/transport/ibverbs/buffer.cc +++ b/gloo/transport/ibverbs/buffer.cc @@ -6,16 +6,15 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/ibverbs/buffer.h" - #include #include #include - #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" +#include "gloo/common/log.h" +#include "gloo/transport/ibverbs/buffer.h" namespace gloo { namespace transport { @@ -63,7 +62,8 @@ Buffer::~Buffer() { std::lock_guard lock(m_); if (sendPending_ > 0) { GLOO_WARN( - "Destructing buffer with pending sends, sendPending_=", sendPending_); + "Destructing buffer with pending sends, sendPending_={}", + sendPending_.load()); } ibv_dereg_mr(mr_); @@ -189,11 +189,7 @@ void Buffer::send(size_t offset, size_t length, size_t roffset) { checkErrorState(); - if (debug_) { - std::cout << "[" << getpid() << "] "; - std::cout << "send " << length << " bytes"; - std::cout << std::endl; - } + GLOO_TRACE("send {} bytes", length); // Increment number of sends in flight sendPending_++; @@ -208,19 +204,11 @@ void Buffer::handleCompletion(int rank, struct ibv_wc* wc) { std::unique_lock lock(m_); if (wc->opcode & IBV_WC_RECV) { - if (debug_) { - std::cout << "[" << getpid() << "] "; - std::cout << "recv " << wc->byte_len << " bytes"; - std::cout << std::endl; - } + GLOO_TRACE("recv {} bytes", wc->byte_len); recvCompletions_++; recvCv_.notify_one(); } else if (wc->opcode == IBV_WC_RDMA_WRITE) { - if (debug_) { - std::cout << "[" << getpid() << "] "; - std::cout << "send complete"; - std::cout << std::endl; - } + GLOO_TRACE("send complete"); sendCompletions_++; sendPending_--; sendCv_.notify_one(); diff --git a/gloo/transport/ibverbs/device.cc b/gloo/transport/ibverbs/device.cc index ef0d5a123..6146e7f86 100644 --- a/gloo/transport/ibverbs/device.cc +++ b/gloo/transport/ibverbs/device.cc @@ -6,18 +6,17 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/ibverbs/device.h" - #include #include -#include - #include +#include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" #include "gloo/common/linux.h" -#include "gloo/common/logging.h" +#include "gloo/common/log.h" #include "gloo/transport/ibverbs/context.h" +#include "gloo/transport/ibverbs/device.h" #include "gloo/transport/ibverbs/pair.h" namespace gloo { @@ -85,9 +84,8 @@ std::shared_ptr<::gloo::transport::Device> CreateDevice( std::vector names; for (auto i = 0; i < devices.size(); i++) { GLOO_DEBUG( - "found candidate device ", + "found candidate device {} dev={}", devices[i]->name, - " dev=", devices[i]->dev_name); names.push_back(devices[i]->name); } @@ -96,11 +94,9 @@ std::shared_ptr<::gloo::transport::Device> CreateDevice( } GLOO_INFO( - "Using ibverbs device=", + "Using ibverbs device={} port={} index={}", attr.name, - " port=", attr.port, - " index=", attr.index); // Look for specified device name @@ -227,7 +223,7 @@ void Device::loop() { Pair* pair = static_cast(cqContext); pair->handleCompletionEvent(); } catch (const std::exception& ex) { - GLOO_ERROR("Exception while handling completion event: ", ex.what()); + GLOO_ERROR("Exception while handling completion event: {}", ex.what()); throw; } } diff --git a/gloo/transport/ibverbs/memory_region.cc b/gloo/transport/ibverbs/memory_region.cc index 83320dd2d..702bcb9f8 100644 --- a/gloo/transport/ibverbs/memory_region.cc +++ b/gloo/transport/ibverbs/memory_region.cc @@ -10,7 +10,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { namespace transport { diff --git a/gloo/transport/ibverbs/pair.cc b/gloo/transport/ibverbs/pair.cc index d3a0410bb..0ede8ae96 100644 --- a/gloo/transport/ibverbs/pair.cc +++ b/gloo/transport/ibverbs/pair.cc @@ -6,16 +6,16 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/ibverbs/pair.h" -#include "gloo/transport/ibverbs/buffer.h" -#include "gloo/transport/ibverbs/unbound_buffer.h" - -#include -#include +#include +#include #include "gloo/common/common.h" +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" +#include "gloo/common/log.h" +#include "gloo/transport/ibverbs/buffer.h" +#include "gloo/transport/ibverbs/pair.h" +#include "gloo/transport/ibverbs/unbound_buffer.h" namespace gloo { namespace transport { @@ -231,7 +231,7 @@ void Pair::sendMemoryRegion(struct ibv_mr* src, int slot) { wr.imm_data = slot; GLOO_DEBUG( - "sendMemoryRegion slot=", slot, " addr=", src->addr, " lkey=", src->lkey); + "sendMemoryRegion slot={} addr={} lkey={}", slot, src->addr, src->lkey); // The work request is serialized and sent to the driver so it // doesn't need to be valid after the ibv_post_send call. @@ -331,7 +331,7 @@ void Pair::send( size_t offset, size_t nbytes) { std::unique_lock lock(m_); - GLOO_DEBUG("send tag=", slot, " offset=", offset, " nbytes=", nbytes); + GLOO_DEBUG("send tag={} offset={} nbytes={}", slot, offset, nbytes); GLOO_ENFORCE(!sync_, "Cannot send in sync mode"); @@ -364,15 +364,11 @@ void Pair::send( wr.wr.rdma.rkey = peer.rkey; GLOO_DEBUG( - "send UnboundBuffer async slot=", + "send UnboundBuffer async slot={} peer_addr={} remote_addr={:#x} " + "rkey={}", wr.wr_id, - " peer_addr=", peer.addr, - " remote_addr=", - std::hex, wr.wr.rdma.remote_addr, - std::dec, - " rkey=", wr.wr.rdma.rkey); struct ibv_send_wr* bad_wr; @@ -391,7 +387,7 @@ void Pair::recv( size_t nbytes) { std::unique_lock lock(m_); - GLOO_DEBUG("recv tag=", tag, " offset=", offset, " nbytes=", nbytes); + GLOO_DEBUG("recv tag={} offset={} nbytes={}", tag, offset, nbytes); GLOO_ENFORCE(!sync_, "Cannot recv in sync mode"); @@ -445,19 +441,13 @@ void Pair::put( wr.wr.rdma.rkey = key->rkey_; GLOO_DEBUG( + "{}->{}: put UnboundBuffer async slot={} peer_addr={} " + "remote_addr={:#x} rkey={}", self_.str(), - "->", peer_.str(), - ": ", - "put UnboundBuffer async slot=", wr.wr_id, - " peer_addr=", key->addr_, - " remote_addr=", - std::hex, wr.wr.rdma.remote_addr, - std::dec, - " rkey=", wr.wr.rdma.rkey); struct ibv_send_wr* bad_wr; @@ -507,19 +497,13 @@ void Pair::get( wr.wr.rdma.rkey = key->rkey_; GLOO_DEBUG( + "{}->{}: get UnboundBuffer async slot={} peer_addr={} " + "remote_addr={:#x} rkey={}", self_.str(), - "->", peer_.str(), - ": ", - "get UnboundBuffer async slot=", wr.wr_id, - " peer_addr=", key->addr_, - " remote_addr=", - std::hex, wr.wr.rdma.remote_addr, - std::dec, - " rkey=", wr.wr.rdma.rkey); struct ibv_send_wr* bad_wr; @@ -585,11 +569,9 @@ void Pair::pollCompletions() { handleCompletion(&wc[i]); } catch (const std::exception& ex) { GLOO_ERROR( + "{}->{}: Exception in handleCompletion: {}", self_.str(), - "->", peer_.str(), - ": ", - "Exception in handleCompletion: ", ex.what()); throw; } @@ -604,13 +586,11 @@ void Pair::pollCompletions() { void Pair::handleCompletion(struct ibv_wc* wc) { GLOO_DEBUG( + "{}->{}: handleCompletion id={} opcode={}", self_.str(), - "->", peer_.str(), - ": handleCompletion id=", wc->wr_id, - " opcode=", - wc->opcode); + fmt::underlying(wc->opcode)); if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { // Incoming RDMA write completed. @@ -657,12 +637,10 @@ void Pair::handleCompletion(struct ibv_wc* wc) { } } else if (wc->opcode == IBV_WC_RECV) { GLOO_DEBUG( + "{}->{}: handleCompletion id={} opcode=IBV_WC_RECV slot={}", self_.str(), - "->", peer_.str(), - ": handleCompletion id=", wc->wr_id, - " opcode=IBV_WC_RECV slot=", wc->imm_data); // Memory region recv completed. // @@ -706,12 +684,10 @@ void Pair::handleCompletion(struct ibv_wc* wc) { // Memory region send completed. GLOO_DEBUG( + "{}->{}: handleCompletion id={} opcode=IBV_WC_SEND", self_.str(), - "->", peer_.str(), - ": handleCompletion id=", - wc->wr_id, - " opcode=IBV_WC_SEND"); + wc->wr_id); auto slot = wc->wr_id; GLOO_ENFORCE_EQ( @@ -750,15 +726,10 @@ void Pair::send(Buffer* buffer, size_t offset, size_t length, size_t roffset) { wr.wr.rdma.rkey = peer.rkey; GLOO_DEBUG( - "send Buffer async slot=", + "send Buffer async slot={} peer_addr={} remote_addr={:#x} rkey={}", wr.wr_id, - " peer_addr=", peer.addr, - " remote_addr=", - std::hex, wr.wr.rdma.remote_addr, - std::dec, - " rkey=", wr.wr.rdma.rkey); struct ibv_send_wr* bad_wr; @@ -781,7 +752,7 @@ void Pair::send(Buffer* buffer, size_t offset, size_t length, size_t roffset) { void Pair::signalIoFailure(const std::string& msg) { std::lock_guard lock(m_); - GLOO_ERROR(msg); + GLOO_ERROR("{}", msg); auto ex = ::gloo::IoException(msg); if (ex_ == nullptr) { // If we haven't seen an error yet, store the exception to throw on future diff --git a/gloo/transport/ibverbs/unbound_buffer.cc b/gloo/transport/ibverbs/unbound_buffer.cc index 971535d7c..e3a4fd022 100644 --- a/gloo/transport/ibverbs/unbound_buffer.cc +++ b/gloo/transport/ibverbs/unbound_buffer.cc @@ -6,13 +6,13 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/ibverbs/unbound_buffer.h" - #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" +#include "gloo/common/log.h" #include "gloo/transport/ibverbs/context.h" +#include "gloo/transport/ibverbs/unbound_buffer.h" namespace gloo { namespace transport { @@ -106,9 +106,8 @@ void UnboundBuffer::handleCompletion(int rank, struct ibv_wc* wc) { sendCv_.notify_one(); GLOO_DEBUG( - "send complete sendPending=", + "send complete sendPending={} sendCompletions={}", sendPending_, - " sendCompletions=", sendCompletions_); } else { GLOO_ENFORCE(false, "Unexpected completion (opcode: ", wc->opcode, ")"); diff --git a/gloo/transport/pair.h b/gloo/transport/pair.h index 42ef7cd69..93eabf783 100644 --- a/gloo/transport/pair.h +++ b/gloo/transport/pair.h @@ -10,7 +10,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/transport/address.h" #include "gloo/transport/buffer.h" #include "gloo/transport/unbound_buffer.h" diff --git a/gloo/transport/tcp/CMakeLists.txt b/gloo/transport/tcp/CMakeLists.txt index acaf906b7..62691dd55 100644 --- a/gloo/transport/tcp/CMakeLists.txt +++ b/gloo/transport/tcp/CMakeLists.txt @@ -5,7 +5,6 @@ else() "${CMAKE_CURRENT_SOURCE_DIR}/address.cc" "${CMAKE_CURRENT_SOURCE_DIR}/buffer.cc" "${CMAKE_CURRENT_SOURCE_DIR}/context.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/debug_logger.cc" "${CMAKE_CURRENT_SOURCE_DIR}/device.cc" "${CMAKE_CURRENT_SOURCE_DIR}/error.cc" "${CMAKE_CURRENT_SOURCE_DIR}/helpers.cc" @@ -21,7 +20,6 @@ else() "${CMAKE_CURRENT_SOURCE_DIR}/buffer.h" "${CMAKE_CURRENT_SOURCE_DIR}/context.h" "${CMAKE_CURRENT_SOURCE_DIR}/debug_data.h" - "${CMAKE_CURRENT_SOURCE_DIR}/debug_logger.h" "${CMAKE_CURRENT_SOURCE_DIR}/device.h" "${CMAKE_CURRENT_SOURCE_DIR}/error.h" "${CMAKE_CURRENT_SOURCE_DIR}/helpers.h" diff --git a/gloo/transport/tcp/address.cc b/gloo/transport/tcp/address.cc index e37f7b493..93c540d11 100644 --- a/gloo/transport/tcp/address.cc +++ b/gloo/transport/tcp/address.cc @@ -13,7 +13,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { namespace transport { diff --git a/gloo/transport/tcp/buffer.cc b/gloo/transport/tcp/buffer.cc index 54a5e1119..1bee06e30 100644 --- a/gloo/transport/tcp/buffer.cc +++ b/gloo/transport/tcp/buffer.cc @@ -6,16 +6,15 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/tcp/buffer.h" - #include #include #include - #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" +#include "gloo/common/log.h" +#include "gloo/transport/tcp/buffer.h" namespace gloo { namespace transport { @@ -126,12 +125,7 @@ void Buffer::send(size_t offset, size_t length, size_t roffset) { // to support this. GLOO_ENFORCE_LE(offset + length, size_); - if (debug_) { - std::cout << "[" << getpid() << ": " << syscall(SYS_gettid) << "] "; - std::cout << "send " << length << " bytes"; - std::cout << " to " << pair_->peer().str(); - std::cout << std::endl; - } + GLOO_TRACE("send {} bytes to {}", length, pair_->peer().str()); op.preamble.nbytes = sizeof(op.preamble) + length; op.preamble.opcode = Op::SEND_BUFFER; diff --git a/gloo/transport/tcp/context.cc b/gloo/transport/tcp/context.cc index 023178741..28e527cd5 100644 --- a/gloo/transport/tcp/context.cc +++ b/gloo/transport/tcp/context.cc @@ -6,16 +6,15 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/tcp/context.h" - #include #include #include -#include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" +#include "gloo/common/log.h" #include "gloo/common/utils.h" +#include "gloo/transport/tcp/context.h" #include "gloo/transport/tcp/device.h" #include "gloo/transport/tcp/pair.h" #include "gloo/transport/tcp/unbound_buffer.h" @@ -241,17 +240,14 @@ std::vector Context::getUnConnectedPeerRanks() const { void Context::printConnectivityInfo() const { int numConnectedPeers = getConnectedPeerRanks().size(); GLOO_INFO( - "Rank ", + "Rank {} is connected to {}. Expected number of connected peers is: {}", rank, - " is connected to ", numConnectedPeers, - ". Expected number of connected peers is: ", size - 1); if (numConnectedPeers != size - 1) { std::vector unConnectedPeers = getUnConnectedPeerRanks(); - auto peerStrCat = ::gloo::MakeString(unConnectedPeers, /*delim=*/", "); - GLOO_INFO("Rank ", rank, " is NOT connected to: [", peerStrCat, "]"); + GLOO_INFO("Rank {} is NOT connected to: {}", rank, unConnectedPeers); } } diff --git a/gloo/transport/tcp/debug_data.h b/gloo/transport/tcp/debug_data.h index 71863067e..699836b98 100644 --- a/gloo/transport/tcp/debug_data.h +++ b/gloo/transport/tcp/debug_data.h @@ -1,11 +1,10 @@ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +#pragma once +#include #include -#pragma once -namespace gloo { -namespace transport { -namespace tcp { +namespace gloo::transport::tcp { struct ConnectDebugData { const int retryCount; @@ -18,6 +17,23 @@ struct ConnectDebugData { const std::string local; }; -} // namespace tcp -} // namespace transport -} // namespace gloo +} // namespace gloo::transport::tcp + +template <> +struct fmt::formatter + : fmt::formatter { + auto format(gloo::transport::tcp::ConnectDebugData& data, format_context& ctx) + const -> decltype(ctx.out()) { + return fmt::format_to( + ctx.out(), + "willRetry={}, retry={}, retryLimit={}, rank={}, size={}, local={}, remote={}, error={}", + data.willRetry, + data.retryCount, + data.retryLimit, + data.glooRank, + data.glooSize, + data.local, + data.remote, + data.error); + } +}; diff --git a/gloo/transport/tcp/debug_logger.cc b/gloo/transport/tcp/debug_logger.cc deleted file mode 100644 index b55209616..000000000 --- a/gloo/transport/tcp/debug_logger.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include -#include - -namespace gloo { -namespace transport { -namespace tcp { - -void DebugLogger::log(const ConnectDebugData& data) { - GLOO_ERROR( - "failed to connect, willRetry=", - data.willRetry, - ", retry=", - data.retryCount, - ", retryLimit=", - data.retryLimit, - ", rank=", - data.glooRank, - ", size=", - data.glooSize, - ", local=", - data.local, - ", remote=", - data.remote, - ", error=", - data.error); -} - -} // namespace tcp -} // namespace transport -} // namespace gloo diff --git a/gloo/transport/tcp/debug_logger.h b/gloo/transport/tcp/debug_logger.h deleted file mode 100644 index e58f9c6a1..000000000 --- a/gloo/transport/tcp/debug_logger.h +++ /dev/null @@ -1,20 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -namespace gloo { -namespace transport { -namespace tcp { - -class DebugLogger { - public: - static void log(const ConnectDebugData& data); - - private: -}; - -} // namespace tcp -} // namespace transport -} // namespace gloo diff --git a/gloo/transport/tcp/device.cc b/gloo/transport/tcp/device.cc index daf0c2fb1..be9b19edf 100644 --- a/gloo/transport/tcp/device.cc +++ b/gloo/transport/tcp/device.cc @@ -15,9 +15,9 @@ #include #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" #include "gloo/common/linux.h" -#include "gloo/common/logging.h" #include "gloo/common/utils.h" #include "gloo/transport/tcp/context.h" #include "gloo/transport/tcp/helpers.h" diff --git a/gloo/transport/tcp/helpers.cc b/gloo/transport/tcp/helpers.cc index c403996b4..e57ccadc0 100644 --- a/gloo/transport/tcp/helpers.cc +++ b/gloo/transport/tcp/helpers.cc @@ -1,4 +1,7 @@ -#include +#include "gloo/transport/tcp/helpers.h" +#include "gloo/common/log.h" +#include "gloo/transport/tcp/debug_data.h" +#include "gloo/transport/tcp/loop.h" namespace gloo { namespace transport { @@ -16,6 +19,49 @@ void connectLoop( x->run(loop); } +void ConnectOperation::handleEvents(Loop& loop, int /*events*/) { + // Hold a reference to this object to keep it alive until the + // callback is called. + auto leak = shared_from_this(); + loop.unregisterDescriptor(socket_->fd(), this); + + int result; + socklen_t result_len = sizeof(result); + if (getsockopt(socket_->fd(), SOL_SOCKET, SO_ERROR, &result, &result_len) < + 0) { + fn_(loop, socket_, SystemError("getsockopt", errno, remote_)); + return; + } + if (result != 0) { + SystemError e("SO_ERROR", result, remote_); + bool willRetry = + std::chrono::steady_clock::now() < deadline_ && retry_++ < maxRetries_; + + auto debugData = ConnectDebugData{ + retry_, + maxRetries_, + willRetry, + rank_, + size_, + e.what(), + remote_.str(), + socket_->sockName().str(), + }; + GLOO_WARN("{}", debugData); + + // check deadline + if (willRetry) { + run(loop); + } else { + fn_(loop, socket_, TimeoutError("timed out connecting: " + e.what())); + } + + return; + } + + fn_(loop, socket_, Error::kSuccess); +} + } // namespace tcp } // namespace transport } // namespace gloo diff --git a/gloo/transport/tcp/helpers.h b/gloo/transport/tcp/helpers.h index c75993590..14329a4e9 100644 --- a/gloo/transport/tcp/helpers.h +++ b/gloo/transport/tcp/helpers.h @@ -11,16 +11,11 @@ #include #include -#include -#include -#include -#include -#include -#include "gloo/transport/tcp/debug_logger.h" // @manual=//gloo:debug_logger +#include "gloo/transport/tcp/error.h" +#include "gloo/transport/tcp/loop.h" +#include "gloo/transport/tcp/socket.h" -namespace gloo { -namespace transport { -namespace tcp { +namespace gloo::transport::tcp { // ReadValueOperation asynchronously reads a value of type T from the // socket specified at construction. Upon completion or error, the @@ -171,48 +166,7 @@ class ConnectOperation final this->shared_from_this()); } - void handleEvents(Loop& loop, int /*events*/) override { - // Hold a reference to this object to keep it alive until the - // callback is called. - auto leak = shared_from_this(); - loop.unregisterDescriptor(socket_->fd(), this); - - int result; - socklen_t result_len = sizeof(result); - if (getsockopt(socket_->fd(), SOL_SOCKET, SO_ERROR, &result, &result_len) < - 0) { - fn_(loop, socket_, SystemError("getsockopt", errno, remote_)); - return; - } - if (result != 0) { - SystemError e("SO_ERROR", result, remote_); - bool willRetry = std::chrono::steady_clock::now() < deadline_ && - retry_++ < maxRetries_; - - auto debugData = ConnectDebugData{ - retry_, - maxRetries_, - willRetry, - rank_, - size_, - e.what(), - remote_.str(), - socket_->sockName().str(), - }; - DebugLogger::log(debugData); - - // check deadline - if (willRetry) { - run(loop); - } else { - fn_(loop, socket_, TimeoutError("timed out connecting: " + e.what())); - } - - return; - } - - fn_(loop, socket_, Error::kSuccess); - } + void handleEvents(Loop& loop, int /*events*/) override; private: const Address remote_; @@ -235,6 +189,4 @@ void connectLoop( std::chrono::milliseconds timeout, typename ConnectOperation::callback_t fn); -} // namespace tcp -} // namespace transport -} // namespace gloo +} // namespace gloo::transport::tcp diff --git a/gloo/transport/tcp/listener.cc b/gloo/transport/tcp/listener.cc index 325fe0ffc..409a9a3f6 100644 --- a/gloo/transport/tcp/listener.cc +++ b/gloo/transport/tcp/listener.cc @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include # diff --git a/gloo/transport/tcp/loop.cc b/gloo/transport/tcp/loop.cc index dca13e33f..613b92a48 100644 --- a/gloo/transport/tcp/loop.cc +++ b/gloo/transport/tcp/loop.cc @@ -14,8 +14,8 @@ #include +#include #include -#include #if defined(__SANITIZE_THREAD__) #define TSAN_ENABLED diff --git a/gloo/transport/tcp/pair.cc b/gloo/transport/tcp/pair.cc index 030778c0b..95705b83f 100644 --- a/gloo/transport/tcp/pair.cc +++ b/gloo/transport/tcp/pair.cc @@ -22,8 +22,8 @@ #include #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/transport/tcp/buffer.h" #include "gloo/transport/tcp/context.h" #include "gloo/transport/tcp/unbound_buffer.h" diff --git a/gloo/transport/tcp/socket.cc b/gloo/transport/tcp/socket.cc index 1b1270653..4c304b319 100644 --- a/gloo/transport/tcp/socket.cc +++ b/gloo/transport/tcp/socket.cc @@ -13,7 +13,7 @@ #include #include -#include +#include namespace gloo { namespace transport { diff --git a/gloo/transport/tcp/tls/context.cc b/gloo/transport/tcp/tls/context.cc index 141fdac3b..06a819097 100644 --- a/gloo/transport/tcp/tls/context.cc +++ b/gloo/transport/tcp/tls/context.cc @@ -8,7 +8,7 @@ #include "gloo/transport/tcp/tls/context.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/transport/tcp/tls/device.h" #include "gloo/transport/tcp/tls/openssl.h" #include "gloo/transport/tcp/tls/pair.h" diff --git a/gloo/transport/tcp/tls/pair.cc b/gloo/transport/tcp/tls/pair.cc index 3e0fcfde3..7c096ea8b 100644 --- a/gloo/transport/tcp/tls/pair.cc +++ b/gloo/transport/tcp/tls/pair.cc @@ -8,8 +8,8 @@ #include "gloo/transport/tcp/tls/pair.h" +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/transport/tcp/buffer.h" #include "gloo/transport/tcp/tls/context.h" #include "gloo/transport/tcp/tls/device.h" diff --git a/gloo/transport/tcp/unbound_buffer.cc b/gloo/transport/tcp/unbound_buffer.cc index b8cac4679..59b484b30 100644 --- a/gloo/transport/tcp/unbound_buffer.cc +++ b/gloo/transport/tcp/unbound_buffer.cc @@ -8,8 +8,8 @@ #include "gloo/transport/tcp/unbound_buffer.h" +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/transport/tcp/context.h" namespace gloo { diff --git a/gloo/transport/uv/address.cc b/gloo/transport/uv/address.cc index 9b69b9ece..f85d49b46 100644 --- a/gloo/transport/uv/address.cc +++ b/gloo/transport/uv/address.cc @@ -12,7 +12,7 @@ #include -#include +#include namespace gloo { namespace transport { diff --git a/gloo/transport/uv/context.cc b/gloo/transport/uv/context.cc index 606bc95af..1b342854f 100644 --- a/gloo/transport/uv/context.cc +++ b/gloo/transport/uv/context.cc @@ -8,8 +8,8 @@ #include +#include #include -#include #include #include #include diff --git a/gloo/transport/uv/device.cc b/gloo/transport/uv/device.cc index 1dc986433..c6af3fefe 100644 --- a/gloo/transport/uv/device.cc +++ b/gloo/transport/uv/device.cc @@ -22,7 +22,7 @@ #else #include // @manual #endif -#include +#include #include #include #include diff --git a/gloo/transport/uv/pair.cc b/gloo/transport/uv/pair.cc index 4db8bf32c..e1fea6839 100644 --- a/gloo/transport/uv/pair.cc +++ b/gloo/transport/uv/pair.cc @@ -11,8 +11,8 @@ #include #include +#include #include -#include #include #include #include diff --git a/gloo/transport/uv/unbound_buffer.cc b/gloo/transport/uv/unbound_buffer.cc index 593020b3c..f0cebf261 100644 --- a/gloo/transport/uv/unbound_buffer.cc +++ b/gloo/transport/uv/unbound_buffer.cc @@ -8,8 +8,8 @@ #include +#include #include -#include #include namespace gloo { diff --git a/gloo/types.h b/gloo/types.h index b9cb94680..b88093e80 100644 --- a/gloo/types.h +++ b/gloo/types.h @@ -339,4 +339,12 @@ inline float cpu_half2float(float16 h) { return *(float*)rp; } +/** + * @brief Format helper for spdlog/fmt - convert to a float32 and format as + * usual + */ +inline float format_as(float16 v) { + return cpu_half2float(v); +} + } // namespace gloo diff --git a/third_party/spdlog b/third_party/spdlog new file mode 160000 index 000000000..6fa36017c --- /dev/null +++ b/third_party/spdlog @@ -0,0 +1 @@ +Subproject commit 6fa36017cfd5731d617e1a934f0e5ea9c4445b13