From 77c5f2fb0a0409faf53406a663ff00f32cb07428 Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Wed, 1 Oct 2025 00:27:17 +0200 Subject: [PATCH 01/12] CE: Added pages with guidelines for images on Alps --- .../examples/guidelines-images.md | 35 ++ .../examples/image-comm-fwk.md | 105 ++++ .../container-engine/examples/image-mpich.md | 578 ++++++++++++++++++ .../examples/image-nccl-tests.md | 183 ++++++ .../examples/image-nvshmem.md | 237 +++++++ .../container-engine/examples/image-ompi.md | 576 +++++++++++++++++ mkdocs.yml | 6 + 7 files changed, 1720 insertions(+) create mode 100644 docs/software/container-engine/examples/guidelines-images.md create mode 100644 docs/software/container-engine/examples/image-comm-fwk.md create mode 100644 docs/software/container-engine/examples/image-mpich.md create mode 100644 docs/software/container-engine/examples/image-nccl-tests.md create mode 100644 docs/software/container-engine/examples/image-nvshmem.md create mode 100644 docs/software/container-engine/examples/image-ompi.md diff --git a/docs/software/container-engine/examples/guidelines-images.md b/docs/software/container-engine/examples/guidelines-images.md new file mode 100644 index 00000000..f2d339fc --- /dev/null +++ b/docs/software/container-engine/examples/guidelines-images.md @@ -0,0 +1,35 @@ +[](){#ref-ce-guidelines-images} +# Guidelines for images on Alps + +This section offers some guidelines about creating and using container images that achieve good performance on the Alps reseach infrastructure. +The section focuses on foundational components (such as communication libraries) which are essential to enabling performant effective usage of Alps' capabilities, rather than full application use cases. +Synthetic benchmarks are also used to showcase quantitative performance. + +!!! important + The Containerfiles and examples provided in this section are intended to serve as general reference and starting point. + They are not meant to represent all possible combinations and versions of software capable of running efficiently on Alps. + + In the same vein, please note that the content presented here is not intended to represent images officially supported by CSCS staff. + +Below is a summary of the software suggested and demonstrated throughout this section: + +- Base components: + - CUDA 12.8.1 + - GDRCopy 2.5.1 + - Libfabric 1.22.0 + - UCX 1.19.0 +- MPI implementations + - MPICH 4.3.1 + - OpenMPI 5.0.8 +- Other programming libraries + - NVSHMEM 3.4.5 +- Synthetic benchmarks + - OSU Micro-benchmarks 7.5.1 + - NCCL Tests 2.17.1 + +The content is organized in pages which detail container images building incrementally upon each other: + +- a base image installing baseline libraries and frameworks (e.g. CUDA, libfabric) +- MPI implementations (MPICH, OpenMPI) +- NVSHMEM +- NCCL tests diff --git a/docs/software/container-engine/examples/image-comm-fwk.md b/docs/software/container-engine/examples/image-comm-fwk.md new file mode 100644 index 00000000..8bd51735 --- /dev/null +++ b/docs/software/container-engine/examples/image-comm-fwk.md @@ -0,0 +1,105 @@ +[](){#ref-ce-guidelines-images-commfwk} +# Communication frameworks image + +This page describes a container image providing foundational software components for achieving efficient execution on Alps nodes with NVIDIA GPUs. + +The most important aspect to consider for performance of containerized applications is related to use of high-speed networks, +therefore this image mainly installs communication frameworks and libraries, besides general utility tools. +In particular, the [Libfabric](https://ofiwg.github.io/libfabric/) framework (also known as Open Fabrics Interfaces - OFI) is required to interface applications with the Slingshot high-speed network. + +At runtime, the container engine [CXI hook][ref-ce-cxi-hook] will replace the Libfabric libraries inside the container with the corresponding libraries on the host system. +This will ensure access to the Slingshot interconnect. + +This image is not intended to be used on its own, but to serve as a base to build higher-level software (e.g. MPI implementations) and application stacks. +For this reason, no performance results are provided in this page. + +A build of this image is currently hosted on the [Quay.io](https://quay.io/) registry at the following reference: +`quay.io/ethcscs/comm-fwk:ofi1.22-ucx1.19-cuda12.8`. +The image name `comm-fwk` is a shortened form of "communication frameworks". + +## Contents + +- Ubuntu 24.04 +- CUDA 12.8.1 +- GDRCopy 2.5.1 +- Libfabric 1.22.0 +- UCX 1.19.0 + +## Containerfile +```Dockerfile +ARG ubuntu_version=24.04 +ARG cuda_version=12.8.1 +FROM docker.io/nvidia/cuda:${cuda_version}-cudnn-devel-ubuntu${ubuntu_version} + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive \ + apt-get install -y \ + build-essential \ + ca-certificates \ + pkg-config \ + automake \ + autoconf \ + libtool \ + cmake \ + gdb \ + strace \ + wget \ + git \ + bzip2 \ + python3 \ + gfortran \ + rdma-core \ + numactl \ + libconfig-dev \ + libuv1-dev \ + libfuse-dev \ + libfuse3-dev \ + libyaml-dev \ + libnl-3-dev \ + libnuma-dev \ + libsensors-dev \ + libcurl4-openssl-dev \ + libjson-c-dev \ + libibverbs-dev \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* + +ARG gdrcopy_version=2.5.1 +RUN git clone --depth 1 --branch v${gdrcopy_version} https://github.com/NVIDIA/gdrcopy.git \ + && cd gdrcopy \ + && export CUDA_PATH=${CUDA_HOME:-$(echo $(which nvcc) | grep -o '.*cuda')} \ + && make CC=gcc CUDA=$CUDA_PATH lib \ + && make lib_install \ + && cd ../ && rm -rf gdrcopy + +# Install libfabric +ARG libfabric_version=1.22.0 +RUN git clone --branch v${libfabric_version} --depth 1 https://github.com/ofiwg/libfabric.git \ + && cd libfabric \ + && ./autogen.sh \ + && ./configure --prefix=/usr --with-cuda=/usr/local/cuda --enable-cuda-dlopen --enable-gdrcopy-dlopen --enable-efa \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -rf libfabric + +# Install UCX +ARG UCX_VERSION=1.19.0 +RUN wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz \ + && tar xzf ucx-${UCX_VERSION}.tar.gz \ + && cd ucx-${UCX_VERSION} \ + && mkdir build \ + && cd build \ + && ../configure --prefix=/usr --with-cuda=/usr/local/cuda --with-gdrcopy=/usr/local --enable-mt --enable-devel-headers \ + && make -j$(nproc) \ + && make install \ + && cd ../.. \ + && rm -rf ucx-${UCX_VERSION}.tar.gz ucx-${UCX_VERSION} +``` + +## Notes +- The image is based on an official NVIDIA CUDA image, and therefore already provides the NCCL library, alongside a complete CUDA installation. +- Communication frameworks are built with explicit support for CUDA and GDRCopy. +- The libfabric EFA provider is included to leave open the possibility to experiment with derived images on AWS infrastructure as well. +- Although only the libfabric framework is required to support Alps' Slingshot network, this image also packages the UCX communication framework to allow building a broader set of software (e.g. some OpenSHMEM implementations) and supporting optimized Infiniband communication as well. diff --git a/docs/software/container-engine/examples/image-mpich.md b/docs/software/container-engine/examples/image-mpich.md new file mode 100644 index 00000000..2dd617cf --- /dev/null +++ b/docs/software/container-engine/examples/image-mpich.md @@ -0,0 +1,578 @@ +[](){#ref-ce-guidelines-images-mpich} +# MPICH image + +This page describes a container image featuring the MPICH library as MPI (Message Passing Interface) implementation, with support for CUDA and Libfabric. + +This image is based on the [communication frameworks image][ref-ce-guidelines-images-commfwk], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes. + +A build of this image is currently hosted on the [Quay.io](https://quay.io/) registry at the following reference: +`quay.io/ethcscs/mpich:4.3.1-ofi1.22-cuda12.8`. + +## Contents + +- Ubuntu 24.04 +- CUDA 12.8.1 +- GDRCopy 2.5.1 +- Libfabric 1.22.0 +- UCX 1.19.0 +- MPICH 4.3.1 + +## Containerfile +```Dockerfile +FROM quay.io/ethcscs/comm-fwk:ofi1.22-ucx1.19-cuda12.8 + +ARG MPI_VER=4.3.1 +RUN wget -q https://www.mpich.org/static/downloads/${MPI_VER}/mpich-${MPI_VER}.tar.gz \ + && tar xf mpich-${MPI_VER}.tar.gz \ + && cd mpich-${MPI_VER} \ + && ./autogen.sh \ + && ./configure --prefix=/usr --enable-fast=O3,ndebug \ + --disable-fortran --disable-cxx \ + --with-device=ch4:ofi --with-libfabric=/usr \ + --with-cuda=/usr/local/cuda \ + CFLAGS="-L/usr/local/cuda/targets/sbsa-linux/lib/stubs/ -lcuda" \ + CXXFLAGS="-L/usr/local/cuda/targets/sbsa-linux/lib/stubs/ -lcuda" \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -rf mpich-${MPI_VER}.tar.gz mpich-${MPI_VER} +``` + +!!! tip + This image builds MPICH without Fortran and C++ bindings. In general, C++ bindings are deprecated by the MPI standard. If you require the Fortran bindings, remove the `--disable-fortran` option in the MPICH `configure` command above. + + +## Performance examples + +In this section we demonstrate the performance of the previosly created MPICH image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks. + +A build of the image with the OSU benchmarks is available on the [Quay.io](https://quay.io/) registry at the following reference: +`quay.io/ethcscs/osu-mb:7.5-mpich4.3.1-ofi1.22-cuda12.8`. + +### OSU-MB Containerfile +```Dockerfile +FROM quay.io/ethcscs/mpich:4.3.1-ofi1.22-cuda12.8 + +ARG omb_version=7.5.1 +RUN wget -q http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${omb_version}.tar.gz \ + && tar xf osu-micro-benchmarks-${omb_version}.tar.gz \ + && cd osu-micro-benchmarks-${omb_version} \ + && ldconfig /usr/local/cuda/targets/sbsa-linux/lib/stubs \ + && ./configure --prefix=/usr/local CC=$(which mpicc) CFLAGS="-O3 -lcuda -lnvidia-ml" \ + --enable-cuda --with-cuda-include=/usr/local/cuda/include \ + --with-cuda-libpath=/usr/local/cuda/lib64 \ + CXXFLAGS="-lmpi -lcuda" \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -rf osu-micro-benchmarks-${omb_version} osu-micro-benchmarks-${omb_version}.tar.gz + +WORKDIR /usr/local/libexec/osu-micro-benchmarks/mpi +``` + +### Environment Definition File +```toml +image = "quay.io#ethcscs/osu-mb:7.5-mpich4.3.1-ofi1.22-cuda12.8" +``` + +### Notes + +- **Important:** To make sure that GPU-to-GPU performance is good for inter-node communication one must set the variable `MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1`. + This setting can negatively impact performance for other types of communication (e.g. intra-node CPU-to-CPU transfers). +- Since by default MPICH uses PMI-1 or PMI-2 for wire-up and communication between ranks, when using this image the `srun` option `--mpi=pmi2` must be used to run successful multi-rank jobs. + +### Results + +=== "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bw --validation + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.88 Pass + 2 1.76 Pass + 4 3.53 Pass + 8 7.07 Pass + 16 14.16 Pass + 32 27.76 Pass + 64 56.80 Pass + 128 113.27 Pass + 256 225.42 Pass + 512 445.70 Pass + 1024 883.96 Pass + 2048 1733.54 Pass + 4096 3309.75 Pass + 8192 6188.29 Pass + 16384 12415.59 Pass + 32768 19526.60 Pass + 65536 22624.33 Pass + 131072 23346.67 Pass + 262144 23671.41 Pass + 524288 23847.29 Pass + 1048576 23940.59 Pass + 2097152 23980.12 Pass + 4194304 24007.69 Pass + ``` + +=== "Point-to-point bandwidth, GPU-to-GPU memory, inter-node communication" + ```console + $ MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 srun -N2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bw --validation D D + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI-CUDA Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.92 Pass + 2 1.80 Pass + 4 3.72 Pass + 8 7.45 Pass + 16 14.91 Pass + 32 29.66 Pass + 64 59.65 Pass + 128 119.08 Pass + 256 236.90 Pass + 512 467.70 Pass + 1024 930.74 Pass + 2048 1808.56 Pass + 4096 3461.06 Pass + 8192 6385.63 Pass + 16384 12768.18 Pass + 32768 19332.39 Pass + 65536 22547.35 Pass + 131072 23297.26 Pass + 262144 23652.07 Pass + 524288 23812.58 Pass + 1048576 23913.85 Pass + 2097152 23971.55 Pass + 4194304 23998.79 Pass + ``` + + +=== "Point-to-point bandwidth, CPU-to-CPU memory, intra-node communication" + ```console + $ srun -N1 -n2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bw --validation + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 1.28 Pass + 2 2.60 Pass + 4 5.20 Pass + 8 10.39 Pass + 16 20.85 Pass + 32 41.56 Pass + 64 83.23 Pass + 128 164.73 Pass + 256 326.92 Pass + 512 632.98 Pass + 1024 1209.82 Pass + 2048 2352.68 Pass + 4096 4613.67 Pass + 8192 8881.00 Pass + 16384 7435.51 Pass + 32768 9369.82 Pass + 65536 11644.51 Pass + 131072 13198.71 Pass + 262144 14058.41 Pass + 524288 12958.24 Pass + 1048576 12836.55 Pass + 2097152 13117.14 Pass + 4194304 13187.01 Pass + ``` + + +=== "Point-to-point bandwidth, GPU-to-GPU memory, intra-node communication" + ```console + $ srun -N1 -n2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bw --validation D D + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI-CUDA Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.13 Pass + 2 0.27 Pass + 4 0.55 Pass + 8 1.10 Pass + 16 2.20 Pass + 32 4.40 Pass + 64 8.77 Pass + 128 17.50 Pass + 256 35.01 Pass + 512 70.14 Pass + 1024 140.35 Pass + 2048 278.91 Pass + 4096 555.96 Pass + 8192 1104.97 Pass + 16384 2214.87 Pass + 32768 4422.67 Pass + 65536 8833.18 Pass + 131072 17765.30 Pass + 262144 33834.24 Pass + 524288 59704.15 Pass + 1048576 84566.94 Pass + 2097152 102221.49 Pass + 4194304 113955.83 Pass + ``` + + +=== "Point-to-point bi-directional bandwidth, CPU-to-CPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bibw --validation + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI Bi-Directional Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 1.03 Pass + 2 2.07 Pass + 4 4.14 Pass + 8 8.28 Pass + 16 16.54 Pass + 32 33.07 Pass + 64 66.08 Pass + 128 131.65 Pass + 256 258.60 Pass + 512 518.60 Pass + 1024 1036.09 Pass + 2048 2072.16 Pass + 4096 4142.18 Pass + 8192 7551.70 Pass + 16384 14953.49 Pass + 32768 23871.35 Pass + 65536 33767.12 Pass + 131072 39284.40 Pass + 262144 42638.43 Pass + 524288 44602.52 Pass + 1048576 45621.16 Pass + 2097152 46159.65 Pass + 4194304 46433.80 Pass + ``` + + +=== "Point-to-point bi-directional bandwidth, GPU-to-GPU memory, inter-node communication" + ```console + $ MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 srun -N2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bibw --validation D D + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI-CUDA Bi-Directional Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 1.05 Pass + 2 2.10 Pass + 4 4.20 Pass + 8 8.40 Pass + 16 16.84 Pass + 32 33.63 Pass + 64 67.01 Pass + 128 132.11 Pass + 256 258.74 Pass + 512 515.52 Pass + 1024 1025.44 Pass + 2048 2019.51 Pass + 4096 3844.87 Pass + 8192 6123.96 Pass + 16384 13244.25 Pass + 32768 22521.76 Pass + 65536 34040.97 Pass + 131072 39503.52 Pass + 262144 42827.91 Pass + 524288 44663.44 Pass + 1048576 45629.24 Pass + 2097152 46167.41 Pass + 4194304 46437.18 Pass + ``` + + +=== "Point-to-point latency, CPU-to-CPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_latency --validation + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_latency: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_latency: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 3.00 Pass + 2 2.99 Pass + 4 2.99 Pass + 8 3.07 Pass + 16 2.99 Pass + 32 3.08 Pass + 64 3.01 Pass + 128 3.88 Pass + 256 4.43 Pass + 512 4.62 Pass + 1024 4.47 Pass + 2048 4.57 Pass + 4096 4.79 Pass + 8192 7.92 Pass + 16384 8.53 Pass + 32768 9.48 Pass + 65536 10.92 Pass + 131072 13.84 Pass + 262144 19.19 Pass + 524288 30.05 Pass + 1048576 51.73 Pass + 2097152 94.94 Pass + 4194304 181.46 Pass + ``` + + +=== "All-to-all collective latency, CPU-to-CPU memory, multiple nodes" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmi2 --environment=omb-mpich ./collective/osu_alltoall --validation + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 22.25 Pass + 2 22.34 Pass + 4 21.83 Pass + 8 21.72 Pass + 16 21.74 Pass + 32 21.71 Pass + 64 22.02 Pass + 128 22.35 Pass + 256 22.84 Pass + 512 23.42 Pass + 1024 24.61 Pass + 2048 24.99 Pass + 4096 26.02 Pass + 8192 29.17 Pass + 16384 68.81 Pass + 32768 95.63 Pass + 65536 181.42 Pass + 131072 306.83 Pass + 262144 526.50 Pass + 524288 960.52 Pass + 1048576 1823.52 Pass + ``` + + +=== "All-to-all collective latency, GPU-to-GPU memory, multiple nodes" + ```console + $ MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 srun -N2 --ntasks-per-node=4 --mpi=pmi2 --environment=omb-mpich ./collective/osu_alltoall --validation -d cuda + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 65.62 Pass + 2 65.51 Pass + 4 65.46 Pass + 8 65.40 Pass + 16 65.58 Pass + 32 64.97 Pass + 64 65.01 Pass + 128 65.31 Pass + 256 65.03 Pass + 512 65.14 Pass + 1024 65.67 Pass + 2048 66.23 Pass + 4096 66.69 Pass + 8192 67.47 Pass + 16384 85.99 Pass + 32768 103.15 Pass + 65536 120.40 Pass + 131072 135.64 Pass + 262144 162.24 Pass + 524288 213.84 Pass + 1048576 317.07 Pass + ``` + + +### Results without the CXI hook +On many Alps vClusters, the Container Engine is configured with the CXI hook enabled by default, enabling transparent access to the Slingshot interconnect. + +This section demonstrates the performance benefit of the CXI hook by explicitly disabling it through the EDF: +```console +$ cat .edf/omb-mpich-no-cxi.toml +image = "quay.io#ethcscs/osu-mb:7.5-mpich4.3.1-ofi1.22-cuda12.8" + +[annotations] +com.hooks.cxi.enabled="false" +``` + +=== "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmi2 --environment=omb-mpich-no-cxi ./pt2pt/osu_bw --validation + + # OSU MPI Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.14 Pass + 2 0.28 Pass + 4 0.56 Pass + 8 1.15 Pass + 16 2.32 Pass + 32 4.55 Pass + 64 9.36 Pass + 128 18.20 Pass + 256 20.26 Pass + 512 39.11 Pass + 1024 55.88 Pass + 2048 108.19 Pass + 4096 142.91 Pass + 8192 393.95 Pass + 16384 307.93 Pass + 32768 1205.61 Pass + 65536 1723.86 Pass + 131072 2376.59 Pass + 262144 2847.85 Pass + 524288 3277.75 Pass + 1048576 3580.23 Pass + 2097152 3697.47 Pass + 4194304 3764.11 Pass + ``` + +=== "Point-to-point bandwidth, GPU-to-GPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmi2 --environment=omb-mpich-no-cxi ./pt2pt/osu_bw --validation D D + + # OSU MPI-CUDA Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.04 Pass + 2 0.08 Pass + 4 0.16 Pass + 8 0.31 Pass + 16 0.62 Pass + 32 1.24 Pass + 64 2.46 Pass + 128 4.80 Pass + 256 7.33 Pass + 512 14.40 Pass + 1024 24.43 Pass + 2048 47.68 Pass + 4096 85.40 Pass + 8192 161.68 Pass + 16384 306.15 Pass + 32768 520.57 Pass + 65536 818.99 Pass + 131072 1160.48 Pass + 262144 1436.44 Pass + 524288 1676.61 Pass + 1048576 2003.55 Pass + 2097152 2104.65 Pass + 4194304 2271.56 Pass + ``` + +=== "Point-to-point latency, CPU-to-CPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmi2 --environment=omb-mpich-no-cxi ./pt2pt/osu_latency --validation + + # OSU MPI Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 38.25 Pass + 2 38.58 Pass + 4 38.49 Pass + 8 38.43 Pass + 16 38.40 Pass + 32 38.49 Pass + 64 39.18 Pass + 128 39.23 Pass + 256 45.17 Pass + 512 53.49 Pass + 1024 59.60 Pass + 2048 48.83 Pass + 4096 50.84 Pass + 8192 51.45 Pass + 16384 52.35 Pass + 32768 58.92 Pass + 65536 74.88 Pass + 131072 100.32 Pass + 262144 135.35 Pass + 524288 219.52 Pass + 1048576 384.61 Pass + 2097152 706.79 Pass + 4194304 1341.79 Pass + ``` + + +=== "All-to-all collective latency, CPU-to-CPU memory, multiple nodes" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmi2 --environment=omb-mpich-no-cxi ./collective/osu_alltoall --validation + + # OSU MPI All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 169.19 Pass + 2 169.50 Pass + 4 170.35 Pass + 8 168.81 Pass + 16 169.71 Pass + 32 169.60 Pass + 64 169.47 Pass + 128 171.48 Pass + 256 334.47 Pass + 512 343.06 Pass + 1024 703.55 Pass + 2048 449.30 Pass + 4096 454.68 Pass + 8192 468.90 Pass + 16384 532.46 Pass + 32768 578.95 Pass + 65536 1164.92 Pass + 131072 1511.04 Pass + 262144 2287.48 Pass + 524288 3668.35 Pass + 1048576 6498.36 Pass + ``` + + +=== "All-to-all collective latency, GPU-to-GPU memory, multiple nodes" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmi2 --environment=omb-mpich-no-cxi ./collective/osu_alltoall --validation -d cuda + + # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 276.29 Pass + 2 273.94 Pass + 4 273.53 Pass + 8 273.88 Pass + 16 274.83 Pass + 32 274.90 Pass + 64 276.85 Pass + 128 278.17 Pass + 256 413.21 Pass + 512 442.62 Pass + 1024 793.14 Pass + 2048 547.57 Pass + 4096 561.82 Pass + 8192 570.71 Pass + 16384 624.20 Pass + 32768 657.30 Pass + 65536 1168.43 Pass + 131072 1451.91 Pass + 262144 2049.24 Pass + 524288 3061.54 Pass + 1048576 5238.24 Pass + ``` diff --git a/docs/software/container-engine/examples/image-nccl-tests.md b/docs/software/container-engine/examples/image-nccl-tests.md new file mode 100644 index 00000000..3e1e4a54 --- /dev/null +++ b/docs/software/container-engine/examples/image-nccl-tests.md @@ -0,0 +1,183 @@ +[](){#ref-ce-guidelines-images-nccl-tests} +# NCCL Tests image + +This page describes a container image featuring the [NCCL Tests](https://github.com/NVIDIA/nccl-tests) to demonstrate how to efficiently execute NCCL-based containerized software on Alps. + +This image is based on the [OpenMPI image][ref-ce-guidelines-images-ompi], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes. + +A build of this image is currently hosted on the [Quay.io](https://quay.io/) registry at the following reference: +`quay.io/ethcscs/nccl-tests:2.17.1-ompi5.0.8-ofi1.22-cuda12.8`. + +## Contents + +- Ubuntu 24.04 +- CUDA 12.8.1 (includes NCCL) +- GDRCopy 2.5.1 +- Libfabric 1.22.0 +- UCX 1.19.0 +- OpenMPI 5.0.8 +- NCCL Tests 2.17.1 + +## Containerfile +```Dockerfile +FROM quay.io/ethcscs/ompi:5.0.8-ofi1.22-cuda12.8 + +ARG nccl_tests_version=2.17.1 +RUN wget -O nccl-tests-${nccl_tests_version}.tar.gz https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${nccl_tests_version}.tar.gz \ + && tar xf nccl-tests-${nccl_tests_version}.tar.gz \ + && cd nccl-tests-${nccl_tests_version} \ + && MPI=1 make -j$(nproc) \ + && cd .. \ + && rm -rf nccl-tests-${nccl_tests_version}.tar.gz +``` + +!!! note + This image builds NCCL tests with MPI support enabled. + +## Performance examples + +### Environment Definition File +```toml +image = "quay.io#ethcscs/nccl-tests:2.17.1-ompi5.0.8-ofi1.22-cuda12.8" + +[env] +PMIX_MCA_psec="native" + +[annotations] +com.hooks.aws_ofi_nccl.enabled = "true" +com.hooks.aws_ofi_nccl.variant = "cuda12" +``` + +### Notes + +- Since OpenMPI uses PMIx for wire-up and communication between ranks, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs. +- NCCL requires the presence of the [AWS OFI NCCL plugin](https://github.com/aws/aws-ofi-nccl) in order to correctly interface with Libfabric and (through the latter) the Slingshot interconnect. Therefore, for optimal performance the [related CE hook][ref-ce-aws-ofi-hook] must be enabled and set to match the CUDA version in the container. +- Libfabric itself is usually injected by the [CXI hook][ref-ce-cxi-hook], which is enabled by default on several Alps vClusters. + +### Results + +=== "All-reduce latency test on 2 nodes, 8 GPUs" + ```console + $ srun -N2 -t5 --mpi=pmix --ntasks-per-node=4 --environment=nccl-test-ompi /nccl-tests-2.17.1/build/all_reduce_perf -b 8 -e 128M -f 2 + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + # Collective test starting: all_reduce_perf + # nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0 + # + # Using devices + # Rank 0 Group 0 Pid 204199 on nid005471 device 0 [0009:01:00] NVIDIA GH200 120GB + # Rank 1 Group 0 Pid 204200 on nid005471 device 1 [0019:01:00] NVIDIA GH200 120GB + # Rank 2 Group 0 Pid 204201 on nid005471 device 2 [0029:01:00] NVIDIA GH200 120GB + # Rank 3 Group 0 Pid 204202 on nid005471 device 3 [0039:01:00] NVIDIA GH200 120GB + # Rank 4 Group 0 Pid 155254 on nid005487 device 0 [0009:01:00] NVIDIA GH200 120GB + # Rank 5 Group 0 Pid 155255 on nid005487 device 1 [0019:01:00] NVIDIA GH200 120GB + # Rank 6 Group 0 Pid 155256 on nid005487 device 2 [0029:01:00] NVIDIA GH200 120GB + # Rank 7 Group 0 Pid 155257 on nid005487 device 3 [0039:01:00] NVIDIA GH200 120GB + # + # out-of-place in-place + # size count type redop root time algbw busbw #wrong time algbw busbw #wrong + # (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 17.93 0.00 0.00 0 17.72 0.00 0.00 0 + 16 4 float sum -1 17.65 0.00 0.00 0 17.63 0.00 0.00 0 + 32 8 float sum -1 17.54 0.00 0.00 0 17.43 0.00 0.00 0 + 64 16 float sum -1 19.27 0.00 0.01 0 19.21 0.00 0.01 0 + 128 32 float sum -1 18.86 0.01 0.01 0 18.67 0.01 0.01 0 + 256 64 float sum -1 18.83 0.01 0.02 0 19.02 0.01 0.02 0 + 512 128 float sum -1 19.72 0.03 0.05 0 19.40 0.03 0.05 0 + 1024 256 float sum -1 20.35 0.05 0.09 0 20.32 0.05 0.09 0 + 2048 512 float sum -1 22.07 0.09 0.16 0 21.72 0.09 0.17 0 + 4096 1024 float sum -1 31.97 0.13 0.22 0 31.58 0.13 0.23 0 + 8192 2048 float sum -1 37.21 0.22 0.39 0 35.84 0.23 0.40 0 + 16384 4096 float sum -1 37.29 0.44 0.77 0 36.53 0.45 0.78 0 + 32768 8192 float sum -1 39.61 0.83 1.45 0 37.09 0.88 1.55 0 + 65536 16384 float sum -1 61.03 1.07 1.88 0 68.45 0.96 1.68 0 + 131072 32768 float sum -1 81.41 1.61 2.82 0 72.94 1.80 3.14 0 + 262144 65536 float sum -1 127.0 2.06 3.61 0 108.9 2.41 4.21 0 + 524288 131072 float sum -1 170.3 3.08 5.39 0 349.6 1.50 2.62 0 + 1048576 262144 float sum -1 164.3 6.38 11.17 0 187.7 5.59 9.77 0 + 2097152 524288 float sum -1 182.1 11.51 20.15 0 180.6 11.61 20.32 0 + 4194304 1048576 float sum -1 292.7 14.33 25.08 0 295.4 14.20 24.85 0 + 8388608 2097152 float sum -1 344.5 24.35 42.61 0 345.7 24.27 42.47 0 + 16777216 4194304 float sum -1 461.7 36.34 63.59 0 454.0 36.95 64.67 0 + 33554432 8388608 float sum -1 686.5 48.88 85.54 0 686.6 48.87 85.52 0 + 67108864 16777216 float sum -1 1090.5 61.54 107.69 0 1083.5 61.94 108.39 0 + 134217728 33554432 float sum -1 1916.4 70.04 122.57 0 1907.8 70.35 123.11 0 + # Out of bounds values : 0 OK + # Avg bus bandwidth : 19.7866 + # + # Collective test concluded: all_reduce_perf + ``` + +### Results without the AWS OFI NCCL hook +This section demonstrates the performance benefit of the AWS OFI NCCL hook by not enabling it through the EDF: +```console +$ cat ~/.edf/nccl-test-ompi-no-awsofinccl.toml +image = "quay.io#ethcscs/nccl-tests:2.17.1-ompi5.0.8-ofi1.22-cuda12.8" + +[env] +PMIX_MCA_psec="native" +``` + +=== "All-reduce latency test on 2 nodes, 8 GPUs" + ```console + $ srun -N2 -t5 --mpi=pmix --ntasks-per-node=4 --environment=nccl-test-ompi /nccl-tests-2.17.1/build/all_reduce_perf -b 8 -e 128M -f 2 + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + # Collective test starting: all_reduce_perf + # nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0 + # + # Using devices + # Rank 0 Group 0 Pid 202829 on nid005471 device 0 [0009:01:00] NVIDIA GH200 120GB + # Rank 1 Group 0 Pid 202830 on nid005471 device 1 [0019:01:00] NVIDIA GH200 120GB + # Rank 2 Group 0 Pid 202831 on nid005471 device 2 [0029:01:00] NVIDIA GH200 120GB + # Rank 3 Group 0 Pid 202832 on nid005471 device 3 [0039:01:00] NVIDIA GH200 120GB + # Rank 4 Group 0 Pid 154517 on nid005487 device 0 [0009:01:00] NVIDIA GH200 120GB + # Rank 5 Group 0 Pid 154518 on nid005487 device 1 [0019:01:00] NVIDIA GH200 120GB + # Rank 6 Group 0 Pid 154519 on nid005487 device 2 [0029:01:00] NVIDIA GH200 120GB + # Rank 7 Group 0 Pid 154520 on nid005487 device 3 [0039:01:00] NVIDIA GH200 120GB + # + # out-of-place in-place + # size count type redop root time algbw busbw #wrong time algbw busbw #wrong + # (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 85.47 0.00 0.00 0 53.44 0.00 0.00 0 + 16 4 float sum -1 52.41 0.00 0.00 0 51.11 0.00 0.00 0 + 32 8 float sum -1 50.45 0.00 0.00 0 50.40 0.00 0.00 0 + 64 16 float sum -1 62.58 0.00 0.00 0 50.70 0.00 0.00 0 + 128 32 float sum -1 50.94 0.00 0.00 0 50.77 0.00 0.00 0 + 256 64 float sum -1 50.76 0.01 0.01 0 51.77 0.00 0.01 0 + 512 128 float sum -1 163.2 0.00 0.01 0 357.5 0.00 0.00 0 + 1024 256 float sum -1 373.0 0.00 0.00 0 59.31 0.02 0.03 0 + 2048 512 float sum -1 53.22 0.04 0.07 0 52.58 0.04 0.07 0 + 4096 1024 float sum -1 55.95 0.07 0.13 0 56.63 0.07 0.13 0 + 8192 2048 float sum -1 58.52 0.14 0.24 0 58.62 0.14 0.24 0 + 16384 4096 float sum -1 108.7 0.15 0.26 0 107.8 0.15 0.27 0 + 32768 8192 float sum -1 184.1 0.18 0.31 0 183.5 0.18 0.31 0 + 65536 16384 float sum -1 325.0 0.20 0.35 0 325.4 0.20 0.35 0 + 131072 32768 float sum -1 592.7 0.22 0.39 0 591.5 0.22 0.39 0 + 262144 65536 float sum -1 942.0 0.28 0.49 0 941.4 0.28 0.49 0 + 524288 131072 float sum -1 1143.1 0.46 0.80 0 1138.0 0.46 0.81 0 + 1048576 262144 float sum -1 1502.2 0.70 1.22 0 1478.9 0.71 1.24 0 + 2097152 524288 float sum -1 921.8 2.28 3.98 0 899.8 2.33 4.08 0 + 4194304 1048576 float sum -1 1443.1 2.91 5.09 0 1432.7 2.93 5.12 0 + 8388608 2097152 float sum -1 2437.7 3.44 6.02 0 2417.0 3.47 6.07 0 + 16777216 4194304 float sum -1 5036.9 3.33 5.83 0 5003.6 3.35 5.87 0 + 33554432 8388608 float sum -1 17388 1.93 3.38 0 17275 1.94 3.40 0 + 67108864 16777216 float sum -1 21253 3.16 5.53 0 21180 3.17 5.54 0 + 134217728 33554432 float sum -1 43293 3.10 5.43 0 43396 3.09 5.41 0 + # Out of bounds values : 0 OK + # Avg bus bandwidth : 1.58767 + # + # Collective test concluded: all_reduce_perf + ``` diff --git a/docs/software/container-engine/examples/image-nvshmem.md b/docs/software/container-engine/examples/image-nvshmem.md new file mode 100644 index 00000000..1ced3ef5 --- /dev/null +++ b/docs/software/container-engine/examples/image-nvshmem.md @@ -0,0 +1,237 @@ +[](){#ref-ce-guidelines-images-nvshmem} +# NVSHMEM image + +This page describes a container image featuring the [NVSHMEM](https://developer.nvidia.com/nvshmem) parallel programming library with support for Libfabric, and demonstrates how to efficiently run said image on Alps. + +This image is based on the [OpenMPI image][ref-ce-guidelines-images-ompi], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes. + +A build of this image is currently hosted on the [Quay.io](https://quay.io/) registry at the following reference: +`quay.io/ethcscs/nvshmem:3.4.5-ompi5.0.8-ofi1.22-cuda12.8`. + +## Contents + +- Ubuntu 24.04 +- CUDA 12.8.1 (includes NCCL) +- GDRCopy 2.5.1 +- Libfabric 1.22.0 +- UCX 1.19.0 +- OpenMPI 5.0.8 +- NVSHMEM 3.4.5 + +## Containerfile +```Dockerfile +FROM quay.io/ethcscs/ompi:5.0.8-ofi1.22-cuda12.8 + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive \ + apt-get install -y \ + python3-venv \ + python3-dev \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* \ + && rm /usr/lib/python3.12/EXTERNALLY-MANAGED + +# Build NVSHMEM from source +RUN wget -q https://developer.download.nvidia.com/compute/redist/nvshmem/3.4.5/source/nvshmem_src_cuda12-all-all-3.4.5.tar.gz \ + && tar -xvf nvshmem_src_cuda12-all-all-3.4.5.tar.gz \ + && cd nvshmem_src \ + && NVSHMEM_BUILD_EXAMPLES=0 \ + NVSHMEM_BUILD_TESTS=1 \ + NVSHMEM_DEBUG=0 \ + NVSHMEM_DEVEL=0 \ + NVSHMEM_DEFAULT_PMI2=0 \ + NVSHMEM_DEFAULT_PMIX=1 \ + NVSHMEM_DISABLE_COLL_POLL=1 \ + NVSHMEM_ENABLE_ALL_DEVICE_INLINING=0 \ + NVSHMEM_GPU_COLL_USE_LDST=0 \ + NVSHMEM_LIBFABRIC_SUPPORT=1 \ + NVSHMEM_MPI_SUPPORT=1 \ + NVSHMEM_MPI_IS_OMPI=1 \ + NVSHMEM_NVTX=1 \ + NVSHMEM_PMIX_SUPPORT=1 \ + NVSHMEM_SHMEM_SUPPORT=1 \ + NVSHMEM_TEST_STATIC_LIB=0 \ + NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ + NVSHMEM_TRACE=0 \ + NVSHMEM_USE_DLMALLOC=0 \ + NVSHMEM_USE_NCCL=1 \ + NVSHMEM_USE_GDRCOPY=1 \ + NVSHMEM_VERBOSE=0 \ + NVSHMEM_DEFAULT_UCX=0 \ + NVSHMEM_UCX_SUPPORT=0 \ + NVSHMEM_IBGDA_SUPPORT=0 \ + NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY=0 \ + NVSHMEM_IBDEVX_SUPPORT=0 \ + NVSHMEM_IBRC_SUPPORT=0 \ + LIBFABRIC_HOME=/usr \ + NCCL_HOME=/usr \ + GDRCOPY_HOME=/usr/local \ + MPI_HOME=/usr \ + SHMEM_HOME=/usr \ + NVSHMEM_HOME=/usr \ + cmake . \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -r nvshmem_src nvshmem_src_cuda12-all-all-3.4.5.tar.gz +``` + +!!! note + - This image also builds the performance tests bundled with NVSHMEM (`NVSHMEM_BUILD_TESTS=1`) to demonstrate performance below. The performance tests, in turn, require the installation of Python dependencies. When building images intended solely for production purposes, you may exclude both those elements. + - Notice that NVSHMEM is configured with support for Libfabric explicitly enabled (`NVSHMEM_LIBFABRIC_SUPPORT=1`). + - Since this image is meant primarily to run on Alps, NVSHMEM is built without support for UCX and Infiniband components. + - Since this image uses OpenMPI (which provides PMIx) as MPI implementation, NVSHMEM is also configured to default to PMIx for bootstrapping (`NVSHMEM_PMIX_SUPPORT=1`). + +## Performance examples + +### Environment Definition File +```toml +image = "quay.io#ethcscs/nvshmem:3.4.5-ompi5.0.8-ofi1.22-cuda12.8" + +[env] +PMIX_MCA_psec="native" +NVSHMEM_REMOTE_TRANSPORT="libfabric" +NVSHMEM_LIBFABRIC_PROVIDER="cxi" +NVSHMEM_DISABLE_CUDA_VMM="1" + +[annotations] +com.hooks.aws_ofi_nccl.enabled = "true" +com.hooks.aws_ofi_nccl.variant = "cuda12" +``` + +### Notes + +- NVSHMEM's `libfabric` transport does not support VMM yet, so VMM must be disabled by setting the environment variable `NVSHMEM_DISABLE_CUDA_VMM=1`. +- Since NVSHMEM has been configured in the Containerfile to use PMIx for bootstrapping, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs. +- Other bootstrapping methods (including different PMI implementations) can be specified for NVSHMEM throug the related [environment variables](https://docs.nvidia.com/nvshmem/api/gen/env.html#bootstrap-options). When bootstrapping through PMI or MPI through Slurm, ensure that the PMI implementation used by Slurm (i.e. `srun --mpi` option) matches the one expected by NVSHMEM or the MPI library. +- NCCL requires the presence of the [AWS OFI NCCL plugin](https://github.com/aws/aws-ofi-nccl) in order to correctly interface with Libfabric and (through the latter) the Slingshot interconnect. Therefore, for optimal performance the [related CE hook][ref-ce-aws-ofi-hook] must be enabled and set to match the CUDA version in the container. +- Libfabric itself is usually injected by the [CXI hook][ref-ce-cxi-hook], which is enabled by default on several Alps vClusters. + +### Results + +=== "All-to-all latency test on 2 nodes, 8 GPUs" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=nvshmem /usr/local/nvshmem/bin/perftest/device/coll/alltoall_latency + Runtime options after parsing command line arguments + min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0 + Note: Above is full list of options, any given test will use only a subset of these variables. + mype: 6 mype_node: 2 device name: NVIDIA GH200 120GB bus id: 1 + Runtime options after parsing command line arguments + min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0 + Note: Above is full list of options, any given test will use only a subset of these variables. + mype: 5 mype_node: 1 device name: NVIDIA GH200 120GB bus id: 1 + Runtime options after parsing command line arguments + min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0 + Note: Above is full list of options, any given test will use only a subset of these variables. + mype: 7 mype_node: 3 device name: NVIDIA GH200 120GB bus id: 1 + Runtime options after parsing command line arguments + min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0 + Note: Above is full list of options, any given test will use only a subset of these variables. + mype: 4 mype_node: 0 device name: NVIDIA GH200 120GB bus id: 1 + Runtime options after parsing command line arguments + min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0 + Note: Above is full list of options, any given test will use only a subset of these variables. + mype: 0 mype_node: 0 device name: NVIDIA GH200 120GB bus id: 1 + #alltoall_device + size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) + 32 8 32-bit thread 116.220796 0.000 0.000 + 64 16 32-bit thread 112.700796 0.001 0.000 + 128 32 32-bit thread 113.571203 0.001 0.001 + 256 64 32-bit thread 111.123204 0.002 0.002 + 512 128 32-bit thread 111.075199 0.005 0.004 + 1024 256 32-bit thread 110.131204 0.009 0.008 + 2048 512 32-bit thread 111.030400 0.018 0.016 + 4096 1024 32-bit thread 110.985601 0.037 0.032 + 8192 2048 32-bit thread 111.039996 0.074 0.065 + #alltoall_device + size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) + 32 8 32-bit warp 89.801598 0.000 0.000 + 64 16 32-bit warp 90.563202 0.001 0.001 + 128 32 32-bit warp 89.830399 0.001 0.001 + 256 64 32-bit warp 88.863999 0.003 0.003 + 512 128 32-bit warp 89.686400 0.006 0.005 + 1024 256 32-bit warp 88.908798 0.012 0.010 + 2048 512 32-bit warp 88.819200 0.023 0.020 + 4096 1024 32-bit warp 89.670402 0.046 0.040 + 8192 2048 32-bit warp 88.889599 0.092 0.081 + 16384 4096 32-bit warp 88.972801 0.184 0.161 + 32768 8192 32-bit warp 89.564800 0.366 0.320 + 65536 16384 32-bit warp 89.888000 0.729 0.638 + #alltoall_device + size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) + 32 8 32-bit block 89.747202 0.000 0.000 + 64 16 32-bit block 88.086402 0.001 0.001 + 128 32 32-bit block 87.254399 0.001 0.001 + 256 64 32-bit block 87.401599 0.003 0.003 + 512 128 32-bit block 88.095999 0.006 0.005 + 1024 256 32-bit block 87.273598 0.012 0.010 + 2048 512 32-bit block 88.086402 0.023 0.020 + 4096 1024 32-bit block 88.940799 0.046 0.040 + 8192 2048 32-bit block 88.095999 0.093 0.081 + 16384 4096 32-bit block 87.247998 0.188 0.164 + 32768 8192 32-bit block 88.976002 0.368 0.322 + 65536 16384 32-bit block 88.121599 0.744 0.651 + 131072 32768 32-bit block 90.579200 1.447 1.266 + 262144 65536 32-bit block 91.360003 2.869 2.511 + 524288 131072 32-bit block 101.145601 5.183 4.536 + 1048576 262144 32-bit block 111.052799 9.442 8.262 + 2097152 524288 32-bit block 137.164795 15.289 13.378 + 4194304 1048576 32-bit block 183.171201 22.898 20.036 + #alltoall_device + size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) + 64 8 64-bit thread 111.955202 0.001 0.001 + 128 16 64-bit thread 113.420796 0.001 0.001 + 256 32 64-bit thread 108.508801 0.002 0.002 + 512 64 64-bit thread 110.204804 0.005 0.004 + 1024 128 64-bit thread 109.487998 0.009 0.008 + 2048 256 64-bit thread 109.462404 0.019 0.016 + 4096 512 64-bit thread 110.156798 0.037 0.033 + 8192 1024 64-bit thread 109.401596 0.075 0.066 + 16384 2048 64-bit thread 108.591998 0.151 0.132 + #alltoall_device + size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) + 64 8 64-bit warp 88.896000 0.001 0.001 + 128 16 64-bit warp 89.679998 0.001 0.001 + 256 32 64-bit warp 88.950402 0.003 0.003 + 512 64 64-bit warp 89.606398 0.006 0.005 + 1024 128 64-bit warp 89.775997 0.011 0.010 + 2048 256 64-bit warp 88.838398 0.023 0.020 + 4096 512 64-bit warp 90.671998 0.045 0.040 + 8192 1024 64-bit warp 89.699203 0.091 0.080 + 16384 2048 64-bit warp 89.011198 0.184 0.161 + 32768 4096 64-bit warp 89.622402 0.366 0.320 + 65536 8192 64-bit warp 88.905603 0.737 0.645 + 131072 16384 64-bit warp 89.766401 1.460 1.278 + #alltoall_device + size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) + 64 8 64-bit block 89.788800 0.001 0.001 + 128 16 64-bit block 88.012803 0.001 0.001 + 256 32 64-bit block 87.353599 0.003 0.003 + 512 64 64-bit block 88.000000 0.006 0.005 + 1024 128 64-bit block 87.225598 0.012 0.010 + 2048 256 64-bit block 87.225598 0.023 0.021 + 4096 512 64-bit block 87.168002 0.047 0.041 + 8192 1024 64-bit block 88.067198 0.093 0.081 + 16384 2048 64-bit block 88.863999 0.184 0.161 + 32768 4096 64-bit block 88.723201 0.369 0.323 + 65536 8192 64-bit block 87.993598 0.745 0.652 + 131072 16384 64-bit block 88.783997 1.476 1.292 + 262144 32768 64-bit block 91.366398 2.869 2.511 + 524288 65536 64-bit block 102.060795 5.137 4.495 + 1048576 131072 64-bit block 111.846399 9.375 8.203 + 2097152 262144 64-bit block 137.107205 15.296 13.384 + 4194304 524288 64-bit block 183.100796 22.907 20.044 + Runtime options after parsing command line arguments + min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0 + Note: Above is full list of options, any given test will use only a subset of these variables. + mype: 3 mype_node: 3 device name: NVIDIA GH200 120GB bus id: 1 + Runtime options after parsing command line arguments + min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0 + Note: Above is full list of options, any given test will use only a subset of these variables. + mype: 2 mype_node: 2 device name: NVIDIA GH200 120GB bus id: 1 + Runtime options after parsing command line arguments + min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0 + Note: Above is full list of options, any given test will use only a subset of these variables. + mype: 1 mype_node: 1 device name: NVIDIA GH200 120GB bus id: 1 + ``` diff --git a/docs/software/container-engine/examples/image-ompi.md b/docs/software/container-engine/examples/image-ompi.md new file mode 100644 index 00000000..c1287478 --- /dev/null +++ b/docs/software/container-engine/examples/image-ompi.md @@ -0,0 +1,576 @@ +[](){#ref-ce-guidelines-images-ompi} +# OpenMPI image + +This page describes a container image featuring the OpenMPI library as MPI (Message Passing Interface) implementation, with support for CUDA, Libfabric and UCX. + +This image is based on the [communication frameworks image][ref-ce-guidelines-images-commfwk], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes. + +A build of this image is currently hosted on the [Quay.io](https://quay.io/) registry at the following reference: +`quay.io/ethcscs/ompi:5.0.8-ofi1.22-cuda12.8`. + +## Contents + +- Ubuntu 24.04 +- CUDA 12.8.1 +- GDRCopy 2.5.1 +- Libfabric 1.22.0 +- UCX 1.19.0 +- OpenMPI 5.0.8 + +## Containerfile +```Dockerfile +FROM quay.io/ethcscs/comm-fwk:ofi1.22-ucx1.19-cuda12.8 + +ARG OMPI_VER=5.0.8 +RUN wget -q https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OMPI_VER}.tar.gz \ + && tar xf openmpi-${OMPI_VER}.tar.gz \ + && cd openmpi-${OMPI_VER} \ + && ./configure --prefix=/usr --with-ofi=/usr --with-ucx=/usr --enable-oshmem \ + --with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/local/cuda/lib64/stubs \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -rf openmpi-${OMPI_VER}.tar.gz openmpi-${OMPI_VER} +``` + +!!! note + This image builds OpenSHMEM as part of the OpenMPI installation. This can be useful to support other SHMEM implementations like NVSHMEM. + +## Performance examples + +In this section we demonstrate the performance of the previosly created OpenMPI image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks. + +A build of the image with the OSU benchmarks is available on the [Quay.io](https://quay.io/) registry at the following reference: +`quay.io/ethcscs/osu-mb:7.5-ompi5.0.8-ofi1.22-cuda12.8`. + +### OSU-MB Containerfile +```Dockerfile +FROM quay.io/ethcscs/ompi:5.0.8-ofi1.22-cuda12.8 + +ARG omb_version=7.5.1 +RUN wget -q http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${omb_version}.tar.gz \ + && tar xf osu-micro-benchmarks-${omb_version}.tar.gz \ + && cd osu-micro-benchmarks-${omb_version} \ + && ldconfig /usr/local/cuda/targets/sbsa-linux/lib/stubs \ + && ./configure --prefix=/usr/local CC=$(which mpicc) CFLAGS="-O3 -lcuda -lnvidia-ml" \ + --enable-cuda --with-cuda-include=/usr/local/cuda/include \ + --with-cuda-libpath=/usr/local/cuda/lib64 \ + CXXFLAGS="-lmpi -lcuda" \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -rf osu-micro-benchmarks-${omb_version} osu-micro-benchmarks-${omb_version}.tar.gz + +WORKDIR /usr/local/libexec/osu-micro-benchmarks/mpi +``` + +### Environment Definition File +```toml +image = "quay.io#ethcscs/osu-mb:7.5-ompi5.0.8-ofi1.22-cuda12.8" + +[env] +PMIX_MCA_psec="native" +``` + +### Notes + +- Since OpenMPI uses PMIx for wire-up and communication between ranks, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs. + +### Results + +=== "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.95 Pass + 2 1.90 Pass + 4 3.80 Pass + 8 7.61 Pass + 16 15.21 Pass + 32 30.47 Pass + 64 60.72 Pass + 128 121.56 Pass + 256 242.28 Pass + 512 484.54 Pass + 1024 968.30 Pass + 2048 1943.99 Pass + 4096 3870.29 Pass + 8192 6972.95 Pass + 16384 13922.36 Pass + 32768 18835.52 Pass + 65536 22049.82 Pass + 131072 23136.20 Pass + 262144 23555.35 Pass + 524288 23758.39 Pass + 1048576 23883.95 Pass + 2097152 23949.94 Pass + 4194304 23982.18 Pass + ``` + +=== "Point-to-point bandwidth, GPU-to-GPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation D D + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI-CUDA Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.90 Pass + 2 1.82 Pass + 4 3.65 Pass + 8 7.30 Pass + 16 14.56 Pass + 32 29.03 Pass + 64 57.49 Pass + 128 118.30 Pass + 256 227.18 Pass + 512 461.26 Pass + 1024 926.30 Pass + 2048 1820.46 Pass + 4096 3611.70 Pass + 8192 6837.89 Pass + 16384 13361.25 Pass + 32768 18037.71 Pass + 65536 22019.46 Pass + 131072 23104.58 Pass + 262144 23542.71 Pass + 524288 23758.69 Pass + 1048576 23881.02 Pass + 2097152 23955.49 Pass + 4194304 23989.54 Pass + ``` + + +=== "Point-to-point bandwidth, CPU-to-CPU memory, intra-node communication" + ```console + $ srun -N1 -n2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.96 Pass + 2 1.92 Pass + 4 3.85 Pass + 8 7.68 Pass + 16 15.40 Pass + 32 30.78 Pass + 64 61.26 Pass + 128 122.23 Pass + 256 240.96 Pass + 512 483.12 Pass + 1024 966.52 Pass + 2048 1938.09 Pass + 4096 3873.67 Pass + 8192 7100.56 Pass + 16384 14170.44 Pass + 32768 18607.68 Pass + 65536 21993.95 Pass + 131072 23082.11 Pass + 262144 23546.09 Pass + 524288 23745.05 Pass + 1048576 23879.79 Pass + 2097152 23947.23 Pass + 4194304 23980.15 Pass + ``` + + +=== "Point-to-point bandwidth, GPU-to-GPU memory, intra-node communication" + ```console + $ srun -N1 -n2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation D D + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI-CUDA Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.91 Pass + 2 1.83 Pass + 4 3.73 Pass + 8 7.47 Pass + 16 14.99 Pass + 32 29.98 Pass + 64 59.72 Pass + 128 119.13 Pass + 256 241.88 Pass + 512 481.52 Pass + 1024 963.60 Pass + 2048 1917.15 Pass + 4096 3840.96 Pass + 8192 6942.05 Pass + 16384 13911.45 Pass + 32768 18379.14 Pass + 65536 21761.73 Pass + 131072 23069.72 Pass + 262144 23543.98 Pass + 524288 23750.83 Pass + 1048576 23882.44 Pass + 2097152 23951.34 Pass + 4194304 23989.44 Pass + ``` + + +=== "Point-to-point bi-directional bandwidth, CPU-to-CPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bibw --validation + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI Bi-Directional Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.93 Pass + 2 1.94 Pass + 4 3.89 Pass + 8 7.77 Pass + 16 15.61 Pass + 32 30.94 Pass + 64 62.10 Pass + 128 123.73 Pass + 256 247.77 Pass + 512 495.33 Pass + 1024 988.33 Pass + 2048 1977.44 Pass + 4096 3953.82 Pass + 8192 7252.82 Pass + 16384 14434.94 Pass + 32768 23610.53 Pass + 65536 33290.72 Pass + 131072 39024.03 Pass + 262144 42508.16 Pass + 524288 44482.65 Pass + 1048576 45575.40 Pass + 2097152 46124.45 Pass + 4194304 46417.59 Pass + ``` + + +=== "Point-to-point bi-directional bandwidth, GPU-to-GPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bibw --validation D D + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI-CUDA Bi-Directional Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.97 Pass + 2 1.94 Pass + 4 3.89 Pass + 8 7.75 Pass + 16 15.55 Pass + 32 31.11 Pass + 64 61.95 Pass + 128 123.35 Pass + 256 250.91 Pass + 512 500.80 Pass + 1024 1002.29 Pass + 2048 2003.24 Pass + 4096 4014.15 Pass + 8192 7289.11 Pass + 16384 14717.42 Pass + 32768 22467.65 Pass + 65536 33136.69 Pass + 131072 38970.21 Pass + 262144 42501.28 Pass + 524288 44466.34 Pass + 1048576 45554.48 Pass + 2097152 46124.56 Pass + 4194304 46417.53 Pass + ``` + + +=== "Point-to-point latency, CPU-to-CPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_latency --validation + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_latency: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_latency: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 3.34 Pass + 2 3.34 Pass + 4 3.35 Pass + 8 3.34 Pass + 16 3.33 Pass + 32 3.34 Pass + 64 3.33 Pass + 128 4.32 Pass + 256 4.36 Pass + 512 4.40 Pass + 1024 4.46 Pass + 2048 4.61 Pass + 4096 4.89 Pass + 8192 8.31 Pass + 16384 8.95 Pass + 32768 9.76 Pass + 65536 11.16 Pass + 131072 13.98 Pass + 262144 19.41 Pass + 524288 30.21 Pass + 1048576 52.12 Pass + 2097152 95.26 Pass + 4194304 182.39 Pass + ``` + + +=== "All-to-all collective latency, CPU-to-CPU memory, multiple nodes" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi ./collective/osu_alltoall --validation + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 12.46 Pass + 2 12.05 Pass + 4 11.99 Pass + 8 11.84 Pass + 16 11.87 Pass + 32 11.84 Pass + 64 11.95 Pass + 128 12.22 Pass + 256 13.21 Pass + 512 13.23 Pass + 1024 13.37 Pass + 2048 13.52 Pass + 4096 13.88 Pass + 8192 17.32 Pass + 16384 18.98 Pass + 32768 23.72 Pass + 65536 36.53 Pass + 131072 62.96 Pass + 262144 119.44 Pass + 524288 236.43 Pass + 1048576 519.85 Pass + ``` + + +=== "All-to-all collective latency, GPU-to-GPU memory, multiple nodes" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi ./collective/osu_alltoall --validation -d cuda + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + + # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 22.26 Pass + 2 22.08 Pass + 4 22.15 Pass + 8 22.19 Pass + 16 22.25 Pass + 32 22.11 Pass + 64 22.22 Pass + 128 21.98 Pass + 256 22.19 Pass + 512 22.20 Pass + 1024 22.37 Pass + 2048 22.58 Pass + 4096 22.99 Pass + 8192 27.22 Pass + 16384 28.55 Pass + 32768 32.60 Pass + 65536 44.88 Pass + 131072 70.15 Pass + 262144 123.30 Pass + 524288 234.89 Pass + 1048576 486.89 Pass + ``` + + +### Results without the CXI hook +On many Alps vClusters, the Container Engine is configured with the CXI hook enabled by default, enabling transparent access to the Slingshot interconnect. + +This section demonstrates the performance benefit of the CXI hook by explicitly disabling it through the EDF: +```console +$ cat .edf/omb-ompi-no-cxi.toml +image = "quay.io#ethcscs/osu-mb:7.5-ompi5.0.8-ofi1.22-cuda12.8" + +[env] +PMIX_MCA_psec="native" + +[annotations] +com.hooks.cxi.enabled="false" +``` + +=== "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation + + # OSU MPI Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.16 Pass + 2 0.32 Pass + 4 0.65 Pass + 8 1.31 Pass + 16 2.59 Pass + 32 5.26 Pass + 64 10.37 Pass + 128 20.91 Pass + 256 41.49 Pass + 512 74.26 Pass + 1024 123.99 Pass + 2048 213.82 Pass + 4096 356.13 Pass + 8192 468.55 Pass + 16384 505.89 Pass + 32768 549.59 Pass + 65536 2170.64 Pass + 131072 2137.95 Pass + 262144 2469.63 Pass + 524288 2731.85 Pass + 1048576 2919.18 Pass + 2097152 3047.21 Pass + 4194304 3121.42 Pass + ``` + +=== "Point-to-point bandwidth, GPU-to-GPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation D D + + # OSU MPI-CUDA Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.06 Pass + 2 0.12 Pass + 4 0.24 Pass + 8 0.48 Pass + 16 0.95 Pass + 32 1.91 Pass + 64 3.85 Pass + 128 7.57 Pass + 256 15.28 Pass + 512 19.87 Pass + 1024 53.06 Pass + 2048 97.29 Pass + 4096 180.73 Pass + 8192 343.75 Pass + 16384 473.72 Pass + 32768 530.81 Pass + 65536 1268.51 Pass + 131072 1080.83 Pass + 262144 1435.36 Pass + 524288 1526.12 Pass + 1048576 1727.31 Pass + 2097152 1755.61 Pass + 4194304 1802.75 Pass + ``` + +=== "Point-to-point latency, CPU-to-CPU memory, inter-node communication" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_latency --validation + + # OSU MPI Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 28.92 Pass + 2 28.99 Pass + 4 29.07 Pass + 8 29.13 Pass + 16 29.48 Pass + 32 29.18 Pass + 64 29.39 Pass + 128 30.11 Pass + 256 32.10 Pass + 512 34.07 Pass + 1024 38.36 Pass + 2048 61.00 Pass + 4096 81.04 Pass + 8192 80.11 Pass + 16384 126.99 Pass + 32768 124.97 Pass + 65536 123.84 Pass + 131072 207.48 Pass + 262144 252.43 Pass + 524288 319.47 Pass + 1048576 497.84 Pass + 2097152 956.03 Pass + 4194304 1455.18 Pass + ``` + + +=== "All-to-all collective latency, CPU-to-CPU memory, multiple nodes" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi-no-cxi ./collective/osu_alltoall --validation + + # OSU MPI All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 137.85 Pass + 2 133.47 Pass + 4 134.03 Pass + 8 131.14 Pass + 16 134.45 Pass + 32 135.35 Pass + 64 137.21 Pass + 128 137.03 Pass + 256 139.90 Pass + 512 140.70 Pass + 1024 165.05 Pass + 2048 197.14 Pass + 4096 255.02 Pass + 8192 335.75 Pass + 16384 543.12 Pass + 32768 928.81 Pass + 65536 782.28 Pass + 131072 1812.95 Pass + 262144 2284.26 Pass + 524288 3213.63 Pass + 1048576 5688.27 Pass + ``` + + +=== "All-to-all collective latency, GPU-to-GPU memory, multiple nodes" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi-no-cxi ./collective/osu_alltoall --validation -d cuda + + # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 186.92 Pass + 2 180.80 Pass + 4 180.72 Pass + 8 179.45 Pass + 16 209.53 Pass + 32 181.73 Pass + 64 182.20 Pass + 128 182.84 Pass + 256 188.29 Pass + 512 189.35 Pass + 1024 237.31 Pass + 2048 231.73 Pass + 4096 298.73 Pass + 8192 396.10 Pass + 16384 589.72 Pass + 32768 983.72 Pass + 65536 786.48 Pass + 131072 1127.39 Pass + 262144 2144.57 Pass + 524288 3107.62 Pass + 1048576 5545.28 Pass + ``` diff --git a/mkdocs.yml b/mkdocs.yml index e02a565f..e1642376 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,6 +59,12 @@ nav: - 'Using the Container Engine': software/container-engine/run.md - 'Hooks and native resources': software/container-engine/resource-hook.md - 'EDF reference': software/container-engine/edf.md + - 'Guidelines for images on Alps': software/container-engine/examples/guidelines-images.md + - software/container-engine/examples/image-comm-fwk.md + - software/container-engine/examples/image-mpich.md + - software/container-engine/examples/image-compi.md + - software/container-engine/examples/image-nccl-tests.md + - software/container-engine/examples/image-nvshmem.md - 'Known issues': software/container-engine/known-issue.md - 'Building and Installing Software': - build-install/index.md From 5d22873a230d9de8b41f407327add2dcfb099dc5 Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Wed, 1 Oct 2025 00:40:46 +0200 Subject: [PATCH 02/12] Fixed mkdocs table of contents --- mkdocs.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index e1642376..df39f44d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,12 +59,13 @@ nav: - 'Using the Container Engine': software/container-engine/run.md - 'Hooks and native resources': software/container-engine/resource-hook.md - 'EDF reference': software/container-engine/edf.md - - 'Guidelines for images on Alps': software/container-engine/examples/guidelines-images.md - - software/container-engine/examples/image-comm-fwk.md - - software/container-engine/examples/image-mpich.md - - software/container-engine/examples/image-compi.md - - software/container-engine/examples/image-nccl-tests.md - - software/container-engine/examples/image-nvshmem.md + - 'Guidelines for images on Alps': + - software/container-engine/examples/guidelines-images.md + - 'Communication frameworks image': software/container-engine/examples/image-comm-fwk.md + - 'MPICH image': software/container-engine/examples/image-mpich.md + - 'OpenMPI image': software/container-engine/examples/image-ompi.md + - 'NCCL Tests image': software/container-engine/examples/image-nccl-tests.md + - 'NVSHMEM image': software/container-engine/examples/image-nvshmem.md - 'Known issues': software/container-engine/known-issue.md - 'Building and Installing Software': - build-install/index.md From 9504522ce4f9214de7069ba627772f26ec8f39db Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Wed, 1 Oct 2025 00:45:36 +0200 Subject: [PATCH 03/12] Fixed typos --- docs/software/container-engine/examples/image-mpich.md | 2 +- docs/software/container-engine/examples/image-nvshmem.md | 2 +- docs/software/container-engine/examples/image-ompi.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/software/container-engine/examples/image-mpich.md b/docs/software/container-engine/examples/image-mpich.md index 2dd617cf..c0016934 100644 --- a/docs/software/container-engine/examples/image-mpich.md +++ b/docs/software/container-engine/examples/image-mpich.md @@ -45,7 +45,7 @@ RUN wget -q https://www.mpich.org/static/downloads/${MPI_VER}/mpich-${MPI_VER}.t ## Performance examples -In this section we demonstrate the performance of the previosly created MPICH image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks. +In this section we demonstrate the performance of the previously created MPICH image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks. A build of the image with the OSU benchmarks is available on the [Quay.io](https://quay.io/) registry at the following reference: `quay.io/ethcscs/osu-mb:7.5-mpich4.3.1-ofi1.22-cuda12.8`. diff --git a/docs/software/container-engine/examples/image-nvshmem.md b/docs/software/container-engine/examples/image-nvshmem.md index 1ced3ef5..d3e9ec27 100644 --- a/docs/software/container-engine/examples/image-nvshmem.md +++ b/docs/software/container-engine/examples/image-nvshmem.md @@ -104,7 +104,7 @@ com.hooks.aws_ofi_nccl.variant = "cuda12" - NVSHMEM's `libfabric` transport does not support VMM yet, so VMM must be disabled by setting the environment variable `NVSHMEM_DISABLE_CUDA_VMM=1`. - Since NVSHMEM has been configured in the Containerfile to use PMIx for bootstrapping, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs. -- Other bootstrapping methods (including different PMI implementations) can be specified for NVSHMEM throug the related [environment variables](https://docs.nvidia.com/nvshmem/api/gen/env.html#bootstrap-options). When bootstrapping through PMI or MPI through Slurm, ensure that the PMI implementation used by Slurm (i.e. `srun --mpi` option) matches the one expected by NVSHMEM or the MPI library. +- Other bootstrapping methods (including different PMI implementations) can be specified for NVSHMEM through the related [environment variables](https://docs.nvidia.com/nvshmem/api/gen/env.html#bootstrap-options). When bootstrapping through PMI or MPI through Slurm, ensure that the PMI implementation used by Slurm (i.e. `srun --mpi` option) matches the one expected by NVSHMEM or the MPI library. - NCCL requires the presence of the [AWS OFI NCCL plugin](https://github.com/aws/aws-ofi-nccl) in order to correctly interface with Libfabric and (through the latter) the Slingshot interconnect. Therefore, for optimal performance the [related CE hook][ref-ce-aws-ofi-hook] must be enabled and set to match the CUDA version in the container. - Libfabric itself is usually injected by the [CXI hook][ref-ce-cxi-hook], which is enabled by default on several Alps vClusters. diff --git a/docs/software/container-engine/examples/image-ompi.md b/docs/software/container-engine/examples/image-ompi.md index c1287478..2f122b8d 100644 --- a/docs/software/container-engine/examples/image-ompi.md +++ b/docs/software/container-engine/examples/image-ompi.md @@ -39,7 +39,7 @@ RUN wget -q https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OMPI_V ## Performance examples -In this section we demonstrate the performance of the previosly created OpenMPI image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks. +In this section we demonstrate the performance of the previously created OpenMPI image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks. A build of the image with the OSU benchmarks is available on the [Quay.io](https://quay.io/) registry at the following reference: `quay.io/ethcscs/osu-mb:7.5-ompi5.0.8-ofi1.22-cuda12.8`. From 713f8b40234e9d2abf7bb9faa167c78151135a3b Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Wed, 1 Oct 2025 00:46:46 +0200 Subject: [PATCH 04/12] Fixed typo --- docs/software/container-engine/examples/guidelines-images.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/software/container-engine/examples/guidelines-images.md b/docs/software/container-engine/examples/guidelines-images.md index f2d339fc..1b3ddc12 100644 --- a/docs/software/container-engine/examples/guidelines-images.md +++ b/docs/software/container-engine/examples/guidelines-images.md @@ -1,7 +1,7 @@ [](){#ref-ce-guidelines-images} # Guidelines for images on Alps -This section offers some guidelines about creating and using container images that achieve good performance on the Alps reseach infrastructure. +This section offers some guidelines about creating and using container images that achieve good performance on the Alps research infrastructure. The section focuses on foundational components (such as communication libraries) which are essential to enabling performant effective usage of Alps' capabilities, rather than full application use cases. Synthetic benchmarks are also used to showcase quantitative performance. From b10b99fe27c4b474524d7a46fa301297038dd28f Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Wed, 1 Oct 2025 00:50:34 +0200 Subject: [PATCH 05/12] Improved content organization for CE image guidelines --- .../image-comm-fwk.md | 0 .../{examples => guidelines-images}/image-mpich.md | 0 .../image-nccl-tests.md | 0 .../{examples => guidelines-images}/image-nvshmem.md | 0 .../{examples => guidelines-images}/image-ompi.md | 0 .../index.md} | 0 mkdocs.yml | 12 ++++++------ 7 files changed, 6 insertions(+), 6 deletions(-) rename docs/software/container-engine/{examples => guidelines-images}/image-comm-fwk.md (100%) rename docs/software/container-engine/{examples => guidelines-images}/image-mpich.md (100%) rename docs/software/container-engine/{examples => guidelines-images}/image-nccl-tests.md (100%) rename docs/software/container-engine/{examples => guidelines-images}/image-nvshmem.md (100%) rename docs/software/container-engine/{examples => guidelines-images}/image-ompi.md (100%) rename docs/software/container-engine/{examples/guidelines-images.md => guidelines-images/index.md} (100%) diff --git a/docs/software/container-engine/examples/image-comm-fwk.md b/docs/software/container-engine/guidelines-images/image-comm-fwk.md similarity index 100% rename from docs/software/container-engine/examples/image-comm-fwk.md rename to docs/software/container-engine/guidelines-images/image-comm-fwk.md diff --git a/docs/software/container-engine/examples/image-mpich.md b/docs/software/container-engine/guidelines-images/image-mpich.md similarity index 100% rename from docs/software/container-engine/examples/image-mpich.md rename to docs/software/container-engine/guidelines-images/image-mpich.md diff --git a/docs/software/container-engine/examples/image-nccl-tests.md b/docs/software/container-engine/guidelines-images/image-nccl-tests.md similarity index 100% rename from docs/software/container-engine/examples/image-nccl-tests.md rename to docs/software/container-engine/guidelines-images/image-nccl-tests.md diff --git a/docs/software/container-engine/examples/image-nvshmem.md b/docs/software/container-engine/guidelines-images/image-nvshmem.md similarity index 100% rename from docs/software/container-engine/examples/image-nvshmem.md rename to docs/software/container-engine/guidelines-images/image-nvshmem.md diff --git a/docs/software/container-engine/examples/image-ompi.md b/docs/software/container-engine/guidelines-images/image-ompi.md similarity index 100% rename from docs/software/container-engine/examples/image-ompi.md rename to docs/software/container-engine/guidelines-images/image-ompi.md diff --git a/docs/software/container-engine/examples/guidelines-images.md b/docs/software/container-engine/guidelines-images/index.md similarity index 100% rename from docs/software/container-engine/examples/guidelines-images.md rename to docs/software/container-engine/guidelines-images/index.md diff --git a/mkdocs.yml b/mkdocs.yml index df39f44d..5511b08e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -60,12 +60,12 @@ nav: - 'Hooks and native resources': software/container-engine/resource-hook.md - 'EDF reference': software/container-engine/edf.md - 'Guidelines for images on Alps': - - software/container-engine/examples/guidelines-images.md - - 'Communication frameworks image': software/container-engine/examples/image-comm-fwk.md - - 'MPICH image': software/container-engine/examples/image-mpich.md - - 'OpenMPI image': software/container-engine/examples/image-ompi.md - - 'NCCL Tests image': software/container-engine/examples/image-nccl-tests.md - - 'NVSHMEM image': software/container-engine/examples/image-nvshmem.md + - software/container-engine/guidelines-images/index.md + - 'Communication frameworks image': software/container-engine/guidelines-images/image-comm-fwk.md + - 'MPICH image': software/container-engine/guidelines-images/image-mpich.md + - 'OpenMPI image': software/container-engine/guidelines-images/image-ompi.md + - 'NCCL Tests image': software/container-engine/guidelines-images/image-nccl-tests.md + - 'NVSHMEM image': software/container-engine/guidelines-images/image-nvshmem.md - 'Known issues': software/container-engine/known-issue.md - 'Building and Installing Software': - build-install/index.md From 5d74d5988d402d5b3676d45702835662cef3ac3b Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Wed, 1 Oct 2025 00:56:28 +0200 Subject: [PATCH 06/12] Fixed code blocks --- docs/software/container-engine/guidelines-images/image-mpich.md | 2 +- docs/software/container-engine/guidelines-images/image-ompi.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/software/container-engine/guidelines-images/image-mpich.md b/docs/software/container-engine/guidelines-images/image-mpich.md index c0016934..79fadecf 100644 --- a/docs/software/container-engine/guidelines-images/image-mpich.md +++ b/docs/software/container-engine/guidelines-images/image-mpich.md @@ -421,7 +421,7 @@ com.hooks.cxi.enabled="false" === "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication" ```console - $ srun -N2 --mpi=pmi2 --environment=omb-mpich-no-cxi ./pt2pt/osu_bw --validation + $ srun -N2 --mpi=pmi2 --environment=omb-mpich-no-cxi ./pt2pt/osu_bw --validation # OSU MPI Bandwidth Test v7.5 # Datatype: MPI_CHAR. diff --git a/docs/software/container-engine/guidelines-images/image-ompi.md b/docs/software/container-engine/guidelines-images/image-ompi.md index 2f122b8d..344440aa 100644 --- a/docs/software/container-engine/guidelines-images/image-ompi.md +++ b/docs/software/container-engine/guidelines-images/image-ompi.md @@ -419,7 +419,7 @@ com.hooks.cxi.enabled="false" === "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication" ```console - $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation + $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation # OSU MPI Bandwidth Test v7.5 # Datatype: MPI_CHAR. From 0bab4b206fac08c14e5f773545e74170e0e38ae6 Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Wed, 1 Oct 2025 19:11:49 +0200 Subject: [PATCH 07/12] Updated allowed words in spelling checker --- .github/actions/spelling/allow.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index 750049ed..516a474f 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -16,9 +16,11 @@ CWP CXI Ceph Containerfile +Containerfiles DNS Dockerfiles Dufourspitze +EFA EMPA ETHZ Ehrenfest @@ -75,6 +77,7 @@ MeteoSwiss NAMD NICs NVMe +NVSHMEM Nordend OpenFabrics OAuth @@ -101,6 +104,7 @@ ROCm RPA Roboto Roothaan +SHMEM SSHService STMV Scopi From 7236858863dbf5ce81f5298c438d4372b9c76c73 Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Mon, 6 Oct 2025 16:22:36 +0200 Subject: [PATCH 08/12] Apply suggestions from code review Co-authored-by: Rocco Meli --- .../container-engine/guidelines-images/image-comm-fwk.md | 6 +++--- .../container-engine/guidelines-images/image-nvshmem.md | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/software/container-engine/guidelines-images/image-comm-fwk.md b/docs/software/container-engine/guidelines-images/image-comm-fwk.md index 8bd51735..1ca39ab5 100644 --- a/docs/software/container-engine/guidelines-images/image-comm-fwk.md +++ b/docs/software/container-engine/guidelines-images/image-comm-fwk.md @@ -5,9 +5,9 @@ This page describes a container image providing foundational software components The most important aspect to consider for performance of containerized applications is related to use of high-speed networks, therefore this image mainly installs communication frameworks and libraries, besides general utility tools. -In particular, the [Libfabric](https://ofiwg.github.io/libfabric/) framework (also known as Open Fabrics Interfaces - OFI) is required to interface applications with the Slingshot high-speed network. +In particular, the [libfabric](https://ofiwg.github.io/libfabric/) framework (also known as Open Fabrics Interfaces - OFI) is required to interface applications with the Slingshot high-speed network. -At runtime, the container engine [CXI hook][ref-ce-cxi-hook] will replace the Libfabric libraries inside the container with the corresponding libraries on the host system. +At runtime, the container engine [CXI hook][ref-ce-cxi-hook] will replace the libfabric libraries inside the container with the corresponding libraries on the host system. This will ensure access to the Slingshot interconnect. This image is not intended to be used on its own, but to serve as a base to build higher-level software (e.g. MPI implementations) and application stacks. @@ -101,5 +101,5 @@ RUN wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${ ## Notes - The image is based on an official NVIDIA CUDA image, and therefore already provides the NCCL library, alongside a complete CUDA installation. - Communication frameworks are built with explicit support for CUDA and GDRCopy. -- The libfabric EFA provider is included to leave open the possibility to experiment with derived images on AWS infrastructure as well. +- The libfabric [EFA](https://aws.amazon.com/hpc/efa/) provider is included to leave open the possibility to experiment with derived images on AWS infrastructure as well. - Although only the libfabric framework is required to support Alps' Slingshot network, this image also packages the UCX communication framework to allow building a broader set of software (e.g. some OpenSHMEM implementations) and supporting optimized Infiniband communication as well. diff --git a/docs/software/container-engine/guidelines-images/image-nvshmem.md b/docs/software/container-engine/guidelines-images/image-nvshmem.md index d3e9ec27..08b13668 100644 --- a/docs/software/container-engine/guidelines-images/image-nvshmem.md +++ b/docs/software/container-engine/guidelines-images/image-nvshmem.md @@ -1,7 +1,7 @@ [](){#ref-ce-guidelines-images-nvshmem} # NVSHMEM image -This page describes a container image featuring the [NVSHMEM](https://developer.nvidia.com/nvshmem) parallel programming library with support for Libfabric, and demonstrates how to efficiently run said image on Alps. +This page describes a container image featuring the [NVSHMEM](https://developer.nvidia.com/nvshmem) parallel programming library with support for libfabric, and demonstrates how to efficiently run said image on Alps. This image is based on the [OpenMPI image][ref-ce-guidelines-images-ompi], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes. @@ -79,7 +79,7 @@ RUN wget -q https://developer.download.nvidia.com/compute/redist/nvshmem/3.4.5/s !!! note - This image also builds the performance tests bundled with NVSHMEM (`NVSHMEM_BUILD_TESTS=1`) to demonstrate performance below. The performance tests, in turn, require the installation of Python dependencies. When building images intended solely for production purposes, you may exclude both those elements. - - Notice that NVSHMEM is configured with support for Libfabric explicitly enabled (`NVSHMEM_LIBFABRIC_SUPPORT=1`). + - Notice that NVSHMEM is configured with support for libfabric explicitly enabled (`NVSHMEM_LIBFABRIC_SUPPORT=1`). - Since this image is meant primarily to run on Alps, NVSHMEM is built without support for UCX and Infiniband components. - Since this image uses OpenMPI (which provides PMIx) as MPI implementation, NVSHMEM is also configured to default to PMIx for bootstrapping (`NVSHMEM_PMIX_SUPPORT=1`). From ca271ed885534f4124c39dd3852bfeeaa41d209f Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Mon, 6 Oct 2025 16:49:43 +0200 Subject: [PATCH 09/12] CE image guidelines: add links to subpages --- docs/software/container-engine/guidelines-images/index.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/software/container-engine/guidelines-images/index.md b/docs/software/container-engine/guidelines-images/index.md index 1b3ddc12..87feed5e 100644 --- a/docs/software/container-engine/guidelines-images/index.md +++ b/docs/software/container-engine/guidelines-images/index.md @@ -29,7 +29,7 @@ Below is a summary of the software suggested and demonstrated throughout this se The content is organized in pages which detail container images building incrementally upon each other: -- a base image installing baseline libraries and frameworks (e.g. CUDA, libfabric) -- MPI implementations (MPICH, OpenMPI) -- NVSHMEM -- NCCL tests +- a [base image][ref-ce-guidelines-images-commfwk] installing baseline libraries and frameworks (e.g. CUDA, libfabric) +- MPI implementations ([MPICH][ref-ce-guidelines-images-mpich], [OpenMPI][ref-ce-guidelines-images-ompi]) +- [NVSHMEM][ref-ce-guidelines-images-nvshmem] +- [NCCL tests][ref-ce-guidelines-images-nccl-tests] From 1e919521bd3c904e13d1c7f4da036e2bf3962ed2 Mon Sep 17 00:00:00 2001 From: Alberto Madonna Date: Mon, 6 Oct 2025 17:16:41 +0200 Subject: [PATCH 10/12] CE image guidelines: add code block notes for PMIx settings --- .../guidelines-images/image-nccl-tests.md | 4 +++- .../container-engine/guidelines-images/image-nvshmem.md | 8 +++++--- .../container-engine/guidelines-images/image-ompi.md | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/software/container-engine/guidelines-images/image-nccl-tests.md b/docs/software/container-engine/guidelines-images/image-nccl-tests.md index 3e1e4a54..3f0801df 100644 --- a/docs/software/container-engine/guidelines-images/image-nccl-tests.md +++ b/docs/software/container-engine/guidelines-images/image-nccl-tests.md @@ -41,13 +41,15 @@ RUN wget -O nccl-tests-${nccl_tests_version}.tar.gz https://github.com/NVIDIA/nc image = "quay.io#ethcscs/nccl-tests:2.17.1-ompi5.0.8-ofi1.22-cuda12.8" [env] -PMIX_MCA_psec="native" +PMIX_MCA_psec="native" # (1)! [annotations] com.hooks.aws_ofi_nccl.enabled = "true" com.hooks.aws_ofi_nccl.variant = "cuda12" ``` +1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup. + ### Notes - Since OpenMPI uses PMIx for wire-up and communication between ranks, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs. diff --git a/docs/software/container-engine/guidelines-images/image-nvshmem.md b/docs/software/container-engine/guidelines-images/image-nvshmem.md index 08b13668..41406424 100644 --- a/docs/software/container-engine/guidelines-images/image-nvshmem.md +++ b/docs/software/container-engine/guidelines-images/image-nvshmem.md @@ -90,19 +90,21 @@ RUN wget -q https://developer.download.nvidia.com/compute/redist/nvshmem/3.4.5/s image = "quay.io#ethcscs/nvshmem:3.4.5-ompi5.0.8-ofi1.22-cuda12.8" [env] -PMIX_MCA_psec="native" +PMIX_MCA_psec="native" # (1)! NVSHMEM_REMOTE_TRANSPORT="libfabric" NVSHMEM_LIBFABRIC_PROVIDER="cxi" -NVSHMEM_DISABLE_CUDA_VMM="1" +NVSHMEM_DISABLE_CUDA_VMM="1" # (2)! [annotations] com.hooks.aws_ofi_nccl.enabled = "true" com.hooks.aws_ofi_nccl.variant = "cuda12" ``` +1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup. +2. NVSHMEM's `libfabric` transport does not support VMM yet, so VMM must be disabled by setting the environment variable `NVSHMEM_DISABLE_CUDA_VMM=1`. + ### Notes -- NVSHMEM's `libfabric` transport does not support VMM yet, so VMM must be disabled by setting the environment variable `NVSHMEM_DISABLE_CUDA_VMM=1`. - Since NVSHMEM has been configured in the Containerfile to use PMIx for bootstrapping, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs. - Other bootstrapping methods (including different PMI implementations) can be specified for NVSHMEM through the related [environment variables](https://docs.nvidia.com/nvshmem/api/gen/env.html#bootstrap-options). When bootstrapping through PMI or MPI through Slurm, ensure that the PMI implementation used by Slurm (i.e. `srun --mpi` option) matches the one expected by NVSHMEM or the MPI library. - NCCL requires the presence of the [AWS OFI NCCL plugin](https://github.com/aws/aws-ofi-nccl) in order to correctly interface with Libfabric and (through the latter) the Slingshot interconnect. Therefore, for optimal performance the [related CE hook][ref-ce-aws-ofi-hook] must be enabled and set to match the CUDA version in the container. diff --git a/docs/software/container-engine/guidelines-images/image-ompi.md b/docs/software/container-engine/guidelines-images/image-ompi.md index 344440aa..07622b14 100644 --- a/docs/software/container-engine/guidelines-images/image-ompi.md +++ b/docs/software/container-engine/guidelines-images/image-ompi.md @@ -71,9 +71,11 @@ WORKDIR /usr/local/libexec/osu-micro-benchmarks/mpi image = "quay.io#ethcscs/osu-mb:7.5-ompi5.0.8-ofi1.22-cuda12.8" [env] -PMIX_MCA_psec="native" +PMIX_MCA_psec="native" # (1)! ``` +1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup. + ### Notes - Since OpenMPI uses PMIx for wire-up and communication between ranks, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs. From a69d336c0cc8ea1264df156295a92a70a8bcfd99 Mon Sep 17 00:00:00 2001 From: bcumming Date: Fri, 24 Oct 2025 09:41:31 +0200 Subject: [PATCH 11/12] wip --- docs/software/communication/index.md | 15 +++++++++++++-- mkdocs.yml | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/docs/software/communication/index.md b/docs/software/communication/index.md index 5d961d77..b241df0e 100644 --- a/docs/software/communication/index.md +++ b/docs/software/communication/index.md @@ -1,7 +1,18 @@ [](){#ref-software-communication} # Communication Libraries -CSCS provides common communication libraries optimized for the [Slingshot 11 network on Alps][ref-alps-hsn]. +Communication libraries are used by scientific and AI workloads to communicate between processes. +The communication libraries used by workloads need to be built and configured correctly to get the best performance. +Broadly speaking, there are two levels of communication: + +* **intra-node** communication between two processes on the same node. +* **inter-node** communication between different nodes, which requires + +Inter-node communication requires sending and receiving data over the [Slingshot 11 network][ref-alps-hsn] that connects nodes on Alps. +Communication libraries, like MPI and NCCL, need to be configured to use the [libfabric][ref-communication-libfabric] library that has an optimised back end for Slingshot 11. + +CSCS provides communication libraries optimised for libfabric and slingshot in uenv, and guidance on how to configure container images similarly. +This section of the documentation provides advice on how to build and install software to use these libraries, and how to deploy them. For most scientific applications relying on MPI, [Cray MPICH][ref-communication-cray-mpich] is recommended. [MPICH][ref-communication-mpich] and [OpenMPI][ref-communication-openmpi] may also be used, with limitations. @@ -12,9 +23,9 @@ NCCL and RCCL have to be configured with a plugin using [libfabric][ref-communic See the individual pages for each library for information on how to use and best configure the libraries. +* [libfabric][ref-communication-libfabric] * [Cray MPICH][ref-communication-cray-mpich] * [MPICH][ref-communication-mpich] * [OpenMPI][ref-communication-openmpi] * [NCCL][ref-communication-nccl] * [RCCL][ref-communication-rccl] -* [libfabric][ref-communication-libfabric] diff --git a/mkdocs.yml b/mkdocs.yml index bcdb0235..2b1ca54b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -111,12 +111,12 @@ nav: - 'WRF': software/cw/wrf.md - 'Communication Libraries': - software/communication/index.md + - 'libfabric': software/communication/libfabric.md - 'Cray MPICH': software/communication/cray-mpich.md - 'MPICH': software/communication/mpich.md - 'OpenMPI': software/communication/openmpi.md - 'NCCL': software/communication/nccl.md - 'RCCL': software/communication/rccl.md - - 'libfabric': software/communication/libfabric.md - 'Commercial software': - software/commercial/index.md - 'Matlab': software/commercial/matlab.md From 5df469c155a316e7b6026cdf623fc18e1dde66d5 Mon Sep 17 00:00:00 2001 From: bcumming Date: Mon, 27 Oct 2025 20:11:11 +0100 Subject: [PATCH 12/12] refactor comms index; integrated base image into libfabric --- docs/software/communication/index.md | 58 +++++++-- docs/software/communication/libfabric.md | 147 ++++++++++++++++++++++- 2 files changed, 192 insertions(+), 13 deletions(-) diff --git a/docs/software/communication/index.md b/docs/software/communication/index.md index b241df0e..22b9aca1 100644 --- a/docs/software/communication/index.md +++ b/docs/software/communication/index.md @@ -1,15 +1,26 @@ [](){#ref-software-communication} # Communication Libraries +!!! todo "list of ideas to integrate in this page" + * communication libraries are part of the "base" or "core" layer in your environment, alongside compilers and cuda (on NVIDIA GPU systems). + * we provide base containers that start with compilers+CUDA + * have a section "installing/getting comm libs": + * CE (build your own) and uenv (it comes with the label) sub-sections + * Conda, pre-built (ORCA, ANSYS, etc) + Communication libraries are used by scientific and AI workloads to communicate between processes. The communication libraries used by workloads need to be built and configured correctly to get the best performance. Broadly speaking, there are two levels of communication: * **intra-node** communication between two processes on the same node. -* **inter-node** communication between different nodes, which requires +* **inter-node** communication between different nodes, over the [Slingshot 11 network][ref-alps-hsn] that connects nodes on Alps.. -Inter-node communication requires sending and receiving data over the [Slingshot 11 network][ref-alps-hsn] that connects nodes on Alps. Communication libraries, like MPI and NCCL, need to be configured to use the [libfabric][ref-communication-libfabric] library that has an optimised back end for Slingshot 11. +As such, they are part of the base layer of libraries and tools required to fully utilize the hardware on Alps: + +* **CPU**: compilers with support for building applications optimized for the CPU architecture on the node. +* **GPU**: CUDA and ROCM provide compilers and runtime libraries for NVIDIA and AMD GPUs respectively. +* **Network**: libfabric, MPI, NCCL/RCCL, NVSHMEM, need to be configured for the Slingshot network. CSCS provides communication libraries optimised for libfabric and slingshot in uenv, and guidance on how to configure container images similarly. This section of the documentation provides advice on how to build and install software to use these libraries, and how to deploy them. @@ -23,9 +34,40 @@ NCCL and RCCL have to be configured with a plugin using [libfabric][ref-communic See the individual pages for each library for information on how to use and best configure the libraries. -* [libfabric][ref-communication-libfabric] -* [Cray MPICH][ref-communication-cray-mpich] -* [MPICH][ref-communication-mpich] -* [OpenMPI][ref-communication-openmpi] -* [NCCL][ref-communication-nccl] -* [RCCL][ref-communication-rccl] +
+ +- __Low Level__ + + learn about the base installation libfabric and its dependencies + + [:octicons-arrow-right-24: libfabric][ref-alps] + +
+
+ +- __MPI__ + + Cray MPICH is the most optimized and best tested MPI implementation on Alps, and is used by uenv. + + [:octicons-arrow-right-24: Cray MPICH][ref-communication-cray-mpich] + + For compatibility in containers: + + [:octicons-arrow-right-24: MPICH][ref-communication-mpich] + + Also OpenMPI can be built in containers or in uenv + + [:octicons-arrow-right-24: FirecREST API][ref-communication-openmpi] + +
+
+ +- __Machine Learning__ + + NCCL and RCCL + + [:octicons-arrow-right-24: NCCL][ref-communication-nccl] + + [:octicons-arrow-right-24: RCCL][ref-communication-rccl] + +
diff --git a/docs/software/communication/libfabric.md b/docs/software/communication/libfabric.md index a8dd80d8..5ef434d3 100644 --- a/docs/software/communication/libfabric.md +++ b/docs/software/communication/libfabric.md @@ -1,16 +1,153 @@ [](){#ref-communication-libfabric} # Libfabric -[Libfabric](https://ofiwg.github.io/libfabric/), or Open Fabrics Interfaces (OFI), is a low level networking library that abstracts away various networking backends. -It is used by Cray MPICH, and can be used together with OpenMPI, NCCL, and RCCL to make use of the [Slingshot network on Alps][ref-alps-hsn]. +[Libfabric](https://ofiwg.github.io/libfabric/), or Open Fabrics Interfaces (OFI), is a low-level networking library that provides an abstract interface for networks. +Libfabric has backends for different network types, and is the interface chosen by HPE for the [Slingshot network on Alps][ref-alps-hsn], and by AWS for their [EFA network interface](https://aws.amazon.com/hpc/efa/). + +To fully take advantage of the network on Alps: + +* libfabric and its dependencies must be availailable in your environment (uenv or container); +* and, communication libraries like Cray MPICH, OpenMPI, NCCL, and RCCL have to be built or configured to use libfabric. + +??? question "What about UCX?" + [Unified Communication X (UCX)](https://openucx.org/) is a low level library that targets the same layer as libfabric. + Specifically, it provides an open, standards-based, networking API. + + By targetting UCX and libfabric, MPI and NCCL do not need to implement low-level support for each network hardware. + + A downside of having two standards instead of one, is that pre-built software (for example Conda packages and Containers) have versions of MPI built for UCX, which does not provide a back end for Slingshot 11. + Trying to run these images will lead to errors, or very poor performance. ## Using libfabric +### uenv + If you are using a uenv provided by CSCS, such as [prgenv-gnu][ref-uenv-prgenv-gnu], [Cray MPICH][ref-communication-cray-mpich] is linked to libfabric and the high speed network will be used. No changes are required in applications. -If you are using containers, the system libfabric can be loaded into your container using the [CXI hook provided by the container engine][ref-ce-cxi-hook]. -Using the hook is essential to make full use of the Alps network. +### Container Engine + +If you are using [containers][ref-container-engine], the simplest approach is to load libfabric into your container using the [CXI hook provided by the container engine][ref-ce-cxi-hook]. + +Alternatively, it is possible to build libfabric and its dependencies into your container. + +!!! example "Installing libfabric in a container for NVIDIA nodes" + The following lines demonstrate how to configure and + + Note that it is assumed that CUDA has already been installed on the system. + ```Dockerfile + # Install libfabric + ARG gdrcopy_version=2.5.1 + RUN git clone --depth 1 --branch v${gdrcopy_version} https://github.com/NVIDIA/gdrcopy.git \ + && cd gdrcopy \ + && export CUDA_PATH=${CUDA_HOME:-$(echo $(which nvcc) | grep -o '.*cuda')} \ + && make CC=gcc CUDA=$CUDA_PATH lib \ + && make lib_install \ + && cd ../ && rm -rf gdrcopy + + # Install libfabric + ARG libfabric_version=1.22.0 + RUN git clone --branch v${libfabric_version} --depth 1 https://github.com/ofiwg/libfabric.git \ + && cd libfabric \ + && ./autogen.sh \ + && ./configure --prefix=/usr --with-cuda=/usr/local/cuda --enable-cuda-dlopen \ + --enable-gdrcopy-dlopen --enable-efa \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -rf libfabric + ``` + +!!! todo + In the above recipe `CUDA_PATH` is "calculated" for gdrcopy, and just hard coded to `/usr/loca/cuda` for libfabric. + How about just hard-coding it everywhere, to simplify the recipe? + +!!! todo + Should we include the EFA and UCX support here? It is not needed to run on Alps, and might confuse readers. + +??? note "The full containerfile for GH200" + + The containerfile below is based on the NVIDIA CUDA image, which provides a complete CUDA installation. + + - Communication frameworks are built with explicit support for CUDA and GDRCopy. + + Some additional features are enabled to increase the portability of the container to non-Alps systems: + + - The libfabric [EFA](https://aws.amazon.com/hpc/efa/) provider is configured using the `--enable-efa` compatibility for derived images on AWS infrastructure. + - this image also packages the UCX communication framework to allow building a broader set of software (e.g. some OpenSHMEM implementations) and supporting optimized Infiniband communication as well. + + ``` + ARG ubuntu_version=24.04 + ARG cuda_version=12.8.1 + FROM docker.io/nvidia/cuda:${cuda_version}-cudnn-devel-ubuntu${ubuntu_version} + + RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive \ + apt-get install -y \ + build-essential \ + ca-certificates \ + pkg-config \ + automake \ + autoconf \ + libtool \ + cmake \ + gdb \ + strace \ + wget \ + git \ + bzip2 \ + python3 \ + gfortran \ + rdma-core \ + numactl \ + libconfig-dev \ + libuv1-dev \ + libfuse-dev \ + libfuse3-dev \ + libyaml-dev \ + libnl-3-dev \ + libnuma-dev \ + libsensors-dev \ + libcurl4-openssl-dev \ + libjson-c-dev \ + libibverbs-dev \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* + + ARG gdrcopy_version=2.5.1 + RUN git clone --depth 1 --branch v${gdrcopy_version} https://github.com/NVIDIA/gdrcopy.git \ + && cd gdrcopy \ + && export CUDA_PATH=${CUDA_HOME:-$(echo $(which nvcc) | grep -o '.*cuda')} \ + && make CC=gcc CUDA=$CUDA_PATH lib \ + && make lib_install \ + && cd ../ && rm -rf gdrcopy + + # Install libfabric + ARG libfabric_version=1.22.0 + RUN git clone --branch v${libfabric_version} --depth 1 https://github.com/ofiwg/libfabric.git \ + && cd libfabric \ + && ./autogen.sh \ + && ./configure --prefix=/usr --with-cuda=/usr/local/cuda --enable-cuda-dlopen --enable-gdrcopy-dlopen --enable-efa \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -rf libfabric + + # Install UCX + ARG UCX_VERSION=1.19.0 + RUN wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz \ + && tar xzf ucx-${UCX_VERSION}.tar.gz \ + && cd ucx-${UCX_VERSION} \ + && mkdir build \ + && cd build \ + && ../configure --prefix=/usr --with-cuda=/usr/local/cuda --with-gdrcopy=/usr/local --enable-mt --enable-devel-headers \ + && make -j$(nproc) \ + && make install \ + && cd ../.. \ + && rm -rf ucx-${UCX_VERSION}.tar.gz ucx-${UCX_VERSION} + ``` ## Tuning libfabric @@ -21,4 +158,4 @@ Note that the exact version deployed on Alps may differ, and not all options may See the [Cray MPICH known issues page][ref-communication-cray-mpich-known-issues] for issues when using Cray MPICH together with libfabric. !!! todo - More options? + - add environment variable tuning guide