From 77c5f2fb0a0409faf53406a663ff00f32cb07428 Mon Sep 17 00:00:00 2001
From: Alberto Madonna <alberto.madonna@cscs.ch>
Date: Wed, 1 Oct 2025 00:27:17 +0200
Subject: [PATCH 01/12] CE: Added pages with guidelines for images on Alps

---
 .../examples/guidelines-images.md             |  35 ++
 .../examples/image-comm-fwk.md                | 105 ++++
 .../container-engine/examples/image-mpich.md  | 578 ++++++++++++++++++
 .../examples/image-nccl-tests.md              | 183 ++++++
 .../examples/image-nvshmem.md                 | 237 +++++++
 .../container-engine/examples/image-ompi.md   | 576 +++++++++++++++++
 mkdocs.yml                                    |   6 +
 7 files changed, 1720 insertions(+)
 create mode 100644 docs/software/container-engine/examples/guidelines-images.md
 create mode 100644 docs/software/container-engine/examples/image-comm-fwk.md
 create mode 100644 docs/software/container-engine/examples/image-mpich.md
 create mode 100644 docs/software/container-engine/examples/image-nccl-tests.md
 create mode 100644 docs/software/container-engine/examples/image-nvshmem.md
 create mode 100644 docs/software/container-engine/examples/image-ompi.md

diff --git a/docs/software/container-engine/examples/guidelines-images.md b/docs/software/container-engine/examples/guidelines-images.md
new file mode 100644
index 00000000..f2d339fc
--- /dev/null
+++ b/docs/software/container-engine/examples/guidelines-images.md
@@ -0,0 +1,35 @@
+[](){#ref-ce-guidelines-images}
+# Guidelines for images on Alps
+
+This section offers some guidelines about creating and using container images that achieve good performance on the Alps reseach infrastructure.
+The section focuses on foundational components (such as communication libraries) which are essential to enabling performant effective usage of Alps' capabilities, rather than full application use cases.
+Synthetic benchmarks are also used to showcase quantitative performance.
+
+!!! important
+    The Containerfiles and examples provided in this section are intended to serve as general reference and starting point.
+    They are not meant to represent all possible combinations and versions of software capable of running efficiently on Alps.
+
+    In the same vein, please note that the content presented here is not intended to represent images officially supported by CSCS staff.
+
+Below is a summary of the software suggested and demonstrated throughout this section:
+
+- Base components:
+    - CUDA 12.8.1
+    - GDRCopy 2.5.1
+    - Libfabric 1.22.0
+    - UCX 1.19.0
+- MPI implementations
+    - MPICH 4.3.1
+    - OpenMPI 5.0.8
+- Other programming libraries
+    - NVSHMEM 3.4.5
+- Synthetic benchmarks
+    - OSU Micro-benchmarks 7.5.1
+    - NCCL Tests 2.17.1
+
+The content is organized in pages which detail container images building incrementally upon each other:
+
+- a base image installing baseline libraries and frameworks (e.g. CUDA, libfabric)
+- MPI implementations (MPICH, OpenMPI)
+- NVSHMEM
+- NCCL tests
diff --git a/docs/software/container-engine/examples/image-comm-fwk.md b/docs/software/container-engine/examples/image-comm-fwk.md
new file mode 100644
index 00000000..8bd51735
--- /dev/null
+++ b/docs/software/container-engine/examples/image-comm-fwk.md
@@ -0,0 +1,105 @@
+[](){#ref-ce-guidelines-images-commfwk}
+# Communication frameworks image
+
+This page describes a container image providing foundational software components for achieving efficient execution on Alps nodes with NVIDIA GPUs.
+
+The most important aspect to consider for performance of containerized applications is related to use of high-speed networks,
+therefore this image mainly installs communication frameworks and libraries, besides general utility tools.
+In particular, the [Libfabric](https://ofiwg.github.io/libfabric/) framework (also known as Open Fabrics Interfaces - OFI) is required to interface applications with the Slingshot high-speed network.
+
+At runtime, the container engine [CXI hook][ref-ce-cxi-hook] will replace the Libfabric libraries inside the container with the corresponding libraries on the host system.
+This will ensure access to the Slingshot interconnect.
+
+This image is not intended to be used on its own, but to serve as a base to build higher-level software (e.g. MPI implementations) and application stacks.
+For this reason, no performance results are provided in this page.
+
+A build of this image is currently hosted on the [Quay.io](https://quay.io/) registry at the following reference:
+`quay.io/ethcscs/comm-fwk:ofi1.22-ucx1.19-cuda12.8`.
+The image name `comm-fwk` is a shortened form of "communication frameworks".
+
+## Contents
+
+- Ubuntu 24.04
+- CUDA 12.8.1
+- GDRCopy 2.5.1
+- Libfabric 1.22.0
+- UCX 1.19.0
+
+## Containerfile
+```Dockerfile
+ARG ubuntu_version=24.04
+ARG cuda_version=12.8.1
+FROM docker.io/nvidia/cuda:${cuda_version}-cudnn-devel-ubuntu${ubuntu_version}
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive \
+       apt-get install -y \
+        build-essential \
+        ca-certificates \
+        pkg-config \
+        automake \
+        autoconf \
+        libtool \
+        cmake \
+        gdb \
+        strace \
+        wget \
+        git \
+        bzip2 \
+        python3 \
+        gfortran \
+        rdma-core \
+        numactl \
+        libconfig-dev \
+        libuv1-dev \
+        libfuse-dev \
+        libfuse3-dev \
+        libyaml-dev \
+        libnl-3-dev \
+        libnuma-dev \
+        libsensors-dev \
+        libcurl4-openssl-dev \
+        libjson-c-dev \
+        libibverbs-dev \
+        --no-install-recommends \
+    && rm -rf /var/lib/apt/lists/*
+
+ARG gdrcopy_version=2.5.1
+RUN git clone --depth 1 --branch v${gdrcopy_version} https://github.com/NVIDIA/gdrcopy.git \
+    && cd gdrcopy \
+    && export CUDA_PATH=${CUDA_HOME:-$(echo $(which nvcc) | grep -o '.*cuda')} \
+    && make CC=gcc CUDA=$CUDA_PATH lib \
+    && make lib_install \
+    && cd ../ && rm -rf gdrcopy
+
+# Install libfabric
+ARG libfabric_version=1.22.0
+RUN git clone --branch v${libfabric_version} --depth 1 https://github.com/ofiwg/libfabric.git \
+    && cd libfabric \
+    && ./autogen.sh \
+    && ./configure --prefix=/usr --with-cuda=/usr/local/cuda --enable-cuda-dlopen --enable-gdrcopy-dlopen --enable-efa \
+    && make -j$(nproc) \
+    && make install \
+    && ldconfig \
+    && cd .. \
+    && rm -rf libfabric
+
+# Install UCX
+ARG UCX_VERSION=1.19.0
+RUN wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz \
+    && tar xzf ucx-${UCX_VERSION}.tar.gz \
+    && cd ucx-${UCX_VERSION} \
+    && mkdir build \
+    && cd build \
+    && ../configure --prefix=/usr --with-cuda=/usr/local/cuda --with-gdrcopy=/usr/local --enable-mt --enable-devel-headers \
+    && make -j$(nproc) \
+    && make install \
+    && cd ../.. \
+    && rm -rf ucx-${UCX_VERSION}.tar.gz ucx-${UCX_VERSION}
+```
+
+## Notes
+- The image is based on an official NVIDIA CUDA image, and therefore already provides the NCCL library, alongside a complete CUDA installation.
+- Communication frameworks are built with explicit support for CUDA and GDRCopy.
+- The libfabric EFA provider is included to leave open the possibility to experiment with derived images on AWS infrastructure as well.
+- Although only the libfabric framework is required to support Alps' Slingshot network, this image also packages the UCX communication framework to allow building a broader set of software (e.g. some OpenSHMEM implementations) and supporting optimized Infiniband communication as well.
diff --git a/docs/software/container-engine/examples/image-mpich.md b/docs/software/container-engine/examples/image-mpich.md
new file mode 100644
index 00000000..2dd617cf
--- /dev/null
+++ b/docs/software/container-engine/examples/image-mpich.md
@@ -0,0 +1,578 @@
+[](){#ref-ce-guidelines-images-mpich}
+# MPICH image
+
+This page describes a container image featuring the MPICH library as MPI (Message Passing Interface) implementation, with support for CUDA and Libfabric.
+
+This image is based on the [communication frameworks image][ref-ce-guidelines-images-commfwk], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes.
+
+A build of this image is currently hosted on the [Quay.io](https://quay.io/) registry at the following reference:
+`quay.io/ethcscs/mpich:4.3.1-ofi1.22-cuda12.8`.
+
+## Contents
+
+- Ubuntu 24.04
+- CUDA 12.8.1
+- GDRCopy 2.5.1
+- Libfabric 1.22.0
+- UCX 1.19.0
+- MPICH 4.3.1
+
+## Containerfile
+```Dockerfile
+FROM quay.io/ethcscs/comm-fwk:ofi1.22-ucx1.19-cuda12.8
+
+ARG MPI_VER=4.3.1
+RUN wget -q https://www.mpich.org/static/downloads/${MPI_VER}/mpich-${MPI_VER}.tar.gz \
+    && tar xf mpich-${MPI_VER}.tar.gz \
+    && cd mpich-${MPI_VER} \
+    && ./autogen.sh \
+    && ./configure --prefix=/usr --enable-fast=O3,ndebug \
+       --disable-fortran --disable-cxx \
+       --with-device=ch4:ofi --with-libfabric=/usr \
+       --with-cuda=/usr/local/cuda \
+       CFLAGS="-L/usr/local/cuda/targets/sbsa-linux/lib/stubs/ -lcuda" \
+       CXXFLAGS="-L/usr/local/cuda/targets/sbsa-linux/lib/stubs/ -lcuda" \
+    && make -j$(nproc) \
+    && make install \
+    && ldconfig \
+    && cd .. \
+    && rm -rf mpich-${MPI_VER}.tar.gz mpich-${MPI_VER}
+```
+
+!!! tip
+    This image builds MPICH without Fortran and C++ bindings. In general, C++ bindings are deprecated by the MPI standard. If you require the Fortran bindings, remove the `--disable-fortran` option in the MPICH `configure` command above.
+
+
+## Performance examples
+
+In this section we demonstrate the performance of the previosly created MPICH image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks.
+
+A build of the image with the OSU benchmarks is available on the [Quay.io](https://quay.io/) registry at the following reference:
+`quay.io/ethcscs/osu-mb:7.5-mpich4.3.1-ofi1.22-cuda12.8`.
+
+### OSU-MB Containerfile
+```Dockerfile
+FROM quay.io/ethcscs/mpich:4.3.1-ofi1.22-cuda12.8
+
+ARG omb_version=7.5.1
+RUN wget -q http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${omb_version}.tar.gz \
+    && tar xf osu-micro-benchmarks-${omb_version}.tar.gz \
+    && cd osu-micro-benchmarks-${omb_version} \
+    && ldconfig /usr/local/cuda/targets/sbsa-linux/lib/stubs \
+    && ./configure --prefix=/usr/local CC=$(which mpicc) CFLAGS="-O3 -lcuda -lnvidia-ml" \
+                   --enable-cuda --with-cuda-include=/usr/local/cuda/include \
+                   --with-cuda-libpath=/usr/local/cuda/lib64 \
+                   CXXFLAGS="-lmpi -lcuda" \
+    && make -j$(nproc) \
+    && make install \
+    && ldconfig \
+    && cd .. \
+    && rm -rf osu-micro-benchmarks-${omb_version} osu-micro-benchmarks-${omb_version}.tar.gz
+
+WORKDIR /usr/local/libexec/osu-micro-benchmarks/mpi
+```
+
+### Environment Definition File
+```toml
+image = "quay.io#ethcscs/osu-mb:7.5-mpich4.3.1-ofi1.22-cuda12.8"
+```
+
+### Notes
+
+- **Important:** To make sure that GPU-to-GPU performance is good for inter-node communication one must set the variable `MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1`.
+  This setting can negatively impact performance for other types of communication (e.g. intra-node CPU-to-CPU transfers).
+- Since by default MPICH uses PMI-1 or PMI-2 for wire-up and communication between ranks, when using this image the `srun` option `--mpi=pmi2` must be used to run successful multi-rank jobs.
+
+### Results
+
+=== "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bw --validation
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.88              Pass
+    2                       1.76              Pass
+    4                       3.53              Pass
+    8                       7.07              Pass
+    16                     14.16              Pass
+    32                     27.76              Pass
+    64                     56.80              Pass
+    128                   113.27              Pass
+    256                   225.42              Pass
+    512                   445.70              Pass
+    1024                  883.96              Pass
+    2048                 1733.54              Pass
+    4096                 3309.75              Pass
+    8192                 6188.29              Pass
+    16384               12415.59              Pass
+    32768               19526.60              Pass
+    65536               22624.33              Pass
+    131072              23346.67              Pass
+    262144              23671.41              Pass
+    524288              23847.29              Pass
+    1048576             23940.59              Pass
+    2097152             23980.12              Pass
+    4194304             24007.69              Pass
+    ```
+
+=== "Point-to-point bandwidth, GPU-to-GPU memory, inter-node communication"
+    ```console
+    $ MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 srun -N2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bw --validation D D
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI-CUDA Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.92              Pass
+    2                       1.80              Pass
+    4                       3.72              Pass
+    8                       7.45              Pass
+    16                     14.91              Pass
+    32                     29.66              Pass
+    64                     59.65              Pass
+    128                   119.08              Pass
+    256                   236.90              Pass
+    512                   467.70              Pass
+    1024                  930.74              Pass
+    2048                 1808.56              Pass
+    4096                 3461.06              Pass
+    8192                 6385.63              Pass
+    16384               12768.18              Pass
+    32768               19332.39              Pass
+    65536               22547.35              Pass
+    131072              23297.26              Pass
+    262144              23652.07              Pass
+    524288              23812.58              Pass
+    1048576             23913.85              Pass
+    2097152             23971.55              Pass
+    4194304             23998.79              Pass
+    ```
+
+
+=== "Point-to-point bandwidth, CPU-to-CPU memory, intra-node communication"
+    ```console
+    $ srun -N1 -n2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bw --validation
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       1.28              Pass
+    2                       2.60              Pass
+    4                       5.20              Pass
+    8                      10.39              Pass
+    16                     20.85              Pass
+    32                     41.56              Pass
+    64                     83.23              Pass
+    128                   164.73              Pass
+    256                   326.92              Pass
+    512                   632.98              Pass
+    1024                 1209.82              Pass
+    2048                 2352.68              Pass
+    4096                 4613.67              Pass
+    8192                 8881.00              Pass
+    16384                7435.51              Pass
+    32768                9369.82              Pass
+    65536               11644.51              Pass
+    131072              13198.71              Pass
+    262144              14058.41              Pass
+    524288              12958.24              Pass
+    1048576             12836.55              Pass
+    2097152             13117.14              Pass
+    4194304             13187.01              Pass
+    ```
+
+
+=== "Point-to-point bandwidth, GPU-to-GPU memory, intra-node communication"
+    ```console
+    $ srun -N1 -n2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bw --validation D D
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI-CUDA Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.13              Pass
+    2                       0.27              Pass
+    4                       0.55              Pass
+    8                       1.10              Pass
+    16                      2.20              Pass
+    32                      4.40              Pass
+    64                      8.77              Pass
+    128                    17.50              Pass
+    256                    35.01              Pass
+    512                    70.14              Pass
+    1024                  140.35              Pass
+    2048                  278.91              Pass
+    4096                  555.96              Pass
+    8192                 1104.97              Pass
+    16384                2214.87              Pass
+    32768                4422.67              Pass
+    65536                8833.18              Pass
+    131072              17765.30              Pass
+    262144              33834.24              Pass
+    524288              59704.15              Pass
+    1048576             84566.94              Pass
+    2097152            102221.49              Pass
+    4194304            113955.83              Pass
+    ```
+
+
+=== "Point-to-point bi-directional bandwidth, CPU-to-CPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bibw --validation
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI Bi-Directional Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       1.03              Pass
+    2                       2.07              Pass
+    4                       4.14              Pass
+    8                       8.28              Pass
+    16                     16.54              Pass
+    32                     33.07              Pass
+    64                     66.08              Pass
+    128                   131.65              Pass
+    256                   258.60              Pass
+    512                   518.60              Pass
+    1024                 1036.09              Pass
+    2048                 2072.16              Pass
+    4096                 4142.18              Pass
+    8192                 7551.70              Pass
+    16384               14953.49              Pass
+    32768               23871.35              Pass
+    65536               33767.12              Pass
+    131072              39284.40              Pass
+    262144              42638.43              Pass
+    524288              44602.52              Pass
+    1048576             45621.16              Pass
+    2097152             46159.65              Pass
+    4194304             46433.80              Pass
+    ```
+
+
+=== "Point-to-point bi-directional bandwidth, GPU-to-GPU memory, inter-node communication"
+    ```console
+    $ MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 srun -N2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_bibw --validation D D
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI-CUDA Bi-Directional Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       1.05              Pass
+    2                       2.10              Pass
+    4                       4.20              Pass
+    8                       8.40              Pass
+    16                     16.84              Pass
+    32                     33.63              Pass
+    64                     67.01              Pass
+    128                   132.11              Pass
+    256                   258.74              Pass
+    512                   515.52              Pass
+    1024                 1025.44              Pass
+    2048                 2019.51              Pass
+    4096                 3844.87              Pass
+    8192                 6123.96              Pass
+    16384               13244.25              Pass
+    32768               22521.76              Pass
+    65536               34040.97              Pass
+    131072              39503.52              Pass
+    262144              42827.91              Pass
+    524288              44663.44              Pass
+    1048576             45629.24              Pass
+    2097152             46167.41              Pass
+    4194304             46437.18              Pass
+    ```
+
+
+=== "Point-to-point latency, CPU-to-CPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmi2 --environment=omb-mpich ./pt2pt/osu_latency --validation
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_latency: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_latency: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                       3.00              Pass
+    2                       2.99              Pass
+    4                       2.99              Pass
+    8                       3.07              Pass
+    16                      2.99              Pass
+    32                      3.08              Pass
+    64                      3.01              Pass
+    128                     3.88              Pass
+    256                     4.43              Pass
+    512                     4.62              Pass
+    1024                    4.47              Pass
+    2048                    4.57              Pass
+    4096                    4.79              Pass
+    8192                    7.92              Pass
+    16384                   8.53              Pass
+    32768                   9.48              Pass
+    65536                  10.92              Pass
+    131072                 13.84              Pass
+    262144                 19.19              Pass
+    524288                 30.05              Pass
+    1048576                51.73              Pass
+    2097152                94.94              Pass
+    4194304               181.46              Pass
+    ```
+
+
+=== "All-to-all collective latency, CPU-to-CPU memory, multiple nodes"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmi2 --environment=omb-mpich ./collective/osu_alltoall --validation
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                      22.25              Pass
+    2                      22.34              Pass
+    4                      21.83              Pass
+    8                      21.72              Pass
+    16                     21.74              Pass
+    32                     21.71              Pass
+    64                     22.02              Pass
+    128                    22.35              Pass
+    256                    22.84              Pass
+    512                    23.42              Pass
+    1024                   24.61              Pass
+    2048                   24.99              Pass
+    4096                   26.02              Pass
+    8192                   29.17              Pass
+    16384                  68.81              Pass
+    32768                  95.63              Pass
+    65536                 181.42              Pass
+    131072                306.83              Pass
+    262144                526.50              Pass
+    524288                960.52              Pass
+    1048576              1823.52              Pass
+    ```
+
+
+=== "All-to-all collective latency, GPU-to-GPU memory, multiple nodes"
+    ```console
+    $ MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 srun -N2 --ntasks-per-node=4 --mpi=pmi2 --environment=omb-mpich ./collective/osu_alltoall --validation -d cuda
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                      65.62              Pass
+    2                      65.51              Pass
+    4                      65.46              Pass
+    8                      65.40              Pass
+    16                     65.58              Pass
+    32                     64.97              Pass
+    64                     65.01              Pass
+    128                    65.31              Pass
+    256                    65.03              Pass
+    512                    65.14              Pass
+    1024                   65.67              Pass
+    2048                   66.23              Pass
+    4096                   66.69              Pass
+    8192                   67.47              Pass
+    16384                  85.99              Pass
+    32768                 103.15              Pass
+    65536                 120.40              Pass
+    131072                135.64              Pass
+    262144                162.24              Pass
+    524288                213.84              Pass
+    1048576               317.07              Pass
+    ```
+
+
+### Results without the CXI hook
+On many Alps vClusters, the Container Engine is configured with the CXI hook enabled by default, enabling transparent access to the Slingshot interconnect.
+
+This section demonstrates the performance benefit of the CXI hook by explicitly disabling it through the EDF:
+```console
+$ cat .edf/omb-mpich-no-cxi.toml 
+image = "quay.io#ethcscs/osu-mb:7.5-mpich4.3.1-ofi1.22-cuda12.8"
+
+[annotations]
+com.hooks.cxi.enabled="false"
+```
+
+=== "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication"
+    ```console
+   $ srun -N2 --mpi=pmi2 --environment=omb-mpich-no-cxi ./pt2pt/osu_bw --validation
+
+    # OSU MPI Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.14              Pass
+    2                       0.28              Pass
+    4                       0.56              Pass
+    8                       1.15              Pass
+    16                      2.32              Pass
+    32                      4.55              Pass
+    64                      9.36              Pass
+    128                    18.20              Pass
+    256                    20.26              Pass
+    512                    39.11              Pass
+    1024                   55.88              Pass
+    2048                  108.19              Pass
+    4096                  142.91              Pass
+    8192                  393.95              Pass
+    16384                 307.93              Pass
+    32768                1205.61              Pass
+    65536                1723.86              Pass
+    131072               2376.59              Pass
+    262144               2847.85              Pass
+    524288               3277.75              Pass
+    1048576              3580.23              Pass
+    2097152              3697.47              Pass
+    4194304              3764.11              Pass
+    ```
+
+=== "Point-to-point bandwidth, GPU-to-GPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmi2 --environment=omb-mpich-no-cxi ./pt2pt/osu_bw --validation D D
+
+    # OSU MPI-CUDA Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.04              Pass
+    2                       0.08              Pass
+    4                       0.16              Pass
+    8                       0.31              Pass
+    16                      0.62              Pass
+    32                      1.24              Pass
+    64                      2.46              Pass
+    128                     4.80              Pass
+    256                     7.33              Pass
+    512                    14.40              Pass
+    1024                   24.43              Pass
+    2048                   47.68              Pass
+    4096                   85.40              Pass
+    8192                  161.68              Pass
+    16384                 306.15              Pass
+    32768                 520.57              Pass
+    65536                 818.99              Pass
+    131072               1160.48              Pass
+    262144               1436.44              Pass
+    524288               1676.61              Pass
+    1048576              2003.55              Pass
+    2097152              2104.65              Pass
+    4194304              2271.56              Pass
+    ```
+
+=== "Point-to-point latency, CPU-to-CPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmi2 --environment=omb-mpich-no-cxi ./pt2pt/osu_latency --validation
+
+    # OSU MPI Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                      38.25              Pass
+    2                      38.58              Pass
+    4                      38.49              Pass
+    8                      38.43              Pass
+    16                     38.40              Pass
+    32                     38.49              Pass
+    64                     39.18              Pass
+    128                    39.23              Pass
+    256                    45.17              Pass
+    512                    53.49              Pass
+    1024                   59.60              Pass
+    2048                   48.83              Pass
+    4096                   50.84              Pass
+    8192                   51.45              Pass
+    16384                  52.35              Pass
+    32768                  58.92              Pass
+    65536                  74.88              Pass
+    131072                100.32              Pass
+    262144                135.35              Pass
+    524288                219.52              Pass
+    1048576               384.61              Pass
+    2097152               706.79              Pass
+    4194304              1341.79              Pass
+    ```
+
+
+=== "All-to-all collective latency, CPU-to-CPU memory, multiple nodes"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmi2 --environment=omb-mpich-no-cxi ./collective/osu_alltoall --validation
+
+    # OSU MPI All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                     169.19              Pass
+    2                     169.50              Pass
+    4                     170.35              Pass
+    8                     168.81              Pass
+    16                    169.71              Pass
+    32                    169.60              Pass
+    64                    169.47              Pass
+    128                   171.48              Pass
+    256                   334.47              Pass
+    512                   343.06              Pass
+    1024                  703.55              Pass
+    2048                  449.30              Pass
+    4096                  454.68              Pass
+    8192                  468.90              Pass
+    16384                 532.46              Pass
+    32768                 578.95              Pass
+    65536                1164.92              Pass
+    131072               1511.04              Pass
+    262144               2287.48              Pass
+    524288               3668.35              Pass
+    1048576              6498.36              Pass
+    ```
+
+
+=== "All-to-all collective latency, GPU-to-GPU memory, multiple nodes"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmi2 --environment=omb-mpich-no-cxi ./collective/osu_alltoall --validation -d cuda
+
+    # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                     276.29              Pass
+    2                     273.94              Pass
+    4                     273.53              Pass
+    8                     273.88              Pass
+    16                    274.83              Pass
+    32                    274.90              Pass
+    64                    276.85              Pass
+    128                   278.17              Pass
+    256                   413.21              Pass
+    512                   442.62              Pass
+    1024                  793.14              Pass
+    2048                  547.57              Pass
+    4096                  561.82              Pass
+    8192                  570.71              Pass
+    16384                 624.20              Pass
+    32768                 657.30              Pass
+    65536                1168.43              Pass
+    131072               1451.91              Pass
+    262144               2049.24              Pass
+    524288               3061.54              Pass
+    1048576              5238.24              Pass
+    ```
diff --git a/docs/software/container-engine/examples/image-nccl-tests.md b/docs/software/container-engine/examples/image-nccl-tests.md
new file mode 100644
index 00000000..3e1e4a54
--- /dev/null
+++ b/docs/software/container-engine/examples/image-nccl-tests.md
@@ -0,0 +1,183 @@
+[](){#ref-ce-guidelines-images-nccl-tests}
+# NCCL Tests image
+
+This page describes a container image featuring the [NCCL Tests](https://github.com/NVIDIA/nccl-tests) to demonstrate how to efficiently execute NCCL-based containerized software on Alps.
+
+This image is based on the [OpenMPI image][ref-ce-guidelines-images-ompi], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes.
+
+A build of this image is currently hosted on the [Quay.io](https://quay.io/) registry at the following reference:
+`quay.io/ethcscs/nccl-tests:2.17.1-ompi5.0.8-ofi1.22-cuda12.8`.
+
+## Contents
+
+- Ubuntu 24.04
+- CUDA 12.8.1 (includes NCCL)
+- GDRCopy 2.5.1
+- Libfabric 1.22.0
+- UCX 1.19.0
+- OpenMPI 5.0.8
+- NCCL Tests 2.17.1
+
+## Containerfile
+```Dockerfile
+FROM quay.io/ethcscs/ompi:5.0.8-ofi1.22-cuda12.8
+
+ARG nccl_tests_version=2.17.1
+RUN wget -O nccl-tests-${nccl_tests_version}.tar.gz https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${nccl_tests_version}.tar.gz \
+    && tar xf nccl-tests-${nccl_tests_version}.tar.gz \
+    && cd nccl-tests-${nccl_tests_version} \
+    && MPI=1 make -j$(nproc) \
+    && cd .. \
+    && rm -rf nccl-tests-${nccl_tests_version}.tar.gz
+```
+
+!!! note
+    This image builds NCCL tests with MPI support enabled.
+
+## Performance examples
+
+### Environment Definition File
+```toml
+image = "quay.io#ethcscs/nccl-tests:2.17.1-ompi5.0.8-ofi1.22-cuda12.8"
+
+[env]
+PMIX_MCA_psec="native"
+
+[annotations]
+com.hooks.aws_ofi_nccl.enabled = "true"
+com.hooks.aws_ofi_nccl.variant = "cuda12"
+```
+
+### Notes
+
+- Since OpenMPI uses PMIx for wire-up and communication between ranks, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs.
+- NCCL requires the presence of the [AWS OFI NCCL plugin](https://github.com/aws/aws-ofi-nccl) in order to correctly interface with Libfabric and (through the latter) the Slingshot interconnect. Therefore, for optimal performance the [related CE hook][ref-ce-aws-ofi-hook] must be enabled and set to match the CUDA version in the container.
+- Libfabric itself is usually injected by the [CXI hook][ref-ce-cxi-hook], which is enabled by default on several Alps vClusters.
+
+### Results
+
+=== "All-reduce latency test on 2 nodes, 8 GPUs"
+    ```console
+    $ srun -N2 -t5 --mpi=pmix --ntasks-per-node=4 --environment=nccl-test-ompi /nccl-tests-2.17.1/build/all_reduce_perf -b 8 -e 128M -f 2
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    # Collective test starting: all_reduce_perf
+    # nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0
+    #
+    # Using devices
+    #  Rank  0 Group  0 Pid 204199 on  nid005471 device  0 [0009:01:00] NVIDIA GH200 120GB
+    #  Rank  1 Group  0 Pid 204200 on  nid005471 device  1 [0019:01:00] NVIDIA GH200 120GB
+    #  Rank  2 Group  0 Pid 204201 on  nid005471 device  2 [0029:01:00] NVIDIA GH200 120GB
+    #  Rank  3 Group  0 Pid 204202 on  nid005471 device  3 [0039:01:00] NVIDIA GH200 120GB
+    #  Rank  4 Group  0 Pid 155254 on  nid005487 device  0 [0009:01:00] NVIDIA GH200 120GB
+    #  Rank  5 Group  0 Pid 155255 on  nid005487 device  1 [0019:01:00] NVIDIA GH200 120GB
+    #  Rank  6 Group  0 Pid 155256 on  nid005487 device  2 [0029:01:00] NVIDIA GH200 120GB
+    #  Rank  7 Group  0 Pid 155257 on  nid005487 device  3 [0039:01:00] NVIDIA GH200 120GB
+    #
+    #                                                              out-of-place                       in-place          
+    #       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+    #        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
+            8             2     float     sum      -1    17.93    0.00    0.00      0    17.72    0.00    0.00      0
+            16             4     float     sum      -1    17.65    0.00    0.00      0    17.63    0.00    0.00      0
+            32             8     float     sum      -1    17.54    0.00    0.00      0    17.43    0.00    0.00      0
+            64            16     float     sum      -1    19.27    0.00    0.01      0    19.21    0.00    0.01      0
+            128            32     float     sum      -1    18.86    0.01    0.01      0    18.67    0.01    0.01      0
+            256            64     float     sum      -1    18.83    0.01    0.02      0    19.02    0.01    0.02      0
+            512           128     float     sum      -1    19.72    0.03    0.05      0    19.40    0.03    0.05      0
+            1024           256     float     sum      -1    20.35    0.05    0.09      0    20.32    0.05    0.09      0
+            2048           512     float     sum      -1    22.07    0.09    0.16      0    21.72    0.09    0.17      0
+            4096          1024     float     sum      -1    31.97    0.13    0.22      0    31.58    0.13    0.23      0
+            8192          2048     float     sum      -1    37.21    0.22    0.39      0    35.84    0.23    0.40      0
+        16384          4096     float     sum      -1    37.29    0.44    0.77      0    36.53    0.45    0.78      0
+        32768          8192     float     sum      -1    39.61    0.83    1.45      0    37.09    0.88    1.55      0
+        65536         16384     float     sum      -1    61.03    1.07    1.88      0    68.45    0.96    1.68      0
+        131072         32768     float     sum      -1    81.41    1.61    2.82      0    72.94    1.80    3.14      0
+        262144         65536     float     sum      -1    127.0    2.06    3.61      0    108.9    2.41    4.21      0
+        524288        131072     float     sum      -1    170.3    3.08    5.39      0    349.6    1.50    2.62      0
+        1048576        262144     float     sum      -1    164.3    6.38   11.17      0    187.7    5.59    9.77      0
+        2097152        524288     float     sum      -1    182.1   11.51   20.15      0    180.6   11.61   20.32      0
+        4194304       1048576     float     sum      -1    292.7   14.33   25.08      0    295.4   14.20   24.85      0
+        8388608       2097152     float     sum      -1    344.5   24.35   42.61      0    345.7   24.27   42.47      0
+        16777216       4194304     float     sum      -1    461.7   36.34   63.59      0    454.0   36.95   64.67      0
+        33554432       8388608     float     sum      -1    686.5   48.88   85.54      0    686.6   48.87   85.52      0
+        67108864      16777216     float     sum      -1   1090.5   61.54  107.69      0   1083.5   61.94  108.39      0
+    134217728      33554432     float     sum      -1   1916.4   70.04  122.57      0   1907.8   70.35  123.11      0
+    # Out of bounds values : 0 OK
+    # Avg bus bandwidth    : 19.7866 
+    #
+    # Collective test concluded: all_reduce_perf
+    ```
+
+### Results without the AWS OFI NCCL hook
+This section demonstrates the performance benefit of the AWS OFI NCCL hook by not enabling it through the EDF:
+```console
+$ cat ~/.edf/nccl-test-ompi-no-awsofinccl.toml
+image = "quay.io#ethcscs/nccl-tests:2.17.1-ompi5.0.8-ofi1.22-cuda12.8"
+
+[env]
+PMIX_MCA_psec="native"
+```
+
+=== "All-reduce latency test on 2 nodes, 8 GPUs"
+    ```console
+    $ srun -N2 -t5 --mpi=pmix --ntasks-per-node=4 --environment=nccl-test-ompi /nccl-tests-2.17.1/build/all_reduce_perf -b 8 -e 128M -f 2
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /nccl-tests-2.17.1/build/all_reduce_perf: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    # Collective test starting: all_reduce_perf
+    # nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0
+    #
+    # Using devices
+    #  Rank  0 Group  0 Pid 202829 on  nid005471 device  0 [0009:01:00] NVIDIA GH200 120GB
+    #  Rank  1 Group  0 Pid 202830 on  nid005471 device  1 [0019:01:00] NVIDIA GH200 120GB
+    #  Rank  2 Group  0 Pid 202831 on  nid005471 device  2 [0029:01:00] NVIDIA GH200 120GB
+    #  Rank  3 Group  0 Pid 202832 on  nid005471 device  3 [0039:01:00] NVIDIA GH200 120GB
+    #  Rank  4 Group  0 Pid 154517 on  nid005487 device  0 [0009:01:00] NVIDIA GH200 120GB
+    #  Rank  5 Group  0 Pid 154518 on  nid005487 device  1 [0019:01:00] NVIDIA GH200 120GB
+    #  Rank  6 Group  0 Pid 154519 on  nid005487 device  2 [0029:01:00] NVIDIA GH200 120GB
+    #  Rank  7 Group  0 Pid 154520 on  nid005487 device  3 [0039:01:00] NVIDIA GH200 120GB
+    #
+    #                                                              out-of-place                       in-place          
+    #       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+    #        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
+            8             2     float     sum      -1    85.47    0.00    0.00      0    53.44    0.00    0.00      0
+            16             4     float     sum      -1    52.41    0.00    0.00      0    51.11    0.00    0.00      0
+            32             8     float     sum      -1    50.45    0.00    0.00      0    50.40    0.00    0.00      0
+            64            16     float     sum      -1    62.58    0.00    0.00      0    50.70    0.00    0.00      0
+            128            32     float     sum      -1    50.94    0.00    0.00      0    50.77    0.00    0.00      0
+            256            64     float     sum      -1    50.76    0.01    0.01      0    51.77    0.00    0.01      0
+            512           128     float     sum      -1    163.2    0.00    0.01      0    357.5    0.00    0.00      0
+            1024           256     float     sum      -1    373.0    0.00    0.00      0    59.31    0.02    0.03      0
+            2048           512     float     sum      -1    53.22    0.04    0.07      0    52.58    0.04    0.07      0
+            4096          1024     float     sum      -1    55.95    0.07    0.13      0    56.63    0.07    0.13      0
+            8192          2048     float     sum      -1    58.52    0.14    0.24      0    58.62    0.14    0.24      0
+        16384          4096     float     sum      -1    108.7    0.15    0.26      0    107.8    0.15    0.27      0
+        32768          8192     float     sum      -1    184.1    0.18    0.31      0    183.5    0.18    0.31      0
+        65536         16384     float     sum      -1    325.0    0.20    0.35      0    325.4    0.20    0.35      0
+        131072         32768     float     sum      -1    592.7    0.22    0.39      0    591.5    0.22    0.39      0
+        262144         65536     float     sum      -1    942.0    0.28    0.49      0    941.4    0.28    0.49      0
+        524288        131072     float     sum      -1   1143.1    0.46    0.80      0   1138.0    0.46    0.81      0
+        1048576        262144     float     sum      -1   1502.2    0.70    1.22      0   1478.9    0.71    1.24      0
+        2097152        524288     float     sum      -1    921.8    2.28    3.98      0    899.8    2.33    4.08      0
+        4194304       1048576     float     sum      -1   1443.1    2.91    5.09      0   1432.7    2.93    5.12      0
+        8388608       2097152     float     sum      -1   2437.7    3.44    6.02      0   2417.0    3.47    6.07      0
+        16777216       4194304     float     sum      -1   5036.9    3.33    5.83      0   5003.6    3.35    5.87      0
+        33554432       8388608     float     sum      -1    17388    1.93    3.38      0    17275    1.94    3.40      0
+        67108864      16777216     float     sum      -1    21253    3.16    5.53      0    21180    3.17    5.54      0
+    134217728      33554432     float     sum      -1    43293    3.10    5.43      0    43396    3.09    5.41      0
+    # Out of bounds values : 0 OK
+    # Avg bus bandwidth    : 1.58767 
+    #
+    # Collective test concluded: all_reduce_perf
+    ```
diff --git a/docs/software/container-engine/examples/image-nvshmem.md b/docs/software/container-engine/examples/image-nvshmem.md
new file mode 100644
index 00000000..1ced3ef5
--- /dev/null
+++ b/docs/software/container-engine/examples/image-nvshmem.md
@@ -0,0 +1,237 @@
+[](){#ref-ce-guidelines-images-nvshmem}
+# NVSHMEM image
+
+This page describes a container image featuring the [NVSHMEM](https://developer.nvidia.com/nvshmem) parallel programming library with support for Libfabric, and demonstrates how to efficiently run said image on Alps.
+
+This image is based on the [OpenMPI image][ref-ce-guidelines-images-ompi], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes.
+
+A build of this image is currently hosted on the [Quay.io](https://quay.io/) registry at the following reference:
+`quay.io/ethcscs/nvshmem:3.4.5-ompi5.0.8-ofi1.22-cuda12.8`.
+
+## Contents
+
+- Ubuntu 24.04
+- CUDA 12.8.1 (includes NCCL)
+- GDRCopy 2.5.1
+- Libfabric 1.22.0
+- UCX 1.19.0
+- OpenMPI 5.0.8
+- NVSHMEM 3.4.5
+
+## Containerfile
+```Dockerfile
+FROM quay.io/ethcscs/ompi:5.0.8-ofi1.22-cuda12.8
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive \
+       apt-get install -y \
+        python3-venv \
+        python3-dev \
+        --no-install-recommends \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm /usr/lib/python3.12/EXTERNALLY-MANAGED
+
+# Build NVSHMEM from source
+RUN wget -q https://developer.download.nvidia.com/compute/redist/nvshmem/3.4.5/source/nvshmem_src_cuda12-all-all-3.4.5.tar.gz \
+    && tar -xvf nvshmem_src_cuda12-all-all-3.4.5.tar.gz \
+    && cd nvshmem_src \
+    && NVSHMEM_BUILD_EXAMPLES=0 \
+       NVSHMEM_BUILD_TESTS=1 \
+       NVSHMEM_DEBUG=0 \
+       NVSHMEM_DEVEL=0 \
+       NVSHMEM_DEFAULT_PMI2=0 \
+       NVSHMEM_DEFAULT_PMIX=1 \
+       NVSHMEM_DISABLE_COLL_POLL=1 \
+       NVSHMEM_ENABLE_ALL_DEVICE_INLINING=0 \
+       NVSHMEM_GPU_COLL_USE_LDST=0 \
+       NVSHMEM_LIBFABRIC_SUPPORT=1 \
+       NVSHMEM_MPI_SUPPORT=1 \
+       NVSHMEM_MPI_IS_OMPI=1 \
+       NVSHMEM_NVTX=1 \
+       NVSHMEM_PMIX_SUPPORT=1 \
+       NVSHMEM_SHMEM_SUPPORT=1 \
+       NVSHMEM_TEST_STATIC_LIB=0 \
+       NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+       NVSHMEM_TRACE=0 \
+       NVSHMEM_USE_DLMALLOC=0 \
+       NVSHMEM_USE_NCCL=1 \
+       NVSHMEM_USE_GDRCOPY=1 \
+       NVSHMEM_VERBOSE=0 \
+       NVSHMEM_DEFAULT_UCX=0 \
+       NVSHMEM_UCX_SUPPORT=0 \
+       NVSHMEM_IBGDA_SUPPORT=0 \
+       NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY=0 \
+       NVSHMEM_IBDEVX_SUPPORT=0 \
+       NVSHMEM_IBRC_SUPPORT=0 \
+       LIBFABRIC_HOME=/usr \
+       NCCL_HOME=/usr \
+       GDRCOPY_HOME=/usr/local \
+       MPI_HOME=/usr \
+       SHMEM_HOME=/usr \
+       NVSHMEM_HOME=/usr \
+       cmake . \
+       && make -j$(nproc) \
+       && make install \
+   && ldconfig \
+   && cd .. \
+   && rm -r nvshmem_src nvshmem_src_cuda12-all-all-3.4.5.tar.gz
+```
+
+!!! note
+    - This image also builds the performance tests bundled with NVSHMEM (`NVSHMEM_BUILD_TESTS=1`) to demonstrate performance below. The performance tests, in turn, require the installation of Python dependencies. When building images intended solely for production purposes, you may exclude both those elements.
+    - Notice that NVSHMEM is configured with support for Libfabric explicitly enabled (`NVSHMEM_LIBFABRIC_SUPPORT=1`).
+    - Since this image is meant primarily to run on Alps, NVSHMEM is built without support for UCX and Infiniband components. 
+    - Since this image uses OpenMPI (which provides PMIx) as MPI implementation, NVSHMEM is also configured to default to PMIx for bootstrapping (`NVSHMEM_PMIX_SUPPORT=1`).
+
+## Performance examples
+
+### Environment Definition File
+```toml
+image = "quay.io#ethcscs/nvshmem:3.4.5-ompi5.0.8-ofi1.22-cuda12.8"
+
+[env]
+PMIX_MCA_psec="native"
+NVSHMEM_REMOTE_TRANSPORT="libfabric"
+NVSHMEM_LIBFABRIC_PROVIDER="cxi"
+NVSHMEM_DISABLE_CUDA_VMM="1"
+
+[annotations]
+com.hooks.aws_ofi_nccl.enabled = "true"
+com.hooks.aws_ofi_nccl.variant = "cuda12"
+```
+
+### Notes
+
+- NVSHMEM's `libfabric` transport does not support VMM yet, so VMM must be disabled by setting the environment variable `NVSHMEM_DISABLE_CUDA_VMM=1`.
+- Since NVSHMEM has been configured in the Containerfile to use PMIx for bootstrapping, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs.
+- Other bootstrapping methods (including different PMI implementations) can be specified for NVSHMEM throug the related [environment variables](https://docs.nvidia.com/nvshmem/api/gen/env.html#bootstrap-options). When bootstrapping through PMI or MPI through Slurm, ensure that the PMI implementation used by Slurm (i.e. `srun --mpi` option) matches the one expected by NVSHMEM or the MPI library.
+- NCCL requires the presence of the [AWS OFI NCCL plugin](https://github.com/aws/aws-ofi-nccl) in order to correctly interface with Libfabric and (through the latter) the Slingshot interconnect. Therefore, for optimal performance the [related CE hook][ref-ce-aws-ofi-hook] must be enabled and set to match the CUDA version in the container.
+- Libfabric itself is usually injected by the [CXI hook][ref-ce-cxi-hook], which is enabled by default on several Alps vClusters.
+
+### Results
+
+=== "All-to-all latency test on 2 nodes, 8 GPUs"
+    ```console
+    $ srun -N2 --ntasks-per-node=4  --mpi=pmix --environment=nvshmem /usr/local/nvshmem/bin/perftest/device/coll/alltoall_latency
+    Runtime options after parsing command line arguments 
+    min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0
+    Note: Above is full list of options, any given test will use only a subset of these variables.
+    mype: 6 mype_node: 2 device name: NVIDIA GH200 120GB bus id: 1 
+    Runtime options after parsing command line arguments 
+    min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0
+    Note: Above is full list of options, any given test will use only a subset of these variables.
+    mype: 5 mype_node: 1 device name: NVIDIA GH200 120GB bus id: 1 
+    Runtime options after parsing command line arguments 
+    min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0
+    Note: Above is full list of options, any given test will use only a subset of these variables.
+    mype: 7 mype_node: 3 device name: NVIDIA GH200 120GB bus id: 1 
+    Runtime options after parsing command line arguments 
+    min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0
+    Note: Above is full list of options, any given test will use only a subset of these variables.
+    mype: 4 mype_node: 0 device name: NVIDIA GH200 120GB bus id: 1 
+    Runtime options after parsing command line arguments 
+    min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0
+    Note: Above is full list of options, any given test will use only a subset of these variables.
+    mype: 0 mype_node: 0 device name: NVIDIA GH200 120GB bus id: 1 
+    #alltoall_device
+    size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+    32          8         32-bit    thread    116.220796        0.000         0.000       
+    64          16        32-bit    thread    112.700796        0.001         0.000       
+    128         32        32-bit    thread    113.571203        0.001         0.001       
+    256         64        32-bit    thread    111.123204        0.002         0.002       
+    512         128       32-bit    thread    111.075199        0.005         0.004       
+    1024        256       32-bit    thread    110.131204        0.009         0.008       
+    2048        512       32-bit    thread    111.030400        0.018         0.016       
+    4096        1024      32-bit    thread    110.985601        0.037         0.032       
+    8192        2048      32-bit    thread    111.039996        0.074         0.065       
+    #alltoall_device
+    size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+    32          8         32-bit    warp      89.801598         0.000         0.000       
+    64          16        32-bit    warp      90.563202         0.001         0.001       
+    128         32        32-bit    warp      89.830399         0.001         0.001       
+    256         64        32-bit    warp      88.863999         0.003         0.003       
+    512         128       32-bit    warp      89.686400         0.006         0.005       
+    1024        256       32-bit    warp      88.908798         0.012         0.010       
+    2048        512       32-bit    warp      88.819200         0.023         0.020       
+    4096        1024      32-bit    warp      89.670402         0.046         0.040       
+    8192        2048      32-bit    warp      88.889599         0.092         0.081       
+    16384       4096      32-bit    warp      88.972801         0.184         0.161       
+    32768       8192      32-bit    warp      89.564800         0.366         0.320       
+    65536       16384     32-bit    warp      89.888000         0.729         0.638       
+    #alltoall_device
+    size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+    32          8         32-bit    block     89.747202         0.000         0.000       
+    64          16        32-bit    block     88.086402         0.001         0.001       
+    128         32        32-bit    block     87.254399         0.001         0.001       
+    256         64        32-bit    block     87.401599         0.003         0.003       
+    512         128       32-bit    block     88.095999         0.006         0.005       
+    1024        256       32-bit    block     87.273598         0.012         0.010       
+    2048        512       32-bit    block     88.086402         0.023         0.020       
+    4096        1024      32-bit    block     88.940799         0.046         0.040       
+    8192        2048      32-bit    block     88.095999         0.093         0.081       
+    16384       4096      32-bit    block     87.247998         0.188         0.164       
+    32768       8192      32-bit    block     88.976002         0.368         0.322       
+    65536       16384     32-bit    block     88.121599         0.744         0.651       
+    131072      32768     32-bit    block     90.579200         1.447         1.266       
+    262144      65536     32-bit    block     91.360003         2.869         2.511       
+    524288      131072    32-bit    block     101.145601        5.183         4.536       
+    1048576     262144    32-bit    block     111.052799        9.442         8.262       
+    2097152     524288    32-bit    block     137.164795        15.289        13.378      
+    4194304     1048576   32-bit    block     183.171201        22.898        20.036      
+    #alltoall_device
+    size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+    64          8         64-bit    thread    111.955202        0.001         0.001       
+    128         16        64-bit    thread    113.420796        0.001         0.001       
+    256         32        64-bit    thread    108.508801        0.002         0.002       
+    512         64        64-bit    thread    110.204804        0.005         0.004       
+    1024        128       64-bit    thread    109.487998        0.009         0.008       
+    2048        256       64-bit    thread    109.462404        0.019         0.016       
+    4096        512       64-bit    thread    110.156798        0.037         0.033       
+    8192        1024      64-bit    thread    109.401596        0.075         0.066       
+    16384       2048      64-bit    thread    108.591998        0.151         0.132       
+    #alltoall_device
+    size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+    64          8         64-bit    warp      88.896000         0.001         0.001       
+    128         16        64-bit    warp      89.679998         0.001         0.001       
+    256         32        64-bit    warp      88.950402         0.003         0.003       
+    512         64        64-bit    warp      89.606398         0.006         0.005       
+    1024        128       64-bit    warp      89.775997         0.011         0.010       
+    2048        256       64-bit    warp      88.838398         0.023         0.020       
+    4096        512       64-bit    warp      90.671998         0.045         0.040       
+    8192        1024      64-bit    warp      89.699203         0.091         0.080       
+    16384       2048      64-bit    warp      89.011198         0.184         0.161       
+    32768       4096      64-bit    warp      89.622402         0.366         0.320       
+    65536       8192      64-bit    warp      88.905603         0.737         0.645       
+    131072      16384     64-bit    warp      89.766401         1.460         1.278       
+    #alltoall_device
+    size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+    64          8         64-bit    block     89.788800         0.001         0.001       
+    128         16        64-bit    block     88.012803         0.001         0.001       
+    256         32        64-bit    block     87.353599         0.003         0.003       
+    512         64        64-bit    block     88.000000         0.006         0.005       
+    1024        128       64-bit    block     87.225598         0.012         0.010       
+    2048        256       64-bit    block     87.225598         0.023         0.021       
+    4096        512       64-bit    block     87.168002         0.047         0.041       
+    8192        1024      64-bit    block     88.067198         0.093         0.081       
+    16384       2048      64-bit    block     88.863999         0.184         0.161       
+    32768       4096      64-bit    block     88.723201         0.369         0.323       
+    65536       8192      64-bit    block     87.993598         0.745         0.652       
+    131072      16384     64-bit    block     88.783997         1.476         1.292       
+    262144      32768     64-bit    block     91.366398         2.869         2.511       
+    524288      65536     64-bit    block     102.060795        5.137         4.495       
+    1048576     131072    64-bit    block     111.846399        9.375         8.203       
+    2097152     262144    64-bit    block     137.107205        15.296        13.384      
+    4194304     524288    64-bit    block     183.100796        22.907        20.044      
+    Runtime options after parsing command line arguments 
+    min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0
+    Note: Above is full list of options, any given test will use only a subset of these variables.
+    mype: 3 mype_node: 3 device name: NVIDIA GH200 120GB bus id: 1 
+    Runtime options after parsing command line arguments 
+    min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0
+    Note: Above is full list of options, any given test will use only a subset of these variables.
+    mype: 2 mype_node: 2 device name: NVIDIA GH200 120GB bus id: 1 
+    Runtime options after parsing command line arguments 
+    min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0
+    Note: Above is full list of options, any given test will use only a subset of these variables.
+    mype: 1 mype_node: 1 device name: NVIDIA GH200 120GB bus id: 1
+    ```
diff --git a/docs/software/container-engine/examples/image-ompi.md b/docs/software/container-engine/examples/image-ompi.md
new file mode 100644
index 00000000..c1287478
--- /dev/null
+++ b/docs/software/container-engine/examples/image-ompi.md
@@ -0,0 +1,576 @@
+[](){#ref-ce-guidelines-images-ompi}
+# OpenMPI image
+
+This page describes a container image featuring the OpenMPI library as MPI (Message Passing Interface) implementation, with support for CUDA, Libfabric and UCX.
+
+This image is based on the [communication frameworks image][ref-ce-guidelines-images-commfwk], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes.
+
+A build of this image is currently hosted on the [Quay.io](https://quay.io/) registry at the following reference:
+`quay.io/ethcscs/ompi:5.0.8-ofi1.22-cuda12.8`.
+
+## Contents
+
+- Ubuntu 24.04
+- CUDA 12.8.1
+- GDRCopy 2.5.1
+- Libfabric 1.22.0
+- UCX 1.19.0
+- OpenMPI 5.0.8
+
+## Containerfile
+```Dockerfile
+FROM quay.io/ethcscs/comm-fwk:ofi1.22-ucx1.19-cuda12.8
+
+ARG OMPI_VER=5.0.8
+RUN wget -q https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OMPI_VER}.tar.gz \
+    && tar xf openmpi-${OMPI_VER}.tar.gz \
+    && cd openmpi-${OMPI_VER} \
+    && ./configure --prefix=/usr --with-ofi=/usr --with-ucx=/usr --enable-oshmem \
+       --with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/local/cuda/lib64/stubs \
+    && make -j$(nproc) \
+    && make install \
+    && ldconfig \
+    && cd .. \
+    && rm -rf openmpi-${OMPI_VER}.tar.gz openmpi-${OMPI_VER}
+```
+
+!!! note
+    This image builds OpenSHMEM as part of the OpenMPI installation. This can be useful to support other SHMEM implementations like NVSHMEM.
+
+## Performance examples
+
+In this section we demonstrate the performance of the previosly created OpenMPI image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks.
+
+A build of the image with the OSU benchmarks is available on the [Quay.io](https://quay.io/) registry at the following reference:
+`quay.io/ethcscs/osu-mb:7.5-ompi5.0.8-ofi1.22-cuda12.8`.
+
+### OSU-MB Containerfile
+```Dockerfile
+FROM quay.io/ethcscs/ompi:5.0.8-ofi1.22-cuda12.8
+
+ARG omb_version=7.5.1
+RUN wget -q http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${omb_version}.tar.gz \
+    && tar xf osu-micro-benchmarks-${omb_version}.tar.gz \
+    && cd osu-micro-benchmarks-${omb_version} \
+    && ldconfig /usr/local/cuda/targets/sbsa-linux/lib/stubs \
+    && ./configure --prefix=/usr/local CC=$(which mpicc) CFLAGS="-O3 -lcuda -lnvidia-ml" \
+                   --enable-cuda --with-cuda-include=/usr/local/cuda/include \
+                   --with-cuda-libpath=/usr/local/cuda/lib64 \
+                   CXXFLAGS="-lmpi -lcuda" \
+    && make -j$(nproc) \
+    && make install \
+    && ldconfig \
+    && cd .. \
+    && rm -rf osu-micro-benchmarks-${omb_version} osu-micro-benchmarks-${omb_version}.tar.gz
+
+WORKDIR /usr/local/libexec/osu-micro-benchmarks/mpi
+```
+
+### Environment Definition File
+```toml
+image = "quay.io#ethcscs/osu-mb:7.5-ompi5.0.8-ofi1.22-cuda12.8"
+
+[env]
+PMIX_MCA_psec="native"
+```
+
+### Notes
+
+- Since OpenMPI uses PMIx for wire-up and communication between ranks, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs.
+
+### Results
+
+=== "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.95              Pass
+    2                       1.90              Pass
+    4                       3.80              Pass
+    8                       7.61              Pass
+    16                     15.21              Pass
+    32                     30.47              Pass
+    64                     60.72              Pass
+    128                   121.56              Pass
+    256                   242.28              Pass
+    512                   484.54              Pass
+    1024                  968.30              Pass
+    2048                 1943.99              Pass
+    4096                 3870.29              Pass
+    8192                 6972.95              Pass
+    16384               13922.36              Pass
+    32768               18835.52              Pass
+    65536               22049.82              Pass
+    131072              23136.20              Pass
+    262144              23555.35              Pass
+    524288              23758.39              Pass
+    1048576             23883.95              Pass
+    2097152             23949.94              Pass
+    4194304             23982.18              Pass
+    ```
+
+=== "Point-to-point bandwidth, GPU-to-GPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation D D
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI-CUDA Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.90              Pass
+    2                       1.82              Pass
+    4                       3.65              Pass
+    8                       7.30              Pass
+    16                     14.56              Pass
+    32                     29.03              Pass
+    64                     57.49              Pass
+    128                   118.30              Pass
+    256                   227.18              Pass
+    512                   461.26              Pass
+    1024                  926.30              Pass
+    2048                 1820.46              Pass
+    4096                 3611.70              Pass
+    8192                 6837.89              Pass
+    16384               13361.25              Pass
+    32768               18037.71              Pass
+    65536               22019.46              Pass
+    131072              23104.58              Pass
+    262144              23542.71              Pass
+    524288              23758.69              Pass
+    1048576             23881.02              Pass
+    2097152             23955.49              Pass
+    4194304             23989.54              Pass
+    ```
+
+
+=== "Point-to-point bandwidth, CPU-to-CPU memory, intra-node communication"
+    ```console
+    $ srun -N1 -n2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.96              Pass
+    2                       1.92              Pass
+    4                       3.85              Pass
+    8                       7.68              Pass
+    16                     15.40              Pass
+    32                     30.78              Pass
+    64                     61.26              Pass
+    128                   122.23              Pass
+    256                   240.96              Pass
+    512                   483.12              Pass
+    1024                  966.52              Pass
+    2048                 1938.09              Pass
+    4096                 3873.67              Pass
+    8192                 7100.56              Pass
+    16384               14170.44              Pass
+    32768               18607.68              Pass
+    65536               21993.95              Pass
+    131072              23082.11              Pass
+    262144              23546.09              Pass
+    524288              23745.05              Pass
+    1048576             23879.79              Pass
+    2097152             23947.23              Pass
+    4194304             23980.15              Pass
+    ```
+
+
+=== "Point-to-point bandwidth, GPU-to-GPU memory, intra-node communication"
+    ```console
+    $ srun -N1 -n2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation D D
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI-CUDA Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.91              Pass
+    2                       1.83              Pass
+    4                       3.73              Pass
+    8                       7.47              Pass
+    16                     14.99              Pass
+    32                     29.98              Pass
+    64                     59.72              Pass
+    128                   119.13              Pass
+    256                   241.88              Pass
+    512                   481.52              Pass
+    1024                  963.60              Pass
+    2048                 1917.15              Pass
+    4096                 3840.96              Pass
+    8192                 6942.05              Pass
+    16384               13911.45              Pass
+    32768               18379.14              Pass
+    65536               21761.73              Pass
+    131072              23069.72              Pass
+    262144              23543.98              Pass
+    524288              23750.83              Pass
+    1048576             23882.44              Pass
+    2097152             23951.34              Pass
+    4194304             23989.44              Pass
+    ```
+
+
+=== "Point-to-point bi-directional bandwidth, CPU-to-CPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bibw --validation
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI Bi-Directional Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.93              Pass
+    2                       1.94              Pass
+    4                       3.89              Pass
+    8                       7.77              Pass
+    16                     15.61              Pass
+    32                     30.94              Pass
+    64                     62.10              Pass
+    128                   123.73              Pass
+    256                   247.77              Pass
+    512                   495.33              Pass
+    1024                  988.33              Pass
+    2048                 1977.44              Pass
+    4096                 3953.82              Pass
+    8192                 7252.82              Pass
+    16384               14434.94              Pass
+    32768               23610.53              Pass
+    65536               33290.72              Pass
+    131072              39024.03              Pass
+    262144              42508.16              Pass
+    524288              44482.65              Pass
+    1048576             45575.40              Pass
+    2097152             46124.45              Pass
+    4194304             46417.59              Pass
+    ```
+
+
+=== "Point-to-point bi-directional bandwidth, GPU-to-GPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bibw --validation D D
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_bibw: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI-CUDA Bi-Directional Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.97              Pass
+    2                       1.94              Pass
+    4                       3.89              Pass
+    8                       7.75              Pass
+    16                     15.55              Pass
+    32                     31.11              Pass
+    64                     61.95              Pass
+    128                   123.35              Pass
+    256                   250.91              Pass
+    512                   500.80              Pass
+    1024                 1002.29              Pass
+    2048                 2003.24              Pass
+    4096                 4014.15              Pass
+    8192                 7289.11              Pass
+    16384               14717.42              Pass
+    32768               22467.65              Pass
+    65536               33136.69              Pass
+    131072              38970.21              Pass
+    262144              42501.28              Pass
+    524288              44466.34              Pass
+    1048576             45554.48              Pass
+    2097152             46124.56              Pass
+    4194304             46417.53              Pass
+    ```
+
+
+=== "Point-to-point latency, CPU-to-CPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_latency --validation
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_latency: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./pt2pt/osu_latency: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                       3.34              Pass
+    2                       3.34              Pass
+    4                       3.35              Pass
+    8                       3.34              Pass
+    16                      3.33              Pass
+    32                      3.34              Pass
+    64                      3.33              Pass
+    128                     4.32              Pass
+    256                     4.36              Pass
+    512                     4.40              Pass
+    1024                    4.46              Pass
+    2048                    4.61              Pass
+    4096                    4.89              Pass
+    8192                    8.31              Pass
+    16384                   8.95              Pass
+    32768                   9.76              Pass
+    65536                  11.16              Pass
+    131072                 13.98              Pass
+    262144                 19.41              Pass
+    524288                 30.21              Pass
+    1048576                52.12              Pass
+    2097152                95.26              Pass
+    4194304               182.39              Pass
+    ```
+
+
+=== "All-to-all collective latency, CPU-to-CPU memory, multiple nodes"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi ./collective/osu_alltoall --validation
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                      12.46              Pass
+    2                      12.05              Pass
+    4                      11.99              Pass
+    8                      11.84              Pass
+    16                     11.87              Pass
+    32                     11.84              Pass
+    64                     11.95              Pass
+    128                    12.22              Pass
+    256                    13.21              Pass
+    512                    13.23              Pass
+    1024                   13.37              Pass
+    2048                   13.52              Pass
+    4096                   13.88              Pass
+    8192                   17.32              Pass
+    16384                  18.98              Pass
+    32768                  23.72              Pass
+    65536                  36.53              Pass
+    131072                 62.96              Pass
+    262144                119.44              Pass
+    524288                236.43              Pass
+    1048576               519.85              Pass
+    ```
+
+
+=== "All-to-all collective latency, GPU-to-GPU memory, multiple nodes"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi ./collective/osu_alltoall --validation -d cuda
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+
+    # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                      22.26              Pass
+    2                      22.08              Pass
+    4                      22.15              Pass
+    8                      22.19              Pass
+    16                     22.25              Pass
+    32                     22.11              Pass
+    64                     22.22              Pass
+    128                    21.98              Pass
+    256                    22.19              Pass
+    512                    22.20              Pass
+    1024                   22.37              Pass
+    2048                   22.58              Pass
+    4096                   22.99              Pass
+    8192                   27.22              Pass
+    16384                  28.55              Pass
+    32768                  32.60              Pass
+    65536                  44.88              Pass
+    131072                 70.15              Pass
+    262144                123.30              Pass
+    524288                234.89              Pass
+    1048576               486.89              Pass
+    ```
+
+
+### Results without the CXI hook
+On many Alps vClusters, the Container Engine is configured with the CXI hook enabled by default, enabling transparent access to the Slingshot interconnect.
+
+This section demonstrates the performance benefit of the CXI hook by explicitly disabling it through the EDF:
+```console
+$ cat .edf/omb-ompi-no-cxi.toml 
+image = "quay.io#ethcscs/osu-mb:7.5-ompi5.0.8-ofi1.22-cuda12.8"
+
+[env]
+PMIX_MCA_psec="native"
+
+[annotations]
+com.hooks.cxi.enabled="false"
+```
+
+=== "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication"
+    ```console
+   $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation
+
+    # OSU MPI Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.16              Pass
+    2                       0.32              Pass
+    4                       0.65              Pass
+    8                       1.31              Pass
+    16                      2.59              Pass
+    32                      5.26              Pass
+    64                     10.37              Pass
+    128                    20.91              Pass
+    256                    41.49              Pass
+    512                    74.26              Pass
+    1024                  123.99              Pass
+    2048                  213.82              Pass
+    4096                  356.13              Pass
+    8192                  468.55              Pass
+    16384                 505.89              Pass
+    32768                 549.59              Pass
+    65536                2170.64              Pass
+    131072               2137.95              Pass
+    262144               2469.63              Pass
+    524288               2731.85              Pass
+    1048576              2919.18              Pass
+    2097152              3047.21              Pass
+    4194304              3121.42              Pass
+    ```
+
+=== "Point-to-point bandwidth, GPU-to-GPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation D D
+
+    # OSU MPI-CUDA Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.06              Pass
+    2                       0.12              Pass
+    4                       0.24              Pass
+    8                       0.48              Pass
+    16                      0.95              Pass
+    32                      1.91              Pass
+    64                      3.85              Pass
+    128                     7.57              Pass
+    256                    15.28              Pass
+    512                    19.87              Pass
+    1024                   53.06              Pass
+    2048                   97.29              Pass
+    4096                  180.73              Pass
+    8192                  343.75              Pass
+    16384                 473.72              Pass
+    32768                 530.81              Pass
+    65536                1268.51              Pass
+    131072               1080.83              Pass
+    262144               1435.36              Pass
+    524288               1526.12              Pass
+    1048576              1727.31              Pass
+    2097152              1755.61              Pass
+    4194304              1802.75              Pass
+    ```
+
+=== "Point-to-point latency, CPU-to-CPU memory, inter-node communication"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_latency --validation
+
+    # OSU MPI Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                      28.92              Pass
+    2                      28.99              Pass
+    4                      29.07              Pass
+    8                      29.13              Pass
+    16                     29.48              Pass
+    32                     29.18              Pass
+    64                     29.39              Pass
+    128                    30.11              Pass
+    256                    32.10              Pass
+    512                    34.07              Pass
+    1024                   38.36              Pass
+    2048                   61.00              Pass
+    4096                   81.04              Pass
+    8192                   80.11              Pass
+    16384                 126.99              Pass
+    32768                 124.97              Pass
+    65536                 123.84              Pass
+    131072                207.48              Pass
+    262144                252.43              Pass
+    524288                319.47              Pass
+    1048576               497.84              Pass
+    2097152               956.03              Pass
+    4194304              1455.18              Pass
+    ```
+
+
+=== "All-to-all collective latency, CPU-to-CPU memory, multiple nodes"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi-no-cxi ./collective/osu_alltoall --validation
+
+    # OSU MPI All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                     137.85              Pass
+    2                     133.47              Pass
+    4                     134.03              Pass
+    8                     131.14              Pass
+    16                    134.45              Pass
+    32                    135.35              Pass
+    64                    137.21              Pass
+    128                   137.03              Pass
+    256                   139.90              Pass
+    512                   140.70              Pass
+    1024                  165.05              Pass
+    2048                  197.14              Pass
+    4096                  255.02              Pass
+    8192                  335.75              Pass
+    16384                 543.12              Pass
+    32768                 928.81              Pass
+    65536                 782.28              Pass
+    131072               1812.95              Pass
+    262144               2284.26              Pass
+    524288               3213.63              Pass
+    1048576              5688.27              Pass
+    ```
+
+
+=== "All-to-all collective latency, GPU-to-GPU memory, multiple nodes"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi-no-cxi ./collective/osu_alltoall --validation -d cuda
+
+    # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                     186.92              Pass
+    2                     180.80              Pass
+    4                     180.72              Pass
+    8                     179.45              Pass
+    16                    209.53              Pass
+    32                    181.73              Pass
+    64                    182.20              Pass
+    128                   182.84              Pass
+    256                   188.29              Pass
+    512                   189.35              Pass
+    1024                  237.31              Pass
+    2048                  231.73              Pass
+    4096                  298.73              Pass
+    8192                  396.10              Pass
+    16384                 589.72              Pass
+    32768                 983.72              Pass
+    65536                 786.48              Pass
+    131072               1127.39              Pass
+    262144               2144.57              Pass
+    524288               3107.62              Pass
+    1048576              5545.28              Pass
+    ```
diff --git a/mkdocs.yml b/mkdocs.yml
index e02a565f..e1642376 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -59,6 +59,12 @@ nav:
       - 'Using the Container Engine': software/container-engine/run.md
       - 'Hooks and native resources': software/container-engine/resource-hook.md
       - 'EDF reference': software/container-engine/edf.md
+      - 'Guidelines for images on Alps': software/container-engine/examples/guidelines-images.md
+        - software/container-engine/examples/image-comm-fwk.md
+        - software/container-engine/examples/image-mpich.md
+        - software/container-engine/examples/image-compi.md
+        - software/container-engine/examples/image-nccl-tests.md
+        - software/container-engine/examples/image-nvshmem.md
       - 'Known issues': software/container-engine/known-issue.md
   - 'Building and Installing Software':
     - build-install/index.md

From 5d22873a230d9de8b41f407327add2dcfb099dc5 Mon Sep 17 00:00:00 2001
From: Alberto Madonna <alberto.madonna@cscs.ch>
Date: Wed, 1 Oct 2025 00:40:46 +0200
Subject: [PATCH 02/12] Fixed mkdocs table of contents

---
 mkdocs.yml | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mkdocs.yml b/mkdocs.yml
index e1642376..df39f44d 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -59,12 +59,13 @@ nav:
       - 'Using the Container Engine': software/container-engine/run.md
       - 'Hooks and native resources': software/container-engine/resource-hook.md
       - 'EDF reference': software/container-engine/edf.md
-      - 'Guidelines for images on Alps': software/container-engine/examples/guidelines-images.md
-        - software/container-engine/examples/image-comm-fwk.md
-        - software/container-engine/examples/image-mpich.md
-        - software/container-engine/examples/image-compi.md
-        - software/container-engine/examples/image-nccl-tests.md
-        - software/container-engine/examples/image-nvshmem.md
+      - 'Guidelines for images on Alps':
+        - software/container-engine/examples/guidelines-images.md
+        - 'Communication frameworks image': software/container-engine/examples/image-comm-fwk.md
+        - 'MPICH image': software/container-engine/examples/image-mpich.md
+        - 'OpenMPI image': software/container-engine/examples/image-ompi.md
+        - 'NCCL Tests image': software/container-engine/examples/image-nccl-tests.md
+        - 'NVSHMEM image': software/container-engine/examples/image-nvshmem.md
       - 'Known issues': software/container-engine/known-issue.md
   - 'Building and Installing Software':
     - build-install/index.md

From 9504522ce4f9214de7069ba627772f26ec8f39db Mon Sep 17 00:00:00 2001
From: Alberto Madonna <alberto.madonna@cscs.ch>
Date: Wed, 1 Oct 2025 00:45:36 +0200
Subject: [PATCH 03/12] Fixed typos

---
 docs/software/container-engine/examples/image-mpich.md   | 2 +-
 docs/software/container-engine/examples/image-nvshmem.md | 2 +-
 docs/software/container-engine/examples/image-ompi.md    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/software/container-engine/examples/image-mpich.md b/docs/software/container-engine/examples/image-mpich.md
index 2dd617cf..c0016934 100644
--- a/docs/software/container-engine/examples/image-mpich.md
+++ b/docs/software/container-engine/examples/image-mpich.md
@@ -45,7 +45,7 @@ RUN wget -q https://www.mpich.org/static/downloads/${MPI_VER}/mpich-${MPI_VER}.t
 
 ## Performance examples
 
-In this section we demonstrate the performance of the previosly created MPICH image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks.
+In this section we demonstrate the performance of the previously created MPICH image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks.
 
 A build of the image with the OSU benchmarks is available on the [Quay.io](https://quay.io/) registry at the following reference:
 `quay.io/ethcscs/osu-mb:7.5-mpich4.3.1-ofi1.22-cuda12.8`.
diff --git a/docs/software/container-engine/examples/image-nvshmem.md b/docs/software/container-engine/examples/image-nvshmem.md
index 1ced3ef5..d3e9ec27 100644
--- a/docs/software/container-engine/examples/image-nvshmem.md
+++ b/docs/software/container-engine/examples/image-nvshmem.md
@@ -104,7 +104,7 @@ com.hooks.aws_ofi_nccl.variant = "cuda12"
 
 - NVSHMEM's `libfabric` transport does not support VMM yet, so VMM must be disabled by setting the environment variable `NVSHMEM_DISABLE_CUDA_VMM=1`.
 - Since NVSHMEM has been configured in the Containerfile to use PMIx for bootstrapping, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs.
-- Other bootstrapping methods (including different PMI implementations) can be specified for NVSHMEM throug the related [environment variables](https://docs.nvidia.com/nvshmem/api/gen/env.html#bootstrap-options). When bootstrapping through PMI or MPI through Slurm, ensure that the PMI implementation used by Slurm (i.e. `srun --mpi` option) matches the one expected by NVSHMEM or the MPI library.
+- Other bootstrapping methods (including different PMI implementations) can be specified for NVSHMEM through the related [environment variables](https://docs.nvidia.com/nvshmem/api/gen/env.html#bootstrap-options). When bootstrapping through PMI or MPI through Slurm, ensure that the PMI implementation used by Slurm (i.e. `srun --mpi` option) matches the one expected by NVSHMEM or the MPI library.
 - NCCL requires the presence of the [AWS OFI NCCL plugin](https://github.com/aws/aws-ofi-nccl) in order to correctly interface with Libfabric and (through the latter) the Slingshot interconnect. Therefore, for optimal performance the [related CE hook][ref-ce-aws-ofi-hook] must be enabled and set to match the CUDA version in the container.
 - Libfabric itself is usually injected by the [CXI hook][ref-ce-cxi-hook], which is enabled by default on several Alps vClusters.
 
diff --git a/docs/software/container-engine/examples/image-ompi.md b/docs/software/container-engine/examples/image-ompi.md
index c1287478..2f122b8d 100644
--- a/docs/software/container-engine/examples/image-ompi.md
+++ b/docs/software/container-engine/examples/image-ompi.md
@@ -39,7 +39,7 @@ RUN wget -q https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OMPI_V
 
 ## Performance examples
 
-In this section we demonstrate the performance of the previosly created OpenMPI image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks.
+In this section we demonstrate the performance of the previously created OpenMPI image using it to build the OSU Micro-Benchmarks 7.5.1, and deploying the resulting image on Alps through the Container Engine to run a variety of benchmarks.
 
 A build of the image with the OSU benchmarks is available on the [Quay.io](https://quay.io/) registry at the following reference:
 `quay.io/ethcscs/osu-mb:7.5-ompi5.0.8-ofi1.22-cuda12.8`.

From 713f8b40234e9d2abf7bb9faa167c78151135a3b Mon Sep 17 00:00:00 2001
From: Alberto Madonna <alberto.madonna@cscs.ch>
Date: Wed, 1 Oct 2025 00:46:46 +0200
Subject: [PATCH 04/12] Fixed typo

---
 docs/software/container-engine/examples/guidelines-images.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/software/container-engine/examples/guidelines-images.md b/docs/software/container-engine/examples/guidelines-images.md
index f2d339fc..1b3ddc12 100644
--- a/docs/software/container-engine/examples/guidelines-images.md
+++ b/docs/software/container-engine/examples/guidelines-images.md
@@ -1,7 +1,7 @@
 [](){#ref-ce-guidelines-images}
 # Guidelines for images on Alps
 
-This section offers some guidelines about creating and using container images that achieve good performance on the Alps reseach infrastructure.
+This section offers some guidelines about creating and using container images that achieve good performance on the Alps research infrastructure.
 The section focuses on foundational components (such as communication libraries) which are essential to enabling performant effective usage of Alps' capabilities, rather than full application use cases.
 Synthetic benchmarks are also used to showcase quantitative performance.
 

From b10b99fe27c4b474524d7a46fa301297038dd28f Mon Sep 17 00:00:00 2001
From: Alberto Madonna <alberto.madonna@cscs.ch>
Date: Wed, 1 Oct 2025 00:50:34 +0200
Subject: [PATCH 05/12] Improved content organization for CE image guidelines

---
 .../image-comm-fwk.md                                |  0
 .../{examples => guidelines-images}/image-mpich.md   |  0
 .../image-nccl-tests.md                              |  0
 .../{examples => guidelines-images}/image-nvshmem.md |  0
 .../{examples => guidelines-images}/image-ompi.md    |  0
 .../index.md}                                        |  0
 mkdocs.yml                                           | 12 ++++++------
 7 files changed, 6 insertions(+), 6 deletions(-)
 rename docs/software/container-engine/{examples => guidelines-images}/image-comm-fwk.md (100%)
 rename docs/software/container-engine/{examples => guidelines-images}/image-mpich.md (100%)
 rename docs/software/container-engine/{examples => guidelines-images}/image-nccl-tests.md (100%)
 rename docs/software/container-engine/{examples => guidelines-images}/image-nvshmem.md (100%)
 rename docs/software/container-engine/{examples => guidelines-images}/image-ompi.md (100%)
 rename docs/software/container-engine/{examples/guidelines-images.md => guidelines-images/index.md} (100%)

diff --git a/docs/software/container-engine/examples/image-comm-fwk.md b/docs/software/container-engine/guidelines-images/image-comm-fwk.md
similarity index 100%
rename from docs/software/container-engine/examples/image-comm-fwk.md
rename to docs/software/container-engine/guidelines-images/image-comm-fwk.md
diff --git a/docs/software/container-engine/examples/image-mpich.md b/docs/software/container-engine/guidelines-images/image-mpich.md
similarity index 100%
rename from docs/software/container-engine/examples/image-mpich.md
rename to docs/software/container-engine/guidelines-images/image-mpich.md
diff --git a/docs/software/container-engine/examples/image-nccl-tests.md b/docs/software/container-engine/guidelines-images/image-nccl-tests.md
similarity index 100%
rename from docs/software/container-engine/examples/image-nccl-tests.md
rename to docs/software/container-engine/guidelines-images/image-nccl-tests.md
diff --git a/docs/software/container-engine/examples/image-nvshmem.md b/docs/software/container-engine/guidelines-images/image-nvshmem.md
similarity index 100%
rename from docs/software/container-engine/examples/image-nvshmem.md
rename to docs/software/container-engine/guidelines-images/image-nvshmem.md
diff --git a/docs/software/container-engine/examples/image-ompi.md b/docs/software/container-engine/guidelines-images/image-ompi.md
similarity index 100%
rename from docs/software/container-engine/examples/image-ompi.md
rename to docs/software/container-engine/guidelines-images/image-ompi.md
diff --git a/docs/software/container-engine/examples/guidelines-images.md b/docs/software/container-engine/guidelines-images/index.md
similarity index 100%
rename from docs/software/container-engine/examples/guidelines-images.md
rename to docs/software/container-engine/guidelines-images/index.md
diff --git a/mkdocs.yml b/mkdocs.yml
index df39f44d..5511b08e 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -60,12 +60,12 @@ nav:
       - 'Hooks and native resources': software/container-engine/resource-hook.md
       - 'EDF reference': software/container-engine/edf.md
       - 'Guidelines for images on Alps':
-        - software/container-engine/examples/guidelines-images.md
-        - 'Communication frameworks image': software/container-engine/examples/image-comm-fwk.md
-        - 'MPICH image': software/container-engine/examples/image-mpich.md
-        - 'OpenMPI image': software/container-engine/examples/image-ompi.md
-        - 'NCCL Tests image': software/container-engine/examples/image-nccl-tests.md
-        - 'NVSHMEM image': software/container-engine/examples/image-nvshmem.md
+        - software/container-engine/guidelines-images/index.md
+        - 'Communication frameworks image': software/container-engine/guidelines-images/image-comm-fwk.md
+        - 'MPICH image': software/container-engine/guidelines-images/image-mpich.md
+        - 'OpenMPI image': software/container-engine/guidelines-images/image-ompi.md
+        - 'NCCL Tests image': software/container-engine/guidelines-images/image-nccl-tests.md
+        - 'NVSHMEM image': software/container-engine/guidelines-images/image-nvshmem.md
       - 'Known issues': software/container-engine/known-issue.md
   - 'Building and Installing Software':
     - build-install/index.md

From 5d74d5988d402d5b3676d45702835662cef3ac3b Mon Sep 17 00:00:00 2001
From: Alberto Madonna <alberto.madonna@cscs.ch>
Date: Wed, 1 Oct 2025 00:56:28 +0200
Subject: [PATCH 06/12] Fixed code blocks

---
 docs/software/container-engine/guidelines-images/image-mpich.md | 2 +-
 docs/software/container-engine/guidelines-images/image-ompi.md  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/software/container-engine/guidelines-images/image-mpich.md b/docs/software/container-engine/guidelines-images/image-mpich.md
index c0016934..79fadecf 100644
--- a/docs/software/container-engine/guidelines-images/image-mpich.md
+++ b/docs/software/container-engine/guidelines-images/image-mpich.md
@@ -421,7 +421,7 @@ com.hooks.cxi.enabled="false"
 
 === "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication"
     ```console
-   $ srun -N2 --mpi=pmi2 --environment=omb-mpich-no-cxi ./pt2pt/osu_bw --validation
+    $ srun -N2 --mpi=pmi2 --environment=omb-mpich-no-cxi ./pt2pt/osu_bw --validation
 
     # OSU MPI Bandwidth Test v7.5
     # Datatype: MPI_CHAR.
diff --git a/docs/software/container-engine/guidelines-images/image-ompi.md b/docs/software/container-engine/guidelines-images/image-ompi.md
index 2f122b8d..344440aa 100644
--- a/docs/software/container-engine/guidelines-images/image-ompi.md
+++ b/docs/software/container-engine/guidelines-images/image-ompi.md
@@ -419,7 +419,7 @@ com.hooks.cxi.enabled="false"
 
 === "Point-to-point bandwidth, CPU-to-CPU memory, inter-node communication"
     ```console
-   $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation
+    $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation
 
     # OSU MPI Bandwidth Test v7.5
     # Datatype: MPI_CHAR.

From 0bab4b206fac08c14e5f773545e74170e0e38ae6 Mon Sep 17 00:00:00 2001
From: Alberto Madonna <alberto.madonna@cscs.ch>
Date: Wed, 1 Oct 2025 19:11:49 +0200
Subject: [PATCH 07/12] Updated allowed words in spelling checker

---
 .github/actions/spelling/allow.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt
index 750049ed..516a474f 100644
--- a/.github/actions/spelling/allow.txt
+++ b/.github/actions/spelling/allow.txt
@@ -16,9 +16,11 @@ CWP
 CXI
 Ceph
 Containerfile
+Containerfiles
 DNS
 Dockerfiles
 Dufourspitze
+EFA
 EMPA
 ETHZ
 Ehrenfest
@@ -75,6 +77,7 @@ MeteoSwiss
 NAMD
 NICs
 NVMe
+NVSHMEM
 Nordend
 OpenFabrics
 OAuth
@@ -101,6 +104,7 @@ ROCm
 RPA
 Roboto
 Roothaan
+SHMEM
 SSHService
 STMV
 Scopi

From 7236858863dbf5ce81f5298c438d4372b9c76c73 Mon Sep 17 00:00:00 2001
From: Alberto Madonna <alberto.madonna@cscs.ch>
Date: Mon, 6 Oct 2025 16:22:36 +0200
Subject: [PATCH 08/12] Apply suggestions from code review

Co-authored-by: Rocco Meli <r.meli@bluemail.ch>
---
 .../container-engine/guidelines-images/image-comm-fwk.md    | 6 +++---
 .../container-engine/guidelines-images/image-nvshmem.md     | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/software/container-engine/guidelines-images/image-comm-fwk.md b/docs/software/container-engine/guidelines-images/image-comm-fwk.md
index 8bd51735..1ca39ab5 100644
--- a/docs/software/container-engine/guidelines-images/image-comm-fwk.md
+++ b/docs/software/container-engine/guidelines-images/image-comm-fwk.md
@@ -5,9 +5,9 @@ This page describes a container image providing foundational software components
 
 The most important aspect to consider for performance of containerized applications is related to use of high-speed networks,
 therefore this image mainly installs communication frameworks and libraries, besides general utility tools.
-In particular, the [Libfabric](https://ofiwg.github.io/libfabric/) framework (also known as Open Fabrics Interfaces - OFI) is required to interface applications with the Slingshot high-speed network.
+In particular, the [libfabric](https://ofiwg.github.io/libfabric/) framework (also known as Open Fabrics Interfaces - OFI) is required to interface applications with the Slingshot high-speed network.
 
-At runtime, the container engine [CXI hook][ref-ce-cxi-hook] will replace the Libfabric libraries inside the container with the corresponding libraries on the host system.
+At runtime, the container engine [CXI hook][ref-ce-cxi-hook] will replace the libfabric libraries inside the container with the corresponding libraries on the host system.
 This will ensure access to the Slingshot interconnect.
 
 This image is not intended to be used on its own, but to serve as a base to build higher-level software (e.g. MPI implementations) and application stacks.
@@ -101,5 +101,5 @@ RUN wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${
 ## Notes
 - The image is based on an official NVIDIA CUDA image, and therefore already provides the NCCL library, alongside a complete CUDA installation.
 - Communication frameworks are built with explicit support for CUDA and GDRCopy.
-- The libfabric EFA provider is included to leave open the possibility to experiment with derived images on AWS infrastructure as well.
+- The libfabric [EFA](https://aws.amazon.com/hpc/efa/) provider is included to leave open the possibility to experiment with derived images on AWS infrastructure as well.
 - Although only the libfabric framework is required to support Alps' Slingshot network, this image also packages the UCX communication framework to allow building a broader set of software (e.g. some OpenSHMEM implementations) and supporting optimized Infiniband communication as well.
diff --git a/docs/software/container-engine/guidelines-images/image-nvshmem.md b/docs/software/container-engine/guidelines-images/image-nvshmem.md
index d3e9ec27..08b13668 100644
--- a/docs/software/container-engine/guidelines-images/image-nvshmem.md
+++ b/docs/software/container-engine/guidelines-images/image-nvshmem.md
@@ -1,7 +1,7 @@
 [](){#ref-ce-guidelines-images-nvshmem}
 # NVSHMEM image
 
-This page describes a container image featuring the [NVSHMEM](https://developer.nvidia.com/nvshmem) parallel programming library with support for Libfabric, and demonstrates how to efficiently run said image on Alps.
+This page describes a container image featuring the [NVSHMEM](https://developer.nvidia.com/nvshmem) parallel programming library with support for libfabric, and demonstrates how to efficiently run said image on Alps.
 
 This image is based on the [OpenMPI image][ref-ce-guidelines-images-ompi], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes.
 
@@ -79,7 +79,7 @@ RUN wget -q https://developer.download.nvidia.com/compute/redist/nvshmem/3.4.5/s
 
 !!! note
     - This image also builds the performance tests bundled with NVSHMEM (`NVSHMEM_BUILD_TESTS=1`) to demonstrate performance below. The performance tests, in turn, require the installation of Python dependencies. When building images intended solely for production purposes, you may exclude both those elements.
-    - Notice that NVSHMEM is configured with support for Libfabric explicitly enabled (`NVSHMEM_LIBFABRIC_SUPPORT=1`).
+    - Notice that NVSHMEM is configured with support for libfabric explicitly enabled (`NVSHMEM_LIBFABRIC_SUPPORT=1`).
     - Since this image is meant primarily to run on Alps, NVSHMEM is built without support for UCX and Infiniband components. 
     - Since this image uses OpenMPI (which provides PMIx) as MPI implementation, NVSHMEM is also configured to default to PMIx for bootstrapping (`NVSHMEM_PMIX_SUPPORT=1`).
 

From ca271ed885534f4124c39dd3852bfeeaa41d209f Mon Sep 17 00:00:00 2001
From: Alberto Madonna <alberto.madonna@cscs.ch>
Date: Mon, 6 Oct 2025 16:49:43 +0200
Subject: [PATCH 09/12] CE image guidelines: add links to subpages

---
 docs/software/container-engine/guidelines-images/index.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/software/container-engine/guidelines-images/index.md b/docs/software/container-engine/guidelines-images/index.md
index 1b3ddc12..87feed5e 100644
--- a/docs/software/container-engine/guidelines-images/index.md
+++ b/docs/software/container-engine/guidelines-images/index.md
@@ -29,7 +29,7 @@ Below is a summary of the software suggested and demonstrated throughout this se
 
 The content is organized in pages which detail container images building incrementally upon each other:
 
-- a base image installing baseline libraries and frameworks (e.g. CUDA, libfabric)
-- MPI implementations (MPICH, OpenMPI)
-- NVSHMEM
-- NCCL tests
+- a [base image][ref-ce-guidelines-images-commfwk] installing baseline libraries and frameworks (e.g. CUDA, libfabric)
+- MPI implementations ([MPICH][ref-ce-guidelines-images-mpich], [OpenMPI][ref-ce-guidelines-images-ompi])
+- [NVSHMEM][ref-ce-guidelines-images-nvshmem]
+- [NCCL tests][ref-ce-guidelines-images-nccl-tests]

From 1e919521bd3c904e13d1c7f4da036e2bf3962ed2 Mon Sep 17 00:00:00 2001
From: Alberto Madonna <alberto.madonna@cscs.ch>
Date: Mon, 6 Oct 2025 17:16:41 +0200
Subject: [PATCH 10/12] CE image guidelines: add code block notes for PMIx
 settings

---
 .../guidelines-images/image-nccl-tests.md                 | 4 +++-
 .../container-engine/guidelines-images/image-nvshmem.md   | 8 +++++---
 .../container-engine/guidelines-images/image-ompi.md      | 4 +++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/docs/software/container-engine/guidelines-images/image-nccl-tests.md b/docs/software/container-engine/guidelines-images/image-nccl-tests.md
index 3e1e4a54..3f0801df 100644
--- a/docs/software/container-engine/guidelines-images/image-nccl-tests.md
+++ b/docs/software/container-engine/guidelines-images/image-nccl-tests.md
@@ -41,13 +41,15 @@ RUN wget -O nccl-tests-${nccl_tests_version}.tar.gz https://github.com/NVIDIA/nc
 image = "quay.io#ethcscs/nccl-tests:2.17.1-ompi5.0.8-ofi1.22-cuda12.8"
 
 [env]
-PMIX_MCA_psec="native"
+PMIX_MCA_psec="native" # (1)!
 
 [annotations]
 com.hooks.aws_ofi_nccl.enabled = "true"
 com.hooks.aws_ofi_nccl.variant = "cuda12"
 ```
 
+1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup.
+
 ### Notes
 
 - Since OpenMPI uses PMIx for wire-up and communication between ranks, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs.
diff --git a/docs/software/container-engine/guidelines-images/image-nvshmem.md b/docs/software/container-engine/guidelines-images/image-nvshmem.md
index 08b13668..41406424 100644
--- a/docs/software/container-engine/guidelines-images/image-nvshmem.md
+++ b/docs/software/container-engine/guidelines-images/image-nvshmem.md
@@ -90,19 +90,21 @@ RUN wget -q https://developer.download.nvidia.com/compute/redist/nvshmem/3.4.5/s
 image = "quay.io#ethcscs/nvshmem:3.4.5-ompi5.0.8-ofi1.22-cuda12.8"
 
 [env]
-PMIX_MCA_psec="native"
+PMIX_MCA_psec="native" # (1)!
 NVSHMEM_REMOTE_TRANSPORT="libfabric"
 NVSHMEM_LIBFABRIC_PROVIDER="cxi"
-NVSHMEM_DISABLE_CUDA_VMM="1"
+NVSHMEM_DISABLE_CUDA_VMM="1" # (2)!
 
 [annotations]
 com.hooks.aws_ofi_nccl.enabled = "true"
 com.hooks.aws_ofi_nccl.variant = "cuda12"
 ```
 
+1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup.
+2. NVSHMEM's `libfabric` transport does not support VMM yet, so VMM must be disabled by setting the environment variable `NVSHMEM_DISABLE_CUDA_VMM=1`.
+
 ### Notes
 
-- NVSHMEM's `libfabric` transport does not support VMM yet, so VMM must be disabled by setting the environment variable `NVSHMEM_DISABLE_CUDA_VMM=1`.
 - Since NVSHMEM has been configured in the Containerfile to use PMIx for bootstrapping, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs.
 - Other bootstrapping methods (including different PMI implementations) can be specified for NVSHMEM through the related [environment variables](https://docs.nvidia.com/nvshmem/api/gen/env.html#bootstrap-options). When bootstrapping through PMI or MPI through Slurm, ensure that the PMI implementation used by Slurm (i.e. `srun --mpi` option) matches the one expected by NVSHMEM or the MPI library.
 - NCCL requires the presence of the [AWS OFI NCCL plugin](https://github.com/aws/aws-ofi-nccl) in order to correctly interface with Libfabric and (through the latter) the Slingshot interconnect. Therefore, for optimal performance the [related CE hook][ref-ce-aws-ofi-hook] must be enabled and set to match the CUDA version in the container.
diff --git a/docs/software/container-engine/guidelines-images/image-ompi.md b/docs/software/container-engine/guidelines-images/image-ompi.md
index 344440aa..07622b14 100644
--- a/docs/software/container-engine/guidelines-images/image-ompi.md
+++ b/docs/software/container-engine/guidelines-images/image-ompi.md
@@ -71,9 +71,11 @@ WORKDIR /usr/local/libexec/osu-micro-benchmarks/mpi
 image = "quay.io#ethcscs/osu-mb:7.5-ompi5.0.8-ofi1.22-cuda12.8"
 
 [env]
-PMIX_MCA_psec="native"
+PMIX_MCA_psec="native" # (1)!
 ```
 
+1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup.
+
 ### Notes
 
 - Since OpenMPI uses PMIx for wire-up and communication between ranks, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs.

From a69d336c0cc8ea1264df156295a92a70a8bcfd99 Mon Sep 17 00:00:00 2001
From: bcumming <bcumming@cscs.ch>
Date: Fri, 24 Oct 2025 09:41:31 +0200
Subject: [PATCH 11/12] wip

---
 docs/software/communication/index.md | 15 +++++++++++++--
 mkdocs.yml                           |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/docs/software/communication/index.md b/docs/software/communication/index.md
index 5d961d77..b241df0e 100644
--- a/docs/software/communication/index.md
+++ b/docs/software/communication/index.md
@@ -1,7 +1,18 @@
 [](){#ref-software-communication}
 # Communication Libraries
 
-CSCS provides common communication libraries optimized for the [Slingshot 11 network on Alps][ref-alps-hsn].
+Communication libraries are used by scientific and AI workloads to communicate between processes.
+The communication libraries used by workloads need to be built and configured correctly to get the best performance.
+Broadly speaking, there are two levels of communication:
+
+* **intra-node** communication between two processes on the same node.
+* **inter-node** communication between different nodes, which requires 
+
+Inter-node communication requires sending and receiving data over the [Slingshot 11 network][ref-alps-hsn] that connects nodes on Alps.
+Communication libraries, like MPI and NCCL, need to be configured to use the [libfabric][ref-communication-libfabric] library that has an optimised back end for Slingshot 11.
+
+CSCS provides communication libraries optimised for libfabric and slingshot in uenv, and guidance on how to configure container images similarly.
+This section of the documentation provides advice on how to build and install software to use these libraries, and how to deploy them.
 
 For most scientific applications relying on MPI, [Cray MPICH][ref-communication-cray-mpich] is recommended.
 [MPICH][ref-communication-mpich] and [OpenMPI][ref-communication-openmpi] may also be used, with limitations.
@@ -12,9 +23,9 @@ NCCL and RCCL have to be configured with a plugin using [libfabric][ref-communic
 
 See the individual pages for each library for information on how to use and best configure the libraries.
 
+* [libfabric][ref-communication-libfabric]
 * [Cray MPICH][ref-communication-cray-mpich]
 * [MPICH][ref-communication-mpich]
 * [OpenMPI][ref-communication-openmpi]
 * [NCCL][ref-communication-nccl]
 * [RCCL][ref-communication-rccl]
-* [libfabric][ref-communication-libfabric]
diff --git a/mkdocs.yml b/mkdocs.yml
index bcdb0235..2b1ca54b 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -111,12 +111,12 @@ nav:
       - 'WRF': software/cw/wrf.md
     - 'Communication Libraries':
       - software/communication/index.md
+      - 'libfabric': software/communication/libfabric.md
       - 'Cray MPICH': software/communication/cray-mpich.md
       - 'MPICH': software/communication/mpich.md
       - 'OpenMPI': software/communication/openmpi.md
       - 'NCCL': software/communication/nccl.md
       - 'RCCL': software/communication/rccl.md
-      - 'libfabric': software/communication/libfabric.md
     - 'Commercial software':
       - software/commercial/index.md
       - 'Matlab': software/commercial/matlab.md

From 5df469c155a316e7b6026cdf623fc18e1dde66d5 Mon Sep 17 00:00:00 2001
From: bcumming <bcumming@cscs.ch>
Date: Mon, 27 Oct 2025 20:11:11 +0100
Subject: [PATCH 12/12] refactor comms index; integrated base image into
 libfabric

---
 docs/software/communication/index.md     |  58 +++++++--
 docs/software/communication/libfabric.md | 147 ++++++++++++++++++++++-
 2 files changed, 192 insertions(+), 13 deletions(-)

diff --git a/docs/software/communication/index.md b/docs/software/communication/index.md
index b241df0e..22b9aca1 100644
--- a/docs/software/communication/index.md
+++ b/docs/software/communication/index.md
@@ -1,15 +1,26 @@
 [](){#ref-software-communication}
 # Communication Libraries
 
+!!! todo "list of ideas to integrate in this page"
+    * communication libraries are part of the "base" or "core" layer in your environment, alongside compilers and cuda (on NVIDIA GPU systems).
+        * we provide base containers that start with compilers+CUDA
+    * have a section "installing/getting comm libs":
+        * CE (build your own) and uenv (it comes with the label) sub-sections
+        * Conda, pre-built (ORCA, ANSYS, etc)
+
 Communication libraries are used by scientific and AI workloads to communicate between processes.
 The communication libraries used by workloads need to be built and configured correctly to get the best performance.
 Broadly speaking, there are two levels of communication:
 
 * **intra-node** communication between two processes on the same node.
-* **inter-node** communication between different nodes, which requires 
+* **inter-node** communication between different nodes, over the [Slingshot 11 network][ref-alps-hsn] that connects nodes on Alps..
 
-Inter-node communication requires sending and receiving data over the [Slingshot 11 network][ref-alps-hsn] that connects nodes on Alps.
 Communication libraries, like MPI and NCCL, need to be configured to use the [libfabric][ref-communication-libfabric] library that has an optimised back end for Slingshot 11.
+As such, they are part of the base layer of libraries and tools required to fully utilize the hardware on Alps:
+
+* **CPU**: compilers with support for building applications optimized for the CPU architecture on the node.
+* **GPU**: CUDA and ROCM provide compilers and runtime libraries for NVIDIA and AMD GPUs respectively.
+* **Network**: libfabric, MPI, NCCL/RCCL, NVSHMEM, need to be configured for the Slingshot network.
 
 CSCS provides communication libraries optimised for libfabric and slingshot in uenv, and guidance on how to configure container images similarly.
 This section of the documentation provides advice on how to build and install software to use these libraries, and how to deploy them.
@@ -23,9 +34,40 @@ NCCL and RCCL have to be configured with a plugin using [libfabric][ref-communic
 
 See the individual pages for each library for information on how to use and best configure the libraries.
 
-* [libfabric][ref-communication-libfabric]
-* [Cray MPICH][ref-communication-cray-mpich]
-* [MPICH][ref-communication-mpich]
-* [OpenMPI][ref-communication-openmpi]
-* [NCCL][ref-communication-nccl]
-* [RCCL][ref-communication-rccl]
+<div class="grid cards" markdown>
+
+-   __Low Level__
+
+    learn about the base installation libfabric and its dependencies
+
+    [:octicons-arrow-right-24: libfabric][ref-alps]
+
+</div>
+<div class="grid cards" markdown>
+
+-   __MPI__
+
+    Cray MPICH is the most optimized and best tested MPI implementation on Alps, and is used by uenv.
+
+    [:octicons-arrow-right-24: Cray MPICH][ref-communication-cray-mpich]
+
+    For compatibility in containers:
+
+    [:octicons-arrow-right-24: MPICH][ref-communication-mpich]
+
+    Also OpenMPI can be built in containers or in uenv
+
+    [:octicons-arrow-right-24: FirecREST API][ref-communication-openmpi]
+
+</div>
+<div class="grid cards" markdown>
+
+-   __Machine Learning__
+
+    NCCL and RCCL 
+
+    [:octicons-arrow-right-24: NCCL][ref-communication-nccl]
+
+    [:octicons-arrow-right-24: RCCL][ref-communication-rccl]
+
+</div>
diff --git a/docs/software/communication/libfabric.md b/docs/software/communication/libfabric.md
index a8dd80d8..5ef434d3 100644
--- a/docs/software/communication/libfabric.md
+++ b/docs/software/communication/libfabric.md
@@ -1,16 +1,153 @@
 [](){#ref-communication-libfabric}
 # Libfabric
 
-[Libfabric](https://ofiwg.github.io/libfabric/), or Open Fabrics Interfaces (OFI), is a low level networking library that abstracts away various networking backends.
-It is used by Cray MPICH, and can be used together with OpenMPI, NCCL, and RCCL to make use of the [Slingshot network on Alps][ref-alps-hsn].
+[Libfabric](https://ofiwg.github.io/libfabric/), or Open Fabrics Interfaces (OFI), is a low-level networking library that provides an abstract interface for networks.
+Libfabric has backends for different network types, and is the interface chosen by HPE for the [Slingshot network on Alps][ref-alps-hsn], and by AWS for their [EFA network interface](https://aws.amazon.com/hpc/efa/).
+
+To fully take advantage of the network on Alps:
+
+* libfabric and its dependencies must be availailable in your environment (uenv or container);
+* and, communication libraries like Cray MPICH, OpenMPI, NCCL, and RCCL have to be built or configured to use libfabric.
+
+??? question "What about UCX?"
+    [Unified Communication X (UCX)](https://openucx.org/) is a low level library that targets the same layer as libfabric.
+    Specifically, it provides an open, standards-based, networking API.
+
+    By targetting UCX and libfabric, MPI and NCCL do not need to implement low-level support for each network hardware.
+
+    A downside of having two standards instead of one, is that pre-built software (for example Conda packages and Containers) have versions of MPI built for UCX, which does not provide a back end for Slingshot 11.
+    Trying to run these images will lead to errors, or very poor performance.
 
 ## Using libfabric
 
+### uenv
+
 If you are using a uenv provided by CSCS, such as [prgenv-gnu][ref-uenv-prgenv-gnu], [Cray MPICH][ref-communication-cray-mpich] is linked to libfabric and the high speed network will be used.
 No changes are required in applications.
 
-If you are using containers, the system libfabric can be loaded into your container using the [CXI hook provided by the container engine][ref-ce-cxi-hook].
-Using the hook is essential to make full use of the Alps network.
+### Container Engine
+
+If you are using [containers][ref-container-engine], the simplest approach is to load libfabric into your container using the [CXI hook provided by the container engine][ref-ce-cxi-hook].
+
+Alternatively, it is possible to build libfabric and its dependencies into your container.
+
+!!! example "Installing libfabric in a container for NVIDIA nodes"
+    The following lines demonstrate how to configure and 
+
+    Note that it is assumed that CUDA has already been installed on the system.
+    ```Dockerfile
+    # Install libfabric
+    ARG gdrcopy_version=2.5.1
+    RUN git clone --depth 1 --branch v${gdrcopy_version} https://github.com/NVIDIA/gdrcopy.git \
+        && cd gdrcopy \
+        && export CUDA_PATH=${CUDA_HOME:-$(echo $(which nvcc) | grep -o '.*cuda')} \
+        && make CC=gcc CUDA=$CUDA_PATH lib \
+        && make lib_install \
+        && cd ../ && rm -rf gdrcopy
+
+    # Install libfabric
+    ARG libfabric_version=1.22.0
+    RUN git clone --branch v${libfabric_version} --depth 1 https://github.com/ofiwg/libfabric.git \
+        && cd libfabric \
+        && ./autogen.sh \
+        && ./configure --prefix=/usr --with-cuda=/usr/local/cuda --enable-cuda-dlopen \
+           --enable-gdrcopy-dlopen --enable-efa \
+        && make -j$(nproc) \
+        && make install \
+        && ldconfig \
+        && cd .. \
+        && rm -rf libfabric
+    ```
+
+!!! todo
+    In the above recipe `CUDA_PATH` is "calculated" for gdrcopy, and just hard coded to `/usr/loca/cuda` for libfabric.
+    How about just hard-coding it everywhere, to simplify the recipe?
+
+!!! todo
+    Should we include the EFA and UCX support here? It is not needed to run on Alps, and might confuse readers.
+
+??? note "The full containerfile for GH200"
+
+    The containerfile below is based on the NVIDIA CUDA image, which provides a complete CUDA installation.
+
+    - Communication frameworks are built with explicit support for CUDA and GDRCopy.
+
+    Some additional features are enabled to increase the portability of the container to non-Alps systems:
+
+    - The libfabric [EFA](https://aws.amazon.com/hpc/efa/) provider is configured using the `--enable-efa` compatibility for derived images on AWS infrastructure.
+    - this image also packages the UCX communication framework to allow building a broader set of software (e.g. some OpenSHMEM implementations) and supporting optimized Infiniband communication as well.
+
+    ```
+    ARG ubuntu_version=24.04
+    ARG cuda_version=12.8.1
+    FROM docker.io/nvidia/cuda:${cuda_version}-cudnn-devel-ubuntu${ubuntu_version}
+
+    RUN apt-get update \
+        && DEBIAN_FRONTEND=noninteractive \
+           apt-get install -y \
+            build-essential \
+            ca-certificates \
+            pkg-config \
+            automake \
+            autoconf \
+            libtool \
+            cmake \
+            gdb \
+            strace \
+            wget \
+            git \
+            bzip2 \
+            python3 \
+            gfortran \
+            rdma-core \
+            numactl \
+            libconfig-dev \
+            libuv1-dev \
+            libfuse-dev \
+            libfuse3-dev \
+            libyaml-dev \
+            libnl-3-dev \
+            libnuma-dev \
+            libsensors-dev \
+            libcurl4-openssl-dev \
+            libjson-c-dev \
+            libibverbs-dev \
+            --no-install-recommends \
+        && rm -rf /var/lib/apt/lists/*
+
+    ARG gdrcopy_version=2.5.1
+    RUN git clone --depth 1 --branch v${gdrcopy_version} https://github.com/NVIDIA/gdrcopy.git \
+        && cd gdrcopy \
+        && export CUDA_PATH=${CUDA_HOME:-$(echo $(which nvcc) | grep -o '.*cuda')} \
+        && make CC=gcc CUDA=$CUDA_PATH lib \
+        && make lib_install \
+        && cd ../ && rm -rf gdrcopy
+
+    # Install libfabric
+    ARG libfabric_version=1.22.0
+    RUN git clone --branch v${libfabric_version} --depth 1 https://github.com/ofiwg/libfabric.git \
+        && cd libfabric \
+        && ./autogen.sh \
+        && ./configure --prefix=/usr --with-cuda=/usr/local/cuda --enable-cuda-dlopen --enable-gdrcopy-dlopen --enable-efa \
+        && make -j$(nproc) \
+        && make install \
+        && ldconfig \
+        && cd .. \
+        && rm -rf libfabric
+
+    # Install UCX
+    ARG UCX_VERSION=1.19.0
+    RUN wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz \
+        && tar xzf ucx-${UCX_VERSION}.tar.gz \
+        && cd ucx-${UCX_VERSION} \
+        && mkdir build \
+        && cd build \
+        && ../configure --prefix=/usr --with-cuda=/usr/local/cuda --with-gdrcopy=/usr/local --enable-mt --enable-devel-headers \
+        && make -j$(nproc) \
+        && make install \
+        && cd ../.. \
+        && rm -rf ucx-${UCX_VERSION}.tar.gz ucx-${UCX_VERSION}
+    ```
 
 ## Tuning libfabric
 
@@ -21,4 +158,4 @@ Note that the exact version deployed on Alps may differ, and not all options may
 See the [Cray MPICH known issues page][ref-communication-cray-mpich-known-issues] for issues when using Cray MPICH together with libfabric.
 
 !!! todo
-    More options?
+    - add environment variable tuning guide