From 69e8fd246078e608cb1f84c71aa4970028898b23 Mon Sep 17 00:00:00 2001 From: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> Date: Tue, 11 Nov 2025 18:21:43 +0100 Subject: [PATCH 1/4] [None][chore]: reduce the layers of the `devel` docker image Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> --- docker/Dockerfile.multi | 65 +++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index ebb3f152652..0ea360754b8 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -44,48 +44,41 @@ COPY docker/common/install.sh \ docker/common/install_ucx.sh \ docker/common/install_nixl.sh \ docker/common/install_etcd.sh \ - docker/common/install_mooncake.sh \ ./ -RUN GITHUB_MIRROR=${GITHUB_MIRROR} \ - PYTHON_VERSION=${PYTHON_VERSION} \ - bash ./install.sh --base && rm install_base.sh - -RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --cmake && rm install_cmake.sh - -RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --ccache && rm install_ccache.sh - -RUN bash ./install.sh --cuda_toolkit && rm install_cuda_toolkit.sh - ARG TRT_VER ARG CUDA_VER ARG CUDNN_VER ARG NCCL_VER ARG CUBLAS_VER -RUN TRT_VER=${TRT_VER} \ +ARG TORCH_INSTALL_TYPE="skip" +RUN GITHUB_MIRROR=${GITHUB_MIRROR} \ + PYTHON_VERSION=${PYTHON_VERSION} \ + TRT_VER=${TRT_VER} \ CUDA_VER=${CUDA_VER} \ CUDNN_VER=${CUDNN_VER} \ NCCL_VER=${NCCL_VER} \ CUBLAS_VER=${CUBLAS_VER} \ - bash ./install.sh --tensorrt && rm install_tensorrt.sh - -RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --polygraphy && rm install_polygraphy.sh - -RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --mpi4py && rm install_mpi4py.sh - -ARG TORCH_INSTALL_TYPE="skip" -RUN TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} bash ./install.sh --pytorch && rm install_pytorch.sh - -RUN bash ./install.sh --opencv && rm install.sh - -# Install UCX first -RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_ucx.sh && rm install_ucx.sh - -# Install NIXL -RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_nixl.sh && rm install_nixl.sh - -# Install etcd -RUN bash ./install_etcd.sh && rm install_etcd.sh + TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} \ + bash ./install.sh --base --cmake --ccache --cuda_toolkit --tensorrt --polygraphy --mpi4py --pytorch --opencv && \ + rm install_base.sh && \ + rm install_cmake.sh && \ + rm install_ccache.sh && \ + rm install_cuda_toolkit.sh && \ + rm install_tensorrt.sh && \ + rm install_polygraphy.sh && \ + rm install_mpi4py.sh && \ + rm install_pytorch.sh && \ + rm install.sh + +# Install UCX, NIXL, etcd +# TODO: Combine these into the main install.sh script +RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_ucx.sh && \ + GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_nixl.sh && \ + bash ./install_etcd.sh && \ + rm install_ucx.sh && \ + rm install_nixl.sh && \ + rm install_etcd.sh FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton @@ -100,16 +93,18 @@ COPY --from=triton /opt/tritonserver/caches /opt/tritonserver/caches # Copy all installation scripts at once to reduce layers COPY docker/common/install_triton.sh \ + docker/common/install_mooncake.sh \ ./ -RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_triton.sh && rm install_triton.sh - # Install Mooncake, after triton handles boost requirement -RUN if [ -f /etc/redhat-release ]; then \ +RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_triton.sh && \ + if [ -f /etc/redhat-release ]; then \ echo "Rocky8 detected, skipping mooncake installation"; \ else \ bash ./install_mooncake.sh; \ - fi && rm install_mooncake.sh + fi && \ + rm install_triton.sh && \ + rm install_mooncake.sh FROM ${DEVEL_IMAGE} AS wheel WORKDIR /src/tensorrt_llm From 62fcbfc5e1fd751b872abb04e3b8fbdc91cf4786 Mon Sep 17 00:00:00 2001 From: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> Date: Thu, 13 Nov 2025 18:27:05 +0100 Subject: [PATCH 2/4] Fix build problems for rockylinux Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> --- docker/Dockerfile.multi | 7 ++++--- docker/Makefile | 6 ++++++ docker/common/install.sh | 6 +++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 0ea360754b8..6125b19ab00 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -13,9 +13,10 @@ LABEL com.nvidia.eula="https://www.nvidia.com/en-us/agreements/enterprise-softwa LABEL com.nvidia.ai-terms="https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/" # https://www.gnu.org/software/bash/manual/html_node/Bash-Startup-Files.html -# The default values come from `nvcr.io/nvidia/pytorch` -ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc} -ENV ENV=${ENV:-/etc/shinit_v2} +ARG SH_ENV="/etc/shinit_v2" +ENV ENV=${SH_ENV} +ARG BASH_ENV="/etc/bash.bashrc" +ENV BASH_ENV=${BASH_ENV} ARG GITHUB_MIRROR="" RUN echo "Using GitHub mirror: $GITHUB_MIRROR" diff --git a/docker/Makefile b/docker/Makefile index b51ae8dfc25..c3fc54435d8 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -56,6 +56,10 @@ NGC_STAGING_REPO ?= nvcr.io/nvstaging/tensorrt-llm NGC_REPO ?= nvcr.io/nvidia/tensorrt-llm NGC_USE_STAGING ?= 0 NGC_AUTO_REPO ?= $(if $(filter 1,$(NGC_USE_STAGING)),$(NGC_STAGING_REPO),$(NGC_REPO)) +SH_ENV ?= $(shell docker inspect --format='{{range .Config.Env}}{{println .}}{{end}}' $(BASE_IMAGE):$(BASE_TAG) \ + | grep '^ENV=' | sed 's/^[^=]*=//' 2>/dev/null) +BASH_ENV ?= $(shell docker inspect --format='{{range .Config.Env}}{{println .}}{{end}}' $(BASE_IMAGE):$(BASE_TAG) \ + | grep '^BASH_ENV=' | sed 's/^[^=]*=//' 2>/dev/null) define add_local_user docker build \ @@ -97,6 +101,8 @@ endef $(if $(GIT_COMMIT), --build-arg GIT_COMMIT="$(GIT_COMMIT)") \ $(if $(GITHUB_MIRROR), --build-arg GITHUB_MIRROR="$(GITHUB_MIRROR)") \ $(if $(PYTHON_VERSION), --build-arg PYTHON_VERSION="$(PYTHON_VERSION)") \ + $(if $(SH_ENV), --build-arg SH_ENV="$(SH_ENV)") \ + $(if $(BASH_ENV), --build-arg BASH_ENV="$(BASH_ENV)") \ $(if $(STAGE), --target $(STAGE)) \ --file Dockerfile.multi \ --tag $(IMAGE_WITH_TAG) \ diff --git a/docker/common/install.sh b/docker/common/install.sh index 8ad8c694f13..0de962c9afc 100755 --- a/docker/common/install.sh +++ b/docker/common/install.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -Eeo pipefail shopt -s nullglob trap 'echo "[install.sh] Error on line $LINENO" >&2' ERR @@ -125,7 +125,7 @@ fi if [ $opencv -eq 1 ]; then echo "Installing OpenCV..." - pip3 uninstall -y opencv + bash -c "pip3 uninstall -y opencv" rm -rf /usr/local/lib/python3*/dist-packages/cv2/ - pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir + bash -c "pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir" fi From 99dca35e05764ca62c25cb88972d58aa264ea87e Mon Sep 17 00:00:00 2001 From: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> Date: Fri, 14 Nov 2025 10:29:24 +0100 Subject: [PATCH 3/4] Fix evaluation of Makefile variables Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> --- docker/Makefile | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/docker/Makefile b/docker/Makefile index c3fc54435d8..519dbbda13d 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -56,10 +56,6 @@ NGC_STAGING_REPO ?= nvcr.io/nvstaging/tensorrt-llm NGC_REPO ?= nvcr.io/nvidia/tensorrt-llm NGC_USE_STAGING ?= 0 NGC_AUTO_REPO ?= $(if $(filter 1,$(NGC_USE_STAGING)),$(NGC_STAGING_REPO),$(NGC_REPO)) -SH_ENV ?= $(shell docker inspect --format='{{range .Config.Env}}{{println .}}{{end}}' $(BASE_IMAGE):$(BASE_TAG) \ - | grep '^ENV=' | sed 's/^[^=]*=//' 2>/dev/null) -BASH_ENV ?= $(shell docker inspect --format='{{range .Config.Env}}{{println .}}{{end}}' $(BASE_IMAGE):$(BASE_TAG) \ - | grep '^BASH_ENV=' | sed 's/^[^=]*=//' 2>/dev/null) define add_local_user docker build \ @@ -79,8 +75,16 @@ define rewrite_tag $(shell echo $(IMAGE_WITH_TAG) | sed "s/\/tensorrt-llm:/\/tensorrt-llm-staging:/g") endef +base_pull: + @echo "Pulling base image: $(BASE_IMAGE):$(BASE_TAG)" + docker pull $(BASE_IMAGE):$(BASE_TAG) + %_build: DEVEL_IMAGE = $(if $(findstring 1,$(JENKINS_DEVEL)),$(shell . ../jenkins/current_image_tags.properties && echo $$LLM_DOCKER_IMAGE)) -%_build: +%_build: SH_ENV = $(shell docker inspect --format='{{range .Config.Env}}{{println .}}{{end}}' $(BASE_IMAGE):$(BASE_TAG) \ + | grep '^ENV=' | sed 's/^[^=]*=//' 2>/dev/null) +%_build: BASH_ENV = $(shell docker inspect --format='{{range .Config.Env}}{{println .}}{{end}}' $(BASE_IMAGE):$(BASE_TAG) \ + | grep '^BASH_ENV=' | sed 's/^[^=]*=//' 2>/dev/null) +%_build: base_pull @echo "Building docker image: $(IMAGE_WITH_TAG)" docker buildx build $(DOCKER_BUILD_OPTS) $(DOCKER_BUILD_ARGS) \ --progress $(DOCKER_PROGRESS) \ From a403a49b01b8e7b7694dc1008967463f78b63d83 Mon Sep 17 00:00:00 2001 From: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> Date: Fri, 14 Nov 2025 13:40:39 +0100 Subject: [PATCH 4/4] Update the docker devel images. Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> --- jenkins/current_image_tags.properties | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index ca519defa68..a5e727f8a3e 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511110140-8447 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511110140-8447 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511110140-8447 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511110140-8447 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511141224-9077 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511141224-9077 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511141224-9077 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511141224-9077