diff --git a/constraints.txt b/constraints.txt index d4b78a25670..e276ea19527 100644 --- a/constraints.txt +++ b/constraints.txt @@ -1,2 +1,5 @@ -# These vulnerabilities were inherited from the base image (pytorch:25.10-py3) and should be removed when the base image +# These vulnerabilities were inherited from the base image (pytorch:25.06-py3) and should be removed when the base image # is updated. + +# WAR against https://github.com/advisories/GHSA-8qvm-5x2c-j2w7 +protobuf>=4.25.8 diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index dfbcfecb719..baaf32245e4 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -1,9 +1,8 @@ # Multi-stage Dockerfile ARG BASE_IMAGE=nvcr.io/nvidia/pytorch ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver -ARG BASE_TAG=25.10-py3 -# [TODO] Update to NVIDIA Triton 25.10 when it's available -ARG TRITON_BASE_TAG=25.09-py3 +ARG BASE_TAG=25.08-py3 +ARG TRITON_BASE_TAG=25.08-py3 ARG DEVEL_IMAGE=devel FROM ${BASE_IMAGE}:${BASE_TAG} AS base @@ -41,9 +40,6 @@ COPY docker/common/install.sh \ docker/common/install_polygraphy.sh \ docker/common/install_mpi4py.sh \ docker/common/install_pytorch.sh \ - docker/common/install_ucx.sh \ - docker/common/install_nixl.sh \ - docker/common/install_etcd.sh \ ./ RUN GITHUB_MIRROR=${GITHUB_MIRROR} \ @@ -75,15 +71,36 @@ RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --mpi4py && rm install_mpi4 ARG TORCH_INSTALL_TYPE="skip" RUN TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} bash ./install.sh --pytorch && rm install_pytorch.sh -RUN bash ./install.sh --opencv && rm install.sh +RUN bash ./install.sh --opencv && bash ./install.sh --protobuf && rm install.sh + +# wait for new triton to be published +# Rename pytorch_triton package to triton +RUN if [ -f /etc/redhat-release ]; then \ + echo "Rocky8 detected, skipping symlink and ldconfig steps"; \ + else \ + cd /usr/local/lib/python3.12/dist-packages/ && \ + ls -la | grep pytorch_triton && \ + mv pytorch_triton-3.3.1+gitc8757738.dist-info triton-3.3.1+gitc8757738.dist-info && \ + cd triton-3.3.1+gitc8757738.dist-info && \ + echo "Current directory: $(pwd)" && \ + echo "Files in directory:" && \ + ls -la && \ + sed -i 's/^Name: pytorch-triton/Name: triton/' METADATA && \ + sed -i 's|pytorch_triton-3.3.1+gitc8757738.dist-info/|triton-3.3.1+gitc8757738.dist-info/|g' RECORD && \ + echo "METADATA after update:" && \ + grep "^Name:" METADATA; \ + fi # Install UCX first +COPY docker/common/install_ucx.sh install_ucx.sh RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_ucx.sh && rm install_ucx.sh # Install NIXL +COPY docker/common/install_nixl.sh install_nixl.sh RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_nixl.sh && rm install_nixl.sh # Install etcd +COPY docker/common/install_etcd.sh install_etcd.sh RUN bash ./install_etcd.sh && rm install_etcd.sh FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton @@ -99,6 +116,9 @@ COPY --from=triton /opt/tritonserver/caches /opt/tritonserver/caches # Copy all installation scripts at once to reduce layers COPY docker/common/install_triton.sh \ + docker/common/install_ucx.sh \ + docker/common/install_nixl.sh \ + docker/common/install_etcd.sh \ ./ RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_triton.sh && rm install_triton.sh diff --git a/docker/Makefile b/docker/Makefile index b51ae8dfc25..fc1faee012c 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -192,17 +192,16 @@ jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_V jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE) jenkins-rockylinux8_%: STAGE = tritondevel jenkins-rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda -# [TODO] Update to NVIDIA CUDA 13.0.2 when it's available -jenkins-rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8 +jenkins-rockylinux8_%: BASE_TAG = 13.0.0-devel-rockylinux8 rockylinux8_%: STAGE = tritondevel rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda -rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8 +rockylinux8_%: BASE_TAG = 13.0.0-devel-rockylinux8 # For x86_64 and aarch64 ubuntu22_%: STAGE = tritondevel ubuntu22_%: BASE_IMAGE = nvcr.io/nvidia/cuda -ubuntu22_%: BASE_TAG = 13.0.1-devel-ubuntu22.04 +ubuntu22_%: BASE_TAG = 13.0.0-devel-ubuntu22.04 trtllm_%: STAGE = release trtllm_%: PUSH_TO_STAGING := 0 diff --git a/docker/common/install.sh b/docker/common/install.sh index 8ad8c694f13..eaea3c64391 100755 --- a/docker/common/install.sh +++ b/docker/common/install.sh @@ -16,6 +16,7 @@ polygraphy=0 mpi4py=0 pytorch=0 opencv=0 +protobuf=0 while [[ $# -gt 0 ]]; do case $1 in @@ -55,6 +56,10 @@ while [[ $# -gt 0 ]]; do opencv=1 shift 1 ;; + --protobuf) + protobuf=1 + shift 1 + ;; --all) base=1 cmake=1 @@ -65,6 +70,7 @@ while [[ $# -gt 0 ]]; do mpi4py=1 pytorch=1 opencv=1 + protobuf=1 shift 1 ;; *) @@ -129,3 +135,10 @@ if [ $opencv -eq 1 ]; then rm -rf /usr/local/lib/python3*/dist-packages/cv2/ pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir fi + +# WARs against security issues inherited from pytorch:25.06 +# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7 +if [ $protobuf -eq 1 ]; then + pip3 install --upgrade --no-cache-dir \ + "protobuf>=4.25.8" +fi diff --git a/docker/common/install_cuda_toolkit.sh b/docker/common/install_cuda_toolkit.sh index 0dc5cb305aa..ae41ef1fc8e 100644 --- a/docker/common/install_cuda_toolkit.sh +++ b/docker/common/install_cuda_toolkit.sh @@ -5,7 +5,7 @@ set -ex # This script is used for reinstalling CUDA on Rocky Linux 8 with the run file. # CUDA version is usually aligned with the latest NGC CUDA image tag. # Only use when public CUDA image is not ready. -CUDA_VER="13.0.2_580.95.05" +CUDA_VER="13.0.0_580.65.06" CUDA_VER_SHORT="${CUDA_VER%_*}" NVCC_VERSION_OUTPUT=$(nvcc --version) diff --git a/docker/common/install_mpi4py.sh b/docker/common/install_mpi4py.sh index dd0c3d71a83..33299dad533 100644 --- a/docker/common/install_mpi4py.sh +++ b/docker/common/install_mpi4py.sh @@ -27,15 +27,12 @@ diff --git a/src/mpi4py/futures/_lib.py b/src/mpi4py/futures/_lib.py index f14934d1..eebfb8fc 100644 --- a/src/mpi4py/futures/_lib.py +++ b/src/mpi4py/futures/_lib.py -@@ -278,6 +278,43 @@ def _manager_comm(pool, options, comm, full=True): +@@ -278,6 +278,40 @@ def _manager_comm(pool, options, comm, full=True): def _manager_split(pool, options, comm, root): + if(os.getenv("TRTLLM_USE_MPI_KVCACHE")=="1"): -+ try: -+ from cuda.bindings import runtime as cudart -+ except ImportError: -+ from cuda import cudart ++ from cuda import cudart + has_slurm_rank=False + has_ompi_rank=False + slurm_rank=0 @@ -74,10 +71,6 @@ index f14934d1..eebfb8fc 100644 EOF # Install with pip and clean up cache -ARCH=$(uname -m) -if [ "$ARCH" = "aarch64" ]; then - pip3 install --no-cache-dir Cython==0.29.37 -fi pip3 install --no-cache-dir "$TMP_DIR/mpi4py-${MPI4PY_VERSION}" # Clean up diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh index 069b26846c8..ca707843489 100644 --- a/docker/common/install_pytorch.sh +++ b/docker/common/install_pytorch.sh @@ -4,8 +4,8 @@ set -ex # Use latest stable version from https://pypi.org/project/torch/#history # and closest to the version specified in -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 -TORCH_VERSION="2.9.0" +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08 +TORCH_VERSION="2.8.0" SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') prepare_environment() { @@ -69,8 +69,8 @@ install_from_pypi() { if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi - pip3 uninstall -y torch torchvision - pip3 install torch==${TORCH_VERSION} torchvision --index-url https://download.pytorch.org/whl/cu130 + pip3 uninstall -y torch torchvision torchaudio + pip3 install torch==${TORCH_VERSION} torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 } case "$1" in diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh index 3887be6fa26..3006723a97a 100644 --- a/docker/common/install_tensorrt.sh +++ b/docker/common/install_tensorrt.sh @@ -2,20 +2,23 @@ set -ex -TRT_VER="10.13.3.9" +TRT_VER="10.13.2.6" # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 -CUDA_VER="13.0" # 13.0.2 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08 +CUDA_VER="13.0" # 13.0.0 # Keep the installation for cuDNN if users want to install PyTorch with source codes. # PyTorch 2.x can compile with cuDNN v9. -CUDNN_VER="9.14.0.64-1" +CUDNN_VER="9.12.0.46-1" +# NCCL version 2.26.x used in the NGC PyTorch 25.05 image but has a performance regression issue. +# Use NCCL version 2.27.5 which has the fixes. NCCL_VER="2.27.7-1+cuda13.0" -CUBLAS_VER="13.1.0.3-1" +# Use cuBLAS version 13.0.0.19 instead. +CUBLAS_VER="13.0.0.19-1" # Align with the pre-installed CUDA / NVCC / NVRTC versions from # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html -NVRTC_VER="13.0.88-1" -CUDA_RUNTIME="13.0.96-1" -CUDA_DRIVER_VERSION="580.95.05-1.el8" +NVRTC_VER="13.0.48-1" +CUDA_RUNTIME="13.0.48-1" +CUDA_DRIVER_VERSION="580.65.06-1.el8" for i in "$@"; do case $i in diff --git a/docs/source/installation/build-from-source-linux.md b/docs/source/installation/build-from-source-linux.md index 19dab71c769..7b94aa88119 100644 --- a/docs/source/installation/build-from-source-linux.md +++ b/docs/source/installation/build-from-source-linux.md @@ -147,6 +147,11 @@ check . ## Build TensorRT LLM +```{tip} +:name: build-from-source-tip-cuda-version +TensorRT LLM 1.1 supports both CUDA 12.9 and 13.0 while some dependency changes are required. The `requirements.txt` contains dependencies needed by CUDA 13.0. If you are using CUDA 12.9, please uncomment lines end with `# ` and comment out the next lines. +``` + ### Option 1: Full Build with C++ Compilation The following command compiles the C++ code and packages the compiled libraries along with the Python files into a wheel. When developing C++ code, you need this full build command to apply your code changes. diff --git a/docs/source/installation/linux.md b/docs/source/installation/linux.md index 2aae24e6af0..e04510d1ac4 100644 --- a/docs/source/installation/linux.md +++ b/docs/source/installation/linux.md @@ -12,9 +12,14 @@ Install CUDA Toolkit following the [CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/) and make sure `CUDA_HOME` environment variable is properly set. + ```{tip} + :name: installation-linux-tip-cuda-version + TensorRT LLM 1.1 supports both CUDA 12.9 and 13.0. The wheel package release only supports CUDA 12.9, while CUDA 13.0 is only supported through NGC container release. + ``` + ```bash - # By default, PyTorch CUDA 12.8 package is installed. Install PyTorch CUDA 13.0 package to align with the CUDA version used for building TensorRT LLM wheels. - pip3 install torch==2.9.0 torchvision --index-url https://download.pytorch.org/whl/cu130 + # Optional step: Only required for NVIDIA Blackwell GPUs and SBSA platform + pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128 sudo apt-get -y install libopenmpi-dev @@ -22,6 +27,8 @@ sudo apt-get -y install libzmq3-dev ``` + PyTorch CUDA 12.8 package is required for supporting NVIDIA Blackwell GPUs and SBSA platform. On prior GPUs or Linux x86_64 platform, this extra installation is not required. + ```{tip} Instead of manually installing the preqrequisites as described above, it is also possible to use the pre-built [TensorRT LLM Develop container diff --git a/docs/source/legacy/reference/support-matrix.md b/docs/source/legacy/reference/support-matrix.md index 1dc59fcfa0b..910aa428670 100644 --- a/docs/source/legacy/reference/support-matrix.md +++ b/docs/source/legacy/reference/support-matrix.md @@ -152,7 +152,7 @@ The following table shows the supported software for TensorRT-LLM. * - - Software Compatibility * - Container - - [25.10](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) + - [25.08](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html) * - TensorRT - [10.13](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html) * - Precision diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy index 20637c7da38..d661d9be245 100644 --- a/jenkins/Build.groovy +++ b/jenkins/Build.groovy @@ -16,6 +16,9 @@ AARCH64_TRIPLE = "aarch64-linux-gnu" LLM_DOCKER_IMAGE = env.dockerImage +LLM_DOCKER_IMAGE_12_9 = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383" +LLM_SBSA_DOCKER_IMAGE_12_9 = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383" + // Always use x86_64 image for agent AGENT_IMAGE = env.dockerImage.replace("aarch64", "x86_64") @@ -37,6 +40,9 @@ def BUILD_JOBS_FOR_CONFIG = "buildJobsForConfig" @Field def CONFIG_LINUX_X86_64_VANILLA = "linux_x86_64_Vanilla" +@Field +def CONFIG_LINUX_X86_64_VANILLA_CU12 = "linux_x86_64_Vanilla_CU12" + @Field def CONFIG_LINUX_X86_64_SINGLE_DEVICE = "linux_x86_64_SingleDevice" @@ -46,6 +52,9 @@ def CONFIG_LINUX_X86_64_LLVM = "linux_x86_64_LLVM" @Field def CONFIG_LINUX_AARCH64 = "linux_aarch64" +@Field +def CONFIG_LINUX_AARCH64_CU12 = "linux_aarch64_CU12" + @Field def CONFIG_LINUX_AARCH64_LLVM = "linux_aarch64_LLVM" @@ -64,6 +73,11 @@ def BUILD_CONFIGS = [ (TARNAME) : "TensorRT-LLM.tar.gz", (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real", ], + (CONFIG_LINUX_X86_64_VANILLA_CU12) : [ + (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks", + (TARNAME) : "TensorRT-LLM-CU12.tar.gz", + (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real", + ], (CONFIG_LINUX_X86_64_PYBIND) : [ (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks", (TARNAME) : "pybind-TensorRT-LLM.tar.gz", @@ -85,6 +99,12 @@ def BUILD_CONFIGS = [ (WHEEL_ARCHS): "90-real;100-real;103-real;120-real", (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA ], + (CONFIG_LINUX_AARCH64_CU12): [ + (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON", + (TARNAME) : "TensorRT-LLM-GH200-CU12.tar.gz", + (WHEEL_ARCHS): "90-real;100-real;103-real;120-real", + (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA + ], (CONFIG_LINUX_AARCH64_PYBIND): [ (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl", (TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz", @@ -434,6 +454,9 @@ def runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64) pipArgs = "" } + if (tarName.contains("CU12")) { + trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${LLM_ROOT} && sed -i '/^# .*\$/ {s/^# //; n; s/^/# /}' requirements.txt && cat requirements.txt") + } // install python package trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${LLM_ROOT} && pip3 install -r requirements-dev.txt ${pipArgs}") @@ -454,7 +477,10 @@ def runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64) def llmPath = sh (script: "realpath ${LLM_ROOT}",returnStdout: true).trim() // TODO: Remove after the cmake version is upgraded to 3.31.8 // Get triton tag from docker/dockerfile.multi - def tritonShortTag = "r25.09" + def tritonShortTag = "r25.08" + if (tarName.contains("CU12")) { + tritonShortTag = "r25.06" + } sh "cd ${LLM_ROOT}/triton_backend/inflight_batcher_llm && mkdir build && cd build && cmake .. -DTRTLLM_DIR=${llmPath} -DTRITON_COMMON_REPO_TAG=${tritonShortTag} -DTRITON_CORE_REPO_TAG=${tritonShortTag} -DTRITON_THIRD_PARTY_REPO_TAG=${tritonShortTag} -DTRITON_BACKEND_REPO_TAG=${tritonShortTag} -DUSE_CXX11_ABI=ON && make -j${buildJobs} install" // Step 3: packaging wheels into tarfile @@ -544,9 +570,14 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars) wheelDockerImage = env.dockerImage } + def LLM_DOCKER_IMAGE_CU12 = cpu_arch == AARCH64_TRIPLE ? LLM_SBSA_DOCKER_IMAGE_12_9 : LLM_DOCKER_IMAGE_12_9 + buildConfigs = [ "Build TRT-LLM": [LLM_DOCKER_IMAGE] + prepareLLMBuild( pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA), + // Disable CUDA12 build for too slow to build (cost > 5 hours on SBSA) + "Build TRT-LLM CUDA12": [LLM_DOCKER_IMAGE_CU12] + prepareLLMBuild( + pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_CU12 : CONFIG_LINUX_X86_64_VANILLA_CU12), "Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild( pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM), "Build TRT-LLM Pybind": [LLM_DOCKER_IMAGE] + prepareLLMBuild( diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index d71d510e36c..832cbdbe694 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -38,8 +38,16 @@ LLM_DOCKER_IMAGE = env.dockerImage LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310 LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE_12_9="urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509091430-7383" +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE_12_9="urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509091430-7383" + +LLM_DOCKER_IMAGE_12_9 = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383" +LLM_SBSA_DOCKER_IMAGE_12_9 = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383" + +DLFW_IMAGE_12_9 = "urm.nvidia.com/docker/nvidia/pytorch:25.06-py3" + // DLFW torch image -DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.10-py3" +DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.08-py3" //Ubuntu base image UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04" @@ -56,6 +64,9 @@ def TARNAME = "tarName" @Field def VANILLA_CONFIG = "Vanilla" +@Field +def VANILLA_CONFIG_CU12 = "Vanilla_CU12" + @Field def SINGLE_DEVICE_CONFIG = "SingleDevice" @@ -65,6 +76,9 @@ def LLVM_CONFIG = "LLVM" @Field def LINUX_AARCH64_CONFIG = "linux_aarch64" +@Field +def LINUX_AARCH64_CONFIG_CU12 = "linux_aarch64_CU12" + @Field def PYBIND_CONFIG = "Pybind" @@ -72,9 +86,11 @@ def PYBIND_CONFIG = "Pybind" def BUILD_CONFIGS = [ // Vanilla TARNAME is used for packaging in runLLMPackage (VANILLA_CONFIG) : [(TARNAME) : "TensorRT-LLM.tar.gz"], + (VANILLA_CONFIG_CU12) : [(TARNAME) : "TensorRT-LLM-CU12.tar.gz"], (SINGLE_DEVICE_CONFIG) : [(TARNAME) : "single-device-TensorRT-LLM.tar.gz"], (LLVM_CONFIG) : [(TARNAME) : "llvm-TensorRT-LLM.tar.gz"], (LINUX_AARCH64_CONFIG) : [(TARNAME) : "TensorRT-LLM-GH200.tar.gz"], + (LINUX_AARCH64_CONFIG_CU12) : [(TARNAME) : "TensorRT-LLM-GH200-CU12.tar.gz"], (PYBIND_CONFIG) : [(TARNAME) : "pybind-TensorRT-LLM.tar.gz"], ] @@ -1527,7 +1543,7 @@ def launchTestListCheck(pipeline) sh "tar -zxf ${tarName}" def llmPath = sh (script: "realpath .", returnStdout: true).trim() def llmSrc = "${llmPath}/TensorRT-LLM/src" - sh "NVIDIA_TRITON_SERVER_VERSION=25.09 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive" + sh "NVIDIA_TRITON_SERVER_VERSION=25.04 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive" } catch (InterruptedException e) { throw e } catch (Exception e) { @@ -2072,6 +2088,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO if (env.alternativeTRT) { sh "cd ${llmSrc} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt" } + if (stageName.contains("-CU12")) { + trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && sed -i '/^# .*\$/ {s/^# //; n; s/^/# /}' requirements.txt && cat requirements.txt") + } trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install --retries 1 -r requirements-dev.txt") if (stageName.contains("-Ray-")) { trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install ray[default]") @@ -2357,7 +2376,7 @@ def checkPipInstall(pipeline, wheel_path) } -def runLLMBuild(pipeline, cpu_arch, reinstall_dependencies=false, wheel_path="", cpver="cp312") +def runLLMBuild(pipeline, cpu_arch, reinstall_dependencies=false, wheel_path="", cpver="cp312", is_cu12=false) { sh "pwd && ls -alh" sh "env | sort" @@ -2367,6 +2386,9 @@ def runLLMBuild(pipeline, cpu_arch, reinstall_dependencies=false, wheel_path="", if (env.alternativeTRT) { sh "cd tensorrt_llm/ && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt" } + if (is_cu12) { + trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd tensorrt_llm/ && sed -i '/^# .*\$/ {s/^# //; n; s/^/# /}' requirements.txt && cat requirements.txt") + } // Random sleep to avoid resource contention sleep(10 * Math.random()) @@ -2661,7 +2683,7 @@ def launchTestJobs(pipeline, testFilter) "RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4], ] - parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), { + parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(key.contains("-CU12-") ? LLM_DOCKER_IMAGE_12_9 : LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), { def config = VANILLA_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG @@ -2672,6 +2694,9 @@ def launchTestJobs(pipeline, testFilter) if (key.contains("Pybind")) { config = PYBIND_CONFIG } + if (key.contains("-CU12-")) { + config = VANILLA_CONFIG_CU12 + } runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3]) }]]} fullSet = parallelJobs.keySet() @@ -2702,6 +2727,9 @@ def launchTestJobs(pipeline, testFilter) if (key.contains("llvm")) { config = LLVM_CONFIG } + if (key.contains("-CU12-")) { + config = VANILLA_CONFIG_CU12 + } runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false) }]]} @@ -2740,7 +2768,7 @@ def launchTestJobs(pipeline, testFilter) fullSet += multiNodesSBSAConfigs.keySet() if (env.targetArch == AARCH64_TRIPLE) { - parallelJobs = SBSATestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), { + parallelJobs = SBSATestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(key.contains("-CU12-") ? LLM_SBSA_DOCKER_IMAGE_12_9 : LLM_DOCKER_IMAGE, values[0], "arm64"), { runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3]) }]]} @@ -2795,50 +2823,50 @@ def launchTestJobs(pipeline, testFilter) // Python version and OS for sanity check x86SanityCheckConfigs = [ "PY312-DLFW": [ - LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE, + LLM_DOCKER_IMAGE, "B200_PCIe", X86_64_TRIPLE, false, - "dlfw/", + "cuda13/", DLFW_IMAGE, false, ], - "PY310-UB2204": [ - LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE, + "PY310-UB2204-CU12": [ + LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE_12_9, "A10", X86_64_TRIPLE, true, "", UBUNTU_22_04_IMAGE, - true, // Extra install PyTorch CUDA 13.0 package to align with the CUDA version used for building TensorRT LLM wheels. + false, ], - "PY312-UB2404": [ - LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE, + "PY312-UB2404-CU12": [ + LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE_12_9, "RTX5090", X86_64_TRIPLE, true, "", UBUNTU_24_04_IMAGE, - true, // Extra PyTorch CUDA 13.0 install + true, // Extra PyTorch CUDA 12.8 install ], ] aarch64SanityCheckConfigs = [ - "PY312-UB2404": [ - LLM_DOCKER_IMAGE, + "PY312-UB2404-CU12": [ + LLM_SBSA_DOCKER_IMAGE_12_9, "GH200", AARCH64_TRIPLE, false, "", UBUNTU_24_04_IMAGE, - true, // Extra PyTorch CUDA 13.0 install + true, // Extra PyTorch CUDA 12.8 install ], "PY312-DLFW": [ LLM_DOCKER_IMAGE, "GH200", AARCH64_TRIPLE, false, - "dlfw/", + "cuda13/", DLFW_IMAGE, false, ], @@ -2892,7 +2920,7 @@ def launchTestJobs(pipeline, testFilter) env = ["LD_LIBRARY_PATH+=:/usr/local/cuda/compat"] } withEnv(env) { - wheelName = runLLMBuild(pipeline, cpu_arch, values[3], wheelPath, cpver) + wheelName = runLLMBuild(pipeline, cpu_arch, values[3], wheelPath, cpver, key.contains("CU12")) } } @@ -2917,7 +2945,7 @@ def launchTestJobs(pipeline, testFilter) echo "###### Prerequisites Start ######" echoNodeAndGpuInfo(pipeline, toStageName(values[1], key)) // Clean up the pip constraint file from the base NGC PyTorch image. - if (values[5] == DLFW_IMAGE) { + if (values[5] == DLFW_IMAGE || values[5] == DLFW_IMAGE_12_9) { trtllm_utils.llmExecStepWithRetry(pipeline, script: "[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true") } trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y python3-pip git rsync curl wget") @@ -2925,17 +2953,47 @@ def launchTestJobs(pipeline, testFilter) trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 config set global.break-system-packages true") trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install requests") trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 uninstall -y tensorrt") - if (values[5] != DLFW_IMAGE) { + if (values[5] != DLFW_IMAGE && values[5] != DLFW_IMAGE_12_9) { def ubuntu_version = key.contains("UB2404") ? "ubuntu2404" : "ubuntu2204" def platform = cpu_arch == X86_64_TRIPLE ? "x86_64" : "sbsa" trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget https://developer.download.nvidia.com/compute/cuda/repos/${ubuntu_version}/${platform}/cuda-keyring_1.1-1_all.deb") trtllm_utils.llmExecStepWithRetry(pipeline, script: "dpkg -i cuda-keyring_1.1-1_all.deb") - trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0") + if (key.contains("CU12")) { + trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-12-9") + } else { + trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0") + } } - // Extra PyTorch CUDA 13.0 install for all bare-metal environments (Default PyTorch is for CUDA 12.8) + if (key.contains("CU12")) { + trtllm_utils.llmExecStepWithRetry(pipeline, script: "sed -i '/^# .*\$/ {s/^# //; n; s/^/# /}' ${LLM_ROOT}/requirements.txt") + sh "cat ${LLM_ROOT}/requirements.txt" + } + // Extra PyTorch CUDA 12.8 install for SBSA platform and Blackwell GPUs bare-metal environments if (values[6]) { - echo "###### Extra PyTorch CUDA 13.0 install Start ######" - trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0 torchvision --index-url https://download.pytorch.org/whl/cu130") + echo "###### Extra PyTorch CUDA 12.8 install Start ######" + if (key.contains("CU12")) { + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128") + } else { + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128") + } + } + + // TODO: Remove this after public triton supports CUDA 13. + if (key == "PY312-DLFW" && values[2] == X86_64_TRIPLE) { + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install https://download.pytorch.org/whl/nightly/pytorch_triton-3.3.1%2Bgitc8757738-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl") + sh """ + cd /usr/local/lib/python3.12/dist-packages/ && \ + ls -la | grep pytorch_triton && \ + mv pytorch_triton-3.3.1+gitc8757738.dist-info triton-3.3.1+gitc8757738.dist-info && \ + cd triton-3.3.1+gitc8757738.dist-info && \ + echo "Current directory: \$(pwd)" && \ + echo "Files in directory:" && \ + ls -la && \ + sed -i 's/^Name: pytorch-triton/Name: triton/' METADATA && \ + sed -i 's|pytorch_triton-3.3.1+gitc8757738.dist-info/|triton-3.3.1+gitc8757738.dist-info/|g' RECORD && \ + echo "METADATA after update:" && \ + grep "^Name:" METADATA + """ } def libEnv = [] @@ -2958,9 +3016,9 @@ def launchTestJobs(pipeline, testFilter) } echo "###### Run LLMAPI tests Start ######" - def config = VANILLA_CONFIG + def config = key.contains("CU12") ? VANILLA_CONFIG_CU12 : VANILLA_CONFIG if (cpu_arch == AARCH64_TRIPLE) { - config = LINUX_AARCH64_CONFIG + config = key.contains("CU12") ? LINUX_AARCH64_CONFIG_CU12 : LINUX_AARCH64_CONFIG } withEnv(libEnv) { sh "env | sort" diff --git a/jenkins/controlCCache.groovy b/jenkins/controlCCache.groovy index 74ebebdeb03..bc34d88e4d0 100644 --- a/jenkins/controlCCache.groovy +++ b/jenkins/controlCCache.groovy @@ -1,7 +1,7 @@ import java.lang.InterruptedException -DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202510291120-8621" +DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501" def createKubernetesPodConfig(image, arch = "amd64") { diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index fc097cc9e4e..347265f9860 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,11 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511021230-8838 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511021230-8838 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511021230-8838 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511021230-8838 +LLM_DOCKER_IMAGE_12_9=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383 +LLM_SBSA_DOCKER_IMAGE_12_9=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE_12_9=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509091430-7383 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE_12_9=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509091430-7383 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.08-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511101900-9039 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.08-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511101900-9039 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.0-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511101900-9039 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.0-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511101900-9039 diff --git a/requirements.txt b/requirements.txt index 69f9c305ebc..7763317712f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ ---extra-index-url https://download.pytorch.org/whl/cu130 +--extra-index-url https://download.pytorch.org/whl/cu128 -c constraints.txt accelerate>=1.7.0 build colored +# cuda-python>=12,<13 # cuda-python>=13 diffusers>=0.27.0 lark @@ -13,19 +14,24 @@ onnx_graphsurgeon>=0.5.2 openai polygraphy psutil +# nvidia-ml-py>=12,<13 # nvidia-ml-py>=13 pulp pandas h5py==3.12.1 StrEnum sentencepiece>=0.1.99 +# tensorrt~=10.11.0 # tensorrt~=10.13.0 -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.9.0a0. -torch>=2.9.0a0,<=2.9.0 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08 uses 2.8.0a0. +# torch>=2.7.1,<=2.8.0a0 # +torch>=2.8.0a0,<=2.8.0 torchvision -nvidia-modelopt[torch]~=0.37.0 -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.27.7 -nvidia-nccl-cu13==2.27.7 +nvidia-modelopt[torch]~=0.33.0 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08 uses 2.27.7 +# nvidia-nccl-cu12 # +nvidia-nccl-cu13 +# nvidia-cuda-nvrtc-cu12 # nvidia-cuda-nvrtc transformers==4.56.0 prometheus_client @@ -33,7 +39,7 @@ prometheus_fastapi_instrumentator pydantic>=2.9.1 pydantic-settings[yaml] omegaconf -pillow +pillow==10.3.0 wheel<=0.45.1 optimum # evaluate needs datasets>=2.0.0 which triggers datasets>3.1.0 which is not stable: https://github.com/huggingface/datasets/issues/7467 @@ -64,11 +70,12 @@ ninja etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225b61f039457c8072a blake3 soundfile -triton==3.5.0; platform_machine == "x86_64" +triton==3.3.1; platform_machine == "x86_64" tiktoken blobfile openai-harmony==0.0.4 nvidia-cutlass-dsl==4.2.1; python_version >= "3.10" +numba-cuda>=0.19.0 # WAR for nvbugs/5501820 plotly numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing partial_json_parser diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index dda7da6d0c4..e2c8f41ec30 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -946,6 +946,40 @@ def get_binding_lib(subdirectory, name): # and validating python changes in the whl. clear_folder(dist_dir) + # Modify requirements.txt for wheel build based on CUDA version + def modify_requirements_for_cuda(): + requirements_file = project_dir / "requirements.txt" + if os.environ.get("CUDA_VERSION", "").startswith("12."): + print( + "Detected CUDA 12 environment, modifying requirements.txt for wheel build..." + ) + with open(requirements_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + modified_lines = [] + i = 0 + while i < len(lines): + line = lines[i] + if "" in line and line.strip().startswith( + "#"): + new_line = line.replace("# ", "", 1) + print( + f"Enable CUDA 12.9 dependency: {new_line.strip()}") + modified_lines.append(new_line) + print( + f"Disable CUDA 13 dependency: # {lines[i + 1].strip()}" + ) + modified_lines.append("# " + lines[i + 1]) + i += 1 + else: + modified_lines.append(line) + i += 1 + with open(requirements_file, 'w', encoding='utf-8') as f: + f.writelines(modified_lines) + return True + return False + + modify_requirements_for_cuda() + build_run( f'\"{venv_python}\" -m build {project_dir} --skip-dependency-check --no-isolation --wheel --outdir "{dist_dir}"' ) diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py index 68229e4150d..a3bf1024eae 100644 --- a/tensorrt_llm/_utils.py +++ b/tensorrt_llm/_utils.py @@ -1189,12 +1189,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> None: def torch_pybind11_abi() -> str: global TORCH_PYBIND11_ABI if TORCH_PYBIND11_ABI is None: - if hasattr(torch._C, '_PYBIND11_COMPILER_TYPE'): - # Old pybind11 abi string before torch 2.9.0 - TORCH_PYBIND11_ABI = f"{torch._C._PYBIND11_COMPILER_TYPE}{torch._C._PYBIND11_STDLIB}{torch._C._PYBIND11_BUILD_ABI}" - else: - # New pybind11 abi string since torch 2.9.0 - TORCH_PYBIND11_ABI = f"system_libstdcpp_gxx_abi_1xxx_use_cxx11_abi_{int(torch.compiled_with_cxx11_abi())}" + TORCH_PYBIND11_ABI = f"{torch._C._PYBIND11_COMPILER_TYPE}{torch._C._PYBIND11_STDLIB}{torch._C._PYBIND11_BUILD_ABI}" return TORCH_PYBIND11_ABI diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 7b82594e524..0069e1906d5 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -281,7 +281,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[latency] SKIP (https://nvbugs/5481198) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[throughput] SKIP (https://nvbugs/5481198) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5503479) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5630310) full:L20/accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized SKIP (https://nvbugs/5542862) full:L20/accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 SKIP (https://nvbugs/5542862) full:L40S/accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized SKIP (https://nvbugs/5542862) @@ -418,3 +417,4 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKI test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5647825) test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image] SKIP (https://nvbugs/5644190) test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5568836) +disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5633340) diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py index 91b906241df..9712830c26c 100644 --- a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py +++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py @@ -343,7 +343,6 @@ def test_nemotron_h_cuda_graph_overlap_scheduler(): ) -@pytest.mark.skip(reason="https://nvbugs/5626259") def test_nemotron_h_chunked_prefill(): # Long prompts (~100 tokens) to make sure chunked prefill is enabled # (At the time of development, tokens_per_block isn't configurable from the LLM API, diff --git a/tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py b/tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py index 17c28f75fac..0f6a8724c43 100644 --- a/tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py +++ b/tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py @@ -31,7 +31,6 @@ from tensorrt_llm.llmapi.utils import get_total_gpu_memory -@pytest.mark.skip(reason="https://nvbugs/5606178") @pytest.mark.parametrize( "dim, headdim, ngroups, dstate, req_type, dtype, batch_size, max_seq_len, has_z, remove_padding, paged_cache, use_initial_states", # dim parametrization @@ -363,7 +362,6 @@ def test_mamba2_chunk_scan_selective_state_update(dim, headdim, ngroups, dstate, atol=atol[dtype]) -@pytest.mark.skip(reason="https://nvbugs/5606178") @pytest.mark.parametrize("mamba_chunk_size", [8, 256]) @pytest.mark.parametrize("seqlens", [ (16, 2, 8, 13),