diff --git a/constraints.txt b/constraints.txt
index d4b78a25670..e276ea19527 100644
--- a/constraints.txt
+++ b/constraints.txt
@@ -1,2 +1,5 @@
-# These vulnerabilities were inherited from the base image (pytorch:25.10-py3) and should be removed when the base image
+# These vulnerabilities were inherited from the base image (pytorch:25.06-py3) and should be removed when the base image
 # is updated.
+
+# WAR against https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
+protobuf>=4.25.8
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index dfbcfecb719..baaf32245e4 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -1,9 +1,8 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
 ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver
-ARG BASE_TAG=25.10-py3
-# [TODO] Update to NVIDIA Triton 25.10 when it's available
-ARG TRITON_BASE_TAG=25.09-py3
+ARG BASE_TAG=25.08-py3
+ARG TRITON_BASE_TAG=25.08-py3
 ARG DEVEL_IMAGE=devel
 
 FROM ${BASE_IMAGE}:${BASE_TAG} AS base
@@ -41,9 +40,6 @@ COPY docker/common/install.sh \
      docker/common/install_polygraphy.sh \
      docker/common/install_mpi4py.sh \
      docker/common/install_pytorch.sh \
-     docker/common/install_ucx.sh \
-     docker/common/install_nixl.sh \
-     docker/common/install_etcd.sh \
      ./
 
 RUN GITHUB_MIRROR=${GITHUB_MIRROR} \
@@ -75,15 +71,36 @@ RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --mpi4py && rm install_mpi4
 ARG TORCH_INSTALL_TYPE="skip"
 RUN TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} bash ./install.sh --pytorch && rm install_pytorch.sh
 
-RUN bash ./install.sh --opencv && rm install.sh
+RUN bash ./install.sh --opencv && bash ./install.sh --protobuf && rm install.sh
+
+# wait for new triton to be published
+# Rename pytorch_triton package to triton
+RUN if [ -f /etc/redhat-release ]; then \
+        echo "Rocky8 detected, skipping symlink and ldconfig steps"; \
+    else \
+        cd /usr/local/lib/python3.12/dist-packages/ && \
+        ls -la | grep pytorch_triton && \
+        mv pytorch_triton-3.3.1+gitc8757738.dist-info triton-3.3.1+gitc8757738.dist-info && \
+        cd triton-3.3.1+gitc8757738.dist-info && \
+        echo "Current directory: $(pwd)" && \
+        echo "Files in directory:" && \
+        ls -la && \
+        sed -i 's/^Name: pytorch-triton/Name: triton/' METADATA && \
+        sed -i 's|pytorch_triton-3.3.1+gitc8757738.dist-info/|triton-3.3.1+gitc8757738.dist-info/|g' RECORD && \
+        echo "METADATA after update:" && \
+        grep "^Name:" METADATA; \
+    fi
 
 # Install UCX first
+COPY docker/common/install_ucx.sh install_ucx.sh
 RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_ucx.sh && rm install_ucx.sh
 
 # Install NIXL
+COPY docker/common/install_nixl.sh install_nixl.sh
 RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_nixl.sh && rm install_nixl.sh
 
 # Install etcd
+COPY docker/common/install_etcd.sh install_etcd.sh
 RUN bash ./install_etcd.sh && rm install_etcd.sh
 
 FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton
@@ -99,6 +116,9 @@ COPY --from=triton /opt/tritonserver/caches /opt/tritonserver/caches
 
 # Copy all installation scripts at once to reduce layers
 COPY docker/common/install_triton.sh \
+     docker/common/install_ucx.sh \
+     docker/common/install_nixl.sh \
+     docker/common/install_etcd.sh \
      ./
 
 RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_triton.sh && rm install_triton.sh
diff --git a/docker/Makefile b/docker/Makefile
index b51ae8dfc25..fc1faee012c 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -192,17 +192,16 @@ jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_V
 jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE)
 jenkins-rockylinux8_%: STAGE = tritondevel
 jenkins-rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
-# [TODO] Update to NVIDIA CUDA 13.0.2 when it's available
-jenkins-rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8
+jenkins-rockylinux8_%: BASE_TAG = 13.0.0-devel-rockylinux8
 
 rockylinux8_%: STAGE = tritondevel
 rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
-rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8
+rockylinux8_%: BASE_TAG = 13.0.0-devel-rockylinux8
 
 # For x86_64 and aarch64
 ubuntu22_%: STAGE = tritondevel
 ubuntu22_%: BASE_IMAGE = nvcr.io/nvidia/cuda
-ubuntu22_%: BASE_TAG = 13.0.1-devel-ubuntu22.04
+ubuntu22_%: BASE_TAG = 13.0.0-devel-ubuntu22.04
 
 trtllm_%: STAGE = release
 trtllm_%: PUSH_TO_STAGING := 0
diff --git a/docker/common/install.sh b/docker/common/install.sh
index 8ad8c694f13..eaea3c64391 100755
--- a/docker/common/install.sh
+++ b/docker/common/install.sh
@@ -16,6 +16,7 @@ polygraphy=0
 mpi4py=0
 pytorch=0
 opencv=0
+protobuf=0
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -55,6 +56,10 @@ while [[ $# -gt 0 ]]; do
             opencv=1
             shift 1
             ;;
+        --protobuf)
+            protobuf=1
+            shift 1
+            ;;
         --all)
             base=1
             cmake=1
@@ -65,6 +70,7 @@ while [[ $# -gt 0 ]]; do
             mpi4py=1
             pytorch=1
             opencv=1
+            protobuf=1
             shift 1
             ;;
         *)
@@ -129,3 +135,10 @@ if [ $opencv -eq 1 ]; then
     rm -rf /usr/local/lib/python3*/dist-packages/cv2/
     pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
 fi
+
+# WARs against security issues inherited from pytorch:25.06
+# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
+if [ $protobuf -eq 1 ]; then
+    pip3 install --upgrade --no-cache-dir \
+    "protobuf>=4.25.8"
+fi
diff --git a/docker/common/install_cuda_toolkit.sh b/docker/common/install_cuda_toolkit.sh
index 0dc5cb305aa..ae41ef1fc8e 100644
--- a/docker/common/install_cuda_toolkit.sh
+++ b/docker/common/install_cuda_toolkit.sh
@@ -5,7 +5,7 @@ set -ex
 # This script is used for reinstalling CUDA on Rocky Linux 8 with the run file.
 # CUDA version is usually aligned with the latest NGC CUDA image tag.
 # Only use when public CUDA image is not ready.
-CUDA_VER="13.0.2_580.95.05"
+CUDA_VER="13.0.0_580.65.06"
 CUDA_VER_SHORT="${CUDA_VER%_*}"
 
 NVCC_VERSION_OUTPUT=$(nvcc --version)
diff --git a/docker/common/install_mpi4py.sh b/docker/common/install_mpi4py.sh
index dd0c3d71a83..33299dad533 100644
--- a/docker/common/install_mpi4py.sh
+++ b/docker/common/install_mpi4py.sh
@@ -27,15 +27,12 @@ diff --git a/src/mpi4py/futures/_lib.py b/src/mpi4py/futures/_lib.py
 index f14934d1..eebfb8fc 100644
 --- a/src/mpi4py/futures/_lib.py
 +++ b/src/mpi4py/futures/_lib.py
-@@ -278,6 +278,43 @@ def _manager_comm(pool, options, comm, full=True):
+@@ -278,6 +278,40 @@ def _manager_comm(pool, options, comm, full=True):
 
 
  def _manager_split(pool, options, comm, root):
 +    if(os.getenv("TRTLLM_USE_MPI_KVCACHE")=="1"):
-+        try:
-+            from cuda.bindings import runtime as cudart
-+        except ImportError:
-+            from cuda import cudart
++        from cuda import cudart
 +        has_slurm_rank=False
 +        has_ompi_rank=False
 +        slurm_rank=0
@@ -74,10 +71,6 @@ index f14934d1..eebfb8fc 100644
 EOF
 
 # Install with pip and clean up cache
-ARCH=$(uname -m)
-if [ "$ARCH" = "aarch64" ]; then
-    pip3 install --no-cache-dir Cython==0.29.37
-fi
 pip3 install --no-cache-dir "$TMP_DIR/mpi4py-${MPI4PY_VERSION}"
 
 # Clean up
diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh
index 069b26846c8..ca707843489 100644
--- a/docker/common/install_pytorch.sh
+++ b/docker/common/install_pytorch.sh
@@ -4,8 +4,8 @@ set -ex
 
 # Use latest stable version from https://pypi.org/project/torch/#history
 # and closest to the version specified in
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10
-TORCH_VERSION="2.9.0"
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08
+TORCH_VERSION="2.8.0"
 SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 
 prepare_environment() {
@@ -69,8 +69,8 @@ install_from_pypi() {
     if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
     if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi
 
-    pip3 uninstall -y torch torchvision
-    pip3 install torch==${TORCH_VERSION} torchvision --index-url https://download.pytorch.org/whl/cu130
+    pip3 uninstall -y torch torchvision torchaudio
+    pip3 install torch==${TORCH_VERSION} torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
 }
 
 case "$1" in
diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh
index 3887be6fa26..3006723a97a 100644
--- a/docker/common/install_tensorrt.sh
+++ b/docker/common/install_tensorrt.sh
@@ -2,20 +2,23 @@
 
 set -ex
 
-TRT_VER="10.13.3.9"
+TRT_VER="10.13.2.6"
 # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10
-CUDA_VER="13.0" # 13.0.2
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08
+CUDA_VER="13.0" # 13.0.0
 # Keep the installation for cuDNN if users want to install PyTorch with source codes.
 # PyTorch 2.x can compile with cuDNN v9.
-CUDNN_VER="9.14.0.64-1"
+CUDNN_VER="9.12.0.46-1"
+# NCCL version 2.26.x used in the NGC PyTorch 25.05 image but has a performance regression issue.
+# Use NCCL version 2.27.5 which has the fixes.
 NCCL_VER="2.27.7-1+cuda13.0"
-CUBLAS_VER="13.1.0.3-1"
+# Use cuBLAS version 13.0.0.19 instead.
+CUBLAS_VER="13.0.0.19-1"
 # Align with the pre-installed CUDA / NVCC / NVRTC versions from
 # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
-NVRTC_VER="13.0.88-1"
-CUDA_RUNTIME="13.0.96-1"
-CUDA_DRIVER_VERSION="580.95.05-1.el8"
+NVRTC_VER="13.0.48-1"
+CUDA_RUNTIME="13.0.48-1"
+CUDA_DRIVER_VERSION="580.65.06-1.el8"
 
 for i in "$@"; do
     case $i in
diff --git a/docs/source/installation/build-from-source-linux.md b/docs/source/installation/build-from-source-linux.md
index 19dab71c769..7b94aa88119 100644
--- a/docs/source/installation/build-from-source-linux.md
+++ b/docs/source/installation/build-from-source-linux.md
@@ -147,6 +147,11 @@ check <https://github.com/NVIDIA/TensorRT-LLM/tree/main/docker>.
 
 ## Build TensorRT LLM
 
+```{tip}
+:name: build-from-source-tip-cuda-version
+TensorRT LLM 1.1 supports both CUDA 12.9 and 13.0 while some dependency changes are required. The `requirements.txt` contains dependencies needed by CUDA 13.0. If you are using CUDA 12.9, please uncomment lines end with `# <For CUDA 12.9>` and comment out the next lines.
+```
+
 ### Option 1: Full Build with C++ Compilation
 
 The following command compiles the C++ code and packages the compiled libraries along with the Python files into a wheel. When developing C++ code, you need this full build command to apply your code changes.
diff --git a/docs/source/installation/linux.md b/docs/source/installation/linux.md
index 2aae24e6af0..e04510d1ac4 100644
--- a/docs/source/installation/linux.md
+++ b/docs/source/installation/linux.md
@@ -12,9 +12,14 @@
    Install CUDA Toolkit following the [CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/) and
    make sure `CUDA_HOME` environment variable is properly set.
 
+   ```{tip}
+   :name: installation-linux-tip-cuda-version
+   TensorRT LLM 1.1 supports both CUDA 12.9 and 13.0. The wheel package release only supports CUDA 12.9, while CUDA 13.0 is only supported through NGC container release.
+   ```
+
    ```bash
-   # By default, PyTorch CUDA 12.8 package is installed. Install PyTorch CUDA 13.0 package to align with the CUDA version used for building TensorRT LLM wheels.
-   pip3 install torch==2.9.0 torchvision --index-url https://download.pytorch.org/whl/cu130
+   # Optional step: Only required for NVIDIA Blackwell GPUs and SBSA platform
+   pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
 
    sudo apt-get -y install libopenmpi-dev
    
@@ -22,6 +27,8 @@
    sudo apt-get -y install libzmq3-dev
    ```
 
+   PyTorch CUDA 12.8 package is required for supporting NVIDIA Blackwell GPUs and SBSA platform. On prior GPUs or Linux x86_64 platform, this extra installation is not required.
+
    ```{tip}
    Instead of manually installing the preqrequisites as described
    above, it is also possible to use the pre-built [TensorRT LLM Develop container
diff --git a/docs/source/legacy/reference/support-matrix.md b/docs/source/legacy/reference/support-matrix.md
index 1dc59fcfa0b..910aa428670 100644
--- a/docs/source/legacy/reference/support-matrix.md
+++ b/docs/source/legacy/reference/support-matrix.md
@@ -152,7 +152,7 @@ The following table shows the supported software for TensorRT-LLM.
 * -
   - Software Compatibility
 * - Container
-  - [25.10](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+  - [25.08](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
 * - TensorRT
   - [10.13](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
 * - Precision
diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index 20637c7da38..d661d9be245 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -16,6 +16,9 @@ AARCH64_TRIPLE = "aarch64-linux-gnu"
 
 LLM_DOCKER_IMAGE = env.dockerImage
 
+LLM_DOCKER_IMAGE_12_9 = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383"
+LLM_SBSA_DOCKER_IMAGE_12_9 = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383"
+
 // Always use x86_64 image for agent
 AGENT_IMAGE = env.dockerImage.replace("aarch64", "x86_64")
 
@@ -37,6 +40,9 @@ def BUILD_JOBS_FOR_CONFIG = "buildJobsForConfig"
 @Field
 def CONFIG_LINUX_X86_64_VANILLA = "linux_x86_64_Vanilla"
 
+@Field
+def CONFIG_LINUX_X86_64_VANILLA_CU12 = "linux_x86_64_Vanilla_CU12"
+
 @Field
 def CONFIG_LINUX_X86_64_SINGLE_DEVICE = "linux_x86_64_SingleDevice"
 
@@ -46,6 +52,9 @@ def CONFIG_LINUX_X86_64_LLVM = "linux_x86_64_LLVM"
 @Field
 def CONFIG_LINUX_AARCH64 = "linux_aarch64"
 
+@Field
+def CONFIG_LINUX_AARCH64_CU12 = "linux_aarch64_CU12"
+
 @Field
 def CONFIG_LINUX_AARCH64_LLVM = "linux_aarch64_LLVM"
 
@@ -64,6 +73,11 @@ def BUILD_CONFIGS = [
     (TARNAME) : "TensorRT-LLM.tar.gz",
     (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
   ],
+  (CONFIG_LINUX_X86_64_VANILLA_CU12) : [
+    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks",
+    (TARNAME) : "TensorRT-LLM-CU12.tar.gz",
+    (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;103-real;120-real",
+  ],
   (CONFIG_LINUX_X86_64_PYBIND) : [
     (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks",
     (TARNAME) : "pybind-TensorRT-LLM.tar.gz",
@@ -85,6 +99,12 @@ def BUILD_CONFIGS = [
     (WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
     (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
   ],
+  (CONFIG_LINUX_AARCH64_CU12): [
+    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON",
+    (TARNAME) : "TensorRT-LLM-GH200-CU12.tar.gz",
+    (WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
+    (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
+  ],
   (CONFIG_LINUX_AARCH64_PYBIND): [
     (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl",
     (TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz",
@@ -434,6 +454,9 @@ def runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64)
         pipArgs = ""
     }
 
+    if (tarName.contains("CU12")) {
+        trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${LLM_ROOT} && sed -i '/^# .*<For CUDA 12\\.9>\$/ {s/^# //; n; s/^/# /}' requirements.txt && cat requirements.txt")
+    }
     // install python package
     trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${LLM_ROOT} && pip3 install -r requirements-dev.txt ${pipArgs}")
 
@@ -454,7 +477,10 @@ def runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64)
     def llmPath = sh (script: "realpath ${LLM_ROOT}",returnStdout: true).trim()
     // TODO: Remove after the cmake version is upgraded to 3.31.8
     // Get triton tag from docker/dockerfile.multi
-    def tritonShortTag = "r25.09"
+    def tritonShortTag = "r25.08"
+    if (tarName.contains("CU12")) {
+        tritonShortTag = "r25.06"
+    }
     sh "cd ${LLM_ROOT}/triton_backend/inflight_batcher_llm && mkdir build && cd build && cmake .. -DTRTLLM_DIR=${llmPath} -DTRITON_COMMON_REPO_TAG=${tritonShortTag} -DTRITON_CORE_REPO_TAG=${tritonShortTag} -DTRITON_THIRD_PARTY_REPO_TAG=${tritonShortTag} -DTRITON_BACKEND_REPO_TAG=${tritonShortTag} -DUSE_CXX11_ABI=ON && make -j${buildJobs} install"
 
     // Step 3: packaging wheels into tarfile
@@ -544,9 +570,14 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
         wheelDockerImage = env.dockerImage
     }
 
+    def LLM_DOCKER_IMAGE_CU12 = cpu_arch == AARCH64_TRIPLE ? LLM_SBSA_DOCKER_IMAGE_12_9 : LLM_DOCKER_IMAGE_12_9
+
     buildConfigs = [
         "Build TRT-LLM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
             pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA),
+        // Disable CUDA12 build for too slow to build (cost > 5 hours on SBSA)
+        "Build TRT-LLM CUDA12": [LLM_DOCKER_IMAGE_CU12] + prepareLLMBuild(
+            pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_CU12 : CONFIG_LINUX_X86_64_VANILLA_CU12),
         "Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
             pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM),
         "Build TRT-LLM Pybind": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index d71d510e36c..832cbdbe694 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -38,8 +38,16 @@ LLM_DOCKER_IMAGE = env.dockerImage
 LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310
 LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312
 
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE_12_9="urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509091430-7383"
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE_12_9="urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509091430-7383"
+
+LLM_DOCKER_IMAGE_12_9 = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383"
+LLM_SBSA_DOCKER_IMAGE_12_9 = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383"
+
+DLFW_IMAGE_12_9 = "urm.nvidia.com/docker/nvidia/pytorch:25.06-py3"
+
 // DLFW torch image
-DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.10-py3"
+DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.08-py3"
 
 //Ubuntu base image
 UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"
@@ -56,6 +64,9 @@ def TARNAME = "tarName"
 @Field
 def VANILLA_CONFIG = "Vanilla"
 
+@Field
+def VANILLA_CONFIG_CU12 = "Vanilla_CU12"
+
 @Field
 def SINGLE_DEVICE_CONFIG = "SingleDevice"
 
@@ -65,6 +76,9 @@ def LLVM_CONFIG = "LLVM"
 @Field
 def LINUX_AARCH64_CONFIG = "linux_aarch64"
 
+@Field
+def LINUX_AARCH64_CONFIG_CU12 = "linux_aarch64_CU12"
+
 @Field
 def PYBIND_CONFIG = "Pybind"
 
@@ -72,9 +86,11 @@ def PYBIND_CONFIG = "Pybind"
 def BUILD_CONFIGS = [
   // Vanilla TARNAME is used for packaging in runLLMPackage
   (VANILLA_CONFIG) : [(TARNAME) : "TensorRT-LLM.tar.gz"],
+  (VANILLA_CONFIG_CU12) : [(TARNAME) : "TensorRT-LLM-CU12.tar.gz"],
   (SINGLE_DEVICE_CONFIG) : [(TARNAME) : "single-device-TensorRT-LLM.tar.gz"],
   (LLVM_CONFIG) : [(TARNAME) : "llvm-TensorRT-LLM.tar.gz"],
   (LINUX_AARCH64_CONFIG) : [(TARNAME) : "TensorRT-LLM-GH200.tar.gz"],
+  (LINUX_AARCH64_CONFIG_CU12) : [(TARNAME) : "TensorRT-LLM-GH200-CU12.tar.gz"],
   (PYBIND_CONFIG) : [(TARNAME) : "pybind-TensorRT-LLM.tar.gz"],
 ]
 
@@ -1527,7 +1543,7 @@ def launchTestListCheck(pipeline)
             sh "tar -zxf ${tarName}"
             def llmPath = sh (script: "realpath .", returnStdout: true).trim()
             def llmSrc = "${llmPath}/TensorRT-LLM/src"
-            sh "NVIDIA_TRITON_SERVER_VERSION=25.09 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive"
+            sh "NVIDIA_TRITON_SERVER_VERSION=25.04 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive"
         } catch (InterruptedException e) {
             throw e
         } catch (Exception e) {
@@ -2072,6 +2088,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
         if (env.alternativeTRT) {
             sh "cd ${llmSrc} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt"
         }
+        if (stageName.contains("-CU12")) {
+            trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && sed -i '/^# .*<For CUDA 12\\.9>\$/ {s/^# //; n; s/^/# /}' requirements.txt && cat requirements.txt")
+        }
         trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install --retries 1 -r requirements-dev.txt")
         if (stageName.contains("-Ray-")) {
             trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install ray[default]")
@@ -2357,7 +2376,7 @@ def checkPipInstall(pipeline, wheel_path)
 }
 
 
-def runLLMBuild(pipeline, cpu_arch, reinstall_dependencies=false, wheel_path="", cpver="cp312")
+def runLLMBuild(pipeline, cpu_arch, reinstall_dependencies=false, wheel_path="", cpver="cp312", is_cu12=false)
 {
     sh "pwd && ls -alh"
     sh "env | sort"
@@ -2367,6 +2386,9 @@ def runLLMBuild(pipeline, cpu_arch, reinstall_dependencies=false, wheel_path="",
     if (env.alternativeTRT) {
         sh "cd tensorrt_llm/ && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt"
     }
+    if (is_cu12) {
+        trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd tensorrt_llm/ && sed -i '/^# .*<For CUDA 12\\.9>\$/ {s/^# //; n; s/^/# /}' requirements.txt && cat requirements.txt")
+    }
 
     // Random sleep to avoid resource contention
     sleep(10 * Math.random())
@@ -2661,7 +2683,7 @@ def launchTestJobs(pipeline, testFilter)
         "RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4],
     ]
 
-    parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
+    parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(key.contains("-CU12-") ? LLM_DOCKER_IMAGE_12_9 : LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
         def config = VANILLA_CONFIG
         if (key.contains("single-device")) {
             config = SINGLE_DEVICE_CONFIG
@@ -2672,6 +2694,9 @@ def launchTestJobs(pipeline, testFilter)
         if (key.contains("Pybind")) {
             config = PYBIND_CONFIG
         }
+        if (key.contains("-CU12-")) {
+            config = VANILLA_CONFIG_CU12
+        }
         runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
     }]]}
     fullSet = parallelJobs.keySet()
@@ -2702,6 +2727,9 @@ def launchTestJobs(pipeline, testFilter)
         if (key.contains("llvm")) {
             config = LLVM_CONFIG
         }
+        if (key.contains("-CU12-")) {
+            config = VANILLA_CONFIG_CU12
+        }
         runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
     }]]}
 
@@ -2740,7 +2768,7 @@ def launchTestJobs(pipeline, testFilter)
     fullSet += multiNodesSBSAConfigs.keySet()
 
     if (env.targetArch == AARCH64_TRIPLE) {
-        parallelJobs = SBSATestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), {
+        parallelJobs = SBSATestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(key.contains("-CU12-") ? LLM_SBSA_DOCKER_IMAGE_12_9 : LLM_DOCKER_IMAGE, values[0], "arm64"), {
             runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3])
         }]]}
 
@@ -2795,50 +2823,50 @@ def launchTestJobs(pipeline, testFilter)
     // Python version and OS for sanity check
     x86SanityCheckConfigs = [
         "PY312-DLFW": [
-            LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE,
+            LLM_DOCKER_IMAGE,
             "B200_PCIe",
             X86_64_TRIPLE,
             false,
-            "dlfw/",
+            "cuda13/",
             DLFW_IMAGE,
             false,
         ],
-        "PY310-UB2204": [
-            LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE,
+        "PY310-UB2204-CU12": [
+            LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE_12_9,
             "A10",
             X86_64_TRIPLE,
             true,
             "",
             UBUNTU_22_04_IMAGE,
-            true, // Extra install PyTorch CUDA 13.0 package to align with the CUDA version used for building TensorRT LLM wheels.
+            false,
         ],
-        "PY312-UB2404": [
-            LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE,
+        "PY312-UB2404-CU12": [
+            LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE_12_9,
             "RTX5090",
             X86_64_TRIPLE,
             true,
             "",
             UBUNTU_24_04_IMAGE,
-            true, // Extra PyTorch CUDA 13.0 install
+            true, // Extra PyTorch CUDA 12.8 install
         ],
     ]
 
     aarch64SanityCheckConfigs = [
-        "PY312-UB2404": [
-            LLM_DOCKER_IMAGE,
+        "PY312-UB2404-CU12": [
+            LLM_SBSA_DOCKER_IMAGE_12_9,
             "GH200",
             AARCH64_TRIPLE,
             false,
             "",
             UBUNTU_24_04_IMAGE,
-            true, // Extra PyTorch CUDA 13.0 install
+            true, // Extra PyTorch CUDA 12.8 install
         ],
         "PY312-DLFW": [
             LLM_DOCKER_IMAGE,
             "GH200",
             AARCH64_TRIPLE,
             false,
-            "dlfw/",
+            "cuda13/",
             DLFW_IMAGE,
             false,
         ],
@@ -2892,7 +2920,7 @@ def launchTestJobs(pipeline, testFilter)
                     env = ["LD_LIBRARY_PATH+=:/usr/local/cuda/compat"]
                 }
                 withEnv(env) {
-                    wheelName = runLLMBuild(pipeline, cpu_arch, values[3], wheelPath, cpver)
+                    wheelName = runLLMBuild(pipeline, cpu_arch, values[3], wheelPath, cpver, key.contains("CU12"))
                 }
             }
 
@@ -2917,7 +2945,7 @@ def launchTestJobs(pipeline, testFilter)
                         echo "###### Prerequisites Start ######"
                         echoNodeAndGpuInfo(pipeline, toStageName(values[1], key))
                         // Clean up the pip constraint file from the base NGC PyTorch image.
-                        if (values[5] == DLFW_IMAGE) {
+                        if (values[5] == DLFW_IMAGE || values[5] == DLFW_IMAGE_12_9) {
                             trtllm_utils.llmExecStepWithRetry(pipeline, script: "[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true")
                         }
                         trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y python3-pip git rsync curl wget")
@@ -2925,17 +2953,47 @@ def launchTestJobs(pipeline, testFilter)
                         trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 config set global.break-system-packages true")
                         trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install requests")
                         trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 uninstall -y tensorrt")
-                        if (values[5] != DLFW_IMAGE) {
+                        if (values[5] != DLFW_IMAGE && values[5] != DLFW_IMAGE_12_9) {
                             def ubuntu_version = key.contains("UB2404") ? "ubuntu2404" : "ubuntu2204"
                             def platform = cpu_arch == X86_64_TRIPLE ? "x86_64" : "sbsa"
                             trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget https://developer.download.nvidia.com/compute/cuda/repos/${ubuntu_version}/${platform}/cuda-keyring_1.1-1_all.deb")
                             trtllm_utils.llmExecStepWithRetry(pipeline, script: "dpkg -i cuda-keyring_1.1-1_all.deb")
-                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0")
+                            if (key.contains("CU12")) {
+                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-12-9")
+                            } else {
+                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0")
+                            }
                         }
-                        // Extra PyTorch CUDA 13.0 install for all bare-metal environments (Default PyTorch is for CUDA 12.8)
+                        if (key.contains("CU12")) {
+                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "sed -i '/^# .*<For CUDA 12\\.9>\$/ {s/^# //; n; s/^/# /}' ${LLM_ROOT}/requirements.txt")
+                            sh "cat ${LLM_ROOT}/requirements.txt"
+                        }
+                        // Extra PyTorch CUDA 12.8 install for SBSA platform and Blackwell GPUs bare-metal environments
                         if (values[6]) {
-                            echo "###### Extra PyTorch CUDA 13.0 install Start ######"
-                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0 torchvision --index-url https://download.pytorch.org/whl/cu130")
+                            echo "###### Extra PyTorch CUDA 12.8 install Start ######"
+                            if (key.contains("CU12")) {
+                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128")
+                            } else {
+                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.8.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128")
+                            }
+                        }
+
+                        // TODO: Remove this after public triton supports CUDA 13.
+                        if (key == "PY312-DLFW" && values[2] == X86_64_TRIPLE) {
+                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install https://download.pytorch.org/whl/nightly/pytorch_triton-3.3.1%2Bgitc8757738-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl")
+                            sh """
+                                cd /usr/local/lib/python3.12/dist-packages/ && \
+                                ls -la | grep pytorch_triton && \
+                                mv pytorch_triton-3.3.1+gitc8757738.dist-info triton-3.3.1+gitc8757738.dist-info && \
+                                cd triton-3.3.1+gitc8757738.dist-info && \
+                                echo "Current directory: \$(pwd)" && \
+                                echo "Files in directory:" && \
+                                ls -la && \
+                                sed -i 's/^Name: pytorch-triton/Name: triton/' METADATA && \
+                                sed -i 's|pytorch_triton-3.3.1+gitc8757738.dist-info/|triton-3.3.1+gitc8757738.dist-info/|g' RECORD && \
+                                echo "METADATA after update:" && \
+                                grep "^Name:" METADATA
+                            """
                         }
 
                         def libEnv = []
@@ -2958,9 +3016,9 @@ def launchTestJobs(pipeline, testFilter)
                         }
                         echo "###### Run LLMAPI tests Start ######"
 
-                        def config = VANILLA_CONFIG
+                        def config = key.contains("CU12") ? VANILLA_CONFIG_CU12 : VANILLA_CONFIG
                         if (cpu_arch == AARCH64_TRIPLE) {
-                            config = LINUX_AARCH64_CONFIG
+                            config = key.contains("CU12") ? LINUX_AARCH64_CONFIG_CU12 : LINUX_AARCH64_CONFIG
                         }
                         withEnv(libEnv) {
                             sh "env | sort"
diff --git a/jenkins/controlCCache.groovy b/jenkins/controlCCache.groovy
index 74ebebdeb03..bc34d88e4d0 100644
--- a/jenkins/controlCCache.groovy
+++ b/jenkins/controlCCache.groovy
@@ -1,7 +1,7 @@
 
 import java.lang.InterruptedException
 
-DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202510291120-8621"
+DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508130930-6501"
 
 def createKubernetesPodConfig(image, arch = "amd64")
 {
diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties
index fc097cc9e4e..347265f9860 100644
--- a/jenkins/current_image_tags.properties
+++ b/jenkins/current_image_tags.properties
@@ -13,7 +13,11 @@
 #     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
 IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
 
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511021230-8838
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511021230-8838
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511021230-8838
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511021230-8838
+LLM_DOCKER_IMAGE_12_9=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383
+LLM_SBSA_DOCKER_IMAGE_12_9=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE_12_9=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509091430-7383
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE_12_9=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509091430-7383
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.08-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511101900-9039
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.08-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511101900-9039
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.0-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511101900-9039
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.0-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511101900-9039
diff --git a/requirements.txt b/requirements.txt
index 69f9c305ebc..7763317712f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,9 @@
---extra-index-url https://download.pytorch.org/whl/cu130
+--extra-index-url https://download.pytorch.org/whl/cu128
 -c constraints.txt
 accelerate>=1.7.0
 build
 colored
+# cuda-python>=12,<13  # <For CUDA 12.9>
 cuda-python>=13
 diffusers>=0.27.0
 lark
@@ -13,19 +14,24 @@ onnx_graphsurgeon>=0.5.2
 openai
 polygraphy
 psutil
+# nvidia-ml-py>=12,<13  # <For CUDA 12.9>
 nvidia-ml-py>=13
 pulp
 pandas
 h5py==3.12.1
 StrEnum
 sentencepiece>=0.1.99
+# tensorrt~=10.11.0  # <For CUDA 12.9>
 tensorrt~=10.13.0
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.9.0a0.
-torch>=2.9.0a0,<=2.9.0
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08 uses 2.8.0a0.
+# torch>=2.7.1,<=2.8.0a0  # <For CUDA 12.9>
+torch>=2.8.0a0,<=2.8.0
 torchvision
-nvidia-modelopt[torch]~=0.37.0
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.27.7
-nvidia-nccl-cu13==2.27.7
+nvidia-modelopt[torch]~=0.33.0
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08 uses 2.27.7
+# nvidia-nccl-cu12  # <For CUDA 12.9>
+nvidia-nccl-cu13
+# nvidia-cuda-nvrtc-cu12  # <For CUDA 12.9>
 nvidia-cuda-nvrtc
 transformers==4.56.0
 prometheus_client
@@ -33,7 +39,7 @@ prometheus_fastapi_instrumentator
 pydantic>=2.9.1
 pydantic-settings[yaml]
 omegaconf
-pillow
+pillow==10.3.0
 wheel<=0.45.1
 optimum
 # evaluate needs datasets>=2.0.0 which triggers datasets>3.1.0 which is not stable: https://github.com/huggingface/datasets/issues/7467
@@ -64,11 +70,12 @@ ninja
 etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225b61f039457c8072a
 blake3
 soundfile
-triton==3.5.0; platform_machine == "x86_64"
+triton==3.3.1; platform_machine == "x86_64"
 tiktoken
 blobfile
 openai-harmony==0.0.4
 nvidia-cutlass-dsl==4.2.1; python_version >= "3.10"
+numba-cuda>=0.19.0 # WAR for nvbugs/5501820
 plotly
 numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing
 partial_json_parser
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index dda7da6d0c4..e2c8f41ec30 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -946,6 +946,40 @@ def get_binding_lib(subdirectory, name):
             # and validating python changes in the whl.
             clear_folder(dist_dir)
 
+        # Modify requirements.txt for wheel build based on CUDA version
+        def modify_requirements_for_cuda():
+            requirements_file = project_dir / "requirements.txt"
+            if os.environ.get("CUDA_VERSION", "").startswith("12."):
+                print(
+                    "Detected CUDA 12 environment, modifying requirements.txt for wheel build..."
+                )
+                with open(requirements_file, 'r', encoding='utf-8') as f:
+                    lines = f.readlines()
+                modified_lines = []
+                i = 0
+                while i < len(lines):
+                    line = lines[i]
+                    if "<For CUDA 12.9>" in line and line.strip().startswith(
+                            "#"):
+                        new_line = line.replace("# ", "", 1)
+                        print(
+                            f"Enable CUDA 12.9 dependency: {new_line.strip()}")
+                        modified_lines.append(new_line)
+                        print(
+                            f"Disable CUDA 13 dependency: # {lines[i + 1].strip()}"
+                        )
+                        modified_lines.append("# " + lines[i + 1])
+                        i += 1
+                    else:
+                        modified_lines.append(line)
+                    i += 1
+                with open(requirements_file, 'w', encoding='utf-8') as f:
+                    f.writelines(modified_lines)
+                return True
+            return False
+
+        modify_requirements_for_cuda()
+
         build_run(
             f'\"{venv_python}\" -m build {project_dir} --skip-dependency-check --no-isolation --wheel --outdir "{dist_dir}"'
         )
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
index 68229e4150d..a3bf1024eae 100644
--- a/tensorrt_llm/_utils.py
+++ b/tensorrt_llm/_utils.py
@@ -1189,12 +1189,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
 def torch_pybind11_abi() -> str:
     global TORCH_PYBIND11_ABI
     if TORCH_PYBIND11_ABI is None:
-        if hasattr(torch._C, '_PYBIND11_COMPILER_TYPE'):
-            # Old pybind11 abi string before torch 2.9.0
-            TORCH_PYBIND11_ABI = f"{torch._C._PYBIND11_COMPILER_TYPE}{torch._C._PYBIND11_STDLIB}{torch._C._PYBIND11_BUILD_ABI}"
-        else:
-            # New pybind11 abi string since torch 2.9.0
-            TORCH_PYBIND11_ABI = f"system_libstdcpp_gxx_abi_1xxx_use_cxx11_abi_{int(torch.compiled_with_cxx11_abi())}"
+        TORCH_PYBIND11_ABI = f"{torch._C._PYBIND11_COMPILER_TYPE}{torch._C._PYBIND11_STDLIB}{torch._C._PYBIND11_BUILD_ABI}"
     return TORCH_PYBIND11_ABI
 
 
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 7b82594e524..0069e1906d5 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -281,7 +281,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[latency] SKIP (https://nvbugs/5481198)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[throughput] SKIP (https://nvbugs/5481198)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5503479)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5630310)
 full:L20/accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized SKIP (https://nvbugs/5542862)
 full:L20/accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8 SKIP (https://nvbugs/5542862)
 full:L40S/accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized SKIP (https://nvbugs/5542862)
@@ -418,3 +417,4 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKI
 test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5647825)
 test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image] SKIP (https://nvbugs/5644190)
 test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5568836)
+disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5633340)
diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
index 91b906241df..9712830c26c 100644
--- a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
+++ b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
@@ -343,7 +343,6 @@ def test_nemotron_h_cuda_graph_overlap_scheduler():
         )
 
 
-@pytest.mark.skip(reason="https://nvbugs/5626259")
 def test_nemotron_h_chunked_prefill():
     # Long prompts (~100 tokens) to make sure chunked prefill is enabled
     # (At the time of development, tokens_per_block isn't configurable from the LLM API,
diff --git a/tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py b/tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py
index 17c28f75fac..0f6a8724c43 100644
--- a/tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py
+++ b/tests/unittest/_torch/thop/parallel/test_mamba2_chunk_ss_update.py
@@ -31,7 +31,6 @@
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 
 
-@pytest.mark.skip(reason="https://nvbugs/5606178")
 @pytest.mark.parametrize(
     "dim, headdim, ngroups, dstate, req_type, dtype, batch_size, max_seq_len, has_z, remove_padding, paged_cache, use_initial_states",
     # dim parametrization
@@ -363,7 +362,6 @@ def test_mamba2_chunk_scan_selective_state_update(dim, headdim, ngroups, dstate,
                                atol=atol[dtype])
 
 
-@pytest.mark.skip(reason="https://nvbugs/5606178")
 @pytest.mark.parametrize("mamba_chunk_size", [8, 256])
 @pytest.mark.parametrize("seqlens", [
     (16, 2, 8, 13),