NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 320 additions & 187 deletions b/‎CHANGELOG.md‎
Lines changed: 320 additions & 187 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 11 additions & 25 deletions b/‎CMakeLists.txt‎
Lines changed: 11 additions & 25 deletions
diff --git a/‎README.md‎
Lines changed: 168 additions & 147 deletions b/‎README.md‎
Lines changed: 168 additions & 147 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/toolchains/cmake_aarch64_cross.toolchain‎
Lines changed: 2 additions & 0 deletions b/‎cmake/toolchains/cmake_aarch64_cross.toolchain‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎demo/BERT/README.md‎
Lines changed: 345 additions & 313 deletions b/‎demo/BERT/README.md‎
Lines changed: 345 additions & 313 deletions
diff --git a/‎demo/DeBERTa/README.md‎
Lines changed: 3 additions & 3 deletions b/‎demo/DeBERTa/README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎demo/Diffusion/.gitignore‎
Lines changed: 5 additions & 3 deletions b/‎demo/Diffusion/.gitignore‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎demo/Diffusion/README.md‎
Lines changed: 40 additions & 53 deletions b/‎demo/Diffusion/README.md‎
Lines changed: 40 additions & 53 deletions
@@ -3,7 +3,6 @@ build/
 /demo/BERT/engines
 /demo/BERT/squad/*.json
 /docker/jetpack_files/*
-*.nvmk
 *.sln
 *.vcxproj
 externals/
 
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -176,43 +176,29 @@ set(CUDA_LIBRARIES ${CUDART_LIB})
 if (DEFINED GPU_ARCHS)
   message(STATUS "GPU_ARCHS defined as ${GPU_ARCHS}. Generating CUDA code for SM ${GPU_ARCHS}")
   separate_arguments(GPU_ARCHS)
+  foreach(SM IN LISTS GPU_ARCHS)
+    list(APPEND CMAKE_CUDA_ARCHITECTURES SM)
+  endforeach()
 else()
-  list(APPEND GPU_ARCHS
-      75
-    )
-
-  find_file(IS_L4T_NATIVE nv_tegra_release PATHS /env/)
-  set (IS_L4T_CROSS "False")
-  if (DEFINED ENV{IS_L4T_CROSS})
-    set(IS_L4T_CROSS $ENV{IS_L4T_CROSS})
+  list(APPEND CMAKE_CUDA_ARCHITECTURES 72 75 80 86 87 89 90)
+  
+  if(CUDA_VERSION VERSION_GREATER_EQUAL 12.8)
+      list(APPEND CMAKE_CUDA_ARCHITECTURES 100 120)
   endif()
 
-  if (IS_L4T_NATIVE OR ${IS_L4T_CROSS} STREQUAL "True")
-    # Only Orin (SM87) supported
-    list(APPEND GPU_ARCHS 87)
-  endif()
-
-  if (CUDA_VERSION VERSION_GREATER_EQUAL 11.0)
-    # Ampere GPU (SM80) support is only available in CUDA versions > 11.0
-    list(APPEND GPU_ARCHS 80)
-  endif()
-  if (CUDA_VERSION VERSION_GREATER_EQUAL 11.1)
-    list(APPEND GPU_ARCHS 86)
-  endif()
-
-  message(STATUS "GPU_ARCHS is not defined. Generating CUDA code for default SMs: ${GPU_ARCHS}")
+  message(STATUS "GPU_ARCHS is not defined. Generating CUDA code for default SMs: ${CMAKE_CUDA_ARCHITECTURES}")
 endif()
 set(BERT_GENCODES)
 # Generate SASS for each architecture
-foreach(arch ${GPU_ARCHS})
+foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
     if (${arch} GREATER_EQUAL 75)
         set(BERT_GENCODES "${BERT_GENCODES} -gencode arch=compute_${arch},code=sm_${arch}")
     endif()
     set(GENCODES "${GENCODES} -gencode arch=compute_${arch},code=sm_${arch}")
 endforeach()
 
 # Generate PTX for the last architecture in the list.
-list(GET GPU_ARCHS -1 LATEST_SM)
+list(GET CMAKE_CUDA_ARCHITECTURES -1 LATEST_SM)
 set(GENCODES "${GENCODES} -gencode arch=compute_${LATEST_SM},code=compute_${LATEST_SM}")
 if (${LATEST_SM} GREATER_EQUAL 75)
     set(BERT_GENCODES "${BERT_GENCODES} -gencode arch=compute_${LATEST_SM},code=compute_${LATEST_SM}")
 
@@ -1 +1 @@
-10.8.0.43
+10.9.0.34
@@ -53,3 +53,5 @@ set(CMAKE_CUDA_COMPILER_FORCED TRUE)
 set(CUDA_LIBS -L${CUDA_ROOT}/lib)
 
 set(ADDITIONAL_PLATFORM_LIB_FLAGS ${CUDA_LIBS} -lcublas -lcudart -lstdc++ -lm)
+
+link_directories(${CUDA_ROOT}/lib)
@@ -75,7 +75,7 @@ Note that the performance gap between BERT's self-attention and DeBERTa's disent
 ## Environment Setup
 It is recommended to use docker for reproducing the following steps. Follow the setup steps in TensorRT OSS [README](https://github.com/NVIDIA/TensorRT#setting-up-the-build-environment) to build and launch the container and build OSS:
 
-**Example: Ubuntu 20.04 on x86-64 with cuda-12.5 (default)**
+**Example: Ubuntu 20.04 on x86-64 with cuda-12.8 (default)**
 ```bash
 # Download this TensorRT OSS repo
 git clone -b main https://github.com/nvidia/TensorRT TensorRT
@@ -84,10 +84,10 @@ git submodule update --init --recursive
 
 ## at root of TensorRT OSS
 # build container
-./docker/build.sh --file docker/ubuntu-20.04.Dockerfile --tag tensorrt-ubuntu20.04-cuda12.5
+./docker/build.sh --file docker/ubuntu-20.04.Dockerfile --tag tensorrt-ubuntu20.04-cuda12.8
 
 # launch container
-./docker/launch.sh --tag tensorrt-ubuntu20.04-cuda12.5 --gpus all
+./docker/launch.sh --tag tensorrt-ubuntu20.04-cuda12.8 --gpus all
 
 ## now inside container
 # build OSS (only required for pre-8.4.3 TensorRT versions)
 
@@ -1,4 +1,6 @@
 __pycache__/
-onnx/*.onnx
-engine/*.plan
-output/*.png
+onnx/
+engine/
+output/
+pytorch_model/
+artifacts_cache/