TRT RTX EP changes (microsoft#25015)

ishwar-raut1 · ankan-ban · javier-intel · commit ab4e99278490 · 2025-06-14T22:46:43.000-07:00
### Description
&lt;!-- Describe your changes. --&gt;

- don't use cuda runtime API to set the device when a stream is already
provided.
- expose option to set limit on max shared memory TensorRT can use.

- Fixed the Compilation issues for the deprecated APIs
- Small test fix. 

### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;

---------

Co-authored-by: Ankan Banerjee &lt;anbanerjee@nvidia.com&gt;
diff --git a/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h
@@ -8,6 +8,7 @@
  * - `kHasUserComputeStream`: Indicates whether a user-provided compute stream is used.
  * - `kUserComputeStream`: Specifies the user-provided compute stream.
  * - `kMaxWorkspaceSize`: Sets the maximum workspace size for GPU memory allocation.
+ * - 'kMaxSharedMemSize': Sets the maximum amount of shared memory that TensorRT kernels are allowed to use
  * - `kDumpSubgraphs`: Enables or disables dumping of subgraphs for debugging.
  * - `kDetailedBuildLog`: Enables or disables detailed build logs for debugging.
  * - `kProfilesMinShapes`: Specifies the minimum shapes for profiling.
@@ -24,6 +25,7 @@ constexpr const char* kDeviceId = "device_id";
 constexpr const char* kHasUserComputeStream = "has_user_compute_stream";
 constexpr const char* kUserComputeStream = "user_compute_stream";
 constexpr const char* kMaxWorkspaceSize = "nv_max_workspace_size";
+constexpr const char* kMaxSharedMemSize = "nv_max_shared_mem_size";
 constexpr const char* kDumpSubgraphs = "nv_dump_subgraphs";
 constexpr const char* kDetailedBuildLog = "nv_detailed_build_log";
 constexpr const char* kProfilesMinShapes = "nv_profile_min_shapes";
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
@@ -117,7 +117,6 @@ void Impl_Cast(
 }
 }  // namespace cuda
 
-#if NV_TENSORRT_MAJOR >= 10
 void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
                                              uint64_t /*alignment*/, cudaStream_t /*stream*/) noexcept {
   // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
@@ -134,25 +133,6 @@ void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /
   // if cudaMalloc fails, returns nullptr.
   return outputPtr;
 }
-#else
-// Only override this method when TensorRT <= 8.6
-void* OutputAllocator::reallocateOutput(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
-                                        uint64_t /*alignment*/) noexcept {
-  // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
-  // even for empty tensors, so allocate a dummy byte.
-  size = std::max(size, static_cast<uint64_t>(1));
-  if (size > allocated_size) {
-    cudaFree(outputPtr);
-    outputPtr = nullptr;
-    allocated_size = 0;
-    if (cudaMalloc(&outputPtr, size) == cudaSuccess) {
-      allocated_size = size;
-    }
-  }
-  // if cudaMalloc fails, returns nullptr.
-  return outputPtr;
-}
-#endif
 
 void OutputAllocator::notifyShape(char const* /*tensorName*/, nvinfer1::Dims const& dims) noexcept {
   output_shapes.clear();
@@ -912,6 +892,7 @@ Status BindKernelOutput(Ort::KernelContext& ctx,
 }
 
 NvExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, bool has_user_compute_stream, cudaStream_t stream) {
+  // TODO: figure out if PerThreadContext is used at all. If not, just clean it up.
   if (has_user_compute_stream) {
     CUDA_CALL_THROW(cudaSetDevice(device_id));
     (void)(stream);
@@ -1046,8 +1027,16 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
       info_(info),
       device_id_(info.device_id) {
   InitProviderOrtApi();
+
   // TODO(maximlianm) remove this since we should be able to compile an AOT context file without GPU
-  CUDA_CALL_THROW(cudaSetDevice(device_id_));
+
+  if (!info.has_user_compute_stream) {
+    // If the app is passing in a compute stream, it already has initialized cuda and created a context.
+    // Calling cudaSetDevice() will set the default context in the current thread
+    // which may not be compatible with the stream created by the app.
+    CUDA_CALL_THROW(cudaSetDevice(device_id_));
+  }
+
   cudaDeviceProp prop;
   CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
   compute_capability_ = GetComputeCapacity(prop);
@@ -1068,6 +1057,7 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
   max_partition_iterations_ = info.max_partition_iterations;
   min_subgraph_size_ = info.min_subgraph_size;
   max_workspace_size_ = info.max_workspace_size;
+  max_shared_mem_size_ = info.max_shared_mem_size;
   dump_subgraphs_ = info.dump_subgraphs;
   weight_stripped_engine_enable_ = info.weight_stripped_engine_enable;
   onnx_model_folder_path_ = info.onnx_model_folder_path;
@@ -2294,6 +2284,9 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
   if (max_workspace_size_ > 0) {
     trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_);
   }
+  if (max_shared_mem_size_ > 0) {
+    trt_config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kTACTIC_SHARED_MEMORY, max_shared_mem_size_);
+  }
   // Only set default compute capabilities if user hasn't explicitly configured them
   constexpr int kDefaultNumComputeCapabilities = 1;  // Default number of compute capabilities for Turing support
   if (trt_config->getNbComputeCapabilities() == 0) {
@@ -2587,7 +2580,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
 #pragma warning(push)
 #pragma warning(disable : 4996)
 #endif
-    size_t mem_size = trt_engine->getDeviceMemorySize();
+    size_t mem_size = trt_engine->getDeviceMemorySizeV2();
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
@@ -2841,7 +2834,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
 #pragma warning(push)
 #pragma warning(disable : 4996)
 #endif
-      size_t mem_size = trt_engine->getDeviceMemorySize();
+      size_t mem_size = trt_engine->getDeviceMemorySizeV2();
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
@@ -2923,7 +2916,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
     if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
       if (IsGraphCaptureAllowed()) {
         CaptureEnd(0);
-        // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+        // CUDA work issued to a capturing stream doesn't actually run on the GPU,
         // so run the captured graph here to actually execute the work.
         ORT_RETURN_IF_ERROR(ReplayGraph(0));
       } else {
@@ -2973,7 +2966,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
 #pragma warning(push)
 #pragma warning(disable : 4996)
 #endif
-    size_t mem_size = trt_engine->getDeviceMemorySize();
+    size_t mem_size = trt_engine->getDeviceMemorySizeV2();
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
@@ -3155,7 +3148,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
 #pragma warning(push)
 #pragma warning(disable : 4996)
 #endif
-      size_t mem_size = trt_engine->getDeviceMemorySize();
+      size_t mem_size = trt_engine->getDeviceMemorySizeV2();
 #if defined(_MSC_VER)
 #pragma warning(pop)
 #endif
@@ -3237,7 +3230,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
     if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
       if (IsGraphCaptureAllowed()) {
         CaptureEnd(0);
-        // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+        // CUDA work issued to a capturing stream doesn't actually run on the GPU,
         // so run the captured graph here to actually execute the work.
         ORT_RETURN_IF_ERROR(ReplayGraph(0));
       } else {
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
@@ -78,11 +78,8 @@ using unique_pointer = std::unique_ptr<T, TensorrtInferDeleter>;
 //
 class OutputAllocator : public nvinfer1::IOutputAllocator {
  public:
-#if NV_TENSORRT_MAJOR >= 10
   void* reallocateOutputAsync(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment, cudaStream_t stream) noexcept override;
-#else
-  void* reallocateOutput(char const* tensorName, void* currentMemory, uint64_t size, uint64_t alignment) noexcept override;
-#endif
+
   void notifyShape(char const* tensorName, nvinfer1::Dims const& dims) noexcept override;
 
   void* getBuffer() {
@@ -238,6 +235,7 @@ class NvExecutionProvider : public IExecutionProvider {
   int max_partition_iterations_ = 1000;
   size_t min_subgraph_size_ = 1;
   size_t max_workspace_size_ = 0;
+  size_t max_shared_mem_size_ = 0;
   bool force_sequential_engine_build_ = false;
   bool dump_subgraphs_ = false;
   bool engine_cache_enable_ = false;
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc
@@ -41,6 +41,7 @@ NvExecutionProviderInfo NvExecutionProviderInfo::FromProviderOptions(const Provi
                 return Status::OK();
               })
           .AddAssignmentToReference(nv::provider_option_names::kMaxWorkspaceSize, info.max_workspace_size)
+          .AddAssignmentToReference(nv::provider_option_names::kMaxSharedMemSize, info.max_shared_mem_size)
           .AddAssignmentToReference(nv::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
           .AddAssignmentToReference(nv::provider_option_names::kDetailedBuildLog, info.detailed_build_log)
           .AddAssignmentToReference(nv::provider_option_names::kProfilesMinShapes, info.profile_min_shapes)
@@ -90,6 +91,7 @@ ProviderOptions NvExecutionProviderInfo::ToProviderOptions(const NvExecutionProv
       {nv::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)},
       {nv::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast<size_t>(info.user_compute_stream))},
       {nv::provider_option_names::kMaxWorkspaceSize, MakeStringWithClassicLocale(info.max_workspace_size)},
+      {nv::provider_option_names::kMaxSharedMemSize, MakeStringWithClassicLocale(info.max_shared_mem_size)},
       {nv::provider_option_names::kDumpSubgraphs, MakeStringWithClassicLocale(info.dump_subgraphs)},
       {nv::provider_option_names::kDetailedBuildLog, MakeStringWithClassicLocale(info.detailed_build_log)},
       {nv::provider_option_names::kProfilesMinShapes, MakeStringWithClassicLocale(info.profile_min_shapes)},
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h
@@ -24,6 +24,7 @@ struct NvExecutionProviderInfo {
   int max_partition_iterations{1000};
   int min_subgraph_size{1};
   size_t max_workspace_size{0};
+  size_t max_shared_mem_size{0};
   bool dump_subgraphs{false};
   std::string engine_cache_path{""};
   bool weight_stripped_engine_enable{false};
diff --git a/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc b/onnxruntime/test/providers/nv_tensorrt_rtx/nv_basic_test.cc
@@ -365,7 +365,8 @@ TEST(NvExecutionProviderTest, ContextEmbedAndReloadDataDynamic) {
 TYPED_TEST(NvExecutionProviderTest, IOTypeTests) {
   std::string dtype_name = this->getTypeAsName();
   ASSERT_FALSE(dtype_name.empty());
-  PathString model_name = ORT_TSTR("nv_execution_provider_" + dtype_name + ".onnx");
+  const std::string model_name_str = "nv_execution_provider_" + dtype_name + ".onnx";
+  const PathString model_name = ToPathString(model_name_str);
   std::string graph_name = "test" + dtype_name;
   std::vector<int> dims = {1, -1, -1};