Added initial implementation

cehongwang · cehongwang · commit e22be64ab114 · 2025-07-29T20:34:00.000Z
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -61,7 +61,8 @@ TRTEngine::TRTEngine(
     const Platform& target_platform,
     bool hardware_compatible,
     bool requires_output_allocator,
-    const std::string& serialized_metadata)
+    const std::string& serialized_metadata,
+    const ResourceAllocationStrategy& resource_allocation_strategy)
     : TRTEngine(
           "deserialized_trt",
           serialized_engine,
@@ -71,7 +72,8 @@ TRTEngine::TRTEngine(
           target_platform,
           hardware_compatible,
           requires_output_allocator,
-          serialized_metadata) {}
+          serialized_metadata,
+          resource_allocation_strategy) {}
 
 TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
     : TRTEngine(
@@ -83,7 +85,8 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
           Platform(serialized_info[TARGET_PLATFORM_IDX]),
           static_cast<bool>(std::stoi(serialized_info[HW_COMPATIBLE_IDX])),
           static_cast<bool>(std::stoi(serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX])),
-          serialized_info[SERIALIZED_METADATA_IDX]) {}
+          serialized_info[SERIALIZED_METADATA_IDX],
+          resource_allocation_strategy_from_string(serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX])) {}
 
 TRTEngine::TRTEngine(
     const std::string& mod_name,
@@ -94,7 +97,8 @@ TRTEngine::TRTEngine(
     const Platform& target_platform,
     bool hardware_compatible,
     bool requires_output_allocator,
-    const std::string& serialized_metadata) {
+    const std::string& serialized_metadata,
+    const ResourceAllocationStrategy& resource_allocation_strategy) {
   TORCHTRT_CHECK(
       is_supported_on_current_platform(target_platform),
       "This engine was not built to run on this platform (built for: " << target_platform << ", current platform: "
@@ -124,7 +128,12 @@ TRTEngine::TRTEngine(
     cuda_engine->setWeightStreamingBudgetV2(budget_bytes);
   }
 
-  exec_ctx = make_trt(cuda_engine->createExecutionContext());
+  if (this->resource_allocation_strategy == ResourceAllocationStrategy::kDynamic) {
+    this->exec_ctx =
+        make_trt(cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kON_PROFILE_CHANGE));
+  } else {
+    this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
+  }
   TORCHTRT_CHECK((exec_ctx.get() != nullptr), "Unable to create TensorRT execution context");
 
   runtime_states.old_cudagraphs = CUDAGRAPHS_MODE;
@@ -436,7 +445,8 @@ FlattenedState TRTEngine::__obj_flatten__() {
       std::tuple("hardware_compatible", serialized_info[HW_COMPATIBLE_IDX]),
       std::tuple("serialized_metadata", serialized_info[SERIALIZED_METADATA_IDX]),
       std::tuple("requires_output_allocator", serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX]),
-      std::tuple("target_platform", serialized_info[TARGET_PLATFORM_IDX]));
+      std::tuple("target_platform", serialized_info[TARGET_PLATFORM_IDX]),
+      std::tuple("resource_allocation_strategy", serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX]));
 }
 
 std::vector<std::string> TRTEngine::serialize() {
@@ -459,6 +469,8 @@ std::vector<std::string> TRTEngine::serialize() {
   serialized_info[REQUIRES_OUTPUT_ALLOCATOR_IDX] = this->requires_output_allocator ? "1" : "0";
   serialized_info[SERIALIZED_METADATA_IDX] = this->serialized_metadata;
   serialized_info[TARGET_PLATFORM_IDX] = this->target_platform.serialize();
+  serialized_info[RESOURCE_ALLOCATION_STRATEGY_IDX] =
+      resource_allocation_strategy_to_string(this->resource_allocation_strategy);
 
   return serialized_info;
 }
@@ -467,6 +479,19 @@ void TRTEngine::reset_captured_graph() {
   cudagraph.reset();
 }
 
+void TRTEngine::set_resource_allocation_strategy(TRTEngine::ResourceAllocationStrategy new_strategy) {
+  if (new_strategy != this->resource_allocation_strategy) {
+    this->resource_allocation_strategy = new_strategy;
+    if (this->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) {
+      std::cout << "Setting resource allocation strategy to dynamic" << std::endl;
+      this->exec_ctx = make_trt(cuda_engine->createExecutionContext());
+    } else {
+      this->exec_ctx = make_trt(
+          cuda_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kON_PROFILE_CHANGE));
+    }
+  }
+}
+
 } // namespace runtime
 } // namespace core
 } // namespace torch_tensorrt
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -29,7 +29,8 @@ using FlattenedState = std::tuple<
     std::tuple<std::string, std::string>, // HW compatibility
     std::tuple<std::string, std::string>, // requires_output_allocator
     std::tuple<std::string, std::string>, // serialized metadata
-    std::tuple<std::string, std::string>>; // Platform
+    std::tuple<std::string, std::string>, // Platform
+    std::tuple<std::string, std::string>>; // Resource Allocation Strategy
 
 struct TorchTRTRuntimeStates {
   // Indicates whether CUDAGraphs were enabled in the previous execute_engine
@@ -98,6 +99,8 @@ class DynamicOutputAllocator : public nvinfer1::IOutputAllocator {
 };
 
 struct TRTEngine : torch::CustomClassHolder {
+  // Resource Allocation Strategy
+  enum ResourceAllocationStrategy { kStatic, kDynamic };
   // Each engine needs it's own runtime object
   std::shared_ptr<nvinfer1::IRuntime> rt;
   std::shared_ptr<nvinfer1::ICudaEngine> cuda_engine;
@@ -128,7 +131,9 @@ struct TRTEngine : torch::CustomClassHolder {
       const Platform& target_platform = get_current_platform(),
       bool hardware_compatible = false,
       bool requires_output_allocator = false,
-      const std::string& serialized_metadata = "");
+      const std::string& serialized_metadata = "",
+      const TRTEngine::ResourceAllocationStrategy& resource_allocation_strategy =
+          TRTEngine::ResourceAllocationStrategy::kStatic);
 
   TRTEngine(std::vector<std::string> serialized_info);
 
@@ -141,7 +146,9 @@ struct TRTEngine : torch::CustomClassHolder {
       const Platform& target_platform = get_current_platform(),
       bool hardware_compatible = false,
       bool requires_output_allocator = false,
-      const std::string& serialized_metadata = "");
+      const std::string& serialized_metadata = "",
+      const TRTEngine::ResourceAllocationStrategy& resource_allocation_strategy =
+          TRTEngine::ResourceAllocationStrategy::kStatic);
 
   TRTEngine& operator=(const TRTEngine& other);
   std::string to_str() const;
@@ -200,6 +207,9 @@ struct TRTEngine : torch::CustomClassHolder {
   std::string cuda_graph_debug_path;
   std::mutex mu;
   std::unique_ptr<TRTEngineProfiler> trt_engine_profiler;
+  ResourceAllocationStrategy resource_allocation_strategy = kStatic;
+  void set_resource_allocation_strategy(ResourceAllocationStrategy new_strategy);
+  ResourceAllocationStrategy get_resource_allocation_strategy();
 };
 
 } // namespace runtime
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -201,6 +201,12 @@ void create_output_allocator(c10::intrusive_ptr<TRTEngine> compiled_engine) {
 }
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
+  torch::Tensor dynamic_workspace;
+  if (compiled_engine->resource_allocation_strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) {
+    dynamic_workspace = torch::empty(compiled_engine->cuda_engine->getDeviceMemorySizeV2(), {torch::kCUDA});
+    compiled_engine->exec_ctx->setDeviceMemory(dynamic_workspace.data_ptr());
+  }
+
   auto run_standard_execution = [&]() {
     bool cudagraphs_enabled = (CUDAGRAPHS_MODE == SUBGRAPH_CUDAGRAPHS);
     bool shape_changed = _validate_shapes(inputs, compiled_engine);
diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp
@@ -22,6 +22,21 @@ std::string serialize_bindings(const std::vector<std::string>& bindings) {
   return serialized_binding_info;
 }
 
+std::string resource_allocation_strategy_to_string(TRTEngine::ResourceAllocationStrategy strategy) {
+  if (strategy == TRTEngine::ResourceAllocationStrategy::kDynamic) {
+    return std::string("kDynamic");
+  } else {
+    return std::string("kStatic");
+  }
+}
+
+TRTEngine::ResourceAllocationStrategy resource_allocation_strategy_from_string(const std::string& str) {
+  if (str == "kDynamic")
+    return TRTEngine::ResourceAllocationStrategy::kDynamic;
+  else
+    return TRTEngine::ResourceAllocationStrategy::kStatic;
+}
+
 static const std::string sym_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; //=
 std::string base64_encode(const std::string& in) {
   std::string out;
@@ -90,6 +105,13 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion =
         .def("get_engine_layer_info", &TRTEngine::get_engine_layer_info)
         .def("infer_outputs", &TRTEngine::infer_outputs)
         .def("reset_captured_graph", &TRTEngine::reset_captured_graph)
+        .def(
+            "_use_dynamically_allocated_resources",
+            [](const c10::intrusive_ptr<TRTEngine>& self, bool dynamic) -> void {
+              self->set_resource_allocation_strategy(
+                  dynamic ? TRTEngine::ResourceAllocationStrategy::kDynamic
+                          : TRTEngine::ResourceAllocationStrategy::kStatic);
+            })
         .def_readwrite("use_pre_allocated_outputs", &TRTEngine::use_pre_allocated_outputs)
         .def_readwrite("use_output_allocator_outputs", &TRTEngine::use_output_allocator_outputs)
         .def_property(
@@ -135,6 +157,7 @@ TORCH_LIBRARY(tensorrt, m) {
   m.def("TARGET_PLATFORM_IDX", []() -> int64_t { return TARGET_PLATFORM_IDX; });
   m.def("REQUIRES_OUTPUT_ALLOCATOR_IDX", []() -> int64_t { return REQUIRES_OUTPUT_ALLOCATOR_IDX; });
   m.def("SERIALIZATION_LEN", []() -> int64_t { return SERIALIZATION_LEN; });
+  m.def("RESOURCE_ALLOCATION_STRATEGY_IDX", []() -> int64_t { return RESOURCE_ALLOCATION_STRATEGY_IDX; });
   m.def("_platform_linux_x86_64", []() -> std::string {
     auto it = get_platform_name_map().find(Platform::PlatformEnum::kLINUX_X86_64);
     return it->second;
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
@@ -38,13 +38,17 @@ typedef enum {
   SERIALIZED_METADATA_IDX,
   TARGET_PLATFORM_IDX,
   REQUIRES_OUTPUT_ALLOCATOR_IDX,
+  RESOURCE_ALLOCATION_STRATEGY_IDX,
   SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO
 } SerializedInfoIndex;
 
 std::string base64_encode(const std::string& in);
 std::string base64_decode(const std::string& in);
 std::string serialize_bindings(const std::vector<std::string>& bindings);
 
+std::string resource_allocation_strategy_to_string(TRTEngine::ResourceAllocationStrategy strategy);
+TRTEngine::ResourceAllocationStrategy resource_allocation_strategy_from_string(const std::string& str);
+
 c10::optional<RTDevice> get_most_compatible_device(
     const RTDevice& target_device,
     const RTDevice& curr_device = RTDevice(),
diff --git a/examples/dynamo/dynamic_memory_allocation.py b/examples/dynamo/dynamic_memory_allocation.py
@@ -0,0 +1,36 @@
+# %%
+import numpy as np
+import torch
+import torch_tensorrt as torch_trt
+import torchvision.models as models
+from diffusers import DiffusionPipeline
+
+np.random.seed(5)
+torch.manual_seed(5)
+inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
+
+settings = {
+    "ir": "dynamo",
+    "use_python_runtime": False,
+    "enabled_precisions": {torch.float32},
+    "immutable_weights": False,
+}
+
+model = models.resnet152(pretrained=True).eval().to("cuda")
+compiled_module = torch_trt.compile(model, inputs=inputs, **settings)
+print((torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3)
+compiled_module(*inputs)
+
+breakpoint()
+with torch_trt.dynamo.runtime.ResourceAllocatorContext(compiled_module):
+    print(
+        "Memory used (GB):",
+        (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3,
+    )
+    breakpoint()
+    compiled_module(*inputs)
+    print(
+        "Memory used (GB):",
+        (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024**3,
+    )
+    breakpoint()
diff --git a/py/torch_tensorrt/dynamo/runtime/_ResourceAllocator.py b/py/torch_tensorrt/dynamo/runtime/_ResourceAllocator.py
@@ -0,0 +1,30 @@
+from typing import Any
+
+import torch
+
+
+class ResourceAllocatorContext(torch.nn.Module):  # type: ignore[misc]
+    """
+    ResourceAllocatorContext is a context manager module that temporarily enables dynamic resource allocation
+    for all TRT submodules of the given compiled_module. When entering the context,
+    it sets these submodules to use dynamically allocated resources. Upon exiting, it restores them to their
+    original (static) resource allocation mode.
+    """
+
+    def __init__(
+        self,
+        compiled_module: torch.nn.Module,
+    ) -> None:
+        super(ResourceAllocatorContext, self).__init__()
+        self.compiled_module = compiled_module
+
+    def __enter__(self) -> None:
+        print("Entering resource allocator context")
+        for name, submodule in self.compiled_module.named_modules():
+            if "_run_on_acc" in name:
+                submodule.use_dynamically_allocated_resources(dynamic=True)
+
+    def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None:
+        for name, submodule in self.compiled_module.named_modules():
+            if "_run_on_acc" in name:
+                submodule.use_dynamically_allocated_resources(dynamic=False)
diff --git a/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_TorchTensorRTModule.py
@@ -50,7 +50,10 @@
     REQUIRES_OUTPUT_ALLOCATOR_IDX = (
         torch.ops.tensorrt.REQUIRES_OUTPUT_ALLOCATOR_IDX()
     )  # 9
-    SERIALIZATION_LEN = torch.ops.tensorrt.SERIALIZATION_LEN()  # 10
+    RESOURCE_ALLOCATION_STRATEGY_IDX = (
+        torch.ops.tensorrt.RESOURCE_ALLOCATION_STRATEGY_IDX()
+    )  # 10
+    SERIALIZATION_LEN = torch.ops.tensorrt.SERIALIZATION_LEN()  # 11
 
 
 @for_all_methods(needs_torch_tensorrt_runtime)
@@ -139,6 +142,7 @@ def __init__(
         self.serialized_engine = serialized_engine
         self.engine = None
         self.requires_output_allocator = requires_output_allocator
+        self.resource_allocation_strategy = 0  # Default to static allocation TODO: Make this configurable with the context manager
 
         if (
             serialized_engine
@@ -184,6 +188,9 @@ def _pack_engine_info(self) -> List[str | bytes]:
         engine_info[REQUIRES_OUTPUT_ALLOCATOR_IDX] = str(
             int(self.requires_output_allocator)
         )
+        engine_info[RESOURCE_ALLOCATION_STRATEGY_IDX] = str(
+            int(self.resource_allocation_strategy)
+        )
 
         return engine_info
 
@@ -212,6 +219,9 @@ def set_device_memory_budget(self, budget_bytes: int) -> int:
     def _reset_captured_graph(self) -> None:
         self.engine.reset_captured_graph()
 
+    def use_dynamically_allocated_resources(self, dynamic: bool = False) -> None:
+        self.engine._use_dynamically_allocated_resources(dynamic)
+
     def setup_engine(self) -> None:
         """
         Setup engine for a module which has deferred engine setup.
diff --git a/py/torch_tensorrt/dynamo/runtime/__init__.py b/py/torch_tensorrt/dynamo/runtime/__init__.py
@@ -2,6 +2,9 @@
 from torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule import (  # noqa: F401
     PythonTorchTensorRTModule,
 )
+from torch_tensorrt.dynamo.runtime._ResourceAllocator import (  # noqa: F401
+    ResourceAllocatorContext,
+)
 from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import (  # noqa: F401
     TorchTensorRTModule,
 )

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,9 @@`
`2`	`2`	`from torch_tensorrt.dynamo.runtime._PythonTorchTensorRTModule import ( # noqa: F401`
`3`	`3`	`PythonTorchTensorRTModule,`
`4`	`4`	`)`
	`5`	`+from torch_tensorrt.dynamo.runtime._ResourceAllocator import ( # noqa: F401`
	`6`	`+ ResourceAllocatorContext,`
	`7`	`+)`
`5`	`8`	`from torch_tensorrt.dynamo.runtime._TorchTensorRTModule import ( # noqa: F401`
`6`	`9`	`TorchTensorRTModule,`
`7`	`10`	`)`