Skip to content

Commit 6499977

Browse files
ishwar-raut1gaugarg-nv
authored andcommitted
python GPU IO Bindings for NVIDIA (microsoft#25776)
### Description <!-- Describe your changes. --> 1. A Small change to use the shared allocator in Python binding. 2. Remove the FP64 support from the EP. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> The Python GPU IO binding is necessary for performance. The change will enable the shared allocator for GPU allocation. The FP64 was using the FP32 inference—aligned WRT TRT RTX support. --------- Co-authored-by: Gaurav Garg <gaugarg@nvidia.com>
1 parent 48f882f commit 6499977

File tree

4 files changed

+482
-141
lines changed

4 files changed

+482
-141
lines changed

onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc

Lines changed: 13 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
#include "onnx_ctx_model_helper.h"
2121
#include "core/providers/cuda/shared_inc/cuda_call.h"
2222
#include "core/providers/cuda/cuda_graph.h"
23-
#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
2423
#include "core/session/allocator_adapters.h"
2524
#include "cuda_runtime_api.h"
2625
#include "core/common/parse_string.h"
@@ -85,40 +84,6 @@ struct ShutdownProtobuf {
8584

8685
namespace onnxruntime {
8786

88-
namespace cuda {
89-
template <>
90-
void Impl_Cast(
91-
cudaStream_t stream,
92-
const int64_t* input_data, int32_t* output_data,
93-
size_t count) {
94-
return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
95-
}
96-
97-
template <>
98-
void Impl_Cast(
99-
cudaStream_t stream,
100-
const int32_t* input_data, int64_t* output_data,
101-
size_t count) {
102-
return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
103-
}
104-
105-
template <>
106-
void Impl_Cast(
107-
cudaStream_t stream,
108-
const double* input_data, float* output_data,
109-
size_t count) {
110-
return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
111-
}
112-
113-
template <>
114-
void Impl_Cast(
115-
cudaStream_t stream,
116-
const float* input_data, double* output_data,
117-
size_t count) {
118-
return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
119-
}
120-
} // namespace cuda
121-
12287
void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
12388
uint64_t /*alignment*/, cudaStream_t /*stream*/) noexcept {
12489
// Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
@@ -372,51 +337,19 @@ bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationPr
372337
break; \
373338
}
374339

375-
#define CASE_GET_CAST_INPUT_TENSOR(DATA_TYPE, SrcT, DstT) \
376-
case DATA_TYPE: { \
377-
auto input_tensor_ptr = input_tensor.GetTensorData<SrcT>(); \
378-
skip_input_binding_allowed = false; \
379-
if (input_tensor_ptr != nullptr && elem_cnt > 0) { \
380-
scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
381-
data = scratch_buffers.back().get(); \
382-
cuda::Impl_Cast<SrcT, DstT>(stream, input_tensor_ptr, reinterpret_cast<DstT*>(data), elem_cnt); \
383-
} else { \
384-
scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1)); \
385-
data = scratch_buffers.back().get(); \
386-
} \
387-
break; \
388-
}
389-
390340
#define CASE_GET_OUTPUT_TENSOR(DATA_TYPE, SrcT) \
391341
case DATA_TYPE: { \
392342
auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>(); \
393343
data_ptr = output_tensor_ptr; \
394344
if (output_tensor_ptr != nullptr && elem_cnt > 0) { \
395-
buffers[output_name] = output_tensor_ptr; \
345+
buffer = output_tensor_ptr; \
396346
} else { \
397347
scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1)); \
398-
buffers[output_name] = scratch_buffers.back().get(); \
348+
buffer = scratch_buffers.back().get(); \
399349
} \
400350
break; \
401351
}
402352

403-
#define CASE_GET_CAST_OUTPUT_TENSOR(DATA_TYPE, SrcT, DstT) \
404-
case DATA_TYPE: { \
405-
auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>(); \
406-
data_ptr = output_tensor_ptr; \
407-
skip_output_binding_allowed = false; \
408-
if (output_tensor_ptr != nullptr && elem_cnt > 0) { \
409-
scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
410-
buffers[output_name] = scratch_buffers.back().get(); \
411-
output_dim_sizes[i] = static_cast<int>(elem_cnt); \
412-
} else { \
413-
scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1)); \
414-
buffers[output_name] = scratch_buffers.back().get(); \
415-
output_dim_sizes[i] = 1; \
416-
} \
417-
break; \
418-
}
419-
420353
#define CASE_COPY_TENSOR(DATA_TYPE, DstT) \
421354
case DATA_TYPE: { \
422355
auto output_tensor_ptr = output_tensor.GetTensorMutableData<DstT>(); \
@@ -426,15 +359,6 @@ bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationPr
426359
break; \
427360
}
428361

429-
#define CASE_CAST_TENSOR(DATA_TYPE, SrcT, DstT) \
430-
case DATA_TYPE: { \
431-
auto output_tensor_ptr = output_tensor.GetTensorMutableData<DstT>(); \
432-
if (output_tensor_ptr != nullptr && elem_cnt > 0) { \
433-
cuda::Impl_Cast<SrcT, DstT>(stream, reinterpret_cast<SrcT*>(allocator->getBuffer()), reinterpret_cast<DstT*>(output_tensor_ptr), elem_cnt); \
434-
} \
435-
break; \
436-
}
437-
438362
/*
439363
* Set Nv executio context input.
440364
*
@@ -557,7 +481,6 @@ Status BindContextInput(Ort::KernelContext& ctx,
557481
CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
558482
CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
559483
CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t)
560-
CASE_GET_CAST_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float)
561484
default: {
562485
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
563486
"NvTensorRTRTX EP input onnx tensor data type: " + std::to_string(tensor_type) + " not supported.");
@@ -582,8 +505,6 @@ Status BindContextInput(Ort::KernelContext& ctx,
582505
* param output_type - Data type of the output
583506
* param i - Output iteration index
584507
* param output_tensors - Output iteration index to output's ORT value
585-
* param output_dim_sizes - Output iteration index to the multiplocation of its shape's dimensions
586-
* param dds_output_set - DDS output set
587508
* param dds_output_allocator_map - DDS output to its allocator
588509
* param scratch_buffer - The allocation buffer created by TRT EP
589510
* param allocator - ORT allocator
@@ -595,16 +516,11 @@ Status BindContextOutput(Ort::KernelContext& ctx,
595516
const char* output_name,
596517
size_t output_index,
597518
size_t output_type,
598-
size_t i,
599-
std::unordered_map<size_t, Ort::UnownedValue>& output_tensors,
600-
std::unordered_map<size_t, int>& output_dim_sizes,
601519
DDSOutputAllocatorMap& dds_output_allocator_map,
602520
std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
603521
OrtAllocator* alloc,
604-
std::unordered_map<char const*, void*>& buffers,
605522
nvinfer1::Dims& dims,
606-
void*& data_ptr,
607-
bool& skip_output_binding_allowed) {
523+
void*& data_ptr) {
608524
// Get output shape
609525
dims = trt_context->getTensorShape(output_name);
610526
int nb_dims = dims.nbDims;
@@ -634,10 +550,11 @@ Status BindContextOutput(Ort::KernelContext& ctx,
634550
data_ptr = nullptr; // Set data_ptr to nullptr for DDS output binding.
635551
}
636552
} else {
637-
output_tensors[i] = ctx.GetOutput(output_index, dims.d, nb_dims);
638-
auto& output_tensor = output_tensors[i];
553+
auto output_tensor = ctx.GetOutput(output_index, dims.d, nb_dims);
639554
const auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo().GetElementCount();
640555

556+
void* buffer = nullptr;
557+
641558
switch (output_type) {
642559
// below macros set data_ptr and skip_output_binding_allowed variables
643560
CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
@@ -648,13 +565,12 @@ Status BindContextOutput(Ort::KernelContext& ctx,
648565
CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
649566
CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
650567
CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t)
651-
CASE_GET_CAST_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float)
652568
default: {
653569
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
654570
"NvTensorRTRTX EP output tensor data type: " + std::to_string(output_type) + " not supported.");
655571
}
656572
}
657-
trt_context->setTensorAddress(output_name, buffers[output_name]);
573+
trt_context->setTensorAddress(output_name, buffer);
658574
}
659575

660576
return Status::OK();
@@ -711,7 +627,6 @@ Status BindKernelOutput(Ort::KernelContext& ctx,
711627
CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
712628
CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
713629
CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t)
714-
CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, float, double)
715630
default: {
716631
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
717632
"NvTensorRTRTX EP output tensor data type: " + std::to_string(output_type) + " not supported.");
@@ -2837,7 +2752,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
28372752
}
28382753

28392754
// Save TRT engine, other TRT objects and input/output info to map
2840-
parsers_.emplace(fused_node.Name(), std::move(trt_parser));
28412755
engines_.emplace(fused_node.Name(), std::move(trt_engine));
28422756
contexts_.emplace(fused_node.Name(), std::move(trt_context));
28432757
networks_.emplace(fused_node.Name(), std::move(trt_network));
@@ -2853,7 +2767,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
28532767
compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
28542768
std::unique_ptr<TensorrtFuncState> p = std::make_unique<TensorrtFuncState>();
28552769
*p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name, builder_.get(),
2856-
&parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
2770+
&engines_[context->node_name], &contexts_[context->node_name],
28572771
&networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
28582772
input_shape_ranges_[context->node_name], &tensorrt_mu_,
28592773
engine_cache_enable_, cache_path_,
@@ -2891,7 +2805,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
28912805
auto trt_engine = trt_state->engine->get();
28922806
auto trt_context = trt_state->context->get();
28932807
auto trt_profiles = trt_state->profiles;
2894-
int num_outputs = static_cast<int>(output_indexes.size());
28952808
std::unordered_set<std::string> input_names;
28962809

28972810
if (alloc_ == nullptr) {
@@ -2966,16 +2879,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
29662879
/*
29672880
* Set output shapes and bind output buffers
29682881
*/
2969-
std::unordered_map<char const*, void*> buffers;
2970-
buffers.reserve(num_outputs);
2971-
using OutputOrtValue = Ort::UnownedValue;
2972-
std::unordered_map<size_t, OutputOrtValue> output_tensors;
2973-
output_tensors.reserve(num_outputs);
2974-
std::unordered_map<size_t, int> output_dim_sizes;
2975-
output_dim_sizes.reserve(num_outputs);
2976-
29772882
if (require_io_binding) {
2978-
bool skip_output_binding_allowed = true;
29792883
for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
29802884
char const* output_name = output_binding_names[i];
29812885

@@ -2993,16 +2897,15 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
29932897

29942898
nvinfer1::Dims dims;
29952899
void* data_ptr = nullptr;
2996-
Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
2997-
dds_output_allocator_map, scratch_buffers, alloc, buffers, dims, data_ptr, skip_output_binding_allowed);
2900+
2901+
Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type,
2902+
dds_output_allocator_map, scratch_buffers, alloc, dims, data_ptr);
29982903
if (status != Status::OK()) {
29992904
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
30002905
}
30012906

30022907
trt_state->output_tensors[output_index] = TensorParams{data_ptr, dims};
30032908
}
3004-
3005-
trt_state->skip_io_binding_allowed = trt_state->skip_io_binding_allowed | skip_output_binding_allowed;
30062909
}
30072910

30082911
// Set execution context memory
@@ -3082,14 +2985,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
30822985
if (status != Status::OK()) {
30832986
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
30842987
}
3085-
} else {
3086-
auto& output_tensor = output_tensors[i];
3087-
if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
3088-
auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
3089-
if (output_tensor_ptr != nullptr) {
3090-
cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
3091-
}
3092-
}
30932988
}
30942989
}
30952990

@@ -3213,7 +3108,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
32133108
auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
32143109
auto trt_engine = trt_state->engine->get();
32153110
auto trt_context = trt_state->context->get();
3216-
int num_outputs = static_cast<int>(output_indexes.size());
32173111
std::unordered_map<std::string, std::vector<int32_t>> shape_tensor_values; // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run
32183112
std::unordered_map<std::string, std::vector<int64_t>> shape_tensor_values_int64; // same as above but for int64 shape tensor input
32193113

@@ -3283,16 +3177,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
32833177
/*
32843178
* Set output shapes and bind output buffers
32853179
*/
3286-
std::unordered_map<char const*, void*> buffers;
3287-
buffers.reserve(num_outputs);
3288-
using OutputOrtValue = Ort::UnownedValue;
3289-
std::unordered_map<size_t, OutputOrtValue> output_tensors;
3290-
output_tensors.reserve(num_outputs);
3291-
std::unordered_map<size_t, int> output_dim_sizes;
3292-
output_dim_sizes.reserve(num_outputs);
3293-
32943180
if (require_io_binding) {
3295-
bool skip_output_binding_allowed = true;
32963181
for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
32973182
char const* output_name = output_binding_names[i];
32983183

@@ -3311,16 +3196,14 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
33113196
nvinfer1::Dims dims;
33123197
void* data_ptr = nullptr;
33133198

3314-
Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
3315-
dds_output_allocator_map, scratch_buffers, alloc, buffers, dims, data_ptr, skip_output_binding_allowed);
3199+
Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type,
3200+
dds_output_allocator_map, scratch_buffers, alloc, dims, data_ptr);
33163201
if (status != Status::OK()) {
33173202
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
33183203
}
33193204

33203205
trt_state->output_tensors[output_index] = TensorParams{data_ptr, dims};
33213206
}
3322-
3323-
trt_state->skip_io_binding_allowed = trt_state->skip_io_binding_allowed | skip_output_binding_allowed;
33243207
}
33253208

33263209
// Set execution context memory
@@ -3401,14 +3284,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
34013284
if (status != Status::OK()) {
34023285
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
34033286
}
3404-
} else {
3405-
auto& output_tensor = output_tensors[i];
3406-
if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
3407-
auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
3408-
if (output_tensor_ptr != nullptr) {
3409-
cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
3410-
}
3411-
}
34123287
}
34133288
}
34143289

0 commit comments

Comments
 (0)