2020#include " onnx_ctx_model_helper.h"
2121#include " core/providers/cuda/shared_inc/cuda_call.h"
2222#include " core/providers/cuda/cuda_graph.h"
23- #include " core/providers/cuda/math/unary_elementwise_ops_impl.h"
2423#include " core/session/allocator_adapters.h"
2524#include " cuda_runtime_api.h"
2625#include " core/common/parse_string.h"
@@ -85,40 +84,6 @@ struct ShutdownProtobuf {
8584
8685namespace onnxruntime {
8786
88- namespace cuda {
89- template <>
90- void Impl_Cast (
91- cudaStream_t stream,
92- const int64_t * input_data, int32_t * output_data,
93- size_t count) {
94- return g_host->cuda__Impl_Cast (static_cast <void *>(stream), input_data, output_data, count);
95- }
96-
97- template <>
98- void Impl_Cast (
99- cudaStream_t stream,
100- const int32_t * input_data, int64_t * output_data,
101- size_t count) {
102- return g_host->cuda__Impl_Cast (static_cast <void *>(stream), input_data, output_data, count);
103- }
104-
105- template <>
106- void Impl_Cast (
107- cudaStream_t stream,
108- const double * input_data, float * output_data,
109- size_t count) {
110- return g_host->cuda__Impl_Cast (static_cast <void *>(stream), input_data, output_data, count);
111- }
112-
113- template <>
114- void Impl_Cast (
115- cudaStream_t stream,
116- const float * input_data, double * output_data,
117- size_t count) {
118- return g_host->cuda__Impl_Cast (static_cast <void *>(stream), input_data, output_data, count);
119- }
120- } // namespace cuda
121-
12287void * OutputAllocator::reallocateOutputAsync (char const * /* tensorName*/ , void * /* currentMemory*/ , uint64_t size,
12388 uint64_t /* alignment*/ , cudaStream_t /* stream*/ ) noexcept {
12489 // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
@@ -372,51 +337,19 @@ bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationPr
372337 break ; \
373338 }
374339
375- #define CASE_GET_CAST_INPUT_TENSOR (DATA_TYPE, SrcT, DstT ) \
376- case DATA_TYPE: { \
377- auto input_tensor_ptr = input_tensor.GetTensorData <SrcT>(); \
378- skip_input_binding_allowed = false ; \
379- if (input_tensor_ptr != nullptr && elem_cnt > 0 ) { \
380- scratch_buffers.push_back (IAllocator::MakeUniquePtrFromOrtAllocator<void >(alloc, elem_cnt * sizeof (DstT))); \
381- data = scratch_buffers.back ().get (); \
382- cuda::Impl_Cast<SrcT, DstT>(stream, input_tensor_ptr, reinterpret_cast <DstT*>(data), elem_cnt); \
383- } else { \
384- scratch_buffers.push_back (IAllocator::MakeUniquePtrFromOrtAllocator<void >(alloc, 1 )); \
385- data = scratch_buffers.back ().get (); \
386- } \
387- break ; \
388- }
389-
390340#define CASE_GET_OUTPUT_TENSOR (DATA_TYPE, SrcT ) \
391341 case DATA_TYPE: { \
392342 auto output_tensor_ptr = output_tensor.GetTensorMutableData <SrcT>(); \
393343 data_ptr = output_tensor_ptr; \
394344 if (output_tensor_ptr != nullptr && elem_cnt > 0 ) { \
395- buffers[output_name] = output_tensor_ptr; \
345+ buffer = output_tensor_ptr; \
396346 } else { \
397347 scratch_buffers.push_back (IAllocator::MakeUniquePtrFromOrtAllocator<void >(alloc, 1 )); \
398- buffers[output_name] = scratch_buffers.back ().get (); \
348+ buffer = scratch_buffers.back ().get (); \
399349 } \
400350 break ; \
401351 }
402352
403- #define CASE_GET_CAST_OUTPUT_TENSOR (DATA_TYPE, SrcT, DstT ) \
404- case DATA_TYPE: { \
405- auto output_tensor_ptr = output_tensor.GetTensorMutableData <SrcT>(); \
406- data_ptr = output_tensor_ptr; \
407- skip_output_binding_allowed = false ; \
408- if (output_tensor_ptr != nullptr && elem_cnt > 0 ) { \
409- scratch_buffers.push_back (IAllocator::MakeUniquePtrFromOrtAllocator<void >(alloc, elem_cnt * sizeof (DstT))); \
410- buffers[output_name] = scratch_buffers.back ().get (); \
411- output_dim_sizes[i] = static_cast <int >(elem_cnt); \
412- } else { \
413- scratch_buffers.push_back (IAllocator::MakeUniquePtrFromOrtAllocator<void >(alloc, 1 )); \
414- buffers[output_name] = scratch_buffers.back ().get (); \
415- output_dim_sizes[i] = 1 ; \
416- } \
417- break ; \
418- }
419-
420353#define CASE_COPY_TENSOR (DATA_TYPE, DstT ) \
421354 case DATA_TYPE: { \
422355 auto output_tensor_ptr = output_tensor.GetTensorMutableData <DstT>(); \
@@ -426,15 +359,6 @@ bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationPr
426359 break ; \
427360 }
428361
429- #define CASE_CAST_TENSOR (DATA_TYPE, SrcT, DstT ) \
430- case DATA_TYPE: { \
431- auto output_tensor_ptr = output_tensor.GetTensorMutableData <DstT>(); \
432- if (output_tensor_ptr != nullptr && elem_cnt > 0 ) { \
433- cuda::Impl_Cast<SrcT, DstT>(stream, reinterpret_cast <SrcT*>(allocator->getBuffer ()), reinterpret_cast <DstT*>(output_tensor_ptr), elem_cnt); \
434- } \
435- break ; \
436- }
437-
438362/*
439363 * Set Nv executio context input.
440364 *
@@ -557,7 +481,6 @@ Status BindContextInput(Ort::KernelContext& ctx,
557481 CASE_GET_INPUT_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t )
558482 CASE_GET_INPUT_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t )
559483 CASE_GET_INPUT_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t )
560- CASE_GET_CAST_INPUT_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double , float )
561484 default : {
562485 return ORT_MAKE_STATUS (ONNXRUNTIME, EP_FAIL,
563486 " NvTensorRTRTX EP input onnx tensor data type: " + std::to_string (tensor_type) + " not supported." );
@@ -582,8 +505,6 @@ Status BindContextInput(Ort::KernelContext& ctx,
582505 * param output_type - Data type of the output
583506 * param i - Output iteration index
584507 * param output_tensors - Output iteration index to output's ORT value
585- * param output_dim_sizes - Output iteration index to the multiplocation of its shape's dimensions
586- * param dds_output_set - DDS output set
587508 * param dds_output_allocator_map - DDS output to its allocator
588509 * param scratch_buffer - The allocation buffer created by TRT EP
589510 * param allocator - ORT allocator
@@ -595,16 +516,11 @@ Status BindContextOutput(Ort::KernelContext& ctx,
595516 const char * output_name,
596517 size_t output_index,
597518 size_t output_type,
598- size_t i,
599- std::unordered_map<size_t , Ort::UnownedValue>& output_tensors,
600- std::unordered_map<size_t , int >& output_dim_sizes,
601519 DDSOutputAllocatorMap& dds_output_allocator_map,
602520 std::vector<IAllocatorUniquePtr<void >>& scratch_buffers,
603521 OrtAllocator* alloc,
604- std::unordered_map<char const *, void *>& buffers,
605522 nvinfer1::Dims& dims,
606- void *& data_ptr,
607- bool & skip_output_binding_allowed) {
523+ void *& data_ptr) {
608524 // Get output shape
609525 dims = trt_context->getTensorShape (output_name);
610526 int nb_dims = dims.nbDims ;
@@ -634,10 +550,11 @@ Status BindContextOutput(Ort::KernelContext& ctx,
634550 data_ptr = nullptr ; // Set data_ptr to nullptr for DDS output binding.
635551 }
636552 } else {
637- output_tensors[i] = ctx.GetOutput (output_index, dims.d , nb_dims);
638- auto & output_tensor = output_tensors[i];
553+ auto output_tensor = ctx.GetOutput (output_index, dims.d , nb_dims);
639554 const auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo ().GetElementCount ();
640555
556+ void * buffer = nullptr ;
557+
641558 switch (output_type) {
642559 // below macros set data_ptr and skip_output_binding_allowed variables
643560 CASE_GET_OUTPUT_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float )
@@ -648,13 +565,12 @@ Status BindContextOutput(Ort::KernelContext& ctx,
648565 CASE_GET_OUTPUT_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t )
649566 CASE_GET_OUTPUT_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t )
650567 CASE_GET_OUTPUT_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t )
651- CASE_GET_CAST_OUTPUT_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double , float )
652568 default : {
653569 return ORT_MAKE_STATUS (ONNXRUNTIME, EP_FAIL,
654570 " NvTensorRTRTX EP output tensor data type: " + std::to_string (output_type) + " not supported." );
655571 }
656572 }
657- trt_context->setTensorAddress (output_name, buffers[output_name] );
573+ trt_context->setTensorAddress (output_name, buffer );
658574 }
659575
660576 return Status::OK ();
@@ -711,7 +627,6 @@ Status BindKernelOutput(Ort::KernelContext& ctx,
711627 CASE_COPY_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t )
712628 CASE_COPY_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t )
713629 CASE_COPY_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t )
714- CASE_CAST_TENSOR (ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, float , double )
715630 default : {
716631 return ORT_MAKE_STATUS (ONNXRUNTIME, EP_FAIL,
717632 " NvTensorRTRTX EP output tensor data type: " + std::to_string (output_type) + " not supported." );
@@ -2837,7 +2752,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
28372752 }
28382753
28392754 // Save TRT engine, other TRT objects and input/output info to map
2840- parsers_.emplace (fused_node.Name (), std::move (trt_parser));
28412755 engines_.emplace (fused_node.Name (), std::move (trt_engine));
28422756 contexts_.emplace (fused_node.Name (), std::move (trt_context));
28432757 networks_.emplace (fused_node.Name (), std::move (trt_network));
@@ -2853,7 +2767,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
28532767 compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
28542768 std::unique_ptr<TensorrtFuncState> p = std::make_unique<TensorrtFuncState>();
28552769 *p = {context->allocate_func , context->release_func , context->allocator_handle , context->node_name , builder_.get (),
2856- &parsers_[context-> node_name ], & engines_[context->node_name ], &contexts_[context->node_name ],
2770+ &engines_[context->node_name ], &contexts_[context->node_name ],
28572771 &networks_[context->node_name ], input_info_[context->node_name ], output_info_[context->node_name ],
28582772 input_shape_ranges_[context->node_name ], &tensorrt_mu_,
28592773 engine_cache_enable_, cache_path_,
@@ -2891,7 +2805,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
28912805 auto trt_engine = trt_state->engine ->get ();
28922806 auto trt_context = trt_state->context ->get ();
28932807 auto trt_profiles = trt_state->profiles ;
2894- int num_outputs = static_cast <int >(output_indexes.size ());
28952808 std::unordered_set<std::string> input_names;
28962809
28972810 if (alloc_ == nullptr ) {
@@ -2966,16 +2879,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
29662879 /*
29672880 * Set output shapes and bind output buffers
29682881 */
2969- std::unordered_map<char const *, void *> buffers;
2970- buffers.reserve (num_outputs);
2971- using OutputOrtValue = Ort::UnownedValue;
2972- std::unordered_map<size_t , OutputOrtValue> output_tensors;
2973- output_tensors.reserve (num_outputs);
2974- std::unordered_map<size_t , int > output_dim_sizes;
2975- output_dim_sizes.reserve (num_outputs);
2976-
29772882 if (require_io_binding) {
2978- bool skip_output_binding_allowed = true ;
29792883 for (size_t i = 0 , end = output_binding_names.size (); i < end; ++i) {
29802884 char const * output_name = output_binding_names[i];
29812885
@@ -2993,16 +2897,15 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
29932897
29942898 nvinfer1::Dims dims;
29952899 void * data_ptr = nullptr ;
2996- Status status = BindContextOutput (ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
2997- dds_output_allocator_map, scratch_buffers, alloc, buffers, dims, data_ptr, skip_output_binding_allowed);
2900+
2901+ Status status = BindContextOutput (ctx, trt_context, output_name, output_index, output_type,
2902+ dds_output_allocator_map, scratch_buffers, alloc, dims, data_ptr);
29982903 if (status != Status::OK ()) {
29992904 return ORT_MAKE_STATUS (ONNXRUNTIME, EP_FAIL, status.ErrorMessage ());
30002905 }
30012906
30022907 trt_state->output_tensors [output_index] = TensorParams{data_ptr, dims};
30032908 }
3004-
3005- trt_state->skip_io_binding_allowed = trt_state->skip_io_binding_allowed | skip_output_binding_allowed;
30062909 }
30072910
30082911 // Set execution context memory
@@ -3082,14 +2985,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
30822985 if (status != Status::OK ()) {
30832986 return ORT_MAKE_STATUS (ONNXRUNTIME, FAIL, status.ErrorMessage ());
30842987 }
3085- } else {
3086- auto & output_tensor = output_tensors[i];
3087- if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
3088- auto output_tensor_ptr = output_tensor.GetTensorMutableData <double >();
3089- if (output_tensor_ptr != nullptr ) {
3090- cuda::Impl_Cast<float , double >(stream, reinterpret_cast <float *>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
3091- }
3092- }
30932988 }
30942989 }
30952990
@@ -3213,7 +3108,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
32133108 auto & dds_output_allocator_map = this ->dds_output_allocator_maps_ [fused_node_name];
32143109 auto trt_engine = trt_state->engine ->get ();
32153110 auto trt_context = trt_state->context ->get ();
3216- int num_outputs = static_cast <int >(output_indexes.size ());
32173111 std::unordered_map<std::string, std::vector<int32_t >> shape_tensor_values; // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run
32183112 std::unordered_map<std::string, std::vector<int64_t >> shape_tensor_values_int64; // same as above but for int64 shape tensor input
32193113
@@ -3283,16 +3177,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
32833177 /*
32843178 * Set output shapes and bind output buffers
32853179 */
3286- std::unordered_map<char const *, void *> buffers;
3287- buffers.reserve (num_outputs);
3288- using OutputOrtValue = Ort::UnownedValue;
3289- std::unordered_map<size_t , OutputOrtValue> output_tensors;
3290- output_tensors.reserve (num_outputs);
3291- std::unordered_map<size_t , int > output_dim_sizes;
3292- output_dim_sizes.reserve (num_outputs);
3293-
32943180 if (require_io_binding) {
3295- bool skip_output_binding_allowed = true ;
32963181 for (size_t i = 0 , end = output_binding_names.size (); i < end; ++i) {
32973182 char const * output_name = output_binding_names[i];
32983183
@@ -3311,16 +3196,14 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
33113196 nvinfer1::Dims dims;
33123197 void * data_ptr = nullptr ;
33133198
3314- Status status = BindContextOutput (ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
3315- dds_output_allocator_map, scratch_buffers, alloc, buffers, dims, data_ptr, skip_output_binding_allowed );
3199+ Status status = BindContextOutput (ctx, trt_context, output_name, output_index, output_type,
3200+ dds_output_allocator_map, scratch_buffers, alloc, dims, data_ptr);
33163201 if (status != Status::OK ()) {
33173202 return ORT_MAKE_STATUS (ONNXRUNTIME, EP_FAIL, status.ErrorMessage ());
33183203 }
33193204
33203205 trt_state->output_tensors [output_index] = TensorParams{data_ptr, dims};
33213206 }
3322-
3323- trt_state->skip_io_binding_allowed = trt_state->skip_io_binding_allowed | skip_output_binding_allowed;
33243207 }
33253208
33263209 // Set execution context memory
@@ -3401,14 +3284,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
34013284 if (status != Status::OK ()) {
34023285 return ORT_MAKE_STATUS (ONNXRUNTIME, FAIL, status.ErrorMessage ());
34033286 }
3404- } else {
3405- auto & output_tensor = output_tensors[i];
3406- if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
3407- auto output_tensor_ptr = output_tensor.GetTensorMutableData <double >();
3408- if (output_tensor_ptr != nullptr ) {
3409- cuda::Impl_Cast<float , double >(stream, reinterpret_cast <float *>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
3410- }
3411- }
34123287 }
34133288 }
34143289
0 commit comments