@@ -387,6 +387,44 @@ static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
387387 return false ;
388388}
389389
390+ static bool Is16BitTensor (const onnxruntime::NodeArg* node_arg) {
391+ const auto * type_proto = node_arg ? node_arg->TypeAsProto () : nullptr ;
392+ return type_proto && type_proto->has_tensor_type () &&
393+ (type_proto->tensor_type ().elem_type () == ONNX_NAMESPACE::TensorProto_DataType_UINT16 ||
394+ type_proto->tensor_type ().elem_type () == ONNX_NAMESPACE::TensorProto_DataType_INT16);
395+ }
396+
397+ // Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization
398+ static bool IsQDQGraphWithUint16OrInt16 (const onnxruntime::GraphViewer& graph_viewer) {
399+ std::unordered_set<std::string> qdq_ops = {" QuantizeLinear" , " DequantizeLinear" };
400+ const auto & node_indices = graph_viewer.GetNodesInTopologicalOrder ();
401+
402+ for (size_t i = 0 ; i < node_indices.size (); i++) {
403+ gsl::not_null<const onnxruntime::Node*> node (graph_viewer.GetNode (node_indices[i]));
404+
405+ if (qdq_ops.find (node->OpType ()) != qdq_ops.end ()) {
406+ const auto & input_defs = node->InputDefs ();
407+
408+ if (node->OpType () == " DequantizeLinear" ) {
409+ // DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output]
410+ // Check quantized input tensor and optional zero point
411+ if (Is16BitTensor (input_defs.empty () ? nullptr : input_defs[0 ]) ||
412+ (input_defs.size () >= 3 && Is16BitTensor (input_defs[2 ]))) {
413+ return true ;
414+ }
415+ } else if (node->OpType () == " QuantizeLinear" ) {
416+ // QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output]
417+ const auto & output_defs = node->OutputDefs ();
418+ if (Is16BitTensor (output_defs.empty () ? nullptr : output_defs[0 ]) ||
419+ (input_defs.size () >= 3 && Is16BitTensor (input_defs[2 ]))) {
420+ return true ;
421+ }
422+ }
423+ }
424+ }
425+ return false ;
426+ }
427+
390428static void DumpOpenVINOEPModel ([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
391429 [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
392430 [[maybe_unused]] const onnxruntime::Node& fused_node) {
@@ -445,6 +483,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
445483 }
446484#endif
447485
486+ // Check if the graph is QDQ and has int16 or uint16 quantization
487+ // If so, we will apply the QDQ scales fix transformation (for GPU device only)
488+ bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16 (subgraph);
489+
448490 const auto & onnx_model_path_name = subgraph.ModelPath ();
449491 // QDQ stripping enabled only for the NPU and experimentally on the GPU
450492 if ((session_context_.device_type .find (" NPU" ) != std::string::npos) &&
@@ -458,7 +500,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
458500 ORT_ENFORCE (status.IsOK (), status.ErrorMessage ());
459501 return model_proto;
460502 } else if ((session_context_.device_type .find (" GPU" ) != std::string::npos) &&
461- enable_ovep_qdq_optimizer ) {
503+ is_qdq_graph_uint16_or_int16 ) {
462504 // Create a copy of the model
463505 std::unique_ptr<onnxruntime::Model> model;
464506 Status status = qdq_scales_fix::Transform (subgraph, logger, model);
0 commit comments