Add QDQ scale propagation pass (#713)

javier-intel · web-flow · commit e2ec2b38696f · 2025-07-03T16:02:04.000+05:30
* Add pass to perform QDQ stripping and propagate scales

* Fix disconnected outptu node

* Fixes to support session.disable_quant_qdq output, remove dangling nodes and duplicate DQ nodes

* Fix lack of scales updates and remove stray QDQ nodes in certain models

* Address issues with Linux CI

* Fix for double QDQ issue
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
@@ -49,7 +49,7 @@
   endif()
   add_dependencies(onnxruntime_providers_openvino onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
   target_include_directories(onnxruntime_providers_openvino SYSTEM PUBLIC ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${OpenVINO_INCLUDE_DIR} ${OPENVINO_INCLUDE_DIR_LIST} ${PYTHON_INCLUDE_DIRS} $ENV{OPENCL_INCS} $ENV{OPENCL_INCS}/../../cl_headers/)
-  target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS} Eigen3::Eigen)
+  target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS} Eigen3::Eigen onnx_proto)
 
   target_compile_definitions(onnxruntime_providers_openvino PRIVATE FILE_NAME=\"onnxruntime_providers_openvino.dll\")
 
diff --git a/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc b/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc
@@ -52,6 +52,7 @@ static void ApplyNewInputValue(Graph& graph, Node& node, QDQ::InputIndex index,
   input_init.ToProto(new_input_tensor);
   auto new_name = graph.GenerateNodeArgName("DoubleQDQRemoved_" + node.InputDefs()[index]->Name());
   new_input_tensor.set_name(new_name);
+  new_input_tensor.add_dims(1);
   NodeArg& new_input = graph_utils::AddInitializerWithExternalData(graph, new_input_tensor);
   graph_utils::ReplaceNodeInput(node, index, new_input);
 }
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -20,6 +20,7 @@
 #include "core/providers/openvino/ov_interface.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/qdq_transformations/qdq_scales_fix.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -429,8 +430,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
 
   const auto& onnx_model_path_name = subgraph.ModelPath();
   // QDQ stripping enabled only for the NPU and experimentally on the GPU
-  if ((session_context_.device_type.find("NPU") != std::string::npos ||
-       session_context_.device_type.find("GPU") != std::string::npos) &&
+  if ((session_context_.device_type.find("NPU") != std::string::npos) &&
       (enable_ovep_qdq_optimizer || session_context_.so_share_ep_contexts)) {
     std::unique_ptr<onnxruntime::Model> model;
     Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, enable_ovep_qdq_optimizer, model, shared_context_.shared_weights);
@@ -440,6 +440,17 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
+  } else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
+      enable_ovep_qdq_optimizer) {
+    // Create a copy of the model
+    std::unique_ptr<onnxruntime::Model> model;
+    Status status = qdq_scales_fix::Transform(subgraph, logger, model);
+    auto model_proto = model->ToProto();
+    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+    print_model_proto_duration();
+    DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
+    ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
+    return model_proto;
   } else {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled";
     auto model = subgraph.CreateModel(logger);
diff --git a/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp b/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#include "ov_protobuf_utils.h"
+
+#include "core/graph/onnx_protobuf.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace openvino_ep {
+float get_float_initializer_data(const void* initializer) {
+  const auto* tp = reinterpret_cast<const ONNX_NAMESPACE::TensorProto*>(initializer);
+  ORT_ENFORCE((tp->has_data_type() && (tp->data_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT)));
+  // ORT_ENFORCE(initializer.dims_size() == 1);
+  return tp->float_data(0);
+}
+void set_float_initializer_data(const void* initializer, float data) {
+  auto* tp = (ONNX_NAMESPACE::TensorProto*)(initializer);
+  ORT_ENFORCE((tp->has_data_type() && (tp->data_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT)));
+  // ORT_ENFORCE(initializer.dims_size() == 1);
+  tp->set_float_data(0, data);
+}
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_protobuf_utils.h b/onnxruntime/core/providers/openvino/ov_protobuf_utils.h
@@ -0,0 +1,10 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+namespace onnxruntime {
+namespace openvino_ep {
+float get_float_initializer_data(const void* initializer);
+void set_float_initializer_data(const void* initializer, float data);
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h

Original file line number	Diff line number	Diff line change
`@@ -52,6 +52,7 @@ static void ApplyNewInputValue(Graph& graph, Node& node, QDQ::InputIndex index,`
`52`	`52`	`input_init.ToProto(new_input_tensor);`
`53`	`53`	`auto new_name = graph.GenerateNodeArgName("DoubleQDQRemoved_" + node.InputDefs()[index]->Name());`
`54`	`54`	`new_input_tensor.set_name(new_name);`
	`55`	`+ new_input_tensor.add_dims(1);`
`55`	`56`	`NodeArg& new_input = graph_utils::AddInitializerWithExternalData(graph, new_input_tensor);`
`56`	`57`	`graph_utils::ReplaceNodeInput(node, index, new_input);`
`57`	`58`	`}`