[NNAPI EP] Track skipped initializer usage (microsoft#21286)

edgchen1 · web-flow · commit 307b34a820fc · 2024-07-09T13:43:22.000-07:00
Track skipped initializer usage in NNAPI EP to account for usage by other nodes.
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -56,7 +56,13 @@ DEFINE_ADD_OPERAND_FROM_SCALAR(float, FLOAT32);
 #undef DEFINE_ADD_OPERAND_FROM_SCALAR
 
 void ModelBuilder::AddInitializerToSkip(const std::string& tensor_name) {
-  skipped_initializers_.insert(tensor_name);
+  // decrement usage count if this is a known initializer.
+  // For simplicity the OpBuilder::AddInitializersToSkip implementations may call this for arbitrary input names
+  // without first checking if the value is an initializer.
+  auto entry = initializer_usage_.find(tensor_name);
+  if (entry != initializer_usage_.end()) {
+    entry->second -= 1;
+  }
 }
 
 Status ModelBuilder::Prepare() {
@@ -87,7 +93,16 @@ static size_t GetPaddedByteSize(size_t size) {
 }
 
 void ModelBuilder::PreprocessInitializers() {
+  const auto& initializers = GetInitializerTensors();
+
   for (const auto& node_unit : node_unit_holder_) {
+    // find all initializers consumed. AddInitializersToSkip will potentially decrement the usage count.
+    for (const auto& input : node_unit->Inputs()) {
+      if (input.node_arg.Exists() && Contains(initializers, input.node_arg.Name())) {
+        initializer_usage_[input.node_arg.Name()]++;
+      }
+    }
+
     if (const auto* op_builder = GetOpBuilder(*node_unit)) {
       op_builder->AddInitializersToSkip(*this, *node_unit);
     }
@@ -208,11 +223,16 @@ Status ModelBuilder::RegisterInitializers() {
   std::vector<std::tuple<uint32_t, size_t, size_t>> initializers(initializer_size);
   size_t sizeAll = 0;
 
+  const auto should_skip_initializer = [this](const std::string& name) -> bool {
+    const auto it = initializer_usage_.find(name);
+    return it == initializer_usage_.end() || it->second == 0;
+  };
+
   int i = 0;
   for (const auto& pair : initializer_tensors) {
     const auto& tensor = *pair.second;
     const auto& name = tensor.name();
-    if (Contains(skipped_initializers_, name))
+    if (should_skip_initializer(name))
       continue;
 
     Shape shape;
@@ -249,7 +269,7 @@ Status ModelBuilder::RegisterInitializers() {
   size_t offset = 0;
   for (const auto& pair : initializer_tensors) {
     const auto& tensor = *pair.second;
-    if (Contains(skipped_initializers_, tensor.name()))
+    if (should_skip_initializer(tensor.name()))
       continue;
 
     auto [index, size, padded_size] = initializers[i++];
@@ -439,10 +459,11 @@ Status ModelBuilder::AddOperandFromPersistMemoryBuffer(
 Status ModelBuilder::AddOperations() {
   const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
   for (const auto node_idx : node_indices) {
-    LOGS_DEFAULT(VERBOSE) << "Adding node [" << node_idx << "]";
     const auto* node(graph_viewer_.GetNode(node_idx));
     const NodeUnit& node_unit = GetNodeUnit(node);
 
+    LOGS_DEFAULT(VERBOSE) << "Adding node [" << node_unit.Name() << "] at index [" << node_unit.Index() << "]";
+
     // Since we may have NodeUnit with multiple nodes, insert NodeUnit with the first occurrence of
     // its node(s) in topological order may cause the incorrect topological order while inserting
     // NodeUNits, for example,
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -134,7 +134,7 @@ class ModelBuilder {
   std::unordered_set<std::string> operands_;
   std::unordered_set<std::string> fused_activations_;
 
-  std::unordered_set<std::string> skipped_initializers_;
+  std::unordered_map<std::string, int> initializer_usage_;
 
   // All activation nodes (Relu, Relu1, Relu6) as a map <const NodeUnit*, activation_code>
   std::unordered_map<const NodeUnit*, int32_t> activation_node_units_;
diff --git a/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc b/onnxruntime/test/providers/nnapi/nnapi_basic_test.cc
@@ -14,6 +14,7 @@
 #include "test/common/tensor_op_test_utils.h"
 #include "test/framework/test_utils.h"
 #include "test/util/include/asserts.h"
+#include "test/util/include/current_test_name.h"
 #include "test/util/include/default_providers.h"
 #include "test/util/include/inference_session_wrapper.h"
 #include "test/util/include/test/test_environment.h"
@@ -36,10 +37,6 @@ using namespace ::onnxruntime::logging;
 namespace onnxruntime {
 namespace test {
 
-#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
-
-#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
-
 #if !defined(ORT_MINIMAL_BUILD)
 
 // Since NNAPI EP handles Reshape and Flatten differently,
@@ -65,7 +62,8 @@ TEST(NnapiExecutionProviderTest, ReshapeFlattenTest) {
   feeds.insert(std::make_pair("X", ml_value_x));
   feeds.insert(std::make_pair("Y", ml_value_y));
 
-  RunAndVerifyOutputsWithEP(model_file_name, "NnapiExecutionProviderTest.ReshapeFlattenTest",
+  RunAndVerifyOutputsWithEP(model_file_name,
+                            CurrentTestName(),
                             std::make_unique<NnapiExecutionProvider>(0),
                             feeds);
 #else
@@ -88,7 +86,8 @@ TEST(NnapiExecutionProviderTest, SigmoidSupportedInputRankTest) {
   NameMLValMap feeds;
   feeds.insert(std::make_pair("X", ml_value_x));
 
-  RunAndVerifyOutputsWithEP(model_file_name, "NnapiExecutionProviderTest.SigmoidSupportedInputRankTest",
+  RunAndVerifyOutputsWithEP(model_file_name,
+                            CurrentTestName(),
                             std::make_unique<NnapiExecutionProvider>(0),
                             feeds, {ExpectedEPNodeAssignment::None} /* params */);
 #else
@@ -115,7 +114,8 @@ TEST(NnapiExecutionProviderTest, DynamicGraphInputTest) {
   NameMLValMap feeds;
   feeds.insert(std::make_pair("X", ml_value_x));
 
-  RunAndVerifyOutputsWithEP(model_file_name, "NnapiExecutionProviderTest.DynamicGraphInputTest",
+  RunAndVerifyOutputsWithEP(model_file_name,
+                            CurrentTestName(),
                             std::make_unique<NnapiExecutionProvider>(0),
                             feeds);
 #else
@@ -144,7 +144,8 @@ TEST(NnapiExecutionProviderTest, InternalUint8SupportTest) {
   NameMLValMap feeds;
   feeds.insert(std::make_pair("X", ml_value_x));
 
-  RunAndVerifyOutputsWithEP(model_file_name, "NnapiExecutionProviderTest.InternalUint8SupportTest",
+  RunAndVerifyOutputsWithEP(model_file_name,
+                            CurrentTestName(),
                             std::make_unique<NnapiExecutionProvider>(0),
                             feeds);
 #else
@@ -208,7 +209,8 @@ TEST(NnapiExecutionProviderTest, FunctionTest) {
   feeds.insert(std::make_pair("Y", ml_value_y));
   feeds.insert(std::make_pair("Z", ml_value_z));
 
-  RunAndVerifyOutputsWithEP(model_file_name, "NnapiExecutionProviderTest.FunctionTest",
+  RunAndVerifyOutputsWithEP(model_file_name,
+                            CurrentTestName(),
                             std::make_unique<NnapiExecutionProvider>(0),
                             feeds);
 #else
@@ -273,7 +275,8 @@ static void RunQDQModelTest(
   const auto model_data_span = AsByteSpan(model_data.data(), model_data.size());
 
 #if defined(__ANDROID__)
-  RunAndVerifyOutputsWithEP(model_data_span, "NnapiExecutionProviderTest.TestQDQModel",
+  RunAndVerifyOutputsWithEP(model_data_span,
+                            CurrentTestName(),
                             std::make_unique<NnapiExecutionProvider>(0),
                             helper.feeds_, params);
 #else
@@ -513,6 +516,31 @@ TEST(NnapiExecutionProviderTest, TestGather) {
                   {ExpectedEPNodeAssignment::All});
 }
 
+TEST(NnapiExecutionProviderTest, SharedInitializersDoNotGetSkipped) {
+  // NNAPI EP's Clip op builder will mark the max initializer as skipped but it is also used by the Div op.
+  // Test that the shared initializer is still present in the NNAPI model for the Div op.
+  constexpr auto* model_file_name = ORT_TSTR("testdata/clip_div_shared_initializer.onnx");
+
+#if defined(__ANDROID__)
+  AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
+
+  std::vector<int64_t> x_dims{3, 2};
+  std::vector<float> x_values(3.0f, 3 * 2);
+  OrtValue ml_value_x;
+  CreateMLValue<float>(cpu_allocator, x_dims, x_values, &ml_value_x);
+
+  NameMLValMap feeds{{"input_0", ml_value_x}};
+
+  RunAndVerifyOutputsWithEP(model_file_name,
+                            CurrentTestName(),
+                            std::make_unique<NnapiExecutionProvider>(0),
+                            feeds,
+                            {ExpectedEPNodeAssignment::All});
+#else
+  TestModelLoad(model_file_name, std::make_unique<NnapiExecutionProvider>(0), ExpectedEPNodeAssignment::All);
+#endif
+}
+
 #endif  // !(ORT_MINIMAL_BUILD)
 
 TEST(NnapiExecutionProviderTest, NNAPIFlagsTest) {
@@ -541,7 +569,8 @@ TEST(NnapiExecutionProviderTest, TestOrtFormatModel) {
   NameMLValMap feeds;
   feeds.insert(std::make_pair("Input3", ml_value));
 
-  RunAndVerifyOutputsWithEP(model_file_name, "NnapiExecutionProviderTest.TestOrtFormatModel",
+  RunAndVerifyOutputsWithEP(model_file_name,
+                            CurrentTestName(),
                             std::make_unique<NnapiExecutionProvider>(0),
                             feeds);
 #else
diff --git a/onnxruntime/test/testdata/clip_div_shared_initializer.onnx b/onnxruntime/test/testdata/clip_div_shared_initializer.onnx
diff --git a/onnxruntime/test/testdata/clip_div_shared_initializer.py b/onnxruntime/test/testdata/clip_div_shared_initializer.py
@@ -0,0 +1,33 @@
+from onnx import TensorProto, checker, helper, save
+
+graph_proto = helper.make_graph(
+    [
+        helper.make_node(
+            "Clip",
+            inputs=["input_0", "initializer_0", "initializer_1"],
+            outputs=["clip_output"],
+            name="clip",
+        ),
+        helper.make_node(
+            "Div",
+            inputs=["clip_output", "initializer_1"],
+            outputs=["output_0"],
+            name="div",
+        ),
+    ],
+    "Main_graph",
+    [
+        helper.make_tensor_value_info("input_0", TensorProto.FLOAT, [3, 2]),
+    ],
+    [
+        helper.make_tensor_value_info("output_0", TensorProto.FLOAT, [3, 2]),
+    ],
+    [
+        helper.make_tensor("initializer_0", TensorProto.FLOAT, [], [0.0]),
+        helper.make_tensor("initializer_1", TensorProto.FLOAT, [], [6.0]),
+    ],
+)
+
+model = helper.make_model(graph_proto)
+checker.check_model(model, True)
+save(model, "clip_div_shared_initializer.onnx")