Skip to content

Commit f20df72

Browse files
authored
[CANN] Fix the ACL_ERROR_REPEAT_INITIALIZE error that occurs when coexisting… (microsoft#26193)
### Description This PR fixes the ACL_ERROR_REPEAT_INITIALIZE error that occurs when onnxruntime-cann coexists with torch_npu, which causes the CANN provider to fail to initialize and fall back to CPU for inference. ### Error log ``` 2025-08-30 03:30:44.189484484 [E:onnxruntime:Default, provider_bridge_ort.cc:2279 TryGetProviderInfo_CANN] ~/code/onnxruntime/onnxruntime/core/providers/cann/cann_call.cc:143 bool onnxruntime::CannCall(ERRTYPE, const char*, const char*, ERRTYPE, const char*) [with ERRTYPE = int; bool THRW = true] ~/code/onnxruntime/onnxruntime/core/providers/cann/cann_call.cc:137 bool onnxruntime::CannCall(ERRTYPE, const char*, const char*, ERRTYPE, const char*) [with ERRTYPE = int; bool THRW = true] CANN failure 100002: ACL_ERROR_REPEAT_INITIALIZE ; NPU=0 ; hostname=coder-6a95445b-e353-4b0b-970b-41797b02ca23-566969c97-xlf2s ; expr=aclInit(nullptr); *************** EP Error *************** EP Error /home/dou/code/onnxruntime/onnxruntime/python/onnxruntime_pybind_state.cc:1231 std::shared_ptr<onnxruntime::IExecutionProviderFactory> onnxruntime::python::CreateExecutionProviderFactoryInstance(const onnxruntime::SessionOptions&, const string&, const ProviderOptionsMap&) create CANN ExecutionProvider fail when using [('CANNExecutionProvider', {'device_id': 0, 'arena_extend_strategy': 'kNextPowerOfTwo', 'npu_mem_limit': 4294967296, 'enable_cann_graph': True}), 'CPUExecutionProvider'] Falling back to ['CPUExecutionProvider'] and retrying. **************************************** ================= before run [array([[ -8.8664875, -7.903085 , -6.529765 , -6.0811057, -4.148506 , -6.4790154, -5.8431354, -8.300879 , -6.77909 , -8.013498 , -8.759175 , -7.237462 , -6.890937 , -8.367442 , -7.565303 , ```
1 parent 697283d commit f20df72

File tree

4 files changed

+50
-29
lines changed

4 files changed

+50
-29
lines changed

onnxruntime/core/providers/cann/cann_call.cc

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
// Licensed under the MIT License.
44

55
#include <string.h>
6+
#include <cstdio>
67
#include "core/providers/shared_library/provider_api.h"
78
#include "cann_call.h"
9+
#include "core/providers/cann/cann_utils.h"
810

911
namespace onnxruntime {
1012

@@ -121,28 +123,32 @@ const char* CannErrString<ge::graphStatus>(ge::graphStatus e) {
121123
template <typename ERRTYPE, bool THRW>
122124
bool CannCall(ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg) {
123125
if (retCode != successCode) {
124-
try {
125-
char hostname[HOST_NAME_MAX];
126-
if (gethostname(hostname, HOST_NAME_MAX) != 0)
127-
snprintf(hostname, HOST_NAME_MAX, "%s", "?");
128-
int currentCannDevice;
129-
(void)aclrtGetDevice(&currentCannDevice);
130-
(void)aclGetRecentErrMsg();
131-
static char str[1024];
132-
snprintf(str, sizeof(str), "%s failure %d: %s ; NPU=%d ; hostname=%s ; expr=%s; %s",
133-
libName, static_cast<int>(retCode), CannErrString(retCode), currentCannDevice,
134-
hostname,
135-
exprString, msg);
136-
if (THRW) {
137-
ORT_THROW(str);
138-
} else {
139-
LOGS_DEFAULT(ERROR) << str;
140-
}
141-
} catch (const std::exception& e) {
142-
if (THRW) {
143-
ORT_THROW(e.what());
144-
} else {
145-
LOGS_DEFAULT(ERROR) << e.what();
126+
if (retCode == ACL_ERROR_REPEAT_INITIALIZE) {
127+
cann::SetRepeatInitFlag(true);
128+
} else {
129+
try {
130+
char hostname[HOST_NAME_MAX];
131+
if (gethostname(hostname, HOST_NAME_MAX) != 0)
132+
snprintf(hostname, HOST_NAME_MAX, "%s", "?");
133+
int currentCannDevice;
134+
(void)aclrtGetDevice(&currentCannDevice);
135+
(void)aclGetRecentErrMsg();
136+
char str[1024];
137+
snprintf(str, sizeof(str), "%s failure %d: %s ; NPU=%d ; hostname=%s ; expr=%s; %s",
138+
libName, static_cast<int>(retCode), CannErrString(retCode), currentCannDevice,
139+
hostname,
140+
exprString, msg);
141+
if (THRW) {
142+
ORT_THROW(str);
143+
} else {
144+
LOGS_DEFAULT(ERROR) << str;
145+
}
146+
} catch (const std::exception& e) {
147+
if (THRW) {
148+
ORT_THROW(e.what());
149+
} else {
150+
LOGS_DEFAULT(ERROR) << e.what();
151+
}
146152
}
147153
}
148154
return false;

onnxruntime/core/providers/cann/cann_execution_provider.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "core/providers/cann/cann_fwd.h"
1919
#include "core/providers/cann/cann_stream_handle.h"
2020
#include "core/providers/cann/npu_data_transfer.h"
21+
#include "core/providers/cann/cann_utils.h"
2122

2223
using onnxruntime::cann::BuildONNXModel;
2324
using onnxruntime::cann::CannModelPreparation;
@@ -1068,7 +1069,9 @@ void DeleteRegistry() {
10681069

10691070
ge::aclgrphBuildFinalize();
10701071

1071-
CANN_CALL_THROW(aclFinalize());
1072+
if (!cann::GetRepeatInitFlag()) {
1073+
CANN_CALL_THROW(aclFinalize());
1074+
}
10721075
}
10731076

10741077
std::shared_ptr<KernelRegistry> CANNExecutionProvider::GetKernelRegistry() const {
@@ -1393,7 +1396,7 @@ Status CANNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fuse
13931396
modelID = modelIDs_[filename];
13941397
} else {
13951398
std::lock_guard<std::mutex> lock(g_mutex);
1396-
auto filename_with_suffix = cann::RegexMatchFile(filename);
1399+
auto filename_with_suffix = cann::MatchFile(filename);
13971400
if (!filename_with_suffix.empty()) {
13981401
CANN_RETURN_IF_ERROR(aclmdlLoadFromFile(filename_with_suffix.c_str(), &modelID));
13991402
} else {

onnxruntime/core/providers/cann/cann_utils.cc

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <unistd.h>
66
#include <algorithm>
7+
#include <string>
78

89
#include "core/providers/cann/cann_utils.h"
910

@@ -229,18 +230,28 @@ bool is_dynamic_shape(const aclmdlIODims& dims) {
229230
}
230231

231232
namespace fs = std::filesystem;
232-
std::string RegexMatchFile(const std::string& file_name) {
233+
std::string MatchFile(const std::string& file_name) {
233234
fs::path current_dir = fs::current_path();
234-
std::regex pattern(file_name);
235+
235236
for (const auto& entry : fs::directory_iterator(current_dir)) {
236237
if (entry.is_regular_file()) {
237238
std::string name = entry.path().filename().string();
238-
if (std::regex_search(name, pattern)) {
239+
if (name.find(file_name) != std::string::npos && entry.path().extension() == ".om") {
239240
return name;
240241
}
241242
}
242243
}
243244
return "";
244245
}
246+
247+
static bool repeat_acl_init_flag = false;
248+
249+
bool GetRepeatInitFlag() {
250+
return repeat_acl_init_flag;
251+
}
252+
253+
void SetRepeatInitFlag(bool val) {
254+
repeat_acl_init_flag = val;
255+
}
245256
} // namespace cann
246257
} // namespace onnxruntime

onnxruntime/core/providers/cann/cann_utils.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,9 @@ Status aclrtblasGemmEx(aclTransType transA,
127127
bool FileExist(const std::string& file_name);
128128
void GenerateHashValue(const std::string string, HashValue& hash_value);
129129
bool is_dynamic_shape(const aclmdlIODims& dims);
130-
std::string RegexMatchFile(const std::string& file_name);
130+
std::string MatchFile(const std::string& file_name);
131131
std::unique_ptr<Model> CreateModel(const GraphViewer& graph_viewer, const logging::Logger& logger);
132-
132+
bool GetRepeatInitFlag();
133+
void SetRepeatInitFlag(bool val);
133134
} // namespace cann
134135
} // namespace onnxruntime

0 commit comments

Comments
 (0)