Skip to content

Commit 81cd6ea

Browse files
[QNN EP] Fix multithread sync bug in ETW callback (microsoft#23156)
### Description Fixes crash in QNN dlls when an ETW callback tries to change the QNN log level. This is caused by a function that does not lock a mutex before modifying the QNN log level. ### Motivation and Context An ETW callback into QNN EP leads to a crash within QNN SDK dlls. It happens approximately 1 out of 3 full QNN unit tests runs. The cause is a multithreading synchronization bug in QNN EP. We're not always locking a mutex when ETW calls QNN EP to notify of ETW config change. There are two branches in the QNN EP callback function that try to update the QNN log handle. One branch correctly locks a mutex, but other does not lock it at all. This causes crashes within QNN dlls. - Does not lock mutex: [onnxruntime/onnxruntime/core/providers/qnn/qnn_execution_provider.cc at main · microsoft/onnxruntime](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/qnn/qnn_execution_provider.cc#L426) - Locks mutex: [onnxruntime/onnxruntime/core/providers/qnn/qnn_execution_provider.cc at main · microsoft/onnxruntime](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/qnn/qnn_execution_provider.cc#L442) The fix is to lock the mutex in both paths.
1 parent c6ba7ed commit 81cd6ea

File tree

3 files changed

+49
-47
lines changed

3 files changed

+49
-47
lines changed

onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,9 @@ void QnnLogging(const char* format,
255255
}
256256
}
257257

258-
Status QnnBackendManager::InitializeQnnLog() {
258+
Status QnnBackendManager::InitializeQnnLog(const logging::Logger& logger) {
259+
logger_ = &logger;
260+
259261
// Set Qnn log level align with Ort log level
260262
auto ort_log_level = logger_->GetSeverity();
261263
QnnLog_Level_t qnn_log_level = MapOrtSeverityToQNNLogLevel(ort_log_level);
@@ -303,23 +305,15 @@ QnnLog_Level_t QnnBackendManager::MapOrtSeverityToQNNLogLevel(logging::Severity
303305
}
304306
}
305307

306-
Status QnnBackendManager::ResetQnnLogLevel() {
308+
Status QnnBackendManager::ResetQnnLogLevel(std::optional<logging::Severity> ort_log_level) {
307309
std::lock_guard<std::mutex> lock(logger_mutex_);
308-
309-
if (backend_setup_completed_ && logger_ != nullptr) {
310-
auto ort_log_level = logger_->GetSeverity();
311-
LOGS(*logger_, INFO) << "Reset Qnn log level to ORT Logger level: " << (unsigned int)ort_log_level;
312-
return UpdateQnnLogLevel(ort_log_level);
310+
if (!backend_setup_completed_ || logger_ == nullptr) {
311+
return Status::OK();
313312
}
314-
return Status::OK();
315-
}
316-
317-
Status QnnBackendManager::UpdateQnnLogLevel(logging::Severity ort_log_level) {
318313
ORT_RETURN_IF(nullptr == log_handle_, "Unable to update QNN Log Level. Invalid QNN log handle.");
319-
ORT_RETURN_IF(false == backend_setup_completed_, "Unable to update QNN Log Level. Backend setup not completed.");
320-
ORT_RETURN_IF(nullptr == logger_, "Unable to update QNN Log Level. Invalid logger.");
321314

322-
QnnLog_Level_t qnn_log_level = MapOrtSeverityToQNNLogLevel(ort_log_level);
315+
logging::Severity actual_log_level = ort_log_level.has_value() ? *ort_log_level : logger_->GetSeverity();
316+
QnnLog_Level_t qnn_log_level = MapOrtSeverityToQNNLogLevel(actual_log_level);
323317

324318
LOGS(*logger_, INFO) << "Updating Qnn log level to: " << qnn_log_level;
325319

@@ -332,7 +326,8 @@ Status QnnBackendManager::UpdateQnnLogLevel(logging::Severity ort_log_level) {
332326
LOGS(*logger_, ERROR) << "Invalid log handle provided to QnnLog_setLogLevel.";
333327
}
334328
}
335-
ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to set log level in Qnn backend. Error: ", QnnErrorHandleToString(result));
329+
ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result,
330+
"Failed to set log level in Qnn backend. Error: ", QnnErrorHandleToString(result));
336331
return Status::OK();
337332
}
338333

@@ -823,7 +818,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
823818
LOGS(logger, VERBOSE) << "Backend build version: "
824819
<< sdk_build_version_;
825820

826-
SetLogger(&logger);
821+
ORT_RETURN_IF_ERROR(InitializeQnnLog(logger));
827822
LOGS(logger, VERBOSE) << "SetLogger succeed.";
828823

829824
ORT_RETURN_IF_ERROR(InitializeBackend());
@@ -1049,6 +1044,24 @@ Status QnnBackendManager::DestroyHTPPowerConfigID(uint32_t htp_power_config_id)
10491044
return Status::OK();
10501045
}
10511046

1047+
Status QnnBackendManager::TerminateQnnLog() {
1048+
std::lock_guard<std::mutex> lock(logger_mutex_);
1049+
if (logger_ == nullptr) {
1050+
return Status::OK();
1051+
}
1052+
1053+
if (nullptr != qnn_interface_.logFree && nullptr != log_handle_) {
1054+
auto ret_val = qnn_interface_.logFree(log_handle_);
1055+
1056+
// Reset QNN log handle to nullptr so other threads that are waiting on logger_mutex_ know it was freed.
1057+
log_handle_ = nullptr;
1058+
ORT_RETURN_IF(QNN_SUCCESS != ret_val,
1059+
"Unable to terminate logging in the backend.");
1060+
}
1061+
1062+
return Status::OK();
1063+
}
1064+
10521065
void QnnBackendManager::ReleaseResources() {
10531066
if (!backend_setup_completed_) {
10541067
return;
@@ -1074,7 +1087,6 @@ void QnnBackendManager::ReleaseResources() {
10741087
ORT_THROW("Failed to ShutdownBackend.");
10751088
}
10761089

1077-
std::lock_guard<std::mutex> lock(logger_mutex_);
10781090
result = TerminateQnnLog();
10791091
if (Status::OK() != result) {
10801092
ORT_THROW("Failed to TerminateQnnLog.");

onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h

Lines changed: 18 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ class QnnBackendManager {
9696
std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models,
9797
int64_t max_spill_fill_size);
9898

99+
// Initializes handles to QNN resources (device, logger, etc.).
100+
// NOTE: This function locks the internal `logger_mutex_`.
99101
Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context, bool need_load_system_lib);
100102

101103
Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);
@@ -121,34 +123,10 @@ class QnnBackendManager {
121123

122124
const Qnn_ProfileHandle_t& GetQnnProfileHandle() { return profile_backend_handle_; }
123125

124-
void SetLogger(const logging::Logger* logger) {
125-
if (logger_ == nullptr) {
126-
logger_ = logger;
127-
(void)InitializeQnnLog();
128-
}
129-
}
130-
131-
Status InitializeQnnLog();
132-
133-
Status UpdateQnnLogLevel(logging::Severity ort_log_level);
134-
135-
Status ResetQnnLogLevel();
136-
137-
// Terminate logging in the backend
138-
Status TerminateQnnLog() {
139-
if (logger_ == nullptr) {
140-
return Status::OK();
141-
}
142-
143-
if (nullptr != qnn_interface_.logFree && nullptr != log_handle_) {
144-
ORT_RETURN_IF(QNN_SUCCESS != qnn_interface_.logFree(log_handle_),
145-
"Unable to terminate logging in the backend.");
146-
}
147-
148-
return Status::OK();
149-
}
150-
151-
void ReleaseResources();
126+
// Resets the QNN log level to the given ORT log level or to the default log level if the argument is
127+
// std::nullopt.
128+
// NOTE: This function locks the internal `logger_mutex_`.
129+
Status ResetQnnLogLevel(std::optional<logging::Severity> ort_log_level = std::nullopt);
152130

153131
Status ExtractBackendProfilingInfo();
154132
Status ExtractProfilingSubEvents(QnnProfile_EventId_t profile_event_id, std::ofstream& outfile,
@@ -171,6 +149,18 @@ class QnnBackendManager {
171149
uint64_t& max_spill_fill_buffer_size);
172150

173151
private:
152+
// Sets the ORT logger and creates a corresponding QNN logger with the same log level.
153+
// NOTE: caller must lock the `logger_mutex_` before calling this function.
154+
Status InitializeQnnLog(const logging::Logger& logger);
155+
156+
// Terminate logging in the backend
157+
// NOTE: This function locks the internal `logger_mutex_`.
158+
Status TerminateQnnLog();
159+
160+
// Releases all QNN resources. Called in the destructor.
161+
// NOTE: This function indirectly locks the internal `logger_mutex_` via nested function calls.
162+
void ReleaseResources();
163+
174164
void* LoadLib(const char* file_name, int flags, std::string& error_msg);
175165

176166
Status LoadQnnSystemLib();

onnxruntime/core/providers/qnn/qnn_execution_provider.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
423423
if (IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) {
424424
if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) {
425425
auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity();
426-
(void)qnn_backend_manager_->UpdateQnnLogLevel(ortETWSeverity);
426+
(void)qnn_backend_manager_->ResetQnnLogLevel(ortETWSeverity);
427427
}
428428
if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) {
429429
if (Level != 0) {
@@ -439,7 +439,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
439439

440440
if (IsEnabled == EVENT_CONTROL_CODE_DISABLE_PROVIDER) {
441441
// (void)qnn_backend_manager_->SetProfilingLevelETW(qnn::ProfilingLevel::INVALID);
442-
(void)qnn_backend_manager_->ResetQnnLogLevel();
442+
(void)qnn_backend_manager_->ResetQnnLogLevel(std::nullopt);
443443
}
444444
});
445445
etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_);

0 commit comments

Comments
 (0)