Skip to content

Commit 7334aee

Browse files
fix: Switch copy-engine idle check to try-lock
Related-To: NEO-13325 Replace blocking obtainUniqueOwnership with non-blocking tryObtainUniqueOwnership in isCopyEngineOnDeviceIdle. Treat copy engine as not idle when lock is contended (conservative; prevents false stops). Avoid deadlock scenarios caused by holding directSubmissionsMutex while waiting on a CSR lock. Signed-off-by: Slawomir Milczarek <slawomir.milczarek@intel.com>
1 parent e88b4d7 commit 7334aee

File tree

3 files changed

+56
-1
lines changed

3 files changed

+56
-1
lines changed

shared/source/direct_submission/direct_submission_controller.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,13 @@ bool DirectSubmissionController::isCopyEngineOnDeviceIdle(uint32_t rootDeviceInd
253253
return true;
254254
}
255255

256-
auto lock = bcsCsr->obtainUniqueOwnership();
256+
// Non-blocking lock attempt
257+
auto lock = bcsCsr->tryObtainUniqueOwnership();
258+
if (!lock.owns_lock()) {
259+
// Could not acquire -> conservatively declare "not idle"
260+
return false;
261+
}
262+
257263
return (bcsCsr->peekTaskCount() == registeredTaskCount) && isDirectSubmissionIdle(bcsCsr, lock);
258264
}
259265

shared/test/unit_test/direct_submission/direct_submission_controller_mock.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ struct DirectSubmissionControllerMock : public DirectSubmissionController {
1919
using DirectSubmissionController::directSubmissionsMutex;
2020
using DirectSubmissionController::getSleepValue;
2121
using DirectSubmissionController::handlePagingFenceRequests;
22+
using DirectSubmissionController::isCopyEngineOnDeviceIdle;
2223
using DirectSubmissionController::isCsrsContextGroupIdleDetectionEnabled;
2324
using DirectSubmissionController::isDirectSubmissionIdle;
2425
using DirectSubmissionController::keepControlling;

shared/test/unit_test/direct_submission/direct_submission_controller_tests.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,54 @@ TEST_F(DirectSubmissionIdleDetectionTests, givenSimulatedCsrLockContentionWhenCh
528528
controller->unregisterDirectSubmission(contentionCsr.get());
529529
}
530530

531+
TEST_F(DirectSubmissionIdleDetectionTests, givenTryLockContentionThenIsCopyEngineOnDeviceIdleReturnsFalse) {
532+
struct ContentionSimulatingCsr : public TagUpdateMockCommandStreamReceiver {
533+
using TagUpdateMockCommandStreamReceiver::TagUpdateMockCommandStreamReceiver;
534+
535+
bool simulateContention = false;
536+
537+
std::unique_lock<CommandStreamReceiver::MutexType> tryObtainUniqueOwnership() override {
538+
if (simulateContention) {
539+
return std::unique_lock<CommandStreamReceiver::MutexType>();
540+
}
541+
return TagUpdateMockCommandStreamReceiver::tryObtainUniqueOwnership();
542+
}
543+
};
544+
545+
// Remove original csr from controller
546+
controller->unregisterDirectSubmission(csr.get());
547+
548+
DeviceBitfield deviceBitfield(1);
549+
auto contCsr = std::make_unique<ContentionSimulatingCsr>(executionEnvironment, 0u, deviceBitfield);
550+
auto osCtx = std::unique_ptr<OsContext>(OsContext::create(nullptr, 0, 1,
551+
EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::regular},
552+
PreemptionMode::ThreadGroup, deviceBitfield)));
553+
contCsr->setupContext(*osCtx);
554+
555+
contCsr->taskCount.store(50u);
556+
contCsr->setLatestFlushedTaskCount(50u);
557+
contCsr->isBusyReturnValue = false;
558+
559+
controller->registerDirectSubmission(contCsr.get());
560+
controller->directSubmissions[contCsr.get()].taskCount = 50u;
561+
controller->directSubmissions[contCsr.get()].isStopped = false;
562+
563+
std::optional<TaskCountType> bcsTaskCount(50u);
564+
565+
// No contention -> true
566+
EXPECT_TRUE(controller->isCopyEngineOnDeviceIdle(0u, bcsTaskCount));
567+
568+
// Contention -> false
569+
contCsr->simulateContention = true;
570+
EXPECT_FALSE(controller->isCopyEngineOnDeviceIdle(0u, bcsTaskCount));
571+
572+
// Remove contention -> true again
573+
contCsr->simulateContention = false;
574+
EXPECT_TRUE(controller->isCopyEngineOnDeviceIdle(0u, bcsTaskCount));
575+
576+
controller->unregisterDirectSubmission(contCsr.get());
577+
}
578+
531579
struct DirectSubmissionCheckForCopyEngineIdleTests : public ::testing::Test {
532580
void SetUp() override {
533581
controller = std::make_unique<DirectSubmissionControllerMock>();

0 commit comments

Comments
 (0)