Skip to content

Commit 5739d52

Browse files
Broadcast signal to all threads while handling USM pagefault
Related-To: NEO-4721 Change-Id: I77185f8db2576f626c1b6b5615ab5d8f9b22076f Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
1 parent e3ccf19 commit 5739d52

File tree

8 files changed

+122
-10
lines changed

8 files changed

+122
-10
lines changed

shared/source/page_fault_manager/cpu_page_fault_manager.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ bool PageFaultManager::verifyPageFault(void *ptr) {
6666
auto allocPtr = alloc.first;
6767
auto &pageFaultData = alloc.second;
6868
if (ptr >= allocPtr && ptr < ptrOffset(allocPtr, pageFaultData.size)) {
69+
this->broadcastWaitSignal();
6970
this->allowCPUMemoryAccess(allocPtr, pageFaultData.size);
7071
this->setAubWritable(true, allocPtr, pageFaultData.unifiedMemoryManager);
7172
this->transferToCpu(allocPtr, pageFaultData.size, pageFaultData.cmdQ);
@@ -81,4 +82,9 @@ void PageFaultManager::setAubWritable(bool writable, void *ptr, SVMAllocsManager
8182
auto gpuAlloc = unifiedMemoryManager->getSVMAlloc(ptr)->gpuAllocations.getDefaultGraphicsAllocation();
8283
gpuAlloc->setAubWritable(writable, GraphicsAllocation::allBanks);
8384
}
85+
86+
void PageFaultManager::waitForCopy() {
87+
std::unique_lock<SpinLock> lock{mtx};
88+
}
89+
8490
} // namespace NEO

shared/source/page_fault_manager/cpu_page_fault_manager.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ class PageFaultManager : public NonCopyableOrMovableClass {
3838
virtual void allowCPUMemoryAccess(void *ptr, size_t size) = 0;
3939
virtual void protectCPUMemoryAccess(void *ptr, size_t size) = 0;
4040

41+
virtual void broadcastWaitSignal() = 0;
42+
MOCKABLE_VIRTUAL void waitForCopy();
43+
4144
MOCKABLE_VIRTUAL bool verifyPageFault(void *ptr);
4245
MOCKABLE_VIRTUAL void transferToCpu(void *ptr, size_t size, void *cmdQ);
4346
MOCKABLE_VIRTUAL void transferToGpu(void *ptr, void *cmdQ);

shared/source/page_fault_manager/linux/cpu_page_fault_manager_linux.cpp

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99

1010
#include "shared/source/helpers/debug_helpers.h"
1111

12+
#include <dirent.h>
1213
#include <sys/mman.h>
14+
#include <sys/syscall.h>
15+
#include <sys/types.h>
16+
#include <unistd.h>
1317

1418
namespace NEO {
1519
std::unique_ptr<PageFaultManager> PageFaultManager::create() {
@@ -20,21 +24,30 @@ std::function<void(int signal, siginfo_t *info, void *context)> PageFaultManager
2024

2125
PageFaultManagerLinux::PageFaultManagerLinux() {
2226
pageFaultHandler = [&](int signal, siginfo_t *info, void *context) {
23-
if (!this->verifyPageFault(info->si_addr)) {
27+
if (signal == SIGUSR1) {
28+
this->waitForCopy();
29+
} else if (!this->verifyPageFault(info->si_addr)) {
2430
callPreviousHandler(signal, info, context);
2531
}
2632
};
2733

2834
struct sigaction pageFaultManagerHandler = {};
2935
pageFaultManagerHandler.sa_flags = SA_SIGINFO;
3036
pageFaultManagerHandler.sa_sigaction = pageFaultHandlerWrapper;
31-
auto retVal = sigaction(SIGSEGV, &pageFaultManagerHandler, &previousHandler);
37+
38+
auto retVal = sigaction(SIGSEGV, &pageFaultManagerHandler, &previousPageFaultHandler);
39+
UNRECOVERABLE_IF(retVal != 0);
40+
41+
retVal = sigaction(SIGUSR1, &pageFaultManagerHandler, &previousUserSignalHandler);
3242
UNRECOVERABLE_IF(retVal != 0);
3343
}
3444

3545
PageFaultManagerLinux::~PageFaultManagerLinux() {
3646
if (!previousHandlerRestored) {
37-
auto retVal = sigaction(SIGSEGV, &previousHandler, nullptr);
47+
auto retVal = sigaction(SIGSEGV, &previousPageFaultHandler, nullptr);
48+
UNRECOVERABLE_IF(retVal != 0);
49+
50+
retVal = sigaction(SIGUSR1, &previousUserSignalHandler, nullptr);
3851
UNRECOVERABLE_IF(retVal != 0);
3952
}
4053
}
@@ -54,18 +67,50 @@ void PageFaultManagerLinux::protectCPUMemoryAccess(void *ptr, size_t size) {
5467
}
5568

5669
void PageFaultManagerLinux::callPreviousHandler(int signal, siginfo_t *info, void *context) {
57-
if (previousHandler.sa_flags & SA_SIGINFO) {
58-
previousHandler.sa_sigaction(signal, info, context);
70+
if (previousPageFaultHandler.sa_flags & SA_SIGINFO) {
71+
previousPageFaultHandler.sa_sigaction(signal, info, context);
5972
} else {
60-
if (previousHandler.sa_handler == SIG_DFL) {
61-
auto retVal = sigaction(SIGSEGV, &previousHandler, nullptr);
73+
if (previousPageFaultHandler.sa_handler == SIG_DFL) {
74+
auto retVal = sigaction(SIGSEGV, &previousPageFaultHandler, nullptr);
6275
UNRECOVERABLE_IF(retVal != 0);
6376
previousHandlerRestored = true;
64-
} else if (previousHandler.sa_handler == SIG_IGN) {
77+
} else if (previousPageFaultHandler.sa_handler == SIG_IGN) {
6578
return;
6679
} else {
67-
previousHandler.sa_handler(signal);
80+
previousPageFaultHandler.sa_handler(signal);
81+
}
82+
}
83+
}
84+
85+
/* This function is a WA for USM issue in multithreaded environment
86+
While handling page fault, before copy starts, user signal (SIGUSR1)
87+
is broadcasted to ensure that every thread received signal and is
88+
stucked on PageFaultHandler's mutex before copy from GPU to CPU proceeds. */
89+
void PageFaultManagerLinux::broadcastWaitSignal() {
90+
auto selfThreadId = syscall(__NR_gettid);
91+
92+
auto procDir = opendir("/proc/self/task");
93+
UNRECOVERABLE_IF(!procDir);
94+
95+
struct dirent *dirEntry;
96+
while ((dirEntry = readdir(procDir)) != NULL) {
97+
if (dirEntry->d_name[0] == '.') {
98+
continue;
6899
}
100+
101+
int threadId = atoi(dirEntry->d_name);
102+
if (threadId == selfThreadId) {
103+
continue;
104+
}
105+
106+
sendSignalToThread(threadId);
69107
}
108+
109+
closedir(procDir);
110+
}
111+
112+
void PageFaultManagerLinux::sendSignalToThread(int threadId) {
113+
syscall(SYS_tkill, threadId, SIGUSR1);
70114
}
115+
71116
} // namespace NEO

shared/source/page_fault_manager/linux/cpu_page_fault_manager_linux.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,15 @@ class PageFaultManagerLinux : public PageFaultManager {
2424
void allowCPUMemoryAccess(void *ptr, size_t size) override;
2525
void protectCPUMemoryAccess(void *ptr, size_t size) override;
2626

27+
void broadcastWaitSignal() override;
28+
MOCKABLE_VIRTUAL void sendSignalToThread(int threadId);
29+
2730
void callPreviousHandler(int signal, siginfo_t *info, void *context);
2831
bool previousHandlerRestored = false;
2932

3033
static std::function<void(int signal, siginfo_t *info, void *context)> pageFaultHandler;
31-
struct sigaction previousHandler = {};
34+
35+
struct sigaction previousPageFaultHandler = {};
36+
struct sigaction previousUserSignalHandler = {};
3237
};
3338
} // namespace NEO

shared/source/page_fault_manager/windows/cpu_page_fault_manager_windows.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,7 @@ void PageFaultManagerWindows::protectCPUMemoryAccess(void *ptr, size_t size) {
5050
auto retVal = VirtualProtect(ptr, size, PAGE_NOACCESS, &previousState);
5151
UNRECOVERABLE_IF(!retVal);
5252
}
53+
54+
void PageFaultManagerWindows::broadcastWaitSignal() {}
55+
5356
} // namespace NEO

shared/source/page_fault_manager/windows/cpu_page_fault_manager_windows.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ class PageFaultManagerWindows : public PageFaultManager {
2525
void allowCPUMemoryAccess(void *ptr, size_t size) override;
2626
void protectCPUMemoryAccess(void *ptr, size_t size) override;
2727

28+
void broadcastWaitSignal() override;
29+
2830
static std::function<LONG(struct _EXCEPTION_POINTERS *exceptionInfo)> pageFaultHandler;
2931
PVOID previousHandler;
3032
};

shared/test/unit_test/page_fault_manager/linux/cpu_page_fault_manager_linux_tests.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,59 @@
1313

1414
#include <csignal>
1515
#include <sys/mman.h>
16+
#include <sys/syscall.h>
17+
#include <thread>
18+
#include <unistd.h>
1619

1720
using namespace NEO;
1821

1922
using PageFaultManagerLinuxTest = PageFaultManagerConfigFixture;
2023
using MockPageFaultManagerLinux = MockPageFaultManagerHandlerInvoke<PageFaultManagerLinux>;
2124

25+
struct UserSignalMockPageFaultManagerLinux : public PageFaultManagerLinux {
26+
using PageFaultManager::verifyPageFault;
27+
28+
UserSignalMockPageFaultManagerLinux() {
29+
ownThread = std::thread([&]() {
30+
while (!waitForCopyCalled) {
31+
if (ownThreadId == -1) {
32+
ownThreadId = static_cast<int>(syscall(__NR_gettid));
33+
}
34+
}
35+
});
36+
while (ownThreadId == -1) {
37+
}
38+
}
39+
40+
void allowCPUMemoryAccess(void *ptr, size_t size) override {}
41+
void transferToCpu(void *ptr, size_t size, void *cmdQ) override {}
42+
void setAubWritable(bool writable, void *ptr, SVMAllocsManager *unifiedMemoryManager) override {}
43+
44+
void sendSignalToThread(int threadId) override {
45+
PageFaultManagerLinux::sendSignalToThread(ownThreadId);
46+
}
47+
48+
void waitForCopy() override {
49+
PageFaultManagerLinux::waitForCopy();
50+
waitForCopyCalled = true;
51+
}
52+
53+
std::thread ownThread;
54+
int ownThreadId = -1;
55+
bool waitForCopyCalled = false;
56+
};
57+
58+
TEST_F(PageFaultManagerLinuxTest, whenVeryfyingPageFaultThenUserSignalIsSentToOtherThreads) {
59+
auto pageFaultManager = std::make_unique<UserSignalMockPageFaultManagerLinux>();
60+
61+
auto alloc = reinterpret_cast<void *>(0x1);
62+
pageFaultManager->insertAllocation(alloc, 10, nullptr, nullptr);
63+
pageFaultManager->verifyPageFault(alloc);
64+
pageFaultManager->ownThread.join();
65+
66+
EXPECT_TRUE(pageFaultManager->waitForCopyCalled);
67+
}
68+
2269
TEST_F(PageFaultManagerLinuxTest, whenPageFaultIsRaisedThenHandlerIsInvoked) {
2370
auto pageFaultManager = std::make_unique<MockPageFaultManagerLinux>();
2471
EXPECT_FALSE(pageFaultManager->handlerInvoked);

shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ class MockPageFaultManager : public PageFaultManager {
4949
void baseGpuTransfer(void *ptr, void *cmdQ) {
5050
PageFaultManager::transferToGpu(ptr, cmdQ);
5151
}
52+
void broadcastWaitSignal() override {}
5253

5354
int allowMemoryAccessCalled = 0;
5455
int protectMemoryCalled = 0;

0 commit comments

Comments
 (0)