Skip to content

Commit 82bc594

Browse files
Add clEnqueueNDRangeKernelINTEL API
Related-To: NEO-2712 Change-Id: If1d16d9d626871a9dc4b19282f9edc5786ffa398 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
1 parent 7be937c commit 82bc594

26 files changed

+764
-14
lines changed

core/program/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ set(NEO_CORE_PROGRAM
88
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
99
${CMAKE_CURRENT_SOURCE_DIR}/print_formatter.cpp
1010
${CMAKE_CURRENT_SOURCE_DIR}/print_formatter.h
11+
${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.cpp
12+
${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler.h
1113
)
1214

1315
set_property(GLOBAL PROPERTY NEO_CORE_PROGRAM ${NEO_CORE_PROGRAM})
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/*
2+
* Copyright (C) 2019 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#include "core/program/sync_buffer_handler.h"
9+
10+
#include "core/memory_manager/graphics_allocation.h"
11+
#include "runtime/command_stream/command_stream_receiver.h"
12+
#include "runtime/kernel/kernel.h"
13+
#include "runtime/memory_manager/memory_manager.h"
14+
15+
namespace NEO {
16+
17+
SyncBufferHandler::~SyncBufferHandler() {
18+
memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation);
19+
};
20+
SyncBufferHandler::SyncBufferHandler(Device &device)
21+
: device(device), memoryManager(*device.getMemoryManager()) {
22+
23+
allocateNewBuffer();
24+
}
25+
26+
void SyncBufferHandler::prepareForEnqueue(size_t workGroupsCount, Kernel &kernel, CommandStreamReceiver &csr) {
27+
auto requiredSize = workGroupsCount;
28+
std::lock_guard<std::mutex> guard(this->mutex);
29+
30+
bool isCurrentBufferFull = (usedBufferSize + requiredSize > bufferSize);
31+
if (isCurrentBufferFull) {
32+
memoryManager.checkGpuUsageAndDestroyGraphicsAllocations(graphicsAllocation);
33+
allocateNewBuffer();
34+
usedBufferSize = 0;
35+
}
36+
37+
kernel.patchSyncBuffer(device, graphicsAllocation, usedBufferSize);
38+
csr.makeResident(*graphicsAllocation);
39+
40+
usedBufferSize += requiredSize;
41+
}
42+
43+
void SyncBufferHandler::allocateNewBuffer() {
44+
AllocationProperties allocationProperties{device.getRootDeviceIndex(), true, bufferSize,
45+
GraphicsAllocation::AllocationType::LINEAR_STREAM,
46+
false, false, static_cast<uint32_t>(device.getDeviceBitfield().to_ulong())};
47+
graphicsAllocation = memoryManager.allocateGraphicsMemoryWithProperties(allocationProperties);
48+
UNRECOVERABLE_IF(graphicsAllocation == nullptr);
49+
50+
auto cpuPointer = graphicsAllocation->getUnderlyingBuffer();
51+
std::memset(cpuPointer, 0, bufferSize);
52+
}
53+
54+
} // namespace NEO

core/program/sync_buffer_handler.h

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* Copyright (C) 2019 Intel Corporation
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
*/
7+
8+
#pragma once
9+
10+
#include "core/helpers/basic_math.h"
11+
12+
#include <mutex>
13+
14+
namespace NEO {
15+
16+
class CommandStreamReceiver;
17+
class Context;
18+
class Device;
19+
class GraphicsAllocation;
20+
class MemoryManager;
21+
class Kernel;
22+
23+
class SyncBufferHandler {
24+
public:
25+
~SyncBufferHandler();
26+
27+
SyncBufferHandler(Device &device);
28+
29+
void prepareForEnqueue(size_t workGroupsCount, Kernel &kernel, CommandStreamReceiver &csr);
30+
31+
protected:
32+
void allocateNewBuffer();
33+
34+
Device &device;
35+
MemoryManager &memoryManager;
36+
GraphicsAllocation *graphicsAllocation;
37+
const size_t bufferSize = 64 * KB;
38+
size_t usedBufferSize = 0;
39+
std::mutex mutex;
40+
};
41+
42+
} // namespace NEO

runtime/api/api.cpp

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3135,18 +3135,24 @@ cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue commandQueue,
31353135
"event", DebugManager.getEvents(reinterpret_cast<const uintptr_t *>(event), 1));
31363136

31373137
CommandQueue *pCommandQueue = nullptr;
3138+
Kernel *pKernel = nullptr;
31383139

31393140
retVal = validateObjects(
31403141
WithCastToInternal(commandQueue, &pCommandQueue),
3141-
kernel,
3142+
WithCastToInternal(kernel, &pKernel),
31423143
EventWaitList(numEventsInWaitList, eventWaitList));
31433144

31443145
if (CL_SUCCESS != retVal) {
31453146
TRACING_EXIT(clEnqueueNDRangeKernel, &retVal);
31463147
return retVal;
31473148
}
31483149

3149-
auto pKernel = castToObjectOrAbort<Kernel>(kernel);
3150+
if (pKernel->getKernelInfo().patchInfo.pAllocateSyncBuffer != nullptr) {
3151+
retVal = CL_INVALID_KERNEL;
3152+
TRACING_EXIT(clEnqueueNDRangeKernel, &retVal);
3153+
return retVal;
3154+
}
3155+
31503156
TakeOwnershipWrapper<Kernel> kernelOwnership(*pKernel, gtpinIsGTPinInitialized());
31513157
if (gtpinIsGTPinInitialized()) {
31523158
gtpinNotifyKernelSubmit(kernel, pCommandQueue);
@@ -3947,6 +3953,7 @@ void *CL_API_CALL clGetExtensionFunctionAddress(const char *funcName) {
39473953
RETURN_FUNC_PTR_IF_EXIST(clGetDeviceFunctionPointerINTEL);
39483954
RETURN_FUNC_PTR_IF_EXIST(clGetDeviceGlobalVariablePointerINTEL);
39493955
RETURN_FUNC_PTR_IF_EXIST(clGetExecutionInfoINTEL);
3956+
RETURN_FUNC_PTR_IF_EXIST(clEnqueueNDRangeKernelINTEL);
39503957

39513958
void *ret = sharingFactory.getExtensionFunctionAddress(funcName);
39523959
if (ret != nullptr) {
@@ -5197,3 +5204,70 @@ cl_int CL_API_CALL clGetExecutionInfoINTEL(cl_command_queue commandQueue,
51975204

51985205
return retVal;
51995206
}
5207+
5208+
cl_int CL_API_CALL clEnqueueNDRangeKernelINTEL(cl_command_queue commandQueue,
5209+
cl_kernel kernel,
5210+
cl_uint workDim,
5211+
const size_t *globalWorkOffset,
5212+
const size_t *workgroupCount,
5213+
const size_t *localWorkSize,
5214+
cl_uint numEventsInWaitList,
5215+
const cl_event *eventWaitList,
5216+
cl_event *event) {
5217+
cl_int retVal = CL_SUCCESS;
5218+
API_ENTER(&retVal);
5219+
DBG_LOG_INPUTS("commandQueue", commandQueue, "cl_kernel", kernel,
5220+
"globalWorkOffset[0]", DebugManager.getInput(globalWorkOffset, 0),
5221+
"globalWorkOffset[1]", DebugManager.getInput(globalWorkOffset, 1),
5222+
"globalWorkOffset[2]", DebugManager.getInput(globalWorkOffset, 2),
5223+
"workgroupCount", DebugManager.getSizes(workgroupCount, workDim, false),
5224+
"localWorkSize", DebugManager.getSizes(localWorkSize, workDim, true),
5225+
"numEventsInWaitList", numEventsInWaitList,
5226+
"eventWaitList", DebugManager.getEvents(reinterpret_cast<const uintptr_t *>(eventWaitList), numEventsInWaitList),
5227+
"event", DebugManager.getEvents(reinterpret_cast<const uintptr_t *>(event), 1));
5228+
5229+
CommandQueue *pCommandQueue = nullptr;
5230+
Kernel *pKernel = nullptr;
5231+
5232+
retVal = validateObjects(
5233+
WithCastToInternal(commandQueue, &pCommandQueue),
5234+
WithCastToInternal(kernel, &pKernel),
5235+
EventWaitList(numEventsInWaitList, eventWaitList));
5236+
5237+
if (CL_SUCCESS != retVal) {
5238+
return retVal;
5239+
}
5240+
5241+
size_t globalWorkSize[3];
5242+
size_t requestedNumberOfWorkgroups = 1;
5243+
for (size_t i = 0; i < workDim; i++) {
5244+
globalWorkSize[i] = workgroupCount[i] * localWorkSize[i];
5245+
requestedNumberOfWorkgroups *= workgroupCount[i];
5246+
}
5247+
5248+
size_t maximalNumberOfWorkgroupsAllowed = pKernel->getMaxWorkGroupCount(workDim, localWorkSize);
5249+
if (requestedNumberOfWorkgroups > maximalNumberOfWorkgroupsAllowed) {
5250+
retVal = CL_INVALID_VALUE;
5251+
return retVal;
5252+
}
5253+
5254+
TakeOwnershipWrapper<Kernel> kernelOwnership(*pKernel, gtpinIsGTPinInitialized());
5255+
if (gtpinIsGTPinInitialized()) {
5256+
gtpinNotifyKernelSubmit(kernel, pCommandQueue);
5257+
}
5258+
5259+
pCommandQueue->getDevice().allocateSyncBufferHandler();
5260+
5261+
retVal = pCommandQueue->enqueueKernel(
5262+
kernel,
5263+
workDim,
5264+
globalWorkOffset,
5265+
globalWorkSize,
5266+
localWorkSize,
5267+
numEventsInWaitList,
5268+
eventWaitList,
5269+
event);
5270+
5271+
DBG_LOG_INPUTS("event", DebugManager.getEvents(reinterpret_cast<const uintptr_t *>(event), 1u));
5272+
return retVal;
5273+
}

runtime/api/api.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,6 +1036,16 @@ cl_int CL_API_CALL clGetExecutionInfoINTEL(
10361036
void *paramValue,
10371037
size_t *paramValueSizeRet);
10381038

1039+
cl_int CL_API_CALL clEnqueueNDRangeKernelINTEL(cl_command_queue commandQueue,
1040+
cl_kernel kernel,
1041+
cl_uint workDim,
1042+
const size_t *globalWorkOffset,
1043+
const size_t *workgroupCount,
1044+
const size_t *localWorkSize,
1045+
cl_uint numEventsInWaitList,
1046+
const cl_event *eventWaitList,
1047+
cl_event *event);
1048+
10391049
// OpenCL 2.2
10401050

10411051
cl_int CL_API_CALL clSetProgramSpecializationConstant(

runtime/command_queue/enqueue_common.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
*/
77

88
#pragma once
9+
#include "core/program/sync_buffer_handler.h"
910
#include "core/utilities/range.h"
1011
#include "runtime/built_ins/built_ins.h"
1112
#include "runtime/built_ins/builtins_dispatch_builder.h"
@@ -642,6 +643,15 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
642643
blocking = true;
643644
printfHandler->makeResident(getGpgpuCommandStreamReceiver());
644645
}
646+
647+
if (multiDispatchInfo.peekMainKernel()->usesSyncBuffer()) {
648+
auto &gws = multiDispatchInfo.begin()->getGWS();
649+
auto &lws = multiDispatchInfo.begin()->getLocalWorkgroupSize();
650+
size_t workGroupsCount = (gws.x * gws.y * gws.z) /
651+
(lws.x * lws.y * lws.z);
652+
device->syncBufferHandler->prepareForEnqueue(workGroupsCount, *multiDispatchInfo.peekMainKernel(), getGpgpuCommandStreamReceiver());
653+
}
654+
645655
if (timestampPacketContainer) {
646656
timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
647657
timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());

runtime/compiler_interface/patchtokens_decoder.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,10 @@ inline bool decodeToken(const SPatchItemHeader *token, KernelFromPatchtokens &ou
407407
auto tokDataP = reinterpret_cast<const SPatchDataParameterBuffer *>(token);
408408
decodeKernelDataParameterToken(tokDataP, out);
409409
} break;
410+
411+
case PATCH_TOKEN_ALLOCATE_SYNC_BUFFER: {
412+
assignToken(out.tokens.allocateSyncBuffer, token);
413+
} break;
410414
}
411415

412416
return out.decodeStatus != DecoderError::InvalidBinary;

runtime/compiler_interface/patchtokens_decoder.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ struct KernelFromPatchtokens {
129129
const SPatchAllocateStatelessPrintfSurface *allocateStatelessPrintfSurface = nullptr;
130130
const SPatchAllocateStatelessEventPoolSurface *allocateStatelessEventPoolSurface = nullptr;
131131
const SPatchAllocateStatelessDefaultDeviceQueueSurface *allocateStatelessDefaultDeviceQueueSurface = nullptr;
132+
const SPatchAllocateSyncBuffer *allocateSyncBuffer = nullptr;
132133
const SPatchItemHeader *inlineVmeSamplerInfo = nullptr;
133134
const SPatchGtpinFreeGRFInfo *gtpinFreeGrfInfo = nullptr;
134135
const SPatchStateSIP *stateSip = nullptr;

runtime/device/device.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#include "core/command_stream/preemption.h"
1111
#include "core/helpers/hw_helper.h"
12+
#include "core/program/sync_buffer_handler.h"
1213
#include "runtime/command_stream/command_stream_receiver.h"
1314
#include "runtime/command_stream/experimental_command_buffer.h"
1415
#include "runtime/device/device_vector.h"
@@ -60,6 +61,7 @@ Device::Device(ExecutionEnvironment *executionEnvironment)
6061

6162
Device::~Device() {
6263
DEBUG_BREAK_IF(nullptr == executionEnvironment->memoryManager.get());
64+
syncBufferHandler.reset();
6365
if (performanceCounters) {
6466
performanceCounters->shutdown();
6567
}
@@ -203,6 +205,15 @@ double Device::getPlatformHostTimerResolution() const {
203205
return osTime->getHostTimerResolution();
204206
return 0.0;
205207
}
208+
209+
void Device::allocateSyncBufferHandler() {
210+
TakeOwnershipWrapper<Device> lock(*this);
211+
if (syncBufferHandler.get() == nullptr) {
212+
syncBufferHandler = std::make_unique<SyncBufferHandler>(*this);
213+
UNRECOVERABLE_IF(syncBufferHandler.get() == nullptr);
214+
}
215+
}
216+
206217
GFXCORE_FAMILY Device::getRenderCoreFamily() const {
207218
return this->getHardwareInfo().platform.eRenderCoreFamily;
208219
}

runtime/device/device.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@
1818
#include "engine_node.h"
1919

2020
namespace NEO {
21-
class OSTime;
2221
class DriverInfo;
22+
class OSTime;
23+
class SyncBufferHandler;
2324

2425
template <>
2526
struct OpenCLObjectMapper<_cl_device_id> {
@@ -72,6 +73,7 @@ class Device : public BaseObject<_cl_device_id> {
7273
double getPlatformHostTimerResolution() const;
7374
bool isSimulation() const;
7475
GFXCORE_FAMILY getRenderCoreFamily() const;
76+
void allocateSyncBufferHandler();
7577
PerformanceCounters *getPerformanceCounters() { return performanceCounters.get(); }
7678
PreemptionMode getPreemptionMode() const { return preemptionMode; }
7779
MOCKABLE_VIRTUAL bool isSourceLevelDebuggerActive() const;
@@ -88,8 +90,10 @@ class Device : public BaseObject<_cl_device_id> {
8890
virtual uint32_t getRootDeviceIndex() const = 0;
8991
virtual uint32_t getNumAvailableDevices() const = 0;
9092
virtual Device *getDeviceById(uint32_t deviceId) const = 0;
93+
virtual DeviceBitfield getDeviceBitfield() const = 0;
9194

9295
static decltype(&PerformanceCounters::create) createPerformanceCountersFunc;
96+
std::unique_ptr<SyncBufferHandler> syncBufferHandler;
9397

9498
protected:
9599
Device() = delete;
@@ -113,8 +117,6 @@ class Device : public BaseObject<_cl_device_id> {
113117
bool createEngine(uint32_t deviceCsrIndex, aub_stream::EngineType engineType);
114118
MOCKABLE_VIRTUAL std::unique_ptr<CommandStreamReceiver> createCommandStreamReceiver() const;
115119

116-
virtual DeviceBitfield getDeviceBitfield() const = 0;
117-
118120
std::vector<unsigned int> simultaneousInterops;
119121
unsigned int enabledClVersion = 0u;
120122
std::string deviceExtensions;

0 commit comments

Comments
 (0)