Skip to content

Commit a67e829

Browse files
Store crossThreadData per root device in Kernel
Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
1 parent 556b6cd commit a67e829

39 files changed

+306
-269
lines changed

opencl/source/built_ins/vme_dispatch_builder.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "opencl/source/accelerators/intel_motion_estimation.h"
1414
#include "opencl/source/built_ins/built_in_ops_vme.h"
1515
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
16+
#include "opencl/source/cl_device/cl_device.h"
1617
#include "opencl/source/helpers/dispatch_info_builder.h"
1718
#include "opencl/source/mem_obj/buffer.h"
1819
#include "opencl/source/mem_obj/image.h"
@@ -167,7 +168,7 @@ class VmeBuiltinDispatchInfoBuilder : public BuiltinDispatchInfoBuilder {
167168
DEBUG_BREAK_IF(kai.kernelArgPatchInfoVector.size() != 1);
168169
const KernelArgPatchInfo &patchInfo = kai.kernelArgPatchInfoVector[0];
169170
DEBUG_BREAK_IF(sizeof(RetType) > patchInfo.size);
170-
return *(RetType *)(vmeKernel->getCrossThreadData() + patchInfo.crossthreadOffset);
171+
return *(RetType *)(vmeKernel->getCrossThreadData(clDevice.getRootDeviceIndex()) + patchInfo.crossthreadOffset);
171172
}
172173

173174
cl_int validateImages(Vec3<size_t> inputRegion, Vec3<size_t> offset) const {

opencl/source/command_queue/gpgpu_walker_bdw_plus.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
150150
&cmdWalker,
151151
nullptr,
152152
true,
153-
devQueueHw.getDevice().getHardwareInfo());
153+
devQueueHw.getDevice());
154154

155155
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
156156
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, true);

opencl/source/command_queue/hardware_interface_bdw_plus.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
102102
&walkerCmd,
103103
nullptr,
104104
true,
105-
commandQueue.getDevice().getHardwareInfo());
105+
commandQueue.getDevice());
106106

107107
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, globalOffsets, startWorkGroups,
108108
numWorkGroups, localWorkSizes, simd, dim,

opencl/source/helpers/hardware_commands_helper.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
7676
Kernel &kernel,
7777
bool inlineDataProgrammingRequired,
7878
WALKER_TYPE<GfxFamily> *walkerCmd,
79-
uint32_t &sizeCrossThreadData);
79+
uint32_t &sizeCrossThreadData,
80+
uint32_t rootDeviceIndex);
8081

8182
static size_t sendIndirectState(
8283
LinearStream &commandStream,
@@ -93,7 +94,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
9394
WALKER_TYPE<GfxFamily> *walkerCmd,
9495
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
9596
bool localIdsGenerationByRuntime,
96-
const HardwareInfo &hardwareInfo);
97+
const Device &device);
9798

9899
static void programPerThreadData(
99100
size_t &sizePerThreadData,
@@ -121,6 +122,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
121122
static size_t getSizeRequiredDSH(
122123
const Kernel &kernel);
123124
static size_t getSizeRequiredIOH(
125+
uint32_t rootDeviceIndex,
124126
const Kernel &kernel,
125127
size_t localWorkSize = 256);
126128
static size_t getSizeRequiredSSH(

opencl/source/helpers/hardware_commands_helper_base.inl

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredDSH(
5858

5959
template <typename GfxFamily>
6060
size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(
61+
uint32_t rootDeviceIndex,
6162
const Kernel &kernel,
6263
size_t localWorkSize) {
6364
typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE;
@@ -67,7 +68,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(
6768

6869
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
6970
uint32_t grfSize = sizeof(typename GfxFamily::GRF);
70-
return alignUp((kernel.getCrossThreadDataSize() +
71+
return alignUp((kernel.getCrossThreadDataSize(rootDeviceIndex) +
7172
getPerThreadDataSizeTotal(kernel.getKernelInfo().getMaxSimdSize(), grfSize, numChannels, localWorkSize)),
7273
WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
7374
}
@@ -102,7 +103,10 @@ size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(
102103
template <typename GfxFamily>
103104
size_t HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(
104105
const MultiDispatchInfo &multiDispatchInfo) {
105-
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(*dispatchInfo.getKernel(), Math::computeTotalElementsCount(dispatchInfo.getLocalWorkgroupSize())); });
106+
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(
107+
dispatchInfo.getClDevice().getRootDeviceIndex(),
108+
*dispatchInfo.getKernel(),
109+
Math::computeTotalElementsCount(dispatchInfo.getLocalWorkgroupSize())); });
106110
}
107111

108112
template <typename GfxFamily>
@@ -215,10 +219,13 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
215219
WALKER_TYPE<GfxFamily> *walkerCmd,
216220
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
217221
bool localIdsGenerationByRuntime,
218-
const HardwareInfo &hardwareInfo) {
222+
const Device &device) {
219223

220224
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
221225

226+
auto &hardwareInfo = device.getHardwareInfo();
227+
auto rootDeviceIndex = device.getRootDeviceIndex();
228+
222229
DEBUG_BREAK_IF(simd != 1 && simd != 8 && simd != 16 && simd != 32);
223230
auto inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);
224231

@@ -227,7 +234,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
227234
const auto &patchInfo = kernelInfo.patchInfo;
228235

229236
ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
230-
kernel.patchBindlessSurfaceStateOffsets(ssh.getUsed());
237+
kernel.patchBindlessSurfaceStateOffsets(device, ssh.getUsed());
231238

232239
auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0,
233240
kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(),
@@ -248,11 +255,11 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
248255
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
249256
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
250257

251-
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
258+
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize(rootDeviceIndex);
252259

253260
size_t offsetCrossThreadData = HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
254261
ioh, kernel, inlineDataProgrammingRequired,
255-
walkerCmd, sizeCrossThreadData);
262+
walkerCmd, sizeCrossThreadData, rootDeviceIndex);
256263

257264
size_t sizePerThreadDataTotal = 0;
258265
size_t sizePerThreadData = 0;

opencl/source/helpers/hardware_commands_helper_bdw_plus.inl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,13 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
125125
Kernel &kernel,
126126
bool inlineDataProgrammingRequired,
127127
WALKER_TYPE<GfxFamily> *walkerCmd,
128-
uint32_t &sizeCrossThreadData) {
128+
uint32_t &sizeCrossThreadData,
129+
uint32_t rootDeviceIndex) {
129130
indirectHeap.align(WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
130131

131132
auto offsetCrossThreadData = indirectHeap.getUsed();
132133
char *pDest = static_cast<char *>(indirectHeap.getSpace(sizeCrossThreadData));
133-
memcpy_s(pDest, sizeCrossThreadData, kernel.getCrossThreadData(), sizeCrossThreadData);
134+
memcpy_s(pDest, sizeCrossThreadData, kernel.getCrossThreadData(rootDeviceIndex), sizeCrossThreadData);
134135

135136
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
136137
FlatBatchBufferHelper::fixCrossThreadDataInfo(kernel.getPatchInfoDataList(), offsetCrossThreadData, indirectHeap.getGraphicsAllocation()->getGpuAddress());

0 commit comments

Comments
 (0)