Skip to content

Commit ef73bb8

Browse files
Move Walker specific code to dedicated method.
- move cache flushes after the Walker. Change-Id: I58c5e76bad22ac42da2c466ef008ef5bf96df077
1 parent a24704f commit ef73bb8

File tree

4 files changed

+96
-56
lines changed

4 files changed

+96
-56
lines changed

runtime/command_queue/hardware_interface.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018 Intel Corporation
2+
* Copyright (C) 2018-2019 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -80,6 +80,21 @@ class HardwareInterface {
8080
LinearStream *commandStream,
8181
CommandQueue &commandQueue);
8282

83+
static void programWalker(
84+
LinearStream &commandStream,
85+
Kernel &kernel,
86+
CommandQueue &commandQueue,
87+
TimestampPacketContainer *currentTimestampPacketNodes,
88+
IndirectHeap &dsh,
89+
IndirectHeap &ioh,
90+
IndirectHeap &ssh,
91+
size_t localWorkSizes[3],
92+
PreemptionMode preemptionMode,
93+
size_t currentDispatchIndex,
94+
uint32_t &interfaceDescriptorIndex,
95+
const DispatchInfo &dispatchInfo,
96+
size_t offsetInterfaceDescriptorTable);
97+
8398
static WALKER_TYPE<GfxFamily> *allocateWalkerSpace(LinearStream &commandStream,
8499
const Kernel &kernel);
85100
};

runtime/command_queue/hardware_interface.inl

Lines changed: 13 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018 Intel Corporation
2+
* Copyright (C) 2018-2019 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -12,6 +12,14 @@
1212

1313
namespace OCLRT {
1414

15+
template <typename GfxFamily>
16+
inline WALKER_TYPE<GfxFamily> *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream,
17+
const Kernel &kernel) {
18+
auto walkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream.getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
19+
*walkerCmd = GfxFamily::cmdInitGpgpuWalker;
20+
return walkerCmd;
21+
}
22+
1523
template <typename GfxFamily>
1624
void HardwareInterface<GfxFamily>::dispatchWalker(
1725
CommandQueue &commandQueue,
@@ -126,9 +134,6 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
126134
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
127135
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
128136

129-
// Determine SIMD size
130-
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
131-
132137
// If we don't have a required WGS, compute one opportunistically
133138
auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
134139
if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
@@ -148,7 +153,6 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
148153
// Compute number of work groups
149154
Vec3<size_t> twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups()
150155
: generateWorkgroupsNumber(gws, lws);
151-
Vec3<size_t> nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs;
152156

153157
// Patch our kernel constants
154158
*kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
@@ -183,7 +187,6 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
183187

184188
// Send our indirect object data
185189
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
186-
size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
187190

188191
dispatchProfilingPerfStartCommands(dispatchInfo, multiDispatchInfo, hwTimeStamps,
189192
hwPerfCounter, commandStream, commandQueue);
@@ -195,47 +198,8 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
195198
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, nullptr, timestampPacket, TimestampPacket::WriteOperationType::BeforeWalker);
196199
}
197200

198-
// Program the walker. Invokes execution so all state should already be programmed
199-
auto walkerCmd = allocateWalkerSpace(*commandStream, kernel);
200-
201-
KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, &kernel);
202-
203-
if (currentTimestampPacketNodes && commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
204-
auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex)->tag;
205-
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, walkerCmd, timestampPacket, TimestampPacket::WriteOperationType::AfterWalker);
206-
}
207-
208-
auto idd = obtainInterfaceDescriptorData(walkerCmd);
209-
210-
bool localIdsGenerationByRuntime = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes);
211-
bool inlineDataProgrammingRequired = KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);
212-
bool kernelUsesLocalIds = KernelCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
213-
KernelCommandsHelper<GfxFamily>::sendIndirectState(
214-
*commandStream,
215-
*dsh,
216-
*ioh,
217-
*ssh,
218-
kernel,
219-
simd,
220-
localWorkSizes,
221-
offsetInterfaceDescriptorTable,
222-
interfaceDescriptorIndex,
223-
preemptionMode,
224-
walkerCmd,
225-
idd,
226-
localIdsGenerationByRuntime,
227-
kernelUsesLocalIds,
228-
inlineDataProgrammingRequired);
229-
230-
size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
231-
size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
232-
size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
233-
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
234-
numWorkGroups, localWorkSizes, simd, dim,
235-
localIdsGenerationByRuntime, inlineDataProgrammingRequired,
236-
*kernel.getKernelInfo().patchInfo.threadPayload);
237-
238-
GpgpuWalkerHelper<GfxFamily>::adjustWalkerData(commandStream, walkerCmd, kernel, dispatchInfo);
201+
programWalker(*commandStream, kernel, commandQueue, currentTimestampPacketNodes, *dsh, *ioh, *ssh,
202+
localWorkSizes, preemptionMode, currentDispatchIndex, interfaceDescriptorIndex, dispatchInfo, offsetInterfaceDescriptorTable);
239203

240204
dispatchWorkarounds(commandStream, commandQueue, kernel, false);
241205
if (dispatchInfo.isPipeControlRequired()) {
@@ -244,6 +208,8 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
244208
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
245209
pPipeControlCmd->setCommandStreamerStallEnable(true);
246210
}
211+
KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, &kernel);
212+
247213
currentDispatchIndex++;
248214
}
249215
dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);

runtime/command_queue/hardware_interface_base.inl

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018 Intel Corporation
2+
* Copyright (C) 2018-2019 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -92,11 +92,70 @@ inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
9292
}
9393

9494
template <typename GfxFamily>
95-
inline WALKER_TYPE<GfxFamily> *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream,
96-
const Kernel &kernel) {
97-
auto walkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream.getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
98-
*walkerCmd = GfxFamily::cmdInitGpgpuWalker;
99-
return walkerCmd;
95+
inline void HardwareInterface<GfxFamily>::programWalker(
96+
LinearStream &commandStream,
97+
Kernel &kernel,
98+
CommandQueue &commandQueue,
99+
TimestampPacketContainer *currentTimestampPacketNodes,
100+
IndirectHeap &dsh,
101+
IndirectHeap &ioh,
102+
IndirectHeap &ssh,
103+
size_t localWorkSizes[3],
104+
PreemptionMode preemptionMode,
105+
size_t currentDispatchIndex,
106+
uint32_t &interfaceDescriptorIndex,
107+
const DispatchInfo &dispatchInfo,
108+
size_t offsetInterfaceDescriptorTable) {
109+
110+
auto walkerCmd = allocateWalkerSpace(commandStream, kernel);
111+
uint32_t dim = dispatchInfo.getDim();
112+
Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();
113+
Vec3<size_t> gws = dispatchInfo.getGWS();
114+
Vec3<size_t> swgs = dispatchInfo.getStartOfWorkgroups();
115+
Vec3<size_t> twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups() : generateWorkgroupsNumber(gws, lws);
116+
Vec3<size_t> nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs;
117+
size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
118+
119+
if (currentTimestampPacketNodes && commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
120+
auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex)->tag;
121+
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, walkerCmd, timestampPacket, TimestampPacket::WriteOperationType::AfterWalker);
122+
}
123+
124+
auto idd = obtainInterfaceDescriptorData(walkerCmd);
125+
126+
bool localIdsGenerationByRuntime = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes);
127+
bool inlineDataProgrammingRequired = KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);
128+
bool kernelUsesLocalIds = KernelCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
129+
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
130+
131+
Vec3<size_t> offset = dispatchInfo.getOffset();
132+
133+
KernelCommandsHelper<GfxFamily>::sendIndirectState(
134+
commandStream,
135+
dsh,
136+
ioh,
137+
ssh,
138+
kernel,
139+
simd,
140+
localWorkSizes,
141+
offsetInterfaceDescriptorTable,
142+
interfaceDescriptorIndex,
143+
preemptionMode,
144+
walkerCmd,
145+
idd,
146+
localIdsGenerationByRuntime,
147+
kernelUsesLocalIds,
148+
inlineDataProgrammingRequired);
149+
150+
size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
151+
size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
152+
size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
153+
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
154+
numWorkGroups, localWorkSizes, simd, dim,
155+
localIdsGenerationByRuntime, inlineDataProgrammingRequired,
156+
*kernel.getKernelInfo().patchInfo.threadPayload);
157+
158+
GpgpuWalkerHelper<GfxFamily>::adjustWalkerData(&commandStream, walkerCmd, kernel, dispatchInfo);
100159
}
101160

102161
} // namespace OCLRT

unit_tests/command_queue/enqueue_kernel_2_tests.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2018 Intel Corporation
2+
* Copyright (C) 2018-2019 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -878,7 +878,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueKernelTest, givenCacheFlushAfterWalkerEnabled
878878
hwParse.parseCommands<FamilyType>(cmdQ.getCS(0), 0);
879879
auto itorCmd = find<GPGPU_WALKER *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
880880
ASSERT_NE(hwParse.cmdList.end(), itorCmd);
881-
++itorCmd;
881+
itorCmd = find<PIPE_CONTROL *>(itorCmd, hwParse.cmdList.end());
882882
auto pipeControl = genCmdCast<PIPE_CONTROL *>(*itorCmd);
883883
ASSERT_NE(nullptr, pipeControl);
884884
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());

0 commit comments

Comments
 (0)