Skip to content

Commit 556b6cd

Browse files
Add concurrent kernel execution type
Related-To: NEO-4940 Signed-off-by: Sebastian Luzynski <sebastian.jozef.luzynski@intel.com>
1 parent 85ce7a5 commit 556b6cd

File tree

19 files changed

+124
-17
lines changed

19 files changed

+124
-17
lines changed

level_zero/core/source/cmdqueue/cmdqueue_hw.inl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,8 @@ void CommandQueueHw<gfxCoreFamily>::programFrontEnd(uint64_t scratchAddress, NEO
369369
scratchAddress,
370370
device->getMaxNumHwThreads(),
371371
csr->getOsContext().getEngineType(),
372-
NEO::AdditionalKernelExecInfo::NotApplicable);
372+
NEO::AdditionalKernelExecInfo::NotApplicable,
373+
NEO::KernelExecutionType::NotApplicable);
373374
frontEndInit = true;
374375
}
375376

opencl/source/command_queue/enqueue_common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
742742
L3CachingSettings::l3CacheOn, //l3CacheSettings
743743
kernel->getThreadArbitrationPolicy(), //threadArbitrationPolicy
744744
kernel->getAdditionalKernelExecInfo(), //additionalKernelExecInfo
745+
kernel->getExecutionType(), //kernelExecutionType
745746
getSliceCount(), //sliceCount
746747
blocking, //blocking
747748
shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC, //dcFlush
@@ -955,6 +956,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
955956
L3CachingSettings::NotApplicable, //l3CacheSettings
956957
ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
957958
AdditionalKernelExecInfo::NotApplicable, //additionalKernelExecInfo
959+
KernelExecutionType::NotApplicable, //kernelExecutionType
958960
getSliceCount(), //sliceCount
959961
blocking, //blocking
960962
false, //dcFlush

opencl/source/device_queue/device_queue_hw_bdw_plus.inl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,9 @@ void DeviceQueueHw<GfxFamily>::addMediaStateClearCmds() {
130130

131131
addDcFlushToPipeControlWa(pipeControl);
132132

133-
PreambleHelper<GfxFamily>::programVFEState(&slbCS, device->getHardwareInfo(), 0u, 0, device->getSharedDeviceInfo().maxFrontEndThreads, aub_stream::EngineType::ENGINE_RCS, AdditionalKernelExecInfo::NotApplicable);
133+
PreambleHelper<GfxFamily>::programVFEState(&slbCS, device->getHardwareInfo(), 0u, 0, device->getSharedDeviceInfo().maxFrontEndThreads,
134+
aub_stream::EngineType::ENGINE_RCS, AdditionalKernelExecInfo::NotApplicable,
135+
KernelExecutionType::NotApplicable);
134136
}
135137

136138
template <typename GfxFamily>

opencl/source/helpers/task_information.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
6161
L3CachingSettings::NotApplicable, //l3CacheSettings
6262
ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
6363
AdditionalKernelExecInfo::NotApplicable, //additionalKernelExecInfo
64+
KernelExecutionType::NotApplicable, //kernelExecutionType
6465
commandQueue.getSliceCount(), //sliceCount
6566
true, //blocking
6667
true, //dcFlush
@@ -219,6 +220,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
219220
L3CachingSettings::l3CacheOn, //l3CacheSettings
220221
kernel->getThreadArbitrationPolicy(), //threadArbitrationPolicy
221222
kernel->getAdditionalKernelExecInfo(), //additionalKernelExecInfo
223+
kernel->getExecutionType(), //kernelExecutionType
222224
commandQueue.getSliceCount(), //sliceCount
223225
true, //blocking
224226
flushDC, //dcFlush
@@ -340,6 +342,7 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
340342
L3CachingSettings::NotApplicable, //l3CacheSettings
341343
ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
342344
AdditionalKernelExecInfo::NotApplicable, //additionalKernelExecInfo
345+
KernelExecutionType::NotApplicable, //kernelExecutionType
343346
commandQueue.getSliceCount(), //sliceCount
344347
true, //blocking
345348
false, //dcFlush

opencl/source/kernel/kernel_execution_type.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
namespace NEO {
1111
enum class KernelExecutionType {
1212
Default = 0x0u,
13-
Concurrent = 0x1u
13+
Concurrent = 0x1u,
14+
NotApplicable = 0x2u
1415
};
1516
} // namespace NEO

opencl/test/unit_test/command_queue/command_queue_tests.cpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@
1616

1717
#include "opencl/source/command_queue/command_queue_hw.h"
1818
#include "opencl/source/event/event.h"
19+
#include "opencl/source/event/user_event.h"
1920
#include "opencl/source/helpers/hardware_commands_helper.h"
2021
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
2122
#include "opencl/test/unit_test/command_stream/command_stream_fixture.h"
2223
#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
2324
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
2425
#include "opencl/test/unit_test/fixtures/context_fixture.h"
26+
#include "opencl/test/unit_test/fixtures/dispatch_flags_fixture.h"
2527
#include "opencl/test/unit_test/fixtures/image_fixture.h"
2628
#include "opencl/test/unit_test/fixtures/memory_management_fixture.h"
2729
#include "opencl/test/unit_test/helpers/unit_test_helper.h"
@@ -1212,3 +1214,62 @@ TEST(CommandQueue, givenCopySizeAndOffsetWhenCallingBlitEnqueueImageAllowedThenR
12121214
EXPECT_EQ(expectedResult, queue.blitEnqueueImageAllowed(origin, region));
12131215
}
12141216
}
1217+
1218+
using KernelExecutionTypesTests = DispatchFlagsTests;
1219+
HWTEST_F(KernelExecutionTypesTests, givenConcurrentKernelWhileDoingNonBlockedEnqueueThenCorrectKernelTypeIsSetInCSR) {
1220+
using CsrType = MockCsrHw2<FamilyType>;
1221+
SetUpImpl<CsrType>();
1222+
auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
1223+
MockKernelWithInternals mockKernelWithInternals(*device.get());
1224+
auto pKernel = mockKernelWithInternals.mockKernel;
1225+
1226+
pKernel->setKernelExecutionType(CL_KERNEL_EXEC_INFO_CONCURRENT_TYPE_INTEL);
1227+
size_t gws[3] = {63, 0, 0};
1228+
1229+
mockCmdQ->enqueueKernel(pKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
1230+
1231+
auto &mockCsr = device->getUltCommandStreamReceiver<FamilyType>();
1232+
EXPECT_EQ(mockCsr.lastKernelExecutionType, KernelExecutionType::Concurrent);
1233+
}
1234+
1235+
HWTEST_F(KernelExecutionTypesTests, givenKernelWithDifferentExecutionTypeWhileDoingNonBlockedEnqueueThenKernelTypeInCSRIsChanging) {
1236+
using CsrType = MockCsrHw2<FamilyType>;
1237+
SetUpImpl<CsrType>();
1238+
auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
1239+
MockKernelWithInternals mockKernelWithInternals(*device.get());
1240+
auto pKernel = mockKernelWithInternals.mockKernel;
1241+
size_t gws[3] = {63, 0, 0};
1242+
auto &mockCsr = device->getUltCommandStreamReceiver<FamilyType>();
1243+
1244+
pKernel->setKernelExecutionType(CL_KERNEL_EXEC_INFO_CONCURRENT_TYPE_INTEL);
1245+
mockCmdQ->enqueueKernel(pKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
1246+
EXPECT_EQ(mockCsr.lastKernelExecutionType, KernelExecutionType::Concurrent);
1247+
1248+
mockCmdQ->enqueueMarkerWithWaitList(0, nullptr, nullptr);
1249+
EXPECT_EQ(mockCsr.lastKernelExecutionType, KernelExecutionType::Concurrent);
1250+
1251+
pKernel->setKernelExecutionType(CL_KERNEL_EXEC_INFO_DEFAULT_TYPE_INTEL);
1252+
mockCmdQ->enqueueKernel(pKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
1253+
1254+
EXPECT_EQ(mockCsr.lastKernelExecutionType, KernelExecutionType::Default);
1255+
}
1256+
1257+
HWTEST_F(KernelExecutionTypesTests, givenConcurrentKernelWhileDoingBlockedEnqueueThenCorrectKernelTypeIsSetInCSR) {
1258+
using CsrType = MockCsrHw2<FamilyType>;
1259+
SetUpImpl<CsrType>();
1260+
auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context.get(), device.get(), nullptr);
1261+
MockKernelWithInternals mockKernelWithInternals(*device.get());
1262+
auto pKernel = mockKernelWithInternals.mockKernel;
1263+
1264+
pKernel->setKernelExecutionType(CL_KERNEL_EXEC_INFO_CONCURRENT_TYPE_INTEL);
1265+
UserEvent userEvent;
1266+
cl_event waitlist[] = {&userEvent};
1267+
size_t gws[3] = {63, 0, 0};
1268+
1269+
mockCmdQ->enqueueKernel(pKernel, 1, nullptr, gws, nullptr, 1, waitlist, nullptr);
1270+
userEvent.setStatus(CL_COMPLETE);
1271+
1272+
auto &mockCsr = device->getUltCommandStreamReceiver<FamilyType>();
1273+
EXPECT_EQ(mockCsr.lastKernelExecutionType, KernelExecutionType::Concurrent);
1274+
mockCmdQ->isQueueBlocked();
1275+
}

opencl/test/unit_test/command_stream/compute_mode_tests.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,6 @@ struct ComputeModeRequirements : public ::testing::Test {
8181

8282
CommandStreamReceiver *csr = nullptr;
8383
std::unique_ptr<MockDevice> device;
84-
DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false};
84+
DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false};
8585
GraphicsAllocation *alloc = nullptr;
8686
};

opencl/test/unit_test/libult/ult_command_stream_receiver.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
6969
using BaseClass::CommandStreamReceiver::isEnginePrologueSent;
7070
using BaseClass::CommandStreamReceiver::isPreambleSent;
7171
using BaseClass::CommandStreamReceiver::isStateSipSent;
72+
using BaseClass::CommandStreamReceiver::lastKernelExecutionType;
7273
using BaseClass::CommandStreamReceiver::lastMediaSamplerConfig;
7374
using BaseClass::CommandStreamReceiver::lastPreemptionMode;
7475
using BaseClass::CommandStreamReceiver::lastSentCoherencyRequest;

shared/source/command_stream/command_stream_receiver.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ class CommandStreamReceiver {
287287
uint32_t requiredScratchSize = 0;
288288
uint32_t requiredPrivateScratchSize = 0;
289289
uint32_t lastAdditionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;
290+
KernelExecutionType lastKernelExecutionType = KernelExecutionType::Default;
290291

291292
const uint32_t rootDeviceIndex;
292293
const DeviceBitfield deviceBitfield;

shared/source/command_stream/command_stream_receiver_hw_base.inl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,10 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
311311
setMediaVFEStateDirty(true);
312312
}
313313

314+
if (dispatchFlags.kernelExecutionType != KernelExecutionType::NotApplicable && lastKernelExecutionType != dispatchFlags.kernelExecutionType) {
315+
setMediaVFEStateDirty(true);
316+
}
317+
314318
auto &commandStreamCSR = this->getCS(getRequiredCmdStreamSizeAligned(dispatchFlags, device));
315319
auto commandStreamStartCSR = commandStreamCSR.getUsed();
316320

@@ -872,7 +876,10 @@ inline void CommandStreamReceiverHw<GfxFamily>::programVFEState(LinearStream &cs
872876
if (dispatchFlags.additionalKernelExecInfo != AdditionalKernelExecInfo::NotApplicable) {
873877
lastAdditionalKernelExecInfo = dispatchFlags.additionalKernelExecInfo;
874878
}
875-
auto commandOffset = PreambleHelper<GfxFamily>::programVFEState(&csr, peekHwInfo(), requiredScratchSize, getScratchPatchAddress(), maxFrontEndThreads, getOsContext().getEngineType(), lastAdditionalKernelExecInfo);
879+
if (dispatchFlags.kernelExecutionType != KernelExecutionType::NotApplicable) {
880+
lastKernelExecutionType = dispatchFlags.kernelExecutionType;
881+
}
882+
auto commandOffset = PreambleHelper<GfxFamily>::programVFEState(&csr, peekHwInfo(), requiredScratchSize, getScratchPatchAddress(), maxFrontEndThreads, getOsContext().getEngineType(), lastAdditionalKernelExecInfo, lastKernelExecutionType);
876883
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
877884
flatBatchBufferHelper->collectScratchSpacePatchInfo(getScratchPatchAddress(), commandOffset, csr);
878885
}

0 commit comments

Comments
 (0)