Skip to content

Commit 62f89b1

Browse files
Add work_dim patching to l0 kernel
Related-To: NEO-5931 Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
1 parent f6443a2 commit 62f89b1

File tree

11 files changed

+220
-15
lines changed

11 files changed

+220
-15
lines changed

level_zero/core/source/cmdlist/cmdlist_hw_base.inl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
5858
kernel->setGroupCount(pThreadGroupDimensions->groupCountX,
5959
pThreadGroupDimensions->groupCountY,
6060
pThreadGroupDimensions->groupCountZ);
61+
kernel->patchWorkDim(pThreadGroupDimensions->groupCountX,
62+
pThreadGroupDimensions->groupCountY,
63+
pThreadGroupDimensions->groupCountZ);
6164
}
6265

6366
if (isIndirect && pThreadGroupDimensions) {

level_zero/core/source/cmdlist/cmdlist_hw_xehp_plus.inl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,14 +104,16 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
104104
commandListPreemptionMode = std::min(commandListPreemptionMode, functionPreemptionMode);
105105

106106
kernel->patchGlobalOffset();
107-
108107
if (isIndirect && pThreadGroupDimensions) {
109108
prepareIndirectParams(pThreadGroupDimensions);
110109
}
111110
if (!isIndirect) {
112111
kernel->setGroupCount(pThreadGroupDimensions->groupCountX,
113112
pThreadGroupDimensions->groupCountY,
114113
pThreadGroupDimensions->groupCountZ);
114+
kernel->patchWorkDim(pThreadGroupDimensions->groupCountX,
115+
pThreadGroupDimensions->groupCountY,
116+
pThreadGroupDimensions->groupCountZ);
115117
}
116118
NEO::GraphicsAllocation *eventAlloc = nullptr;
117119
uint64_t eventAddress = 0;

level_zero/core/source/kernel/kernel.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
116116
virtual ze_result_t setGlobalOffsetExp(uint32_t offsetX, uint32_t offsetY, uint32_t offsetZ) = 0;
117117
virtual uint32_t patchGlobalOffset() = 0;
118118

119+
virtual void patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) = 0;
120+
119121
virtual ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount) = 0;
120122
virtual ze_result_t setCacheConfig(ze_cache_config_flags_t flags) = 0;
121123

level_zero/core/source/kernel/kernel_imp.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "shared/source/helpers/register_offsets.h"
1515
#include "shared/source/helpers/string.h"
1616
#include "shared/source/helpers/surface_format_info.h"
17+
#include "shared/source/kernel/kernel_arg_descriptor.h"
1718
#include "shared/source/kernel/kernel_descriptor.h"
1819
#include "shared/source/memory_manager/memory_manager.h"
1920
#include "shared/source/memory_manager/memory_operations_handler.h"
@@ -888,6 +889,21 @@ uint32_t KernelImp::patchGlobalOffset() {
888889
return NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkOffset, this->globalOffsets);
889890
}
890891

892+
void KernelImp::patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) {
893+
const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
894+
auto dataOffset = kernelDescriptor.payloadMappings.dispatchTraits.workDim;
895+
if (NEO::isValidOffset(dataOffset)) {
896+
auto destinationBuffer = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
897+
uint32_t workDim = 1;
898+
if (groupCountZ * groupSize[2] > 1) {
899+
workDim = 3;
900+
} else if (groupCountY * groupSize[1] > 1) {
901+
workDim = 2;
902+
}
903+
NEO::patchNonPointer(destinationBuffer, kernelDescriptor.payloadMappings.dispatchTraits.workDim, workDim);
904+
}
905+
}
906+
891907
Kernel *Kernel::create(uint32_t productFamily, Module *module,
892908
const ze_kernel_desc_t *desc, ze_result_t *res) {
893909
UNRECOVERABLE_IF(productFamily >= IGFX_MAX_PRODUCT);

level_zero/core/source/kernel/kernel_imp.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ struct KernelImp : Kernel {
126126
ze_result_t setGlobalOffsetExp(uint32_t offsetX, uint32_t offsetY, uint32_t offsetZ) override;
127127
uint32_t patchGlobalOffset() override;
128128

129+
void patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) override;
130+
129131
ze_result_t setCacheConfig(ze_cache_config_flags_t flags) override;
130132
bool usesRayTracing() {
131133
return kernelImmData->getDescriptor().hasRTCalls();

level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -591,44 +591,95 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenCommandListWhenAppendLaunchKernelS
591591
EXPECT_EQ(1u, event->getPacketsInUse());
592592
}
593593

594-
HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWhenAppendingThenWorkGroupCountAndGlobalWorkSizeIsSetInCrossThreadData) {
594+
HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWhenAppendingThenWorkGroupCountAndGlobalWorkSizeAndWorkDimIsSetInCrossThreadData) {
595595
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
596596
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
597597
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
598598

599599
Mock<::L0::Kernel> kernel;
600+
kernel.groupSize[0] = 2;
600601
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 2;
601602
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = 2;
603+
kernel.descriptor.payloadMappings.dispatchTraits.workDim = 2;
602604
ze_result_t returnValue;
603605
std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue));
604606

605607
void *alloc = nullptr;
606608
ze_device_mem_alloc_desc_t deviceDesc = {};
607609
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc);
608-
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
610+
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
609611

610612
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
611613
static_cast<ze_group_count_t *>(alloc),
612614
nullptr, 0, nullptr);
613-
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
615+
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
616+
617+
kernel.groupSize[2] = 2;
618+
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
619+
static_cast<ze_group_count_t *>(alloc),
620+
nullptr, 0, nullptr);
621+
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
614622

615623
GenCmdList cmdList;
616624
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
617625
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed()));
618626

619627
auto itor = find<MI_STORE_REGISTER_MEM *>(cmdList.begin(), cmdList.end());
620-
EXPECT_NE(cmdList.end(), itor);
621-
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
622-
EXPECT_NE(cmdList.end(), itor);
623-
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
624-
EXPECT_NE(cmdList.end(), itor);
628+
EXPECT_NE(itor, cmdList.end());
625629

626-
itor = find<MI_LOAD_REGISTER_REG *>(itor, cmdList.end());
627-
EXPECT_NE(cmdList.end(), itor);
628-
itor = find<MI_LOAD_REGISTER_IMM *>(itor, cmdList.end());
629-
EXPECT_NE(cmdList.end(), itor);
630-
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
631-
EXPECT_NE(cmdList.end(), itor);
630+
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
631+
EXPECT_NE(itor, cmdList.end());
632+
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
633+
EXPECT_NE(itor, cmdList.end());
634+
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
635+
EXPECT_NE(itor, cmdList.end());
636+
637+
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
638+
EXPECT_NE(itor, cmdList.end());
639+
640+
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
641+
EXPECT_NE(itor, cmdList.end());
642+
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
643+
EXPECT_NE(itor, cmdList.end());
644+
645+
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
646+
EXPECT_NE(itor, cmdList.end());
647+
itor++; //MI_MATH_ALU_INST_INLINE doesn't have tagMI_COMMAND_OPCODE, can't find it in cmdList
648+
EXPECT_NE(itor, cmdList.end());
649+
itor++;
650+
EXPECT_NE(itor, cmdList.end());
651+
652+
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
653+
EXPECT_NE(itor, cmdList.end());
654+
itor++;
655+
EXPECT_NE(itor, cmdList.end());
656+
itor++;
657+
EXPECT_NE(itor, cmdList.end());
658+
659+
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
660+
EXPECT_NE(itor, cmdList.end());
661+
itor++;
662+
EXPECT_NE(itor, cmdList.end());
663+
itor++;
664+
EXPECT_NE(itor, cmdList.end());
665+
666+
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
667+
EXPECT_NE(itor, cmdList.end());
668+
669+
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end()); //kernel with groupSize[2] = 2
670+
EXPECT_NE(itor, cmdList.end());
671+
672+
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
673+
EXPECT_NE(itor, cmdList.end());
674+
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
675+
EXPECT_NE(itor, cmdList.end());
676+
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
677+
EXPECT_NE(itor, cmdList.end());
678+
679+
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
680+
EXPECT_NE(itor, cmdList.end());
681+
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
682+
EXPECT_NE(itor, cmdList.end());
632683

633684
context->freeMem(alloc);
634685
}

level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1824,6 +1824,66 @@ HWTEST_F(KernelGlobalWorkOffsetTests, whenSettingGlobalOffsetThenCrossThreadData
18241824
EXPECT_EQ(*(dst.begin() + desc.payloadMappings.dispatchTraits.globalWorkOffset[2]), globalOffsetz);
18251825
}
18261826

1827+
using KernelWorkDimTests = Test<ModuleImmutableDataFixture>;
1828+
1829+
HWTEST_F(KernelWorkDimTests, givenGroupCountsWhenPatchingWorkDimThenCrossThreadDataIsPatched) {
1830+
struct MockKernelWithMockCrossThreadData : public MockKernel {
1831+
public:
1832+
MockKernelWithMockCrossThreadData(MockModule *mockModule) : MockKernel(mockModule) {}
1833+
void setCrossThreadData(uint32_t _crossThreadDataSize) {
1834+
crossThreadData.reset(new uint8_t[_crossThreadDataSize]);
1835+
crossThreadDataSize = _crossThreadDataSize;
1836+
memset(crossThreadData.get(), 0x00, crossThreadDataSize);
1837+
}
1838+
};
1839+
uint32_t perHwThreadPrivateMemorySizeRequested = 32u;
1840+
1841+
std::unique_ptr<MockImmutableData> mockKernelImmData =
1842+
std::make_unique<MockImmutableData>(perHwThreadPrivateMemorySizeRequested);
1843+
1844+
createModuleFromBinary(perHwThreadPrivateMemorySizeRequested, false, mockKernelImmData.get());
1845+
auto kernel = std::make_unique<MockKernelWithMockCrossThreadData>(module.get());
1846+
createKernel(kernel.get());
1847+
kernel->setCrossThreadData(sizeof(uint32_t));
1848+
1849+
kernel->patchWorkDim(1, 1, 1);
1850+
1851+
mockKernelImmData->mockKernelDescriptor->payloadMappings.dispatchTraits.workDim = 0x0u;
1852+
1853+
auto destinationBuffer = ArrayRef<const uint8_t>(kernel->getCrossThreadData(), kernel->getCrossThreadDataSize());
1854+
auto &kernelDescriptor = mockKernelImmData->getDescriptor();
1855+
auto workDimInCrossThreadDataPtr = destinationBuffer.begin() + kernelDescriptor.payloadMappings.dispatchTraits.workDim;
1856+
EXPECT_EQ(*workDimInCrossThreadDataPtr, 0u);
1857+
1858+
std::array<std::array<uint32_t, 7>, 8> sizesCountsWorkDim{
1859+
std::array<uint32_t, 7>{2, 1, 1, 1, 1, 1, 1},
1860+
std::array<uint32_t, 7>{1, 1, 1, 1, 1, 1, 1},
1861+
std::array<uint32_t, 7>{1, 2, 1, 2, 1, 1, 2},
1862+
std::array<uint32_t, 7>{1, 2, 1, 1, 1, 1, 2},
1863+
std::array<uint32_t, 7>{1, 1, 1, 1, 2, 1, 2},
1864+
std::array<uint32_t, 7>{1, 1, 1, 2, 2, 2, 3},
1865+
std::array<uint32_t, 7>{1, 1, 2, 1, 1, 1, 3},
1866+
std::array<uint32_t, 7>{1, 1, 1, 1, 1, 2, 3}};
1867+
for (auto parameters : sizesCountsWorkDim) {
1868+
1869+
uint32_t groupSizeX = parameters[0];
1870+
uint32_t groupSizeY = parameters[1];
1871+
uint32_t groupSizeZ = parameters[2];
1872+
1873+
uint32_t groupCountX = parameters[3];
1874+
uint32_t groupCountY = parameters[4];
1875+
uint32_t groupCountZ = parameters[5];
1876+
1877+
uint32_t expectedWorkDim = parameters[6];
1878+
1879+
ze_result_t res = kernel->setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
1880+
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
1881+
1882+
kernel->patchWorkDim(groupCountX, groupCountY, groupCountZ);
1883+
EXPECT_EQ(*workDimInCrossThreadDataPtr, expectedWorkDim);
1884+
}
1885+
}
1886+
18271887
using KernelPrintHandlerTest = Test<ModuleFixture>;
18281888
struct MyPrintfHandler : public PrintfHandler {
18291889
static uint32_t getPrintfSurfaceInitialDataSize() {

shared/source/command_container/command_encoder.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@ struct EncodeMath {
126126
AluRegisters firstOperandRegister,
127127
AluRegisters secondOperandRegister,
128128
AluRegisters finalResultRegister);
129+
static void bitwiseOr(CommandContainer &container,
130+
AluRegisters firstOperandRegister,
131+
AluRegisters secondOperandRegister,
132+
AluRegisters finalResultRegister);
129133
};
130134

131135
template <typename GfxFamily>
@@ -169,6 +173,7 @@ struct EncodeIndirectParams {
169173
using MI_MATH = typename GfxFamily::MI_MATH;
170174
using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE;
171175
static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress);
176+
static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, void *crossThreadAddress, const uint32_t *groupSize);
172177
static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws);
173178

174179
static size_t getCmdsSizeForIndirectParams();

shared/source/command_container/command_encoder.inl

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,20 @@ void EncodeMath<Family>::bitwiseAnd(CommandContainer &container,
276276
finalResultRegister);
277277
}
278278

279+
template <typename Family>
280+
void EncodeMath<Family>::bitwiseOr(CommandContainer &container,
281+
AluRegisters firstOperandRegister,
282+
AluRegisters secondOperandRegister,
283+
AluRegisters finalResultRegister) {
284+
uint32_t *cmd = EncodeMath<Family>::commandReserve(container);
285+
EncodeMathMMIO<Family>::encodeAlu(reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(cmd),
286+
firstOperandRegister,
287+
secondOperandRegister,
288+
AluRegisters::OPCODE_OR,
289+
finalResultRegister,
290+
AluRegisters::R_ACCU);
291+
}
292+
279293
template <typename Family>
280294
inline void EncodeSetMMIO<Family>::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap) {
281295
LriHelper<Family>::program(container.getCommandStream(),
@@ -502,6 +516,54 @@ void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &conta
502516
}
503517
}
504518

519+
template <typename Family>
520+
void EncodeIndirectParams<Family>::setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset workDimOffset, void *crossThreadAddress, const uint32_t *groupSize) {
521+
if (NEO::isValidOffset(workDimOffset)) {
522+
constexpr uint32_t GROUP_SIZE_1_GT_1_REGISTER = CS_GPR_R0;
523+
constexpr AluRegisters GROUP_SIZE_1_GT_1_ALU_REGISTER = AluRegisters::R_0;
524+
525+
constexpr AluRegisters GROUP_DIM_2_GT_1_ALU_REGISTER = AluRegisters::R_1;
526+
527+
constexpr AluRegisters GROUP_DIM_1_GT_1_ALU_REGISTER = AluRegisters::R_2;
528+
529+
constexpr uint32_t SUB_RESULT_REGISTER = CS_GPR_R3;
530+
constexpr AluRegisters SUB_RESULT_ALU_REGISTER = AluRegisters::R_3;
531+
532+
constexpr uint32_t RESULT_REGISTER = CS_GPR_R4;
533+
constexpr AluRegisters RESULT_ALU_REGISTER = AluRegisters::R_4;
534+
535+
constexpr uint32_t CONSTANT_ONE_REGISTER = CS_GPR_R5;
536+
constexpr AluRegisters CONSTANT_ONE_ALU_REGISTER = AluRegisters::R_5;
537+
538+
constexpr uint32_t GROUP_DIM_2_REGISTER = CS_GPR_R6;
539+
constexpr AluRegisters GROUP_DIM_2_ALU_REGISTER = AluRegisters::R_6;
540+
541+
constexpr uint32_t GROUP_DIM_1_REGISTER = CS_GPR_R7;
542+
constexpr AluRegisters GROUP_DIM_1_ALU_REGISTER = AluRegisters::R_7;
543+
544+
if (groupSize[2] > 1) {
545+
EncodeSetMMIO<Family>::encodeIMM(container, RESULT_REGISTER, 3, true);
546+
} else {
547+
EncodeSetMMIO<Family>::encodeIMM(container, GROUP_SIZE_1_GT_1_REGISTER, groupSize[1] > 1, true);
548+
EncodeSetMMIO<Family>::encodeREG(container, GROUP_DIM_2_REGISTER, GPUGPU_DISPATCHDIM[2]);
549+
EncodeSetMMIO<Family>::encodeREG(container, GROUP_DIM_1_REGISTER, GPUGPU_DISPATCHDIM[1]);
550+
551+
EncodeSetMMIO<Family>::encodeIMM(container, CONSTANT_ONE_REGISTER, 1, true);
552+
EncodeMath<Family>::greaterThan(container, GROUP_DIM_2_ALU_REGISTER, CONSTANT_ONE_ALU_REGISTER, GROUP_DIM_2_GT_1_ALU_REGISTER);
553+
EncodeMath<Family>::greaterThan(container, GROUP_DIM_1_ALU_REGISTER, CONSTANT_ONE_ALU_REGISTER, GROUP_DIM_1_GT_1_ALU_REGISTER);
554+
555+
EncodeSetMMIO<Family>::encodeIMM(container, SUB_RESULT_REGISTER, 0, true);
556+
EncodeMath<Family>::bitwiseOr(container, GROUP_DIM_2_GT_1_ALU_REGISTER, GROUP_DIM_1_GT_1_ALU_REGISTER, SUB_RESULT_ALU_REGISTER);
557+
EncodeMath<Family>::bitwiseOr(container, SUB_RESULT_ALU_REGISTER, GROUP_SIZE_1_GT_1_ALU_REGISTER, SUB_RESULT_ALU_REGISTER);
558+
559+
EncodeSetMMIO<Family>::encodeIMM(container, RESULT_REGISTER, 1, true);
560+
EncodeMath<Family>::addition(container, RESULT_ALU_REGISTER, SUB_RESULT_ALU_REGISTER, RESULT_ALU_REGISTER);
561+
EncodeMath<Family>::addition(container, RESULT_ALU_REGISTER, GROUP_DIM_2_GT_1_ALU_REGISTER, RESULT_ALU_REGISTER);
562+
}
563+
EncodeStoreMMIO<Family>::encode(*container.getCommandStream(), RESULT_REGISTER, ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), workDimOffset));
564+
}
565+
}
566+
505567
template <typename Family>
506568
void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t samplerCount, uint32_t bindingTableEntryCount) {
507569
auto enablePrefetch = EncodeSurfaceState<Family>::doBindingTablePrefetch();

shared/source/command_container/command_encoder_bdw_plus.inl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
145145
void *gpuPtr = reinterpret_cast<void *>(heapIndirect->getHeapGpuBase() + heapIndirect->getUsed() - sizeThreadData);
146146
EncodeIndirectParams<Family>::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr);
147147
EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize());
148+
EncodeIndirectParams<Family>::setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, gpuPtr, dispatchInterface->getGroupSize());
148149
}
149150

150151
ptr = ptrOffset(ptr, sizeCrossThreadData);

0 commit comments

Comments
 (0)