@@ -2382,7 +2382,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
23822382
23832383 auto lock = device->getBuiltinFunctionsLib ()->obtainUniqueOwnership ();
23842384
2385- bool useImmediateFill = patternSize == 1 || (patternSize <= 4 && isAligned<sizeof (uint32_t )>(dstAllocation.offset ) && isAligned<sizeof (uint32_t ) * 4 >(size));
2385+ const auto maxWgSize = this ->device ->getDeviceInfo ().maxWorkGroupSize ;
2386+ bool useImmediateFill = patternSize == 1 || (patternSize <= 4 && isAligned<sizeof (uint32_t )>(dstAllocation.offset ) && isAligned<sizeof (uint32_t ) * 4 >(size) && (size <= maxWgSize || isAligned (size, maxWgSize)));
23862387 auto builtin = useImmediateFill
23872388 ? BuiltinTypeHelper::adjustBuiltinType<Builtin::fillBufferImmediate>(isStateless, isHeapless)
23882389 : BuiltinTypeHelper::adjustBuiltinType<Builtin::fillBufferMiddle>(isStateless, isHeapless);
@@ -2415,6 +2416,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
24152416 if (useImmediateFill) {
24162417 launchParams.numKernelsInSplitLaunch ++;
24172418 if (fillArguments.leftRemainingBytes > 0 ) {
2419+ DEBUG_BREAK_IF (useImmediateFill && patternSize > 1u );
24182420 res = appendUnalignedFillKernel (isStateless, fillArguments.leftRemainingBytes , dstAllocation, pattern, signalEvent, launchParams);
24192421 if (res) {
24202422 return res;
@@ -2459,6 +2461,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
24592461 launchParams.numKernelsExecutedInSplitLaunch ++;
24602462
24612463 if (fillArguments.rightRemainingBytes > 0 ) {
2464+ DEBUG_BREAK_IF (useImmediateFill && patternSize > 1u );
24622465 dstAllocation.offset = fillArguments.rightOffset ;
24632466 res = appendUnalignedFillKernel (isStateless, fillArguments.rightRemainingBytes , dstAllocation, pattern, signalEvent, launchParams);
24642467 if (res) {
@@ -4075,7 +4078,8 @@ void CommandListCoreFamily<gfxCoreFamily>::setupFillKernelArguments(size_t baseO
40754078 CmdListFillKernelArguments &outArguments,
40764079 Kernel *kernel) {
40774080 constexpr auto dataTypeSize = sizeof (uint32_t ) * 4 ;
4078- if (patternSize == 1 || (patternSize <= 4 && isAligned<sizeof (uint32_t )>(baseOffset) && isAligned<dataTypeSize>(dstSize))) {
4081+ const auto maxWgSize = this ->device ->getDeviceInfo ().maxWorkGroupSize ;
4082+ if (patternSize == 1 || (patternSize <= 4 && isAligned<sizeof (uint32_t )>(baseOffset) && isAligned<dataTypeSize>(dstSize) && (dstSize <= maxWgSize || isAligned (dstSize, maxWgSize)))) {
40794083 size_t middleSize = dstSize;
40804084 outArguments.mainOffset = baseOffset;
40814085 outArguments.leftRemainingBytes = sizeof (uint32_t ) - (baseOffset % sizeof (uint32_t ));
@@ -4087,7 +4091,7 @@ void CommandListCoreFamily<gfxCoreFamily>::setupFillKernelArguments(size_t baseO
40874091 }
40884092
40894093 size_t adjustedSize = middleSize / dataTypeSize;
4090- outArguments.mainGroupSize = this -> device -> getDeviceInfo (). maxWorkGroupSize ;
4094+ outArguments.mainGroupSize = maxWgSize ;
40914095 if (outArguments.mainGroupSize > adjustedSize && adjustedSize > 0 ) {
40924096 outArguments.mainGroupSize = adjustedSize;
40934097 }
0 commit comments