@@ -99,44 +99,6 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
9999 pCmd5->setStateCacheInvalidationEnable (true );
100100}
101101
102- template <typename GfxFamily>
103- inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
104- WALKER_TYPE<GfxFamily> *walkerCmd,
105- const size_t globalOffsets[3 ],
106- const size_t startWorkGroups[3 ],
107- const size_t numWorkGroups[3 ],
108- const size_t localWorkSizesIn[3 ],
109- uint32_t simd,
110- uint32_t workDim,
111- bool localIdsGeneration) {
112- auto localWorkSize = localWorkSizesIn[0 ] * localWorkSizesIn[1 ] * localWorkSizesIn[2 ];
113-
114- auto threadsPerWorkGroup = getThreadsPerWG (simd, localWorkSize);
115- walkerCmd->setThreadWidthCounterMaximum (static_cast <uint32_t >(threadsPerWorkGroup));
116-
117- walkerCmd->setThreadGroupIdXDimension (static_cast <uint32_t >(numWorkGroups[0 ]));
118- walkerCmd->setThreadGroupIdYDimension (static_cast <uint32_t >(numWorkGroups[1 ]));
119- walkerCmd->setThreadGroupIdZDimension (static_cast <uint32_t >(numWorkGroups[2 ]));
120-
121- // compute executionMask - to tell which SIMD lines are active within thread
122- auto remainderSimdLanes = localWorkSize & (simd - 1 );
123- uint64_t executionMask = (1ull << remainderSimdLanes) - 1 ;
124- if (!executionMask)
125- executionMask = ~executionMask;
126-
127- using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
128-
129- walkerCmd->setRightExecutionMask (static_cast <uint32_t >(executionMask));
130- walkerCmd->setBottomExecutionMask (static_cast <uint32_t >(0xffffffff ));
131- walkerCmd->setSimdSize (static_cast <SIMD_SIZE>(simd >> 4 ));
132-
133- walkerCmd->setThreadGroupIdStartingX (static_cast <uint32_t >(startWorkGroups[0 ]));
134- walkerCmd->setThreadGroupIdStartingY (static_cast <uint32_t >(startWorkGroups[1 ]));
135- walkerCmd->setThreadGroupIdStartingResumeZ (static_cast <uint32_t >(startWorkGroups[2 ]));
136-
137- return localWorkSize;
138- }
139-
140102template <typename GfxFamily>
141103void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
142104 HwTimeStamps &hwTimeStamps,
@@ -427,144 +389,6 @@ inline void GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(Lin
427389 }
428390}
429391
430- template <typename GfxFamily>
431- void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
432- LinearStream *cmdStream,
433- WALKER_TYPE<GfxFamily> *walkerCmd,
434- TimestampPacket *timestampPacket,
435- TimestampPacket::WriteOperationType writeOperationType) {
436-
437- if (TimestampPacket::WriteOperationType::AfterWalker == writeOperationType) {
438- uint64_t address = timestampPacket->pickAddressForDataWrite (TimestampPacket::DataIndex::ContextEnd);
439- auto pipeControlCmd = cmdStream->getSpaceForCmd <PIPE_CONTROL>();
440- *pipeControlCmd = PIPE_CONTROL::sInit ();
441- pipeControlCmd->setCommandStreamerStallEnable (true );
442- pipeControlCmd->setPostSyncOperation (PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA);
443- pipeControlCmd->setAddress (static_cast <uint32_t >(address & 0x0000FFFFFFFFULL ));
444- pipeControlCmd->setAddressHigh (static_cast <uint32_t >(address >> 32 ));
445- pipeControlCmd->setImmediateData (0 );
446- }
447- }
448-
449- template <typename GfxFamily>
450- void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
451- CommandQueue &commandQueue,
452- DeviceQueueHw<GfxFamily> &devQueueHw,
453- PreemptionMode preemptionMode,
454- SchedulerKernel &scheduler,
455- IndirectHeap *ssh,
456- IndirectHeap *dsh) {
457-
458- using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
459- using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
460- using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
461-
462- OCLRT::LinearStream *commandStream = nullptr ;
463- OCLRT::IndirectHeap *ioh = nullptr ;
464-
465- commandStream = &commandQueue.getCS (0 );
466-
467- bool dcFlush = false ;
468- commandQueue.getDevice ().getCommandStreamReceiver ().addPipeControl (*commandStream, dcFlush);
469-
470- uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex ;
471- const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize ;
472- const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
473- const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof (INTERFACE_DESCRIPTOR_DATA);
474-
475- // Program media interface descriptor load
476- KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad (
477- *commandStream,
478- offsetInterfaceDescriptor,
479- totalInterfaceDescriptorTableSize);
480-
481- DEBUG_BREAK_IF (offsetInterfaceDescriptorTable % 64 != 0 );
482-
483- // Determine SIMD size
484- uint32_t simd = scheduler.getKernelInfo ().getMaxSimdSize ();
485- DEBUG_BREAK_IF (simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
486-
487- // Patch our kernel constants
488- *scheduler.globalWorkOffsetX = 0 ;
489- *scheduler.globalWorkOffsetY = 0 ;
490- *scheduler.globalWorkOffsetZ = 0 ;
491-
492- *scheduler.globalWorkSizeX = (uint32_t )scheduler.getGws ();
493- *scheduler.globalWorkSizeY = 1 ;
494- *scheduler.globalWorkSizeZ = 1 ;
495-
496- *scheduler.localWorkSizeX = (uint32_t )scheduler.getLws ();
497- *scheduler.localWorkSizeY = 1 ;
498- *scheduler.localWorkSizeZ = 1 ;
499-
500- *scheduler.localWorkSizeX2 = (uint32_t )scheduler.getLws ();
501- *scheduler.localWorkSizeY2 = 1 ;
502- *scheduler.localWorkSizeZ2 = 1 ;
503-
504- *scheduler.enqueuedLocalWorkSizeX = (uint32_t )scheduler.getLws ();
505- *scheduler.enqueuedLocalWorkSizeY = 1 ;
506- *scheduler.enqueuedLocalWorkSizeZ = 1 ;
507-
508- *scheduler.numWorkGroupsX = (uint32_t )(scheduler.getGws () / scheduler.getLws ());
509- *scheduler.numWorkGroupsY = 0 ;
510- *scheduler.numWorkGroupsZ = 0 ;
511-
512- *scheduler.workDim = 1 ;
513-
514- // Send our indirect object data
515- size_t localWorkSizes[3 ] = {scheduler.getLws (), 1 , 1 };
516- size_t globalWorkSizes[3 ] = {scheduler.getGws (), 1 , 1 };
517-
518- // Create indirectHeap for IOH that is located at the end of device enqueue DSH
519- size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData (scheduler);
520- IndirectHeap indirectObjectHeap (dsh->getCpuBase (), dsh->getMaxAvailableSpace ());
521- indirectObjectHeap.getSpace (curbeOffset);
522- ioh = &indirectObjectHeap;
523-
524- // Program the walker. Invokes execution so all state should already be programmed
525- auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace (sizeof (GPGPU_WALKER));
526- *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
527-
528- bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired (1 , globalWorkSizes, localWorkSizes);
529- KernelCommandsHelper<GfxFamily>::sendIndirectState (
530- *commandStream,
531- *dsh,
532- *ioh,
533- *ssh,
534- scheduler,
535- simd,
536- localWorkSizes,
537- offsetInterfaceDescriptorTable,
538- interfaceDescriptorIndex,
539- preemptionMode,
540- pGpGpuWalkerCmd,
541- nullptr ,
542- localIdsGeneration);
543-
544- // Implement enabling special WA DisableLSQCROPERFforOCL if needed
545- GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL (commandStream, scheduler, true );
546-
547- size_t globalOffsets[3 ] = {0 , 0 , 0 };
548- size_t workGroups[3 ] = {(scheduler.getGws () / scheduler.getLws ()), 1 , 1 };
549- GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData (pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1 , localIdsGeneration);
550-
551- // Implement disabling special WA DisableLSQCROPERFforOCL if needed
552- GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL (commandStream, scheduler, false );
553-
554- // Do not put BB_START only when returning in first Scheduler run
555- if (devQueueHw.getSchedulerReturnInstance () != 1 ) {
556-
557- commandQueue.getDevice ().getCommandStreamReceiver ().addPipeControl (*commandStream, true );
558-
559- // Add BB Start Cmd to the SLB in the Primary Batch Buffer
560- auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace (sizeof (MI_BATCH_BUFFER_START));
561- *bbStart = MI_BATCH_BUFFER_START::sInit ();
562- bbStart->setSecondLevelBatchBuffer (MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
563- uint64_t slbAddress = devQueueHw.getSlbBuffer ()->getGpuAddress ();
564- bbStart->setBatchBufferStartAddressGraphicsaddress472 (slbAddress);
565- }
566- }
567-
568392template <typename GfxFamily>
569393void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
570394}
0 commit comments