@@ -320,6 +320,53 @@ __XETLA_API void xetla_prefetch_global(
320320#endif
321321}
322322
323+ // / 2D USM pointer block prefetch.
324+ // / Supported platforms: PVC
325+ // / VISA instruction: lsc_load_block2d.ugm
326+ // /
327+ // / Prefetches elements located at specified address.
328+ // /
329+ // / @tparam T is element type.
330+ // / @tparam BlockWidth is the block width in number of elements.
331+ // / @tparam BlockHeight is the block height in number of elements.
332+ // / @tparam NBlocks is the number of blocks.
333+ // / @tparam L1H is L1 cache hint.
334+ // / @tparam L2H is L2 cache hint.
335+ // / @tparam N is the data size
336+ // / @param Ptr is the surface base address for this operation.
337+ // / @param SurfaceWidth is the surface width minus 1 in bytes
338+ // / @param SurfaceHeight is the surface height minus 1 in rows
339+ // / @param SurfacePitch is the surface pitch minus 1 in bytes
340+ // / @param X is zero based X-coordinate of the left upper rectangle corner in
341+ // / number of elements.
342+ // / @param Y is zero based Y-coordinate of the left upper rectangle corner in
343+ // / rows.
344+ // /
345+ template <
346+ typename T,
347+ int BlockWidth,
348+ int BlockHeight = 1 ,
349+ int NBlocks = 1 ,
350+ cache_hint L1H = cache_hint::none,
351+ cache_hint L2H = cache_hint::none,
352+ int N = __ESIMD_ENS::detail::get_lsc_block_2d_data_size<
353+ T,
354+ NBlocks,
355+ BlockHeight,
356+ BlockWidth,
357+ false ,
358+ false >()>
359+ __XETLA_API void xetla_prefetch_global (
360+ const T* Ptr,
361+ unsigned SurfaceWidth,
362+ unsigned SurfaceHeight,
363+ unsigned SurfacePitch,
364+ int X,
365+ int Y) {
366+ return __ESIMD_ENS::lsc_prefetch_2d (
367+ Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
368+ }
369+
323370// / template <typename T, int VS = 1, typename OffsetT,
324371// / typename PropertyListT = empty_properties_t>
325372// / void prefetch(const T *p, OffsetT byte_offset,
@@ -358,14 +405,71 @@ __XETLA_API void xetla_prefetch_global(T* p, uint64_t byte_offset = 0) {
358405#endif
359406}
360407
408+ // / 2D USM pointer block load.
409+ // / Supported platforms: PVC
410+ // / VISA instruction: lsc_load_block2d.ugm
411+ // /
412+ // / Collects elements located at specified address and returns them
413+ // / as a single \ref simd object.
414+ // /
415+ // / @tparam T is element type.
416+ // / @tparam BlockWidth is the block width in number of elements.
417+ // / @tparam BlockHeight is the block height in number of elements.
418+ // / @tparam NBlocks is the number of blocks.
419+ // / @tparam Transposed is the transposed version or not.
420+ // / @tparam Transformed is apply VNNI transform or not.
421+ // / @tparam L1H is L1 cache hint.
422+ // / @tparam L2H is L2 cache hint.
423+ // / @tparam N is the data size
424+ // / @param Ptr is the surface base address for this operation.
425+ // / @param SurfaceWidth is the surface width minus 1 in bytes
426+ // / @param SurfaceHeight is the surface height minus 1 in rows
427+ // / @param SurfacePitch is the surface pitch minus 1 in bytes
428+ // / @param X is zero based X-coordinate of the left upper rectangle corner in
429+ // / number of elements.
430+ // / @param Y is zero based Y-coordinate of the left upper rectangle corner in
431+ // / rows.
432+ // / @return is a vector of type T and size N, where N is
433+ // / BlockWidth * BlockHeight * NBlocks, if transformed;
434+ // / otherwise,
435+ // / N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) *
436+ // / getNextPowerOf2(BlockWidth) * NBlocks
437+ // /
438+ template <
439+ typename T,
440+ int BlockWidth,
441+ int BlockHeight = 1 ,
442+ int NBlocks = 1 ,
443+ bool Transposed = false ,
444+ bool Transformed = false ,
445+ cache_hint L1H = cache_hint::none,
446+ cache_hint L2H = cache_hint::none,
447+ int N = __ESIMD_ENS::detail::get_lsc_block_2d_data_size<
448+ T,
449+ NBlocks,
450+ BlockHeight,
451+ BlockWidth,
452+ Transposed,
453+ Transformed>()>
454+ __XETLA_API xetla_vector<T, N> xetla_load_global (
455+ const T* Ptr,
456+ size_t SurfaceWidth,
457+ size_t SurfaceHeight,
458+ size_t SurfacePitch,
459+ int X,
460+ int Y) {
461+ return __ESIMD_ENS::lsc_load_2d (
462+ Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
463+ }
464+
361465// / simd<T, N> block_load(const T* ptr, size_t byte_offset,
362466// / props={}); // (usm-bl-2)
363467// / This function loads a contiguous memory block from address referenced
364468// / by USM pointer \p ptr and the given \p byte_offset.
365469// /
366470// / There may be temporary restrictions depending on L1, L2 cache hints,
367- // / See details in the 'Restrictions' section below. The restrictions will be
368- // / relaxed in the future.
471+ // / See details in the 'Restrictions' section below. The restrictions will
472+ // / be relaxed in the future.
369473// /
370474// / The parameter \p props specifies the optional compile-time properties
371475// / of the type esimd::properties and may include esimd::cache_hint_L1,
@@ -383,7 +487,8 @@ __XETLA_API void xetla_prefetch_global(T* p, uint64_t byte_offset = 0) {
383487// /
384488// / Restrictions - cache hint imposed - temporary:
385489// / If L1 or L2 cache hint is passed, then:
386- // / R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
490+ // / R1: The pointer must be at least 4-byte aligned for elements of 4-bytes
491+ // / or
387492// / smaller and 8-byte aligned for 8-byte elements.
388493// / R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
389494// / for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
@@ -574,6 +679,54 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
574679#endif
575680}
576681
682+ // / 2D USM pointer block store.
683+ // / Supported platforms: PVC
684+ // / VISA instruction: lsc_store_block2d.ugm
685+ // /
686+ // / Stores elements at specified address.
687+ // /
688+ // / @tparam T is element type.
689+ // / @tparam BlockWidth is the block width in number of elements.
690+ // / @tparam BlockHeight is the block height in number of elements.
691+ // / @tparam L1H is L1 cache hint.
692+ // / @tparam L2H is L2 cache hint.
693+ // / @tparam N is the data size
694+ // / @param Ptr is the surface base address for this operation.
695+ // / @param SurfaceWidth is the surface width minus 1 in bytes
696+ // / @param SurfaceHeight is the surface height minus 1 in rows
697+ // / @param SurfacePitch is the surface pitch minus 1 in bytes
698+ // / @param X is zero based X-coordinate of the left upper rectangle corner in
699+ // / number of elements.
700+ // / @param Y is zero based Y-coordinate of the left upper rectangle corner in
701+ // / rows.
702+ // / @param Vals is a vector to store of type T and size N, where
703+ // / N = roundUpNextMultiple(BlockHeight, 4 / sizeof(T)) *
704+ // / getNextPowerOf2(BlockWidth) * NBlocks
705+ // /
706+ template <
707+ typename T,
708+ int BlockWidth,
709+ int BlockHeight = 1 ,
710+ cache_hint L1H = cache_hint::none,
711+ cache_hint L2H = cache_hint::none,
712+ int N = __ESIMD_ENS::detail::get_lsc_block_2d_data_size<
713+ T,
714+ 1u ,
715+ BlockHeight,
716+ BlockWidth,
717+ false ,
718+ false >()>
719+ __XETLA_API void xetla_store_global (
720+ T* Ptr,
721+ unsigned SurfaceWidth,
722+ unsigned SurfaceHeight,
723+ unsigned SurfacePitch,
724+ int X,
725+ int Y,
726+ xetla_vector<T, N> Vals) {
727+ __ESIMD_ENS::lsc_store_2d (
728+ Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Vals);
729+ }
577730// / template <typename T, int N, int VS = 1, typename OffsetT,
578731// / typename PropertyListT = empty_properties_t>
579732// / void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
@@ -951,6 +1104,10 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local(
9511104 xetla_vector<uint32_t , N> offsets,
9521105 xetla_mask<N> pred = 1 ) {
9531106 using T = native_type_t <Ty>;
1107+ DEBUG_INVOKE (
1108+ dbg_level::core,
1109+ core::general_1d<gpu_arch::XeHpc, Ty>::
1110+ template check_restriction<NElts, N>(offsets));
9541111
9551112 return __ESIMD_ENS::
9561113 lsc_slm_gather<T, NElts, gpu::xetla::detail::get_data_size (DS), N>(
@@ -975,6 +1132,11 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local(
9751132template <typename Ty, int NElts = 1 , data_size DS = data_size::default_size>
9761133__XETLA_API xetla_vector<Ty, NElts> xetla_load_local (uint32_t offset) {
9771134 using T = native_type_t <Ty>;
1135+ // DEBUG_INVOKE(
1136+ // dbg_level::core,
1137+ // core::general_1d<gpu_arch::XeHpc, Ty>::template
1138+ // check_restriction<NElts>(
1139+ // (uint64_t)offset));
9781140
9791141 return __ESIMD_NS::slm_block_load<T, NElts>(offset);
9801142}
@@ -1005,6 +1167,10 @@ __XETLA_API void xetla_store_local(
10051167 xetla_vector<Ty, N * NElts> vals,
10061168 xetla_mask<N> pred = 1 ) {
10071169 using T = native_type_t <Ty>;
1170+ DEBUG_INVOKE (
1171+ dbg_level::core,
1172+ core::general_1d<gpu_arch::XeHpc, Ty>::
1173+ template check_restriction<NElts, N, uint32_t >(offsets));
10081174
10091175 __ESIMD_ENS::
10101176 lsc_slm_scatter<T, NElts, gpu::xetla::detail::get_data_size (DS), N>(
0 commit comments