@@ -28,6 +28,7 @@ namespace gpu::xetla::subgroup {
2828namespace detail {
2929template <typename tile_t , typename payload_t >
3030struct check_load_type {
31+ static constexpr bool is_lsc_gather = true ;
3132 static constexpr bool is_global_block_2d =
3233 (payload_t ::memory_space == mem_space::global &&
3334 (payload_t ::message_type == msg_type::block_2d) &&
@@ -444,6 +445,7 @@ template <
444445 typename payload_t >
445446__XETLA_API typename std::enable_if_t <
446447 detail::check_load_type<tile_t , payload_t >::is_global_block_2d &&
448+ detail::check_load_type<tile_t , payload_t >::is_lsc_gather &&
447449 payload_t ::arch_tag <= gpu_arch::XeHpg>
448450tile_load (tile_t & tile, payload_t & payload) {
449451 using dtype = typename payload_t ::dtype;
@@ -531,6 +533,77 @@ tile_load(tile_t& tile, payload_t& payload) {
531533 }
532534}
533535
536+ // / @brief This function loads data from unaligned-2D memory surface.
537+ // / Loads an array of rectangular regions (X,Y)..(X+W,Y+H) from memory into
538+ // / registers. Each block will be loaded serially by its corresponding payload.
539+ // / @tparam tile_t Is the tile_t struct contains registers.
540+ // / These registers will be the destination of load operation.
541+ // / @tparam payload_t Is the mem_payload_t struct describing the memory
542+ // / information. Payload indicates the source of load operation.
543+ // / @tparam L1 Is the cache hint for L1 cache.
544+ // / @tparam L3 Is the cache hint for L3 cache.
545+ // / @param tile Is the tile object with type tile_t, holds the return data of
546+ // / the loads.
547+ // / @param payload Is the payload object with type payload_t. Contains all the
548+ // / information for loads.
549+ // / @return No return, update in place.
550+ template <
551+ cache_hint L1 = cache_hint::cached,
552+ cache_hint L3 = cache_hint::cached,
553+ typename tile_t ,
554+ typename payload_t >
555+ __XETLA_API typename std::enable_if_t <
556+ detail::check_load_type<tile_t , payload_t >::is_global_block_2d &&
557+ !detail::check_load_type<tile_t , payload_t >::is_lsc_gather &&
558+ !arch_has_2d_load_store(payload_t ::arch_tag)>
559+ tile_load (tile_t & tile, payload_t & payload) {
560+ using dtype = typename payload_t ::dtype;
561+ using tile_desc = typename payload_t ::tile_desc;
562+ using load_dtype = typename payload_t ::mem_dtype;
563+ constexpr uint32_t load_elems = payload_t ::simd_exec_size;
564+ constexpr uint32_t pack_factor = payload_t ::pack_factor;
565+
566+ #pragma unroll
567+ for (uint32_t i = 0 ; i < tile_desc::num_block_y; i++) {
568+ uint32_t offset_y = i * tile_desc::block_size_y;
569+ #pragma unroll
570+ for (uint32_t j = 0 ; j < tile_desc::num_block_x; j++) {
571+ uint32_t offset_x = j * tile_desc::block_size_x;
572+ auto reg_sub = tile.reg .xetla_select <tile_desc::block_elems, 1 >(
573+ (i * tile_desc::num_block_x + j) * tile_desc::block_elems);
574+ #pragma unroll
575+ for (uint32_t sub_block_y = 0 ; sub_block_y < tile_desc::block_size_y;
576+ sub_block_y += 1 ) {
577+ xetla_vector<load_dtype, load_elems> reg_tmp = 0 ;
578+ uint32_t address_offset = payload_t ::trans
579+ ? offset_x * payload.pitch_in_bytes +
580+ (offset_y + sub_block_y) * sizeof (dtype)
581+ : offset_x * sizeof (dtype) +
582+ (offset_y + sub_block_y) * payload.pitch_in_bytes ;
583+ reg_tmp = xetla_load_global<
584+ load_dtype,
585+ payload_t ::simd_exec_size,
586+ data_size::default_size,
587+ L1,
588+ L3>(payload.base_ptr , payload.base_offset + address_offset);
589+
590+ reg_sub
591+ .xetla_select <load_elems * pack_factor, 1 >(
592+ sub_block_y * tile_desc::block_size_x)
593+ .xetla_format <load_dtype>() = reg_tmp;
594+ }
595+ }
596+ }
597+
598+ if constexpr (payload_t ::trans) {
599+ SW_BARRIER ();
600+ tile_transpose (tile);
601+ }
602+ if constexpr (payload_t ::mem_transform) {
603+ SW_BARRIER ();
604+ vnni_convert (tile);
605+ }
606+ }
534607// / @brief This function loads data from unaligned-2D memory surface.
535608// / Loads an array of rectangular regions (X,Y)..(X+W,Y+H) from memory into
536609// / registers. Each block will be loaded serially by its corresponding payload.
0 commit comments