intel
diff --git a/‎include/common/core/arch_config.hpp‎
Lines changed: 22 additions & 0 deletions b/‎include/common/core/arch_config.hpp‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎include/common/core/memory.hpp‎
Lines changed: 16 additions & 3 deletions b/‎include/common/core/memory.hpp‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎include/common/utils/common.hpp‎
Lines changed: 2 additions & 2 deletions b/‎include/common/utils/common.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/subgroup/tile/impl/load_xe.hpp‎
Lines changed: 108 additions & 43 deletions b/‎include/subgroup/tile/impl/load_xe.hpp‎
Lines changed: 108 additions & 43 deletions
diff --git a/‎include/subgroup/tile/impl/payload_xe.hpp‎
Lines changed: 58 additions & 4 deletions b/‎include/subgroup/tile/impl/payload_xe.hpp‎
Lines changed: 58 additions & 4 deletions
@@ -35,16 +35,38 @@ template <>
 struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeHpc> {
   /// HW limitation checks https://gfxspecs.intel.com/Predator/Home/Index/55490
   static constexpr bool has_hw_block_2d = true;
+  // If Transposed and Transformed are both set to false
+  // BlockHeight must not exceed 32.
   static constexpr uint32_t max_load_height_in_elem = 32;
+
+  // BlockWidth * NBlocks must not exceed 64 for bytes, 32 for words, 16 for
+  // dwords, and 8 for qwords.
   static constexpr uint32_t max_load_width_in_bytes = 64;
+
+  // If Transposed is true then
+  // BlockWidth must be 1,2,4 for qwords and be in range [1..8] for dwords.
   static constexpr uint32_t max_trans_load_width_in_bytes = 32;
+
+  // If Transformed is true
+  // BlockWidth must be in range [4..16] for bytes and [2..16] for word.
   static constexpr uint32_t max_vnni_load_width_in_elems = 16;
+
+  // BlockHeight must be in range [4..32] for bytes and [2..32] for words.
   static constexpr uint32_t min_vnni_load_height_in_bytes = 4;
 
+  // BlockHeight must not exceed 8.
   static constexpr uint32_t max_store_height_in_elem = 8;
+
+  // BlockWidth must not exceed 64 for bytes, 32 for words, 16 for dwords, and 8
+  // for qwords.
   static constexpr uint32_t max_store_width_in_bytes = 64;
 
+  // BlockHeight must not exceed 32.
+  // BlockWidth * NBlocks must not exceed 64 for bytes, 32 for words, 16 for
+  // dwords, and 8 for qwords.
   static constexpr uint32_t max_load_size_in_bytes = 2048;
+  
+  // BlockWidth * BlockHeight * sizeof(T) must not exceed 512.
   static constexpr uint32_t max_store_size_in_bytes = 512;
 
   static constexpr uint32_t special_prefetch_width_in_bytes = 64;
 
@@ -458,8 +458,16 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
     size_t SurfacePitch,
     int X,
     int Y) {
-  return __ESIMD_ENS::lsc_load_2d(
-      Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
+  return __ESIMD_ENS::lsc_load_2d<
+      T,
+      BlockWidth,
+      BlockHeight,
+      NBlocks,
+      Transposed,
+      Transformed,
+      gpu::xetla::detail::get_cache_hint(L1H),
+      gpu::xetla::detail::get_cache_hint(L2H),
+      N>(Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
 }
 
 /// simd<T, N> block_load(const T* ptr, size_t byte_offset,
@@ -724,7 +732,12 @@ __XETLA_API void xetla_store_global(
     int X,
     int Y,
     xetla_vector<T, N> Vals) {
-  __ESIMD_ENS::lsc_store_2d(
+  __ESIMD_ENS::lsc_store_2d<
+      T,
+      BlockWidth,
+      BlockHeight,
+      gpu::xetla::detail::get_cache_hint(L1H),
+      gpu::xetla::detail::get_cache_hint(L2H)>(
       Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Vals);
 }
 /// template <typename T, int N, int VS = 1, typename OffsetT,
 
@@ -275,8 +275,8 @@ enum class reg_layout : uint8_t {
   tiled = 1,
   vnni_tiled = 2,
   transpose_tiled = 3,
-  /// this is vnni tiled format, but for each block, they are stored in col
-  /// major order
+  /// this is vnni tiled format, but for each block, they are stored in
+  /// col-major order
   vnni_tiled_col_major = 4
 };
 enum class store_op : uint8_t {
 
@@ -94,7 +94,8 @@ tile_load(tile_t& tile, payload_t& payload) {
   static constexpr gpu_arch arch_tag = payload_t::arch_tag;
 
   static constexpr reg_layout reg_layout_ = tile_desc::register_layout;
-  static constexpr bool is_vnni_reverse = payload_t::mem_dword_transpose &&
+  static constexpr bool is_vnni_reverse =
+      payload_t::mem_dword_qword_transpose &&
       ((reg_layout_ == reg_layout::tiled) ||
        (reg_layout_ == reg_layout::transpose_tiled));
   static constexpr bool reg_transpose = tile_desc::reg_transpose;
@@ -121,28 +122,27 @@ tile_load(tile_t& tile, payload_t& payload) {
       mem_transpose ? max_trans_block_width : max_load_block_height;
   static constexpr uint32_t ld_blk_size_y = reg_transpose
       ? block_size_y
-      : (block_size_y > ld_blk_size_y_limit ? ld_blk_size_y_limit
-                                            : block_size_y);
-
-  // array len is used to make sure memory load is cache line aligned
-  // disabled while register or memory transpose
-  static constexpr uint8_t arr_len_candidate =
-      (reg_transpose ||
-       mem_transpose
-       // block elements should be integer
-       // times of register bytes
-       || ((block_size_y * block_size_x) % elems_per_reg != 0)
-       // tail blocks also need to meet above condition
-       ||
-       (((tile_size_y % block_size_y) * block_size_x) % elems_per_reg != 0)) ||
-          (block_size_y > ld_blk_size_y_limit)
-      ? 1
-      : (((tile_size_x % elems_per_CL) == 0)
-             ? (((elems_per_CL % block_size_x) == 0)
-                    ? elems_per_CL / block_size_x
-                    : 1)
-             : ((tile_size_x < elems_per_CL) ? (tile_size_x / block_size_x)
-                                             : 1));
+      : std::min(ld_blk_size_y_limit, block_size_y)
+
+      // array len is used to make sure memory load is cache line aligned
+      // disabled while register or memory transpose
+      static constexpr uint8_t arr_len_candidate =
+          (reg_transpose ||
+           mem_transpose
+           // block elements should be integer
+           // times of register bytes
+           || ((block_size_y * block_size_x) % elems_per_reg != 0)
+           // tail blocks also need to meet above condition
+           || (((tile_size_y % block_size_y) * block_size_x) % elems_per_reg !=
+               0)) ||
+              (block_size_y > ld_blk_size_y_limit)
+          ? 1
+          : (((tile_size_x % elems_per_CL) == 0)
+                 ? (((elems_per_CL % block_size_x) == 0)
+                        ? elems_per_CL / block_size_x
+                        : 1)
+                 : ((tile_size_x < elems_per_CL) ? (tile_size_x / block_size_x)
+                                                 : 1));
   static constexpr bool is_valid_arr_len_candidate = (arr_len_candidate == 1) ||
       (arr_len_candidate == 2) || (arr_len_candidate == 4);
 
@@ -203,14 +203,31 @@ tile_load(tile_t& tile, payload_t& payload) {
       for (uint32_t ii = 0; ii < block_size_y / ld_blk_size_y; ++ii) {
         constexpr uint32_t load_elems = ld_blk_size_y * block_size_x * arr_len;
 
-        reg_tmp.xetla_format<native_type_t<load_dtype>>() = xetla_tload_global<
+        // reg_tmp.xetla_format<native_type_t<load_dtype>>() =
+        // xetla_tload_global<
+        //     load_dtype,
+        //     ld_blk_height * block_size_x * arr_len / scale_factor,
+        //     L1,
+        //     L2,
+        //     trans,
+        //     mem_transform,
+        //     arch_tag>(tdesc);
+        reg_tmp.xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
             load_dtype,
-            ld_blk_height * block_size_x * arr_len / scale_factor,
-            L1,
-            L2,
+            block_size_x / scale_factor,
+            block_size_y,
+            num_block,
             trans,
             mem_transform,
-            arch_tag>(tdesc);
+            L1,
+            L2>(
+            (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(
+                tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
         if constexpr (reg_transpose && trans) {
           reg_blk.xetla_select<load_elems, 1>(ii * load_elems)
               .xetla_format<native_type_t<load_dtype>>() =
@@ -256,14 +273,30 @@ tile_load(tile_t& tile, payload_t& payload) {
             tdesc.xetla_format<uint32_t>(), block_widthx_widthy_arrlen);
 
         reg_blk.xetla_select<load_elems, 1>(remained_start)
-            .xetla_format<native_type_t<load_dtype>>() = xetla_tload_global<
+            .xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
             load_dtype,
-            (load_elems / scale_factor),
-            L1,
-            L2,
+            block_size_x / scale_factor,
+            block_size_y,
+            num_block,
             trans,
             mem_transform,
-            arch_tag>(tdesc);
+            L1,
+            L2>(
+            (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(
+                tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
+        // xetla_tload_global<
+        // load_dtype,
+        // (load_elems / scale_factor),
+        // L1,
+        // L2,
+        // trans,
+        // mem_transform,
+        // arch_tag>(tdesc);
       }
     }
   }
@@ -301,14 +334,30 @@ tile_load(tile_t& tile, payload_t& payload) {
         constexpr uint32_t load_elems =
             remained_ld_blk_size_y * block_size_x * arr_len;
 
-        reg_tmp.xetla_format<native_type_t<load_dtype>>() = xetla_tload_global<
+        reg_tmp.xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
             load_dtype,
-            (ld_blk_height * block_size_x * arr_len / scale_factor),
-            L1,
-            L2,
+            block_size_x / scale_factor,
+            block_size_y,
+            num_block,
             trans,
             mem_transform,
-            arch_tag>(tdesc);
+            L1,
+            L2>(
+            (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(
+                tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
+        //  xetla_tload_global<
+        // load_dtype,
+        // (ld_blk_height * block_size_x * arr_len / scale_factor),
+        // L1,
+        // L2,
+        // trans,
+        // mem_transform,
+        // arch_tag>(tdesc);
 
         if constexpr (reg_transpose && trans) {
           reg_blk.xetla_select<load_elems, 1>(ii * load_elems)
@@ -352,14 +401,30 @@ tile_load(tile_t& tile, payload_t& payload) {
         gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen(
             tdesc.xetla_format<uint32_t>(), block_widthx_widthy_arrlen);
         reg_blk.xetla_select<final_load_elems, 1>(final_start)
-            .xetla_format<native_type_t<load_dtype>>() = xetla_tload_global<
+            .xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
             load_dtype,
-            final_load_elems / scale_factor,
-            L1,
-            L2,
+            block_size_x / scale_factor,
+            block_size_y,
+            num_block,
             trans,
             mem_transform,
-            arch_tag>(tdesc);
+            L1,
+            L2>(
+            (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(
+                tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
+            ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
+        // xetla_tload_global<
+        // load_dtype,
+        // final_load_elems / scale_factor,
+        // L1,
+        // L2,
+        // trans,
+        // mem_transform,
+        // arch_tag>(tdesc);
       }
     }
   }
 
@@ -74,22 +74,46 @@ struct mem_payload_t<
   static constexpr bool trans = (mem_transpose ^ reg_transpose) &&
       !(std::is_same_v<dtype_, int4x2> || std::is_same_v<dtype_, int4x8>);
 
-  static constexpr bool mem_transform = (sizeof(dtype) < 4) && !mem_transpose &&
+  // Transformed and Transposed cannot be set to true at the same time.
+  static constexpr bool mem_transform = (sizeof(dtype) <= 2) && !trans &&
       (register_layout == reg_layout::vnni_tiled ||
        register_layout == reg_layout::vnni_tiled_col_major);
-  static constexpr bool mem_dword_transpose = (sizeof(dtype) < 4) && trans;
+  static constexpr bool mem_dword_qword_transpose =
+      (sizeof(dtype) < 4) && trans;
 
-  using mem_dtype =
-      typename std::conditional<mem_dword_transpose, uint32_t, dtype>::type;
+  using mem_dtype = typename std::
+      conditional<mem_dword_qword_transpose, uint32_t, dtype>::type;
   static constexpr uint32_t scale_factor = sizeof(mem_dtype) / sizeof(dtype);
+  mem_dtype* base_ptr;
+  uint32_t surface_width;
+  uint32_t surface_height;
+  uint32_t surface_pitch;
+  int32_t offset_x;
+  int32_t offset_y;
 
   xetla_vector<uint32_t, 16 * num_block> payloads;
 
   inline mem_payload_t(const this_payload_t& rhs) {
+    this->base_ptr = rhs.base_ptr;
+    this->surface_width = rhs.surface_width;
+    this->surface_height = rhs.surface_height;
+    this->surface_pitch = rhs.surface_pitch;
+    this->offset_x = rhs.offset_x;
+    this->offset_y = rhs.offset_y;
+
     this->payloads = rhs.payloads;
   }
 
   inline mem_payload_t(mem_desc_t& mem_desc) {
+    this->base_ptr = (mem_dtype*)mem_desc.base.base;
+    this->surface_width =
+        (mem_transpose ? mem_desc.shape.y : mem_desc.shape.x) * sizeof(dtype);
+    this->surface_height =
+        (mem_transpose ? mem_desc.shape.x : mem_desc.shape.y);
+    this->surface_pitch = mem_desc.shape.stride * sizeof(dtype);
+    this->offset_x = mem_desc.coord.x;
+    this->offset_y = mem_desc.coord.y;
+
     xetla_tdescriptor base_tdesc = mem_desc.get_tdesc();
     int32_t offset = gpu::xetla::detail::xetla_get_tensor_offset_x(base_tdesc) /
         int32_t(scale_factor);
@@ -105,6 +129,13 @@ struct mem_payload_t<
       uint32_t surface_pitch,
       int32_t surface_offset_x = 0,
       int32_t surface_offset_y = 0) {
+    this->base_ptr = (mem_dtype)p;
+    this->surface_width = surface_width;
+    this->surface_height = surface_height;
+    this->surface_pitch = surface_pitch;
+    this->offset_x = surface_offset_x;
+    this->offset_y = surface_offset_y;
+
     xetla_tdescriptor base_tdesc;
     xetla_fill_tdesc(
         base_tdesc.xetla_format<uint32_t>(),
@@ -118,6 +149,15 @@ struct mem_payload_t<
   }
 
   __XETLA_API void init(mem_desc_t& mem_desc) {
+    this->base_ptr = (mem_dtype*)mem_desc.base.base;
+    this->surface_width =
+        (mem_transpose ? mem_desc.shape.y : mem_desc.shape.x) * sizeof(dtype);
+    this->surface_height =
+        (mem_transpose ? mem_desc.shape.x : mem_desc.shape.y);
+    this->surface_pitch = mem_desc.shape.stride * sizeof(dtype);
+    this->offset_x = mem_desc.coord.x;
+    this->offset_y = mem_desc.coord.y;
+
     xetla_tdescriptor base_tdesc = mem_desc.get_tdesc();
     int32_t offset = gpu::xetla::detail::xetla_get_tensor_offset_x(base_tdesc) /
         int32_t(scale_factor);
@@ -141,6 +181,13 @@ struct mem_payload_t<
       uint32_t surface_pitch,
       int32_t surface_offset_x = 0,
       int32_t surface_offset_y = 0) {
+    this->base_ptr = (mem_dtype)p;
+    this->surface_width = surface_width;
+    this->surface_height = surface_height;
+    this->surface_pitch = surface_pitch;
+    this->offset_x = surface_offset_x;
+    this->offset_y = surface_offset_y;
+
     xetla_tdescriptor base_tdesc;
     xetla_fill_tdesc(
         base_tdesc.xetla_format<uint32_t>(),
@@ -159,6 +206,13 @@ struct mem_payload_t<
   // ~mem_payload_t(){}
 
   inline this_payload_t& operator=(const this_payload_t& rhs) {
+    this->base_ptr = rhs.base_ptr;
+    this->surface_width = rhs.surface_width;
+    this->surface_height = rhs.surface_height;
+    this->surface_pitch = rhs.surface_pitch;
+    this->offset_x = rhs.offset_x;
+    this->offset_y = rhs.offset_y;
+
     this->payloads = rhs.payloads;
     return *this;
   }