fix GRU & int4gemm

DDEle · DDEle · commit c39cdfc642a9 · 2024-08-21T08:22:18.000Z
diff --git a/include/common/core/arch_config.hpp b/include/common/core/arch_config.hpp
@@ -65,7 +65,7 @@ struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeHpc> {
   // BlockWidth * NBlocks must not exceed 64 for bytes, 32 for words, 16 for
   // dwords, and 8 for qwords.
   static constexpr uint32_t max_load_size_in_bytes = 2048;
-  
+
   // BlockWidth * BlockHeight * sizeof(T) must not exceed 512.
   static constexpr uint32_t max_store_size_in_bytes = 512;
 
diff --git a/include/common/core/memory.hpp b/include/common/core/memory.hpp
@@ -458,16 +458,38 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
     size_t SurfacePitch,
     int X,
     int Y) {
-  return __ESIMD_ENS::lsc_load_2d<
-      T,
-      BlockWidth,
-      BlockHeight,
-      NBlocks,
-      Transposed,
-      Transformed,
-      gpu::xetla::detail::get_cache_hint(L1H),
-      gpu::xetla::detail::get_cache_hint(L2H),
-      N>(Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
+  if constexpr (BlockWidth * sizeof(T) < sizeof(uint32_t)) {
+    constexpr auto scale_factor = sizeof(uint32_t) / sizeof(T);
+    xetla_vector<uint32_t, N> ret = __ESIMD_ENS::lsc_load_2d<
+        uint32_t,
+        BlockWidth,
+        BlockHeight,
+        NBlocks,
+        Transposed,
+        Transformed,
+        gpu::xetla::detail::get_cache_hint(L1H),
+        gpu::xetla::detail::get_cache_hint(L2H),
+        N>(
+        reinterpret_cast<const uint32_t*>(Ptr),
+        SurfaceWidth,
+        SurfaceHeight,
+        SurfacePitch,
+        X / scale_factor,
+        Y);
+    return ret.xetla_format<T>().xetla_select<N, scale_factor>(
+        X % scale_factor);
+  } else {
+    return __ESIMD_ENS::lsc_load_2d<
+        T,
+        BlockWidth,
+        BlockHeight,
+        NBlocks,
+        Transposed,
+        Transformed,
+        gpu::xetla::detail::get_cache_hint(L1H),
+        gpu::xetla::detail::get_cache_hint(L2H),
+        N>(Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
+  }
 }
 
 /// simd<T, N> block_load(const T* ptr, size_t byte_offset,
diff --git a/include/subgroup/tile/impl/load_xe.hpp b/include/subgroup/tile/impl/load_xe.hpp
@@ -122,27 +122,27 @@ tile_load(tile_t& tile, payload_t& payload) {
       mem_transpose ? max_trans_block_width : max_load_block_height;
   static constexpr uint32_t ld_blk_size_y = reg_transpose
       ? block_size_y
-      : std::min(ld_blk_size_y_limit, block_size_y)
-
-      // array len is used to make sure memory load is cache line aligned
-      // disabled while register or memory transpose
-      static constexpr uint8_t arr_len_candidate =
-          (reg_transpose ||
-           mem_transpose
-           // block elements should be integer
-           // times of register bytes
-           || ((block_size_y * block_size_x) % elems_per_reg != 0)
-           // tail blocks also need to meet above condition
-           || (((tile_size_y % block_size_y) * block_size_x) % elems_per_reg !=
-               0)) ||
-              (block_size_y > ld_blk_size_y_limit)
-          ? 1
-          : (((tile_size_x % elems_per_CL) == 0)
-                 ? (((elems_per_CL % block_size_x) == 0)
-                        ? elems_per_CL / block_size_x
-                        : 1)
-                 : ((tile_size_x < elems_per_CL) ? (tile_size_x / block_size_x)
-                                                 : 1));
+      : std::min(ld_blk_size_y_limit, block_size_y);
+
+  // array len is used to make sure memory load is cache line aligned
+  // disabled while register or memory transpose
+  static constexpr uint8_t arr_len_candidate =
+      (reg_transpose ||
+       mem_transpose
+       // block elements should be integer
+       // times of register bytes
+       || ((block_size_y * block_size_x) % elems_per_reg != 0)
+       // tail blocks also need to meet above condition
+       ||
+       (((tile_size_y % block_size_y) * block_size_x) % elems_per_reg != 0)) ||
+          (block_size_y > ld_blk_size_y_limit)
+      ? 1
+      : (((tile_size_x % elems_per_CL) == 0)
+             ? (((elems_per_CL % block_size_x) == 0)
+                    ? elems_per_CL / block_size_x
+                    : 1)
+             : ((tile_size_x < elems_per_CL) ? (tile_size_x / block_size_x)
+                                             : 1));
   static constexpr bool is_valid_arr_len_candidate = (arr_len_candidate == 1) ||
       (arr_len_candidate == 2) || (arr_len_candidate == 4);
 
@@ -213,16 +213,16 @@ tile_load(tile_t& tile, payload_t& payload) {
         //     mem_transform,
         //     arch_tag>(tdesc);
         reg_tmp.xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
-            load_dtype,
+            native_type_t<load_dtype>,
             block_size_x / scale_factor,
             block_size_y,
-            num_block,
+            arr_len,
             trans,
             mem_transform,
             L1,
             L2>(
-            (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(
-                tdesc),
+            (native_type_t<load_dtype>*)::gpu::xetla::detail::
+                xetla_get_tensor_base_address(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
diff --git a/include/subgroup/tile/impl/payload_xe.hpp b/include/subgroup/tile/impl/payload_xe.hpp
@@ -129,7 +129,7 @@ struct mem_payload_t<
       uint32_t surface_pitch,
       int32_t surface_offset_x = 0,
       int32_t surface_offset_y = 0) {
-    this->base_ptr = (mem_dtype)p;
+    this->base_ptr = (mem_dtype*)p;
     this->surface_width = surface_width;
     this->surface_height = surface_height;
     this->surface_pitch = surface_pitch;
diff --git a/include/subgroup/tile/impl/store_xe.hpp b/include/subgroup/tile/impl/store_xe.hpp
@@ -161,18 +161,17 @@ tile_store(tile_t& tile, payload_t& payload) {
       for (uint32_t ii = 0; ii < block_size_y / st_block_size_y; ++ii) {
         constexpr uint32_t store_elems =
             st_block_size_y * block_size_x * arr_len;
-        auto st_blk =
+        xetla_vector<dtype, store_elems> st_blk =
             combine_blk.xetla_select<store_elems, 1>(ii * store_elems);
         // xetla_tstore_global<dtype, store_elems, L1, L2, payload_t::arch_tag>(
         //     tdesc, st_blk);
         xetla_store_global<
             dtype,
-            block_size_x,
-            block_size_y,
-            num_block,
+            block_size_x * arr_len,
+            st_block_size_y,
             L1,
             L2>(
-            ::gpu::xetla::detail::xetla_get_tensor_base_address(tdesc),
+            (dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),