update load/store

sunjiweiswift · sunjiweiswift · commit f6a11b56ca0e · 2024-08-20T07:30:24.000Z
diff --git a/include/subgroup/tile/impl/load_xe.hpp b/include/subgroup/tile/impl/load_xe.hpp
@@ -274,16 +274,16 @@ tile_load(tile_t& tile, payload_t& payload) {
 
         reg_blk.xetla_select<load_elems, 1>(remained_start)
             .xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
-            load_dtype,
+            native_type_t<load_dtype>,
             block_size_x / scale_factor,
-            block_size_y,
-            num_block,
+            remained_blk_size_y,
+            arr_len,
             trans,
             mem_transform,
             L1,
             L2>(
-            (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(
-                tdesc),
+            (native_type_t<load_dtype>*)::gpu::xetla::detail::
+                xetla_get_tensor_base_address(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
@@ -335,16 +335,16 @@ tile_load(tile_t& tile, payload_t& payload) {
             remained_ld_blk_size_y * block_size_x * arr_len;
 
         reg_tmp.xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
-            load_dtype,
+            native_type_t<load_dtype>,
             block_size_x / scale_factor,
             block_size_y,
-            num_block,
+            arr_len,
             trans,
             mem_transform,
             L1,
             L2>(
-            (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(
-                tdesc),
+            (native_type_t<load_dtype>*)::gpu::xetla::detail::
+                xetla_get_tensor_base_address(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
@@ -402,16 +402,16 @@ tile_load(tile_t& tile, payload_t& payload) {
             tdesc.xetla_format<uint32_t>(), block_widthx_widthy_arrlen);
         reg_blk.xetla_select<final_load_elems, 1>(final_start)
             .xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
-            load_dtype,
+            native_type_t<load_dtype>,
             block_size_x / scale_factor,
-            block_size_y,
-            num_block,
+            final_ld_blk_size_y,
+            arr_len,
             trans,
             mem_transform,
             L1,
             L2>(
-            (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(
-                tdesc),
+            (native_type_t<load_dtype>*)::gpu::xetla::detail::
+                xetla_get_tensor_base_address(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
             ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
@@ -459,12 +459,10 @@ tile_load(tile_t& tile, payload_t& payload) {
   using dtype = typename payload_t::dtype;
   static constexpr uint32_t load_len = tile_t::tile_elems;
   static constexpr gpu_arch arch_tag = payload_t::arch_tag;
-  static constexpr uint32_t power2_block_elems =
-      detail::getNextPowerOf2<uint32_t(tile_t::block_elems * sizeof(dtype))>();
 
   using load_store_attr = load_store_attr_t<msg_type::block_1d, arch_tag>;
   static constexpr uint32_t max_load_vec_len =
-      std::min(power2_block_elems, load_store_attr::max_aligned_load_vec_len);
+      load_store_attr::max_aligned_load_vec_len;
 
   static constexpr uint32_t max_load_vec_elems =
       max_load_vec_len / sizeof(dtype);