@@ -274,16 +274,16 @@ tile_load(tile_t& tile, payload_t& payload) {
274274
275275 reg_blk.xetla_select <load_elems, 1 >(remained_start)
276276 .xetla_format <native_type_t <load_dtype>>() = xetla_load_global<
277- load_dtype,
277+ native_type_t < load_dtype> ,
278278 block_size_x / scale_factor,
279- block_size_y ,
280- num_block ,
279+ remained_blk_size_y ,
280+ arr_len ,
281281 trans,
282282 mem_transform,
283283 L1,
284284 L2>(
285- (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address (
286- tdesc),
285+ (native_type_t < load_dtype> *)::gpu::xetla::detail::
286+ xetla_get_tensor_base_address ( tdesc),
287287 ::gpu::xetla::detail::xetla_get_tensor_width_x (tdesc),
288288 ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
289289 ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
@@ -335,16 +335,16 @@ tile_load(tile_t& tile, payload_t& payload) {
335335 remained_ld_blk_size_y * block_size_x * arr_len;
336336
337337 reg_tmp.xetla_format <native_type_t <load_dtype>>() = xetla_load_global<
338- load_dtype,
338+ native_type_t < load_dtype> ,
339339 block_size_x / scale_factor,
340340 block_size_y,
341- num_block ,
341+ arr_len ,
342342 trans,
343343 mem_transform,
344344 L1,
345345 L2>(
346- (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address (
347- tdesc),
346+ (native_type_t < load_dtype> *)::gpu::xetla::detail::
347+ xetla_get_tensor_base_address ( tdesc),
348348 ::gpu::xetla::detail::xetla_get_tensor_width_x (tdesc),
349349 ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
350350 ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
@@ -402,16 +402,16 @@ tile_load(tile_t& tile, payload_t& payload) {
402402 tdesc.xetla_format <uint32_t >(), block_widthx_widthy_arrlen);
403403 reg_blk.xetla_select <final_load_elems, 1 >(final_start)
404404 .xetla_format <native_type_t <load_dtype>>() = xetla_load_global<
405- load_dtype,
405+ native_type_t < load_dtype> ,
406406 block_size_x / scale_factor,
407- block_size_y ,
408- num_block ,
407+ final_ld_blk_size_y ,
408+ arr_len ,
409409 trans,
410410 mem_transform,
411411 L1,
412412 L2>(
413- (load_dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address (
414- tdesc),
413+ (native_type_t < load_dtype> *)::gpu::xetla::detail::
414+ xetla_get_tensor_base_address ( tdesc),
415415 ::gpu::xetla::detail::xetla_get_tensor_width_x (tdesc),
416416 ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
417417 ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
@@ -459,12 +459,10 @@ tile_load(tile_t& tile, payload_t& payload) {
459459 using dtype = typename payload_t ::dtype;
460460 static constexpr uint32_t load_len = tile_t ::tile_elems;
461461 static constexpr gpu_arch arch_tag = payload_t ::arch_tag;
462- static constexpr uint32_t power2_block_elems =
463- detail::getNextPowerOf2<uint32_t (tile_t ::block_elems * sizeof (dtype))>();
464462
465463 using load_store_attr = load_store_attr_t <msg_type::block_1d, arch_tag>;
466464 static constexpr uint32_t max_load_vec_len =
467- std::min (power2_block_elems, load_store_attr::max_aligned_load_vec_len) ;
465+ load_store_attr::max_aligned_load_vec_len;
468466
469467 static constexpr uint32_t max_load_vec_elems =
470468 max_load_vec_len / sizeof (dtype);
0 commit comments