Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit a0f3194

Browse files
committed
replace base_ptr/width/height/surface_pitch
1 parent b259fba commit a0f3194

File tree

4 files changed

+111
-73
lines changed

4 files changed

+111
-73
lines changed

include/common/core/memory.hpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -477,16 +477,15 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
477477
return ret.xetla_format<T>();
478478
} else if constexpr (BlockWidth * sizeof(T) < sizeof(uint32_t)) {
479479
constexpr auto scale_factor = sizeof(uint32_t) / sizeof(T);
480-
xetla_vector<uint32_t, N> ret = __ESIMD_ENS::lsc_load_2d<
480+
xetla_vector<uint32_t, N> ret = xetla_load_global<
481481
uint32_t,
482482
BlockWidth,
483483
BlockHeight,
484484
NBlocks,
485485
Transposed,
486486
Transformed,
487-
gpu::xetla::detail::get_cache_hint(L1H),
488-
gpu::xetla::detail::get_cache_hint(L2H),
489-
N>(
487+
L1H,
488+
L2H>(
490489
reinterpret_cast<const uint32_t*>(Ptr),
491490
SurfaceWidth,
492491
SurfaceHeight,
@@ -505,7 +504,7 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
505504
Transformed,
506505
gpu::xetla::detail::get_cache_hint(L1H),
507506
gpu::xetla::detail::get_cache_hint(L2H),
508-
N>(Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y);
507+
N>(Ptr, SurfaceWidth - 1, SurfaceHeight - 1, SurfacePitch - 1, X, Y);
509508
}
510509
}
511510

@@ -788,7 +787,7 @@ __XETLA_API void xetla_store_global(
788787
BlockHeight,
789788
gpu::xetla::detail::get_cache_hint(L1H),
790789
gpu::xetla::detail::get_cache_hint(L2H)>(
791-
Ptr, SurfaceWidth, SurfaceHeight, SurfacePitch, X, Y, Vals);
790+
Ptr, SurfaceWidth - 1, SurfaceHeight - 1, SurfacePitch - 1, X, Y, Vals);
792791
}
793792
}
794793
/// template <typename T, int N, int VS = 1, typename OffsetT,

include/subgroup/tile/impl/load_xe.hpp

Lines changed: 27 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ tile_load(tile_t& tile, payload_t& payload) {
9595

9696
static constexpr reg_layout reg_layout_ = tile_desc::register_layout;
9797
static constexpr bool is_vnni_reverse =
98-
payload_t::mem_dword_qword_transpose &&
98+
payload_t::mem_transpose_dtype_less4bytes &&
9999
((reg_layout_ == reg_layout::tiled) ||
100100
(reg_layout_ == reg_layout::transpose_tiled));
101101
static constexpr bool reg_transpose = tile_desc::reg_transpose;
@@ -202,32 +202,29 @@ tile_load(tile_t& tile, payload_t& payload) {
202202
#pragma unroll
203203
for (uint32_t ii = 0; ii < block_size_y / ld_blk_size_y; ++ii) {
204204
constexpr uint32_t load_elems = ld_blk_size_y * block_size_x * arr_len;
205-
206-
// reg_tmp.xetla_format<native_type_t<load_dtype>>() =
207-
// xetla_tload_global<
208-
// load_dtype,
209-
// ld_blk_height * block_size_x * arr_len / scale_factor,
210-
// L1,
211-
// L2,
212-
// trans,
213-
// mem_transform,
214-
// arch_tag>(tdesc);
215205
reg_tmp.xetla_format<native_type_t<load_dtype>>() = xetla_load_global<
216206
native_type_t<load_dtype>,
217-
(mem_transpose ? ld_blk_size_y : block_size_x) / scale_factor,
218-
(mem_transpose ? block_size_x : ld_blk_size_y),
207+
(trans ? ld_blk_size_y : block_size_x) / scale_factor,
208+
(trans ? block_size_x : ld_blk_size_y),
219209
// block_size_x / scale_factor,
220210
// ld_blk_size_y,
221211
arr_len,
222212
trans,
223213
mem_transform,
224214
L1,
225215
L2>(
226-
(native_type_t<load_dtype>*)::gpu::xetla::detail::
227-
xetla_get_tensor_base_address(tdesc),
228-
::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
229-
::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
230-
::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
216+
payload.base_ptr,
217+
payload.surface_width,
218+
payload.surface_height,
219+
payload.surface_pitch,
220+
// payload.offset_x,
221+
// payload.offset_y);
222+
223+
// (native_type_t<load_dtype>*)::gpu::xetla::detail::
224+
// xetla_get_tensor_base_address(tdesc),
225+
// ::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
226+
// ::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
227+
// ::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
231228
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
232229
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
233230
if constexpr (reg_transpose && trans) {
@@ -284,11 +281,10 @@ tile_load(tile_t& tile, payload_t& payload) {
284281
mem_transform,
285282
L1,
286283
L2>(
287-
(native_type_t<load_dtype>*)::gpu::xetla::detail::
288-
xetla_get_tensor_base_address(tdesc),
289-
::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
290-
::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
291-
::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
284+
payload.base_ptr,
285+
payload.surface_width,
286+
payload.surface_height,
287+
payload.surface_pitch,
292288
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
293289
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
294290
// xetla_tload_global<
@@ -345,11 +341,10 @@ tile_load(tile_t& tile, payload_t& payload) {
345341
mem_transform,
346342
L1,
347343
L2>(
348-
(native_type_t<load_dtype>*)::gpu::xetla::detail::
349-
xetla_get_tensor_base_address(tdesc),
350-
::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
351-
::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
352-
::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
344+
payload.base_ptr,
345+
payload.surface_width,
346+
payload.surface_height,
347+
payload.surface_pitch,
353348
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
354349
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
355350
// xetla_tload_global<
@@ -412,11 +407,10 @@ tile_load(tile_t& tile, payload_t& payload) {
412407
mem_transform,
413408
L1,
414409
L2>(
415-
(native_type_t<load_dtype>*)::gpu::xetla::detail::
416-
xetla_get_tensor_base_address(tdesc),
417-
::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
418-
::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
419-
::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
410+
payload.base_ptr,
411+
payload.surface_width,
412+
payload.surface_height,
413+
payload.surface_pitch,
420414
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
421415
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
422416
// xetla_tload_global<

include/subgroup/tile/impl/payload_xe.hpp

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,11 @@ struct mem_payload_t<
7878
static constexpr bool mem_transform = (sizeof(dtype) <= 2) && !trans &&
7979
(register_layout == reg_layout::vnni_tiled ||
8080
register_layout == reg_layout::vnni_tiled_col_major);
81-
static constexpr bool mem_dword_qword_transpose =
81+
static constexpr bool mem_transpose_dtype_less4bytes =
8282
(sizeof(dtype) < 4) && trans;
8383

8484
using mem_dtype = typename std::
85-
conditional<mem_dword_qword_transpose, uint32_t, dtype>::type;
85+
conditional_t<mem_transpose_dtype_less4bytes, uint32_t, dtype>;
8686
static constexpr uint32_t scale_factor = sizeof(mem_dtype) / sizeof(dtype);
8787
mem_dtype* base_ptr;
8888
uint32_t surface_width;
@@ -111,8 +111,11 @@ struct mem_payload_t<
111111
this->surface_height =
112112
(mem_transpose ? mem_desc.shape.x : mem_desc.shape.y);
113113
this->surface_pitch = mem_desc.shape.stride * sizeof(dtype);
114-
this->offset_x = mem_desc.coord.x;
115-
this->offset_y = mem_desc.coord.y;
114+
// this->offset_x = mem_desc.coord.x;
115+
// this->offset_y = mem_desc.coord.y;
116+
this->offset_x = mem_transpose ? mem_desc.coord.y : mem_desc.coord.x;
117+
this->offset_x = this->offset_x / scale_factor;
118+
this->offset_y = mem_transpose ? mem_desc.coord.x : mem_desc.coord.y;
116119

117120
xetla_tdescriptor base_tdesc = mem_desc.get_tdesc();
118121
int32_t offset = gpu::xetla::detail::xetla_get_tensor_offset_x(base_tdesc) /
@@ -130,13 +133,14 @@ struct mem_payload_t<
130133
int32_t surface_offset_x = 0,
131134
int32_t surface_offset_y = 0) {
132135
this->base_ptr = (mem_dtype*)p;
133-
this->surface_width = surface_width;
136+
this->surface_width = surface_width * sizeof(dtype);
134137
this->surface_height = surface_height;
135-
this->surface_pitch = surface_pitch;
136-
this->offset_x = surface_offset_x;
138+
this->surface_pitch = surface_pitch * sizeof(dtype);
139+
this->offset_x = surface_offset_x / scale_factor;
137140
this->offset_y = surface_offset_y;
138141

139142
xetla_tdescriptor base_tdesc;
143+
140144
xetla_fill_tdesc(
141145
base_tdesc.xetla_format<uint32_t>(),
142146
p,
@@ -155,7 +159,7 @@ struct mem_payload_t<
155159
this->surface_height =
156160
(mem_transpose ? mem_desc.shape.x : mem_desc.shape.y);
157161
this->surface_pitch = mem_desc.shape.stride * sizeof(dtype);
158-
this->offset_x = mem_desc.coord.x;
162+
this->offset_x = mem_desc.coord.x / scale_factor;
159163
this->offset_y = mem_desc.coord.y;
160164

161165
xetla_tdescriptor base_tdesc = mem_desc.get_tdesc();
@@ -181,11 +185,11 @@ struct mem_payload_t<
181185
uint32_t surface_pitch,
182186
int32_t surface_offset_x = 0,
183187
int32_t surface_offset_y = 0) {
184-
this->base_ptr = (mem_dtype)p;
185-
this->surface_width = surface_width;
188+
this->base_ptr = (mem_dtype*)p;
189+
this->surface_width = surface_width * sizeof(dtype);
186190
this->surface_height = surface_height;
187-
this->surface_pitch = surface_pitch;
188-
this->offset_x = surface_offset_x;
191+
this->surface_pitch = surface_pitch * sizeof(dtype);
192+
this->offset_x = surface_offset_x / scale_factor;
189193
this->offset_y = surface_offset_y;
190194

191195
xetla_tdescriptor base_tdesc;
@@ -1708,11 +1712,12 @@ struct prefetch_payload_t<
17081712
reg_layout_>,
17091713
num_coop_sg_,
17101714
arch_tag_,
1711-
std::enable_if_t<(!arch_has_2d_load_store<arch_tag_>)&&(
1712-
((block_size_y_ != 1 || tile_size_y_ != 1) &&
1713-
mem_layout_ == mem_layout::row_major) ||
1714-
((block_size_x_ != 1 || tile_size_x_ != 1) &&
1715-
mem_layout_ == mem_layout::col_major))>> {
1715+
std::enable_if_t<
1716+
(!arch_has_2d_load_store<arch_tag_>) &&
1717+
(((block_size_y_ != 1 || tile_size_y_ != 1) &&
1718+
mem_layout_ == mem_layout::row_major) ||
1719+
((block_size_x_ != 1 || tile_size_x_ != 1) &&
1720+
mem_layout_ == mem_layout::col_major))>> {
17161721
using dtype = native_type_t<dtype_>;
17171722
using mem_desc_t =
17181723
mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>;
@@ -1968,9 +1973,10 @@ struct prefetch_payload_t<
19681973
reg_layout_>,
19691974
num_coop_sg_,
19701975
arch_tag_,
1971-
std::enable_if_t<(arch_has_2d_load_store<arch_tag_>)&&(
1972-
((tile_size_y_ != 1) && mem_layout_ == mem_layout::row_major) ||
1973-
((tile_size_x_ != 1) && mem_layout_ == mem_layout::col_major))>> {
1976+
std::enable_if_t<
1977+
(arch_has_2d_load_store<arch_tag_>) &&
1978+
(((tile_size_y_ != 1) && mem_layout_ == mem_layout::row_major) ||
1979+
((tile_size_x_ != 1) && mem_layout_ == mem_layout::col_major))>> {
19741980
using dtype = dtype_;
19751981
using mem_desc_t =
19761982
mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>;

include/subgroup/tile/impl/store_xe.hpp

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,10 @@ tile_store(tile_t& tile, payload_t& payload) {
171171
st_block_size_y,
172172
L1,
173173
L2>(
174-
(dtype*)::gpu::xetla::detail::xetla_get_tensor_base_address(tdesc),
175-
::gpu::xetla::detail::xetla_get_tensor_width_x(tdesc),
176-
::gpu::xetla::detail::xetla_get_tensor_width_y(tdesc),
177-
::gpu::xetla::detail::xetla_get_tensor_pitch_x(tdesc),
174+
payload.base_ptr,
175+
payload.surface_width,
176+
payload.surface_height,
177+
payload.surface_pitch,
178178
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
179179
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc),
180180
st_blk);
@@ -194,12 +194,25 @@ tile_store(tile_t& tile, payload_t& payload) {
194194
(block_size_x * arr_len - 1) | ((blk_remained_y - 1) << 8);
195195
gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen(
196196
tdesc.xetla_format<uint32_t>(), block_widthx_widthy_arrlen);
197-
xetla_tstore_global<
197+
// xetla_tstore_global<
198+
// dtype,
199+
// blk_remained_elems,
200+
// L1,
201+
// L2,
202+
// payload_t::arch_tag>(tdesc, st_blk);
203+
xetla_store_global<
198204
dtype,
199-
blk_remained_elems,
205+
block_size_x * arr_len,
206+
blk_remained_y,
200207
L1,
201-
L2,
202-
payload_t::arch_tag>(tdesc, st_blk);
208+
L2>(
209+
payload.base_ptr,
210+
payload.surface_width,
211+
payload.surface_height,
212+
payload.surface_pitch,
213+
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
214+
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc),
215+
st_blk);
203216
}
204217
}
205218
}
@@ -244,8 +257,21 @@ tile_store(tile_t& tile, payload_t& payload) {
244257
remained_st_blk_size_y * block_size_x * arr_len;
245258
auto st_blk =
246259
combine_blk.xetla_select<store_elems, 1>(ii * store_elems);
247-
xetla_tstore_global<dtype, store_elems, L1, L2, payload_t::arch_tag>(
248-
tdesc, st_blk);
260+
// xetla_tstore_global<dtype, store_elems, L1, L2, payload_t::arch_tag>(
261+
// tdesc, st_blk);
262+
xetla_store_global<
263+
dtype,
264+
block_size_x * arr_len,
265+
remained_st_blk_size_y,
266+
L1,
267+
L2>(
268+
payload.base_ptr,
269+
payload.surface_width,
270+
payload.surface_height,
271+
payload.surface_pitch,
272+
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
273+
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc),
274+
st_blk);
249275
xetla_update_tdesc_offsety(
250276
tdesc.xetla_format<uint32_t>(), remained_st_blk_size_y);
251277
}
@@ -263,12 +289,25 @@ tile_store(tile_t& tile, payload_t& payload) {
263289
(block_size_x * arr_len - 1) | ((final_st_blk_size_y - 1) << 8);
264290
gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen(
265291
tdesc.xetla_format<uint32_t>(), block_widthx_widthy_arrlen);
266-
xetla_tstore_global<
292+
// xetla_tstore_global<
293+
// dtype,
294+
// final_store_elems,
295+
// L1,
296+
// L2,
297+
// payload_t::arch_tag>(tdesc, st_blk);
298+
xetla_store_global<
267299
dtype,
268-
final_store_elems,
300+
block_size_x * arr_len,
301+
final_st_blk_size_y,
269302
L1,
270-
L2,
271-
payload_t::arch_tag>(tdesc, st_blk);
303+
L2>(
304+
payload.base_ptr,
305+
payload.surface_width,
306+
payload.surface_height,
307+
payload.surface_pitch,
308+
::gpu::xetla::detail::xetla_get_tensor_offset_x(tdesc),
309+
::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc),
310+
st_blk);
272311
}
273312
}
274313
}

0 commit comments

Comments
 (0)