Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit 5d86a6b

Browse files
Remove hardcoded XeHpc and enhance multi target support on barrier (#255)
1 parent c3d779a commit 5d86a6b

File tree

90 files changed

+3273
-1587
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

90 files changed

+3273
-1587
lines changed

examples/02_basic_gemm/basic_gemm.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,10 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
114114
// wrap the nd_range to XeTLA range
115115

116116
// Performance tuning setting based on different shapes
117-
static constexpr uint32_t periodic_sync_interval = 0;
118-
static constexpr uint32_t prefetch_distance = 1;
117+
static constexpr uint32_t periodic_sync_interval =
118+
(arch_tag == gpu_arch::XeHpc) ? 8 : 0;
119+
static constexpr uint32_t prefetch_distance =
120+
(arch_tag == gpu_arch::XeHpc) ? 3 : 1;
119121
// should larger than 8
120122
static constexpr uint32_t k_stride = 32;
121123

examples/06_gemm_softmax/gemm_softmax.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,8 @@ void gemm_softmax_run(uint32_t iter) {
200200
static constexpr uint32_t prefetch_distance = 3;
201201
// should larger than 8
202202
static constexpr uint32_t k_iter_num = 16;
203+
static constexpr gpu_arch arch_tag = gpu_arch::XeHpc;
204+
//static constexpr gpu_arch arch_tag = gpu_arch::XeHpg;
203205

204206
// Step 1: define Micro-kernel's configuration
205207
using wg_shape = shape<wg_tile_n, wg_tile_m>;
@@ -227,7 +229,7 @@ void gemm_softmax_run(uint32_t iter) {
227229
data_type_sfx, // accumulator data type for intermediate results
228230
wg_shape, // computation tile shape
229231
k_iter_num, // elements in each iteration
230-
gpu_arch::XeHpc, // GPU arch
232+
arch_tag, // GPU arch
231233
tune_option>;
232234

233235
using gemm_args_t = gemm_op_t::arguments_t;
@@ -239,14 +241,14 @@ void gemm_softmax_run(uint32_t iter) {
239241
mem_space::global, // memory writing to global mem for C
240242
wg_shape, // computation tile shape
241243
k_iter_num, // elements in each iteration
242-
gpu_arch::XeHpc, // GPU arch
244+
arch_tag, // GPU arch
243245
tune_option>;
244246

245247
// using experimental::group::softmax
246248
// define softmax forward op
247249
using tile_shape = typename gemm_op_t::tile_shape;
248250
using softmax_fwd_t = softmax_t<
249-
softmax_policy_fwd<data_type_sfx, gpu_arch::XeHpc>,
251+
softmax_policy_fwd<data_type_sfx, arch_tag>,
250252
tile_shape>;
251253
using softmax_fwd_args_t = typename softmax_fwd_t::arguments_t;
252254

examples/08_scaled_dot_product_attention/softmax.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ struct xetla_softmax_fwd_t {
6060
using softmax_tile_desc_t = subgroup::
6161
tile_desc_t<SIMD, block_height, SIMD, block_height, reg_layout::tiled>;
6262
using softmax_load_t = subgroup::tile_t<dtype_in, softmax_tile_desc_t>;
63-
using mem_desc_in_t = mem_desc_t<dtype_in, mem_layout::row_major, mem_space_in>;
63+
using mem_desc_in_t =
64+
mem_desc_t<dtype_in, mem_layout::row_major, mem_space_in>;
6465
using softmax_load_payload_t = subgroup::mem_payload_t<
6566
mem_desc_in_t,
6667
softmax_tile_desc_t,

include/common/core/arch_config.hpp

Lines changed: 61 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ struct load_store_attr_t<msg_type::block_2d, gpu_arch::XeHpc> {
5050
static constexpr uint32_t special_prefetch_width_in_bytes = 64;
5151

5252
static constexpr uint32_t cache_line_size_in_bytes = 64;
53-
static constexpr uint32_t alignment_in_bytes = 8;
53+
static constexpr uint32_t alignment_in_bytes = 16;
5454
};
5555

5656
template <msg_type message_type, gpu_arch arg_tag>
@@ -72,7 +72,7 @@ struct client_load_store_attr_base_t {
7272
static constexpr uint32_t special_prefetch_width_in_bytes = 64;
7373

7474
static constexpr uint32_t cache_line_size_in_bytes = 64;
75-
static constexpr uint32_t alignment_in_bytes = 8;
75+
static constexpr uint32_t alignment_in_bytes = 4;
7676
};
7777

7878
template <>
@@ -94,15 +94,21 @@ inline constexpr bool arch_has_2d_load_store =
9494
template <gpu_arch arch_tag>
9595
struct load_store_attr_t<msg_type::block_1d, arch_tag> {
9696
static constexpr uint32_t max_load_vec_len = 256;
97+
static constexpr uint32_t max_aligned_load_vec_len = 256;
9798
static constexpr uint32_t max_store_vec_len = 256;
99+
static constexpr uint32_t max_aligned_store_vec_len = 256;
98100
static constexpr uint32_t max_prefetch_vec_len = 32;
101+
static constexpr uint32_t max_channel_num = 16;
99102
};
100103

101104
template <>
102105
struct load_store_attr_t<msg_type::block_1d, gpu_arch::XeHpc> {
103-
static constexpr uint32_t max_load_vec_len = 512;
104-
static constexpr uint32_t max_store_vec_len = 512;
106+
static constexpr uint32_t max_load_vec_len = 256;
107+
static constexpr uint32_t max_aligned_load_vec_len = 512;
108+
static constexpr uint32_t max_store_vec_len = 256;
109+
static constexpr uint32_t max_aligned_store_vec_len = 512;
105110
static constexpr uint32_t max_prefetch_vec_len = 64;
111+
static constexpr uint32_t max_channel_num = 32;
106112
};
107113

108114
struct dpas_attr_base_t {
@@ -112,6 +118,7 @@ struct dpas_attr_base_t {
112118
static constexpr uint32_t op_per_channel_bits = 32;
113119
static constexpr uint32_t op_per_channel_bytes = (op_per_channel_bits >> 3);
114120
static constexpr uint32_t op_per_channel_max = 8;
121+
static constexpr uint32_t k_in_bytes = systolic_depth * op_per_channel_bytes;
115122
};
116123

117124
template <gpu_arch arch_tag>
@@ -121,12 +128,12 @@ struct dpas_attr_t {
121128

122129
template <>
123130
struct dpas_attr_t<gpu_arch::XeHpc> : public dpas_attr_base_t {
124-
static constexpr uint32_t n_fixed_limit = 16;
131+
static constexpr uint32_t n_in_elem = 16;
125132
};
126133

127134
template <>
128135
struct dpas_attr_t<gpu_arch::XeHpg> : public dpas_attr_base_t {
129-
static constexpr uint32_t n_fixed_limit = 8;
136+
static constexpr uint32_t n_in_elem = 8;
130137
};
131138

132139
template <gpu_arch arch_tag>
@@ -140,9 +147,10 @@ struct fpu_attr_t {
140147
template <gpu_arch arch_tag>
141148
inline constexpr bool arch_has_fpu = fpu_attr_t<arch_tag>::has_fpu;
142149

143-
#define GRF grf_mode::double_grf
144150
#ifdef NORMAL_GRF
145151
#define GRF grf_mode::normal_grf
152+
#else
153+
#define GRF grf_mode::double_grf
146154
#endif
147155

148156
template <grf_mode grf_num_mode>
@@ -155,6 +163,7 @@ struct register_nums_t {
155163

156164
template <gpu_arch arch_tag>
157165
struct register_bytes_t;
166+
158167
template <>
159168
struct register_bytes_t<gpu_arch::XeHpc> {
160169
static constexpr uint32_t reg_in_bytes = 64;
@@ -180,24 +189,49 @@ struct register_attr_t {
180189
static constexpr uint32_t grf_in_bytes = register_nums * reg_in_bytes;
181190
};
182191

183-
template <gpu_arch arch_tag, uint32_t m, class enable = void>
192+
template <
193+
gpu_arch arch_tag,
194+
mma_engine engine_type,
195+
uint32_t m,
196+
class enable = void>
184197
struct mma_attr_t {};
185198

186199
template <gpu_arch arch_tag, uint32_t m>
187-
struct mma_attr_t<arch_tag, m, std::enable_if_t<arch_has_xmx<arch_tag>>> {
200+
struct mma_attr_t<
201+
arch_tag,
202+
mma_engine::xmx,
203+
m,
204+
std::enable_if_t<arch_has_xmx<arch_tag>>> {
188205
using dpas_attr = dpas_attr_t<arch_tag>;
206+
using load_store_attr = load_store_attr_t<msg_type::block_2d, arch_tag>;
189207
static constexpr uint32_t mma_m_in_elem =
190208
(m > dpas_attr::rcount_max) ? dpas_attr::rcount_max : m;
191-
static constexpr uint32_t mma_n_in_elem = dpas_attr::n_fixed_limit;
192-
static constexpr uint32_t mma_k_in_bytes =
193-
dpas_attr::systolic_depth * dpas_attr::op_per_channel_bytes;
209+
static constexpr uint32_t blk_m_in_elem = 16;
210+
211+
static constexpr uint32_t mma_n_in_elem = dpas_attr::n_in_elem;
212+
[[maybe_unused]] static constexpr uint32_t blk_n_in_bytes =
213+
load_store_attr::max_trans_load_width_in_bytes;
214+
215+
static constexpr uint32_t mma_k_in_bytes = dpas_attr::k_in_bytes;
216+
static constexpr uint32_t blk_k_in_bytes = mma_k_in_bytes;
194217
};
195218

196219
template <gpu_arch arch_tag, uint32_t m>
197-
struct mma_attr_t<arch_tag, m, std::enable_if_t<!arch_has_xmx<arch_tag>>> {
220+
struct mma_attr_t<
221+
arch_tag,
222+
mma_engine::fpu,
223+
m,
224+
std::enable_if_t<arch_has_fpu<arch_tag>>> {
225+
using load_store_attr = load_store_attr_t<msg_type::block_2d, arch_tag>;
198226
static constexpr uint32_t mma_m_in_elem = (m > 8) ? 8 : m;
199-
static constexpr uint32_t mma_n_in_elem = 16;
227+
static constexpr uint32_t blk_m_in_elem = 16;
228+
200229
static constexpr uint32_t mma_k_in_bytes = 32;
230+
static constexpr uint32_t blk_k_in_bytes = mma_k_in_bytes;
231+
232+
[[maybe_unused]] static constexpr uint32_t mma_n_in_elem = 16;
233+
static constexpr uint32_t blk_n_in_bytes =
234+
register_bytes_t<arch_tag>::reg_in_bytes;
201235
};
202236

203237
template <gpu_arch arch_tag>
@@ -208,43 +242,51 @@ struct arch_attr_t<gpu_arch::XeHpc> {
208242
template <msg_type message_type = msg_type::block_2d>
209243
using load_store_attr = load_store_attr_t<message_type, gpu_arch::XeHpc>;
210244

211-
template <grf_mode grf_num_mode = grf_mode::double_grf>
245+
template <grf_mode grf_num_mode = GRF>
212246
using register_attr = register_attr_t<grf_num_mode, gpu_arch::XeHpc>;
213247

214248
using dpas_attr = dpas_attr_t<gpu_arch::XeHpc>;
215249

216250
static constexpr uint32_t max_wg_num = 64;
217251
static constexpr uint32_t local_mem_size = 128 * 1024;
252+
static constexpr bool has_named_barrier = true;
218253
};
219254

220255
template <>
221256
struct arch_attr_t<gpu_arch::XeHpg> {
222257
template <msg_type message_type = msg_type::block_2d>
223258
using load_store_attr = load_store_attr_t<message_type, gpu_arch::XeHpg>;
224259

225-
template <grf_mode grf_num_mode = grf_mode::double_grf>
260+
template <grf_mode grf_num_mode = GRF>
226261
using register_attr = register_attr_t<grf_num_mode, gpu_arch::XeHpg>;
227262

228263
using dpas_attr = dpas_attr_t<gpu_arch::XeHpg>;
229264

230-
static constexpr uint32_t max_wg_num = 64;
265+
static constexpr uint32_t max_wg_num = 32;
231266
static constexpr uint32_t local_mem_size = 64 * 1024;
267+
268+
static constexpr bool has_named_barrier = false;
232269
};
233270

234271
template <>
235272
struct arch_attr_t<gpu_arch::XeLpg> {
236273
template <msg_type message_type = msg_type::block_2d>
237274
using load_store_attr = load_store_attr_t<message_type, gpu_arch::XeLpg>;
238275

239-
template <grf_mode grf_num_mode = grf_mode::double_grf>
276+
template <grf_mode grf_num_mode = GRF>
240277
using register_attr = register_attr_t<grf_num_mode, gpu_arch::XeLpg>;
241278

242279
using dpas_attr = dpas_attr_t<gpu_arch::XeLpg>;
243280

244-
static constexpr uint32_t max_wg_num = 64;
281+
static constexpr uint32_t max_wg_num = 32;
245282
static constexpr uint32_t local_mem_size = 64 * 1024;
283+
static constexpr bool has_named_barrier = false;
246284
};
247285

286+
template <gpu_arch arch_tag>
287+
inline constexpr bool arch_has_named_barrier =
288+
arch_attr_t<arch_tag>::has_named_barrier;
289+
248290
/// @} xetla_core_arch_config
249291

250292
} // namespace gpu::xetla

include/common/core/common_types.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,12 @@
2121
#include <cstdint>
2222

2323
namespace gpu::xetla {
24-
enum class gpu_arch : uint8_t { XeLpg = 0, XeHpg = 1, XeHpc = 2 };
24+
enum class gpu_arch : uint8_t { XeLpg = 0, XeHpg = 1, XeHpc = 2, XeLast };
25+
26+
template <gpu_arch arch_tag>
27+
inline constexpr bool valid_xe_arch_tag = (arch_tag < gpu_arch::XeLast);
28+
29+
enum class mma_engine : uint8_t { xmx = 0, fpu = 1 };
2530

2631
enum class grf_mode : uint8_t { normal = 0, double_grf = 1 };
2732

include/common/core/math_mma.hpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,7 @@ constexpr gpu::xetla::argument_type mma_argument_type<fp16>() {
7272
template <gpu::xetla::argument_type arg_type>
7373
constexpr __ESIMD_NS::xmx::dpas_argument_type get_argument_type() {
7474
static_assert(
75-
arg_type == gpu::xetla::argument_type::U1 ||
76-
arg_type == gpu::xetla::argument_type::S1 ||
77-
arg_type == gpu::xetla::argument_type::U2 ||
75+
arg_type == gpu::xetla::argument_type::U2 ||
7876
arg_type == gpu::xetla::argument_type::S2 ||
7977
arg_type == gpu::xetla::argument_type::U4 ||
8078
arg_type == gpu::xetla::argument_type::S4 ||
@@ -85,10 +83,6 @@ constexpr __ESIMD_NS::xmx::dpas_argument_type get_argument_type() {
8583
arg_type == gpu::xetla::argument_type::TF32,
8684
"Unsupported argument type");
8785
switch (arg_type) {
88-
case gpu::xetla::argument_type::U1:
89-
return __ESIMD_NS::xmx::dpas_argument_type::u1;
90-
case gpu::xetla::argument_type::S1:
91-
return __ESIMD_NS::xmx::dpas_argument_type::s1;
9286
case gpu::xetla::argument_type::U2:
9387
return __ESIMD_NS::xmx::dpas_argument_type::u2;
9488
case gpu::xetla::argument_type::S2:

include/common/core/memory.hpp

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,6 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_global(
400400
xetla_vector<Toffset, N> offsets,
401401
xetla_mask<N> pred = 1) {
402402
using T = native_type_t<Ty>;
403-
DEBUG_INVOKE(
404-
dbg_level::core,
405-
core::general_1d<gpu_arch::XeHpc, Ty>::
406-
template check_restriction<NElts, N>(offsets, (uint64_t)p));
407403

408404
return __ESIMD_ENS::lsc_gather<
409405
T,
@@ -666,10 +662,6 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local(
666662
xetla_vector<uint32_t, N> offsets,
667663
xetla_mask<N> pred = 1) {
668664
using T = native_type_t<Ty>;
669-
DEBUG_INVOKE(
670-
dbg_level::core,
671-
core::general_1d<gpu_arch::XeHpc, Ty>::
672-
template check_restriction<NElts, N>(offsets));
673665

674666
return __ESIMD_ENS::
675667
lsc_slm_gather<T, NElts, gpu::xetla::detail::get_data_size(DS), N>(
@@ -694,11 +686,6 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local(
694686
template <typename Ty, int NElts = 1, data_size DS = data_size::default_size>
695687
__XETLA_API xetla_vector<Ty, NElts> xetla_load_local(uint32_t offset) {
696688
using T = native_type_t<Ty>;
697-
// DEBUG_INVOKE(
698-
// dbg_level::core,
699-
// core::general_1d<gpu_arch::XeHpc, Ty>::template
700-
// check_restriction<NElts>(
701-
// (uint64_t)offset));
702689

703690
return __ESIMD_NS::slm_block_load<T, NElts>(offset);
704691
}
@@ -729,10 +716,6 @@ __XETLA_API void xetla_store_local(
729716
xetla_vector<Ty, N * NElts> vals,
730717
xetla_mask<N> pred = 1) {
731718
using T = native_type_t<Ty>;
732-
DEBUG_INVOKE(
733-
dbg_level::core,
734-
core::general_1d<gpu_arch::XeHpc, Ty>::
735-
template check_restriction<NElts, N, uint32_t>(offsets));
736719

737720
__ESIMD_ENS::
738721
lsc_slm_scatter<T, NElts, gpu::xetla::detail::get_data_size(DS), N>(

include/common/utils/common.hpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ constexpr uint32_t get_element_size_code() {
5151
enum class lsc_action : uint8_t { prefetch, load, store, atomic };
5252

5353
template <lsc_action Action, cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
54-
constexpr std::enable_if_t<arch_tag <= gpu_arch::XeHpc, void>
54+
constexpr std::enable_if_t<valid_xe_arch_tag<arch_tag>, void>
5555
check_lsc_cache_hint() {
5656
if constexpr (Action == lsc_action::prefetch) {
5757
// https://gfxspecs.intel.com/Predator/Home/Index/53560
@@ -94,7 +94,7 @@ check_lsc_cache_hint() {
9494
}
9595

9696
template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
97-
constexpr std::enable_if_t<arch_tag == gpu_arch::XeHpc, uint32_t>
97+
constexpr std::enable_if_t<arch_has_2d_load_store<arch_tag>, uint32_t>
9898
get_load_cache_hint_code() {
9999
check_lsc_cache_hint<lsc_action::load, L1H, L2H, arch_tag>();
100100
if (L1H == cache_hint::none && L2H == cache_hint::none) {
@@ -126,7 +126,7 @@ get_load_cache_hint_code() {
126126
}
127127

128128
template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
129-
constexpr std::enable_if_t<arch_tag == gpu_arch::XeHpc, uint32_t>
129+
constexpr std::enable_if_t<arch_has_2d_load_store<arch_tag>, uint32_t>
130130
get_prefetch_cache_hint_code() {
131131
check_lsc_cache_hint<lsc_action::prefetch, L1H, L2H, arch_tag>();
132132
if (L2H == cache_hint::uncached) {
@@ -153,7 +153,7 @@ get_prefetch_cache_hint_code() {
153153
}
154154

155155
template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
156-
constexpr std::enable_if_t<arch_tag <= gpu_arch::XeHpc, uint32_t>
156+
constexpr std::enable_if_t<arch_has_2d_load_store<arch_tag>, uint32_t>
157157
get_store_cache_hint_code() {
158158
check_lsc_cache_hint<lsc_action::store, L1H, L2H, arch_tag>();
159159
if (L1H == cache_hint::none && L2H == cache_hint::none) {
@@ -185,7 +185,7 @@ get_store_cache_hint_code() {
185185
}
186186

187187
template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
188-
constexpr std::enable_if_t<arch_tag == gpu_arch::XeHpc, uint32_t>
188+
constexpr std::enable_if_t<arch_has_2d_load_store<arch_tag>, uint32_t>
189189
get_atomic_cache_hint_code() {
190190
check_lsc_cache_hint<lsc_action::atomic, L1H, L2H, arch_tag>();
191191
if (L1H == cache_hint::none && L2H == cache_hint::none) {
@@ -286,7 +286,6 @@ enum class store_op : uint8_t {
286286
scattered_transpose = 3,
287287
block_1d = 4
288288
};
289-
enum class mma_engine : uint8_t { xmx = 0, fpu = 1 };
290289
// enum class trans_mode : uint8_t { none = 0, transpose = 1 };
291290
enum class memory_op : uint8_t { load = 0, store = 1 };
292291
enum class tdesc_update_dir : uint8_t { x_dir = 0, y_dir = 1 };

0 commit comments

Comments
 (0)