Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit f738dc7

Browse files
committed
fix sw_barrier
1 parent 5ad47ef commit f738dc7

File tree

8 files changed

+25
-15
lines changed

8 files changed

+25
-15
lines changed

examples/09_gate_recurrent_unit/kernel_func.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ struct fused_config_t {
8686
{start_x_b, start_y_b}); \
8787
gemm_args.init(mem_desc_a, mem_desc_b, inner_loop_count_##id); \
8888
op(g, matAcc_##acc_id, gemm_args); \
89-
SW_BARRIER();
89+
sw_barrier();
9090

9191
#define MATC_STORE(ptr_c) \
9292
mem_desc_c.init( \
@@ -229,7 +229,7 @@ struct gru_layer {
229229
int start_n = (j)*wg_tile_n;
230230
CONFIG_SETTING(batch_size, -1, hidden_size);
231231
matAcc_0.init(0);
232-
SW_BARRIER();
232+
sw_barrier();
233233

234234
// calculate reset gate: r_t = \sigmoid(X_t x W_ir + h_{t - 1} x W_hr)
235235
// acc0 = X_t x W_ir
@@ -278,19 +278,19 @@ struct gru_layer {
278278
matAcc_0.reg = matAcc_0.reg * (1 - matAcc_1.reg) +
279279
matAcc_1.reg *
280280
xetla_cvt<Act_T, T, matAcc_t::tile_elems>(mat_hidden.reg);
281-
SW_BARRIER();
281+
sw_barrier();
282282

283283
if (seq_id == seq_len - 1) {
284284
MATC_STORE(args->layer_output);
285-
SW_BARRIER();
285+
sw_barrier();
286286
__esimd_barrier();
287287
}
288288
MATC_STORE(args->cell_out_ptr + seq_id * io_size);
289-
SW_BARRIER();
289+
sw_barrier();
290290
__esimd_barrier();
291291

292292
MATC_STORE(args->one_cell_ptr + (seq_id % 2) * io_size);
293-
SW_BARRIER();
293+
sw_barrier();
294294
__esimd_barrier();
295295
}
296296
args->hx_ptr = args->one_cell_ptr + (seq_id % 2) * io_size;
@@ -386,7 +386,7 @@ struct kernel_xcoder_gru_fusion {
386386
args.W_hz_ptr = (W_hz_ptr);
387387
args.W_in_ptr = (W_in_ptr);
388388
args.W_hn_ptr = (W_hn_ptr);
389-
SW_BARRIER();
389+
sw_barrier();
390390
fused_op::call(item, &args);
391391
ping = (ping + 1) % 2;
392392
pong = (pong + 1) % 2;
@@ -411,7 +411,7 @@ struct kernel_xcoder_gru_fusion {
411411
? hidden_out_ptr
412412
: (ping_pong_buffer + ping * one_layer_size);
413413
args.layer_ptr = ((ping_pong_buffer + pong * one_layer_size));
414-
SW_BARRIER();
414+
sw_barrier();
415415
fused_op::call(item, &args);
416416
ping = (ping + 1) % 2;
417417
pong = (pong + 1) % 2;

include/common/core/barrier.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,16 @@ namespace gpu::xetla {
2626
/// @addtogroup xetla_core_barrier
2727
/// @{
2828

29+
/// sw_barrier, insert software scheduling barrier, for better code control
30+
///
31+
32+
void sw_barrier() {
33+
#if __INTEL_LLVM_COMPILER >= 20250000
34+
#else
35+
__ESIMD_NS::fence<__ESIMD_NS::fence_mask::sw_barrier>();
36+
#endif
37+
}
38+
2939
/// @brief Initialize the number of named barrier index for a kernel.
3040
/// Available only on PVC. Only need to initialize once at the beginning.
3141
///

tests/integration/mlp/int4/int4_mlp_gate_mul_up_fwd.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ class global_sum_reduce_two_mat_t {
188188
mat_zero, matAcc1_payload);
189189
subgroup::tile_store<cache_hint::uncached, cache_hint::write_back>(
190190
mat_zero, matAcc2_payload);
191-
SW_BARRIER();
191+
sw_barrier();
192192
}
193193
}
194194
};

tests/integration/vector_add/int32_1d/kernel_func.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ KERNEL_FUNC inline void vector_add_func(
3636
/// use block prefetch for b
3737
xetla_prefetch_global<dtype, SIMD, cache_hint::cached, cache_hint::cached>(
3838
b, offset);
39-
SW_BARRIER();
39+
sw_barrier();
4040
/// use scattered load for a
4141
xetla_vector<dtype, SIMD> ivector1 = xetla_load_global<
4242
dtype,

tests/integration/vector_add/tf32_1d/kernel_func.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ KERNEL_FUNC inline void vector_add_func(
3636
/// use block prefetch for b
3737
xetla_prefetch_global<dtype, SIMD, cache_hint::cached, cache_hint::cached>(
3838
b, offset);
39-
SW_BARRIER();
39+
sw_barrier();
4040
/// use scattered load for a
4141
xetla_vector<dtype, SIMD> ivector1 = xetla_load_global<
4242
dtype,

tests/unit/block_load_store/kernel_func.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ struct block_load_store_func {
4444
cache_hint::cached,
4545
cache_hint::cached,
4646
arch_tag>(src_tdesc);
47-
SW_BARRIER();
47+
sw_barrier();
4848
xetla_vector<dtype, bwidth* bheight> A_load_vec = xetla_tload_global<
4949
dtype,
5050
bwidth * bheight,

tests/unit/tile_load_store/kernel_func.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ struct tile_load_store_atomic_func {
242242
matBias.reg = matA.reg;
243243
matA.reg = 0;
244244
tile_store(matA, payload_store);
245-
SW_BARRIER();
245+
sw_barrier();
246246
tile_store(matBias, payload_store_add, check_tag);
247247
}
248248
};

tests/unit/tile_mma/kernel_func.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,9 @@ struct tile_mma_func {
103103
matA, matA_payload);
104104
subgroup::tile_load<cache_hint::cached, cache_hint::cached>(
105105
matB, matB_payload);
106-
SW_BARRIER();
106+
sw_barrier();
107107
tile_mma::mma(matAcc, matAcc, matB, matA);
108-
SW_BARRIER();
108+
sw_barrier();
109109
matC.reg = xetla_cvt<dtypeC, dtypeAcc, matAcc_t::tile_desc::tile_elems>(
110110
matAcc.reg);
111111
matC_payload.init(c, n, m, n, 0, 0);

0 commit comments

Comments
 (0)