Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit b259fba

Browse files
committed
update Test execution
1 parent 08a0253 commit b259fba

File tree

2 files changed

+86
-64
lines changed

2 files changed

+86
-64
lines changed

tests/utils/common.hpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ inline auto getTypeName<gpu::xetla::tf32>() {
9090
}
9191

9292
enum class test_result : uint8_t { complete = 0, skip = 1, fail = 2 };
93+
enum Direction { FWD = 0, BWDD = 1, BWDW = 2 };
9394

9495
template <typename result_type>
9596
inline result_type generate_real_random(
@@ -129,6 +130,28 @@ inline data_type* alloc_host(size_t size) {
129130
return host_ptr;
130131
}
131132

133+
template <typename data_type>
134+
using init_func_t = std::function<void(data_type* data, size_t elements)>;
135+
136+
template <typename data_type>
137+
void index_init_func(data_type* data, size_t idx) {
138+
data[idx] = static_cast<data_type>(idx);
139+
}
140+
template <typename data_type>
141+
void no_init_func(
142+
[[maybe_unused]] data_type* data,
143+
[[maybe_unused]] size_t idx) {}
144+
145+
template <typename data_type>
146+
void rand_init_func(data_type* data, size_t idx) {
147+
data[idx] = static_cast<data_type>(random_float() - 0.5f);
148+
}
149+
150+
template <typename data_type>
151+
void zero_init_func(data_type* data, size_t idx) {
152+
data[idx] = 0;
153+
}
154+
132155
template <typename data_type>
133156
inline data_type* alloc_host_and_init(
134157
size_t size,

tests/utils/execution.hpp

Lines changed: 63 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*******************************************************************************
2-
* Copyright (c) 2022-2023 Intel Corporation
2+
* Copyright (c) 2022-2024 Intel Corporation
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -20,22 +20,26 @@
2020
#include <stdexcept>
2121
#include "common.hpp"
2222
#include "profiling.hpp"
23+
#ifdef _WIN32
24+
#include "windows_functions.hpp"
25+
#endif
2326
#include "xetla.hpp"
2427

2528
using namespace cl::sycl;
2629
using namespace gpu;
2730
using namespace gpu::xetla;
2831

29-
template <
30-
class Test,
31-
typename validate_func,
32-
typename KERNEL,
33-
int SLMSIZE = arch_attr_t<Test::gpu_arch>::local_mem_size,
34-
int BARNUM = 32>
35-
void gemm_exec(const std::string& compile_str, size_t batch = 1) {
32+
template <typename Test, typename validate_func, typename kernel_t>
33+
void gemm_exec(
34+
const std::string& compile_str,
35+
size_t batch = 1,
36+
size_t scaling = 1) {
3637
test_result result = test_result::complete;
3738

38-
using gemm_op_t = typename KERNEL::gemm_op_t;
39+
using gemm_op_t = typename kernel_t::gemm_op_t;
40+
41+
constexpr uint32_t slm_size = kernel_t::slm_size;
42+
constexpr uint32_t barrier_num = kernel_t::barrier_count;
3943

4044
using data_type_a = typename Test::data_type_a;
4145
using data_type_b = typename Test::data_type_b;
@@ -46,6 +50,12 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
4650
constexpr size_t matrix_n = Test::mat_n;
4751
constexpr size_t matrix_k = Test::mat_k;
4852

53+
[[maybe_unused]] constexpr size_t wg_tile_m = Test::wg_m;
54+
[[maybe_unused]] constexpr size_t wg_tile_n = Test::wg_n;
55+
[[maybe_unused]] constexpr size_t sg_tile_m = Test::sg_m;
56+
[[maybe_unused]] constexpr size_t sg_tile_n = Test::sg_n;
57+
[[maybe_unused]] constexpr size_t sg_tile_k = Test::sg_k;
58+
4959
size_t size_a = matrix_m * matrix_k;
5060
size_t size_b = matrix_k * matrix_n;
5161
size_t size_c = matrix_m * matrix_n;
@@ -59,16 +69,16 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
5969

6070
auto A = alloc_device_and_init<data_type_a>(
6171
batch * size_a,
62-
[](data_type_a* data, size_t idx) {
63-
data[idx] = static_cast<data_type_a>(random_float());
72+
[&scaling](data_type_a* data, size_t idx) {
73+
data[idx] = static_cast<data_type_a>(scaling * (random_float() - 0.5f));
6474
},
6575
queue,
6676
device,
6777
context);
6878
auto B = alloc_device_and_init<data_type_b>(
6979
batch * size_b,
70-
[](data_type_b* data, size_t idx) {
71-
data[idx] = static_cast<data_type_b>(random_float());
80+
[&scaling](data_type_b* data, size_t idx) {
81+
data[idx] = static_cast<data_type_b>(scaling * (random_float() - 0.5f));
7282
},
7383
queue,
7484
device,
@@ -81,6 +91,7 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
8191
queue,
8292
device,
8393
context);
94+
8495
size_t size_acc = gemm_op_t::get_acc_buf_size(matrix_m, matrix_n);
8596
size_t size_cnt = gemm_op_t::get_cnt_buf_size(matrix_m, matrix_n);
8697
auto Acc = alloc_device_and_init<data_type_acc>(
@@ -97,20 +108,20 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
97108
queue,
98109
device,
99110
context);
100-
101-
size_t ops = 2 * matrix_m * matrix_n * matrix_k;
111+
long ops = 2 * static_cast<long>(matrix_m) * matrix_n * matrix_k;
102112
profiling_helper prof("gemm", ops, "gflops");
103-
104113
try {
105114
std::vector<kernel_id> kernelId = {get_kernel_id<Test>()};
106115
auto inputBundle =
107116
get_kernel_bundle<bundle_state::input>(context, kernelId);
108-
static const std::string env_set_str =
109-
"SYCL_PROGRAM_COMPILE_OPTIONS=" + compile_str;
110-
putenv(const_cast<char*>(env_set_str.c_str()));
117+
char* value = getenv("GOGRITS");
118+
if (value == NULL || strcmp(value, "on") != 0) {
119+
setenv("SYCL_PROGRAM_COMPILE_OPTIONS", compile_str.c_str(), 1);
120+
}
111121
kernel_bundle<bundle_state::executable> exeBundle = build(inputBundle);
112-
static const std::string env_unset_str = "SYCL_PROGRAM_COMPILE_OPTIONS=";
113-
putenv(const_cast<char*>(env_unset_str.c_str()));
122+
if (value == NULL || strcmp(value, "on") != 0) {
123+
unsetenv("SYCL_PROGRAM_COMPILE_OPTIONS");
124+
}
114125

115126
using namespace gpu::xetla::group;
116127
using namespace gpu::xetla::kernel;
@@ -130,10 +141,10 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
130141
nullptr);
131142

132143
cl::sycl::nd_range<3> nd_range = gemm_op_t::get_nd_range(arg);
133-
134144
int constexpr warm_up = 10;
135145
int constexpr iters = 100;
136146
for (size_t i = 0; i < batch; i++) {
147+
prof.cpu_start();
137148
auto A_ptr = A + i * size_a;
138149
auto B_ptr = B + i * size_b;
139150
auto C_ptr = C + i * size_c;
@@ -157,11 +168,13 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
157168
prof.cpu_start();
158169
}
159170
auto e_esimd = queue.submit([&](handler& cgh) {
160-
cgh.use_kernel_bundle(exeBundle);
171+
if (value == NULL || strcmp(value, "on") != 0) {
172+
cgh.use_kernel_bundle(exeBundle);
173+
}
161174
cgh.parallel_for<Test>(nd_range, [=](nd_item<3> item) KERNEL_MAIN {
162-
gpu::xetla::xetla_local_init<SLMSIZE>();
163-
gpu::xetla::xetla_nbarrier_init<BARNUM>();
164-
KERNEL::run(
175+
gpu::xetla::xetla_local_init<slm_size>();
176+
gpu::xetla::xetla_nbarrier_init<barrier_num>();
177+
kernel_t::run(
165178
item,
166179
A_ptr,
167180
B_ptr,
@@ -184,9 +197,7 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
184197
std::cout << "SYCL exception caught: " << e.what() << '\n';
185198
result = test_result::fail;
186199
}
187-
188-
// performance
189-
prof.print_profiling_result(profiling_selector::GPU);
200+
unsetenv("SYCL_PROGRAM_COMPILE_OPTIONS");
190201
// validation
191202
if (result == test_result::complete) {
192203
validate_func vfunc;
@@ -204,61 +215,49 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
204215
} else if (result != test_result::complete) {
205216
FAIL();
206217
}
218+
prof.print_profiling_result(profiling_selector::GPU);
207219
}
208220

209221
/// @brief The template function to execute kernel in esimd way for unit test
210222
/// framework
211223
///
212224
/// @tparam data_type data_type The data type of buffer used in kernel and
213225
/// buffer allocation
214-
/// @tparam KERNEL the kernel function struct
226+
/// @tparam kernel_t the kernel function struct
215227
/// @param nd_range the range of workitems
216-
/// @param validate_result validation function, taking 3 parameters buffer A, B
217-
/// as input C as output
228+
/// @param validate_result validation function, taking 3 parameters buffer A,
229+
/// B as input C as output
218230
///
219231
template <
220232
typename data_type,
221-
class KERNEL,
222-
size_t SLMSIZE = 8 * 1024,
223-
size_t BARNUM = 32,
224-
size_t Size = 4096>
225-
void kernel_run(auto nd_range, auto validate_result) {
233+
typename kernel_t,
234+
size_t slm_size = 8 * 1024,
235+
size_t barrier_num = 32,
236+
size_t size = 4096>
237+
void kernel_run(
238+
auto nd_range,
239+
auto validate_result,
240+
init_func_t<data_type> init_func_a = index_init_func<data_type>,
241+
init_func_t<data_type> init_func_b = index_init_func<data_type>,
242+
init_func_t<data_type> init_func_c = no_init_func<data_type>) {
226243
queue queue{};
227244
auto context = queue.get_info<info::queue::context>();
228245
auto device = queue.get_info<info::queue::device>();
229246
std::cout << "Running on " << device.get_info<info::device::name>() << "\n";
230247

231248
auto A = alloc_device_and_init<data_type>(
232-
Size,
233-
[](data_type* data, size_t idx) {
234-
data[idx] = static_cast<data_type>(idx);
235-
},
236-
queue,
237-
device,
238-
context);
249+
size, init_func_a, queue, device, context);
239250
auto B = alloc_device_and_init<data_type>(
240-
Size,
241-
[](data_type* data, size_t idx) {
242-
data[idx] = static_cast<data_type>(idx);
243-
},
244-
queue,
245-
device,
246-
context);
251+
size, init_func_b, queue, device, context);
247252
auto C = alloc_device_and_init<data_type>(
248-
Size,
249-
[](data_type* data, size_t idx) {
250-
data[idx] = static_cast<data_type>(idx);
251-
},
252-
queue,
253-
device,
254-
context);
253+
size, init_func_c, queue, device, context);
255254

256255
try {
257256
auto e_esimd = queue.submit([&](handler& cgh) {
258257
cgh.parallel_for<>(nd_range, [=](nd_item<1> ndi) KERNEL_MAIN {
259-
gpu::xetla::xetla_local_init<SLMSIZE>();
260-
gpu::xetla::xetla_nbarrier_init<BARNUM>();
261-
KERNEL::run(&ndi, A, B, C);
258+
gpu::xetla::xetla_local_init<slm_size>();
259+
gpu::xetla::xetla_nbarrier_init<barrier_num>();
260+
kernel_t::run(&ndi, A, B, C);
262261
});
263262
});
264263
e_esimd.wait();
@@ -267,9 +266,9 @@ void kernel_run(auto nd_range, auto validate_result) {
267266
FAIL();
268267
}
269268

270-
auto A_host = alloc_host_and_copy<data_type>(A, Size, queue);
271-
auto B_host = alloc_host_and_copy<data_type>(B, Size, queue);
272-
auto C_host = alloc_host_and_copy<data_type>(C, Size, queue);
269+
auto A_host = alloc_host_and_copy<data_type>(A, size, queue);
270+
auto B_host = alloc_host_and_copy<data_type>(B, size, queue);
271+
auto C_host = alloc_host_and_copy<data_type>(C, size, queue);
273272

274273
ASSERT_EQ(0, validate_result(A_host, B_host, C_host));
275274

0 commit comments

Comments
 (0)