11/* ******************************************************************************
2- * Copyright (c) 2022-2023 Intel Corporation
2+ * Copyright (c) 2022-2024 Intel Corporation
33 *
44 * Licensed under the Apache License, Version 2.0 (the "License");
55 * you may not use this file except in compliance with the License.
2020#include < stdexcept>
2121#include " common.hpp"
2222#include " profiling.hpp"
23+ #ifdef _WIN32
24+ #include " windows_functions.hpp"
25+ #endif
2326#include " xetla.hpp"
2427
2528using namespace cl ::sycl;
2629using namespace gpu ;
2730using namespace gpu ::xetla;
2831
29- template <
30- class Test ,
31- typename validate_func,
32- typename KERNEL,
33- int SLMSIZE = arch_attr_t <Test::gpu_arch>::local_mem_size,
34- int BARNUM = 32 >
35- void gemm_exec (const std::string& compile_str, size_t batch = 1 ) {
32+ template <typename Test, typename validate_func, typename kernel_t >
33+ void gemm_exec (
34+ const std::string& compile_str,
35+ size_t batch = 1 ,
36+ size_t scaling = 1 ) {
3637 test_result result = test_result::complete;
3738
38- using gemm_op_t = typename KERNEL::gemm_op_t ;
39+ using gemm_op_t = typename kernel_t ::gemm_op_t ;
40+
41+ constexpr uint32_t slm_size = kernel_t ::slm_size;
42+ constexpr uint32_t barrier_num = kernel_t ::barrier_count;
3943
4044 using data_type_a = typename Test::data_type_a;
4145 using data_type_b = typename Test::data_type_b;
@@ -46,6 +50,12 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
4650 constexpr size_t matrix_n = Test::mat_n;
4751 constexpr size_t matrix_k = Test::mat_k;
4852
53+ [[maybe_unused]] constexpr size_t wg_tile_m = Test::wg_m;
54+ [[maybe_unused]] constexpr size_t wg_tile_n = Test::wg_n;
55+ [[maybe_unused]] constexpr size_t sg_tile_m = Test::sg_m;
56+ [[maybe_unused]] constexpr size_t sg_tile_n = Test::sg_n;
57+ [[maybe_unused]] constexpr size_t sg_tile_k = Test::sg_k;
58+
4959 size_t size_a = matrix_m * matrix_k;
5060 size_t size_b = matrix_k * matrix_n;
5161 size_t size_c = matrix_m * matrix_n;
@@ -59,16 +69,16 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
5969
6070 auto A = alloc_device_and_init<data_type_a>(
6171 batch * size_a,
62- [](data_type_a* data, size_t idx) {
63- data[idx] = static_cast <data_type_a>(random_float ());
72+ [&scaling ](data_type_a* data, size_t idx) {
73+ data[idx] = static_cast <data_type_a>(scaling * ( random_float () - 0 . 5f ));
6474 },
6575 queue,
6676 device,
6777 context);
6878 auto B = alloc_device_and_init<data_type_b>(
6979 batch * size_b,
70- [](data_type_b* data, size_t idx) {
71- data[idx] = static_cast <data_type_b>(random_float ());
80+ [&scaling ](data_type_b* data, size_t idx) {
81+ data[idx] = static_cast <data_type_b>(scaling * ( random_float () - 0 . 5f ));
7282 },
7383 queue,
7484 device,
@@ -81,6 +91,7 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
8191 queue,
8292 device,
8393 context);
94+
8495 size_t size_acc = gemm_op_t::get_acc_buf_size (matrix_m, matrix_n);
8596 size_t size_cnt = gemm_op_t::get_cnt_buf_size (matrix_m, matrix_n);
8697 auto Acc = alloc_device_and_init<data_type_acc>(
@@ -97,20 +108,20 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
97108 queue,
98109 device,
99110 context);
100-
101- size_t ops = 2 * matrix_m * matrix_n * matrix_k;
111+ long ops = 2 * static_cast <long >(matrix_m) * matrix_n * matrix_k;
102112 profiling_helper prof (" gemm" , ops, " gflops" );
103-
104113 try {
105114 std::vector<kernel_id> kernelId = {get_kernel_id<Test>()};
106115 auto inputBundle =
107116 get_kernel_bundle<bundle_state::input>(context, kernelId);
108- static const std::string env_set_str =
109- " SYCL_PROGRAM_COMPILE_OPTIONS=" + compile_str;
110- putenv (const_cast <char *>(env_set_str.c_str ()));
117+ char * value = getenv (" GOGRITS" );
118+ if (value == NULL || strcmp (value, " on" ) != 0 ) {
119+ setenv (" SYCL_PROGRAM_COMPILE_OPTIONS" , compile_str.c_str (), 1 );
120+ }
111121 kernel_bundle<bundle_state::executable> exeBundle = build (inputBundle);
112- static const std::string env_unset_str = " SYCL_PROGRAM_COMPILE_OPTIONS=" ;
113- putenv (const_cast <char *>(env_unset_str.c_str ()));
122+ if (value == NULL || strcmp (value, " on" ) != 0 ) {
123+ unsetenv (" SYCL_PROGRAM_COMPILE_OPTIONS" );
124+ }
114125
115126 using namespace gpu ::xetla::group;
116127 using namespace gpu ::xetla::kernel;
@@ -130,10 +141,10 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
130141 nullptr );
131142
132143 cl::sycl::nd_range<3 > nd_range = gemm_op_t::get_nd_range (arg);
133-
134144 int constexpr warm_up = 10 ;
135145 int constexpr iters = 100 ;
136146 for (size_t i = 0 ; i < batch; i++) {
147+ prof.cpu_start ();
137148 auto A_ptr = A + i * size_a;
138149 auto B_ptr = B + i * size_b;
139150 auto C_ptr = C + i * size_c;
@@ -157,11 +168,13 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
157168 prof.cpu_start ();
158169 }
159170 auto e_esimd = queue.submit ([&](handler& cgh) {
160- cgh.use_kernel_bundle (exeBundle);
171+ if (value == NULL || strcmp (value, " on" ) != 0 ) {
172+ cgh.use_kernel_bundle (exeBundle);
173+ }
161174 cgh.parallel_for <Test>(nd_range, [=](nd_item<3 > item) KERNEL_MAIN {
162- gpu::xetla::xetla_local_init<SLMSIZE >();
163- gpu::xetla::xetla_nbarrier_init<BARNUM >();
164- KERNEL ::run (
175+ gpu::xetla::xetla_local_init<slm_size >();
176+ gpu::xetla::xetla_nbarrier_init<barrier_num >();
177+ kernel_t ::run (
165178 item,
166179 A_ptr,
167180 B_ptr,
@@ -184,9 +197,7 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
184197 std::cout << " SYCL exception caught: " << e.what () << ' \n ' ;
185198 result = test_result::fail;
186199 }
187-
188- // performance
189- prof.print_profiling_result (profiling_selector::GPU);
200+ unsetenv (" SYCL_PROGRAM_COMPILE_OPTIONS" );
190201 // validation
191202 if (result == test_result::complete) {
192203 validate_func vfunc;
@@ -204,61 +215,49 @@ void gemm_exec(const std::string& compile_str, size_t batch = 1) {
204215 } else if (result != test_result::complete) {
205216 FAIL ();
206217 }
218+ prof.print_profiling_result (profiling_selector::GPU);
207219}
208220
209221// / @brief The template function to execute kernel in esimd way for unit test
210222// / framework
211223// /
212224// / @tparam data_type data_type The data type of buffer used in kernel and
213225// / buffer allocation
214- // / @tparam KERNEL the kernel function struct
226+ // / @tparam kernel_t the kernel function struct
215227// / @param nd_range the range of workitems
216- // / @param validate_result validation function, taking 3 parameters buffer A, B
217- // / as input C as output
228+ // / @param validate_result validation function, taking 3 parameters buffer A,
229+ // / B as input C as output
218230// /
219231template <
220232 typename data_type,
221- class KERNEL ,
222- size_t SLMSIZE = 8 * 1024 ,
223- size_t BARNUM = 32 ,
224- size_t Size = 4096 >
225- void kernel_run (auto nd_range, auto validate_result) {
233+ typename kernel_t ,
234+ size_t slm_size = 8 * 1024 ,
235+ size_t barrier_num = 32 ,
236+ size_t size = 4096 >
237+ void kernel_run (
238+ auto nd_range,
239+ auto validate_result,
240+ init_func_t <data_type> init_func_a = index_init_func<data_type>,
241+ init_func_t <data_type> init_func_b = index_init_func<data_type>,
242+ init_func_t <data_type> init_func_c = no_init_func<data_type>) {
226243 queue queue{};
227244 auto context = queue.get_info <info::queue::context>();
228245 auto device = queue.get_info <info::queue::device>();
229246 std::cout << " Running on " << device.get_info <info::device::name>() << " \n " ;
230247
231248 auto A = alloc_device_and_init<data_type>(
232- Size,
233- [](data_type* data, size_t idx) {
234- data[idx] = static_cast <data_type>(idx);
235- },
236- queue,
237- device,
238- context);
249+ size, init_func_a, queue, device, context);
239250 auto B = alloc_device_and_init<data_type>(
240- Size,
241- [](data_type* data, size_t idx) {
242- data[idx] = static_cast <data_type>(idx);
243- },
244- queue,
245- device,
246- context);
251+ size, init_func_b, queue, device, context);
247252 auto C = alloc_device_and_init<data_type>(
248- Size,
249- [](data_type* data, size_t idx) {
250- data[idx] = static_cast <data_type>(idx);
251- },
252- queue,
253- device,
254- context);
253+ size, init_func_c, queue, device, context);
255254
256255 try {
257256 auto e_esimd = queue.submit ([&](handler& cgh) {
258257 cgh.parallel_for <>(nd_range, [=](nd_item<1 > ndi) KERNEL_MAIN {
259- gpu::xetla::xetla_local_init<SLMSIZE >();
260- gpu::xetla::xetla_nbarrier_init<BARNUM >();
261- KERNEL ::run (&ndi, A, B, C);
258+ gpu::xetla::xetla_local_init<slm_size >();
259+ gpu::xetla::xetla_nbarrier_init<barrier_num >();
260+ kernel_t ::run (&ndi, A, B, C);
262261 });
263262 });
264263 e_esimd.wait ();
@@ -267,9 +266,9 @@ void kernel_run(auto nd_range, auto validate_result) {
267266 FAIL ();
268267 }
269268
270- auto A_host = alloc_host_and_copy<data_type>(A, Size , queue);
271- auto B_host = alloc_host_and_copy<data_type>(B, Size , queue);
272- auto C_host = alloc_host_and_copy<data_type>(C, Size , queue);
269+ auto A_host = alloc_host_and_copy<data_type>(A, size , queue);
270+ auto B_host = alloc_host_and_copy<data_type>(B, size , queue);
271+ auto C_host = alloc_host_and_copy<data_type>(C, size , queue);
273272
274273 ASSERT_EQ (0 , validate_result (A_host, B_host, C_host));
275274
0 commit comments