3636#include < helper_cuda.h>
3737#include < vector>
3838#include < chrono>
39+ #include < chrono>
3940#include < taskflow/sycl/syclflow.hpp>
4041
4142using Time = std::chrono::steady_clock;
@@ -55,7 +56,7 @@ void reduce(float *inputVec, double *outputVec, size_t inputSize,
5556 size_t outputSize, const sycl::nd_item<3 > &item_ct1,
5657 double *tmp) {
5758
58- auto cta = item_ct1.get_group ();
59+ sycl::group< 3 > cta = item_ct1.get_group ();
5960 size_t globaltid = item_ct1.get_group (2 ) * item_ct1.get_local_range (2 ) +
6061 item_ct1.get_local_id (2 );
6162
@@ -68,29 +69,27 @@ void reduce(float *inputVec, double *outputVec, size_t inputSize,
6869
6970 item_ct1.barrier ();
7071
71- sycl::sub_group tile_sg = item_ct1.get_sub_group ();
72+ sycl::sub_group tile32 = item_ct1.get_sub_group ();
7273
7374 double beta = temp_sum;
7475 double temp;
7576
76- for (int i = tile_sg .get_local_linear_range () / 2 ; i > 0 ;
77+ for (int i = tile32 .get_local_linear_range () / 2 ; i > 0 ;
7778 i >>= 1 ) {
78- if (tile_sg .get_local_linear_id () < i) {
79+ if (tile32 .get_local_linear_id () < i) {
7980 temp = tmp[item_ct1.get_local_linear_id () + i];
8081 beta += temp;
8182 tmp[item_ct1.get_local_linear_id ()] = beta;
8283 }
83- tile_sg. barrier ();
84- }
84+ }
85+
8586 item_ct1.barrier ();
8687
8788 if (item_ct1.get_local_linear_id () == 0 &&
8889 item_ct1.get_group (2 ) < outputSize) {
8990 beta = 0.0 ;
90- int cta_size = cta.get_local_linear_range ();
91-
92- for (int i = 0 ; i < cta_size;
93- i += tile_sg.get_local_linear_range ()) {
91+ for (int i = 0 ; i < item_ct1.get_group ().get_local_linear_range ();
92+ i += tile32.get_local_linear_range ()) {
9493 beta += tmp[i];
9594 }
9695 outputVec[item_ct1.get_group (2 )] = beta;
@@ -101,6 +100,7 @@ void reduceFinal(double *inputVec, double *result,
101100 size_t inputSize, const sycl::nd_item<3 > &item_ct1,
102101 double *tmp) {
103102
103+ sycl::group<3 > cta = item_ct1.get_group ();
104104 size_t globaltid = item_ct1.get_group (2 ) * item_ct1.get_local_range (2 ) +
105105 item_ct1.get_local_id (2 );
106106
@@ -113,7 +113,7 @@ void reduceFinal(double *inputVec, double *result,
113113
114114 item_ct1.barrier ();
115115
116- sycl::sub_group tile_sg = item_ct1.get_sub_group ();
116+ sycl::sub_group tile32 = item_ct1.get_sub_group ();
117117
118118 // do reduction in shared mem
119119 if ((item_ct1.get_local_range (2 ) >= 512 ) &&
@@ -145,11 +145,11 @@ void reduceFinal(double *inputVec, double *result,
145145 if (item_ct1.get_local_range (2 ) >= 64 ) temp_sum +=
146146 tmp[item_ct1.get_local_linear_id () + 32 ];
147147 // Reduce final warp using shuffle
148- for (int offset = tile_sg .get_local_linear_range () / 2 ;
148+ for (int offset =tile32 .get_local_linear_range () / 2 ;
149149 offset > 0 ; offset /= 2 ) {
150150 temp_sum +=
151- sycl::shift_group_left (tile_sg , temp_sum, offset);
152- }
151+ sycl::shift_group_left (tile32 , temp_sum, offset);
152+ }
153153 }
154154 // write result for this block to global mem
155155 if (item_ct1.get_local_linear_id () == 0 ) result[0 ] = temp_sum;
@@ -169,9 +169,8 @@ void myHostNodeCallback(void *data) {
169169 *result = 0.0 ; // reset the result
170170}
171171
172- void syclTaskFlowManual (float *inputVec_h, float *inputVec_d,
173- double *outputVec_d, double *result_d, size_t inputSize,
174- size_t numOfBlocks, sycl::queue q_ct1) {
172+ void syclTaskFlowManual (float *inputVec_h, float *inputVec_d, double *outputVec_d,
173+ double *result_d, size_t inputSize, size_t numOfBlocks, sycl::queue q_ct1) {
175174 tf::Taskflow tflow;
176175 tf::Executor exe;
177176
@@ -202,7 +201,9 @@ void syclTaskFlowManual(float *inputVec_h, float *inputVec_d,
202201 [[intel::reqd_sub_group_size (SUB_GRP_SIZE)]] {
203202 reduce (inputVec_d, outputVec_d, inputSize,
204203 numOfBlocks, item_ct1,
205- tmp.get_pointer ());
204+
205+ tmp.get_multi_ptr <sycl::access::decorated::no>()
206+ .get ());
206207 });
207208 }).name (" reduce_kernel" );
208209
@@ -222,7 +223,8 @@ void syclTaskFlowManual(float *inputVec_h, float *inputVec_d,
222223 [[intel::reqd_sub_group_size (SUB_GRP_SIZE)]] {
223224 reduceFinal (outputVec_d, result_d,
224225 numOfBlocks, item_ct1,
225- tmp.get_pointer ());
226+ tmp.get_multi_ptr <sycl::access::decorated::no>()
227+ .get ());
226228 });
227229 }).name (" reduceFinal_kernel" );
228230
@@ -259,7 +261,7 @@ void syclTaskFlowManual(float *inputVec_h, float *inputVec_d,
259261 " %zu\n " ,
260262 sf_Task + tf_Task);
261263
262- printf (" Cloned Graph Output.. \n " );
264+ printf (" Cloned Graph Output.. \n " );
263265 tf::Taskflow tflow_clone (std::move (tflow));
264266 exe.run_n (tflow_clone, GRAPH_LAUNCH_ITERATIONS).wait ();
265267}
@@ -293,11 +295,11 @@ int main(int argc, char **argv) {
293295
294296 auto startTimer1 = Time::now ();
295297 syclTaskFlowManual (inputVec_h, inputVec_d, outputVec_d, result_d, size,
296- maxBlocks, q_ct1);
298+ maxBlocks, q_ct1);
297299 auto stopTimer1 = Time::now ();
298300 auto Timer_duration1 =
299301 std::chrono::duration_cast<float_ms>(stopTimer1 - startTimer1).count ();
300- printf (" Elapsed Time of SYCL TaskFlow Manual : %f (ms)\n " , Timer_duration1);
302+ printf (" Elapsed Time of SYCL Taskflow Manual : %f (ms)\n " , Timer_duration1);
301303
302304 DPCT_CHECK_ERROR (sycl::free (inputVec_d, q_ct1));
303305 DPCT_CHECK_ERROR (sycl::free (outputVec_d, q_ct1));
0 commit comments