@@ -29,11 +29,10 @@ static const auto pi = 3.1415926535897932384626433832795;
2929static const auto seed = 7777 ;
3030
3131// Default Number of 2D points
32- static const auto n_samples = 120000000 ;
32+ static const auto n_samples = 120'000'000 ;
3333
3434double estimate_pi (sycl::queue& q, size_t n_points) {
3535 double estimated_pi; // Estimated value of Pi
36- size_t n_under_curve = 0 ; // Number of points fallen under the curve
3736
3837 // Step 1. Generate n_points * 2 random numbers
3938 // 1.1. Generator initialization
@@ -42,47 +41,35 @@ double estimate_pi(sycl::queue& q, size_t n_points) {
4241 // Create an object of distribution (by default float, a = 0.0f, b = 1.0f)
4342 mkl::rng::uniform distr;
4443
45- float * rng_ptr = sycl::malloc_device <float >(n_points * 2 , q);
44+ float * rng_ptr = sycl::malloc_shared <float >(n_points * 2 , q);
4645
4746 // 1.2. Random number generation
4847 auto event = mkl::rng::generate (distr, engine, n_points * 2 , rng_ptr);
4948
5049 // Step 2. Count points under curve (x ^ 2 + y ^ 2 < 1.0f)
51- size_t wg_size = std::min (q.get_device ().get_info <sycl::info::device::max_work_group_size>(), n_points);
52- size_t max_compute_units = q.get_device ().get_info <sycl::info::device::max_compute_units>();
53- size_t wg_num = (n_points > wg_size * max_compute_units) ? max_compute_units : 1 ;
54-
55- size_t count_per_thread = n_points / (wg_size * wg_num);
56-
57- size_t * count_ptr = sycl::malloc_shared<size_t >(wg_num, q);
58-
59- // Make sure, that generation is finished
60- event.wait_and_throw ();
61-
62- event = q.submit ([&] (sycl::handler& h) {
63- h.parallel_for (sycl::nd_range<1 >(wg_size * wg_num, wg_size),
64- [=](sycl::nd_item<1 > item) {
65- sycl::vec<float , 2 > r;
66- size_t count = 0 ;
67- for (int i = 0 ; i < count_per_thread; i++) {
68- r.load (i + item.get_global_linear_id () * count_per_thread, sycl::global_ptr<float >(rng_ptr));
69- if (sycl::length (r) <= 1 .0f ) {
70- count += 1 ;
71- }
72- }
73- count_ptr[item.get_group_linear_id ()] = sycl::reduce_over_group (item.get_group (), count, std::plus<size_t >());
74- });
75- });
76-
77- event.wait_and_throw ();
78-
79- n_under_curve = std::accumulate (count_ptr, count_ptr + wg_num, 0 );
50+ constexpr size_t count_per_thread = 32 ;
51+ size_t *n_under_curve = sycl::malloc_host<size_t >(1 , q); // Number of points fallen under the curve
52+ *n_under_curve = 0 ;
53+ auto reductor = sycl::reduction (n_under_curve, size_t (0 ), std::plus<size_t >{});
54+
55+ q.parallel_for (sycl::range<1 >(n_points / count_per_thread), event, reductor,
56+ [=](sycl::item<1 > item, auto & sum) {
57+ sycl::vec<float , 2 > r;
58+ size_t count = 0 ;
59+ for (int i = 0 ; i < count_per_thread; i++) {
60+ r.load (i + item.get_id (0 ) * count_per_thread, sycl::global_ptr<float >(rng_ptr));
61+ if (sycl::length (r) <= 1 .0f ) {
62+ count++;
63+ }
64+ }
65+ sum += count;
66+ }).wait_and_throw ();
8067
8168 // Step 3. Calculate approximated value of Pi
82- estimated_pi = n_under_curve / ((double )n_points) * 4.0 ;
69+ estimated_pi = * n_under_curve / ((double )n_points) * 4.0 ;
8370
8471 sycl::free (rng_ptr, q);
85- sycl::free (count_ptr , q);
72+ sycl::free (n_under_curve , q);
8673
8774 return estimated_pi;
8875
@@ -135,7 +122,7 @@ int main(int argc, char ** argv) {
135122 std::cout << " Absolute error = " << abs_error << std::endl;
136123 std::cout << std::endl;
137124
138- if (abs_error > 1.0e-3 ) {
125+ if (abs_error > 1.0e-4 ) {
139126 std::cout << " TEST FAILED" << std::endl;
140127 return 1 ;
141128 }
0 commit comments