@@ -29,7 +29,7 @@ static const auto pi = 3.1415926535897932384626433832795;
2929static const auto seed = 7777 ;
3030
3131// Default Number of 2D points
32- static const auto n_samples = 120000000 ;
32+ static const auto n_samples = 120'000'000 ;
3333
3434double estimate_pi (sycl::queue& q, size_t n_points) {
3535 double estimated_pi; // Estimated value of Pi
@@ -48,37 +48,30 @@ double estimate_pi(sycl::queue& q, size_t n_points) {
4848 mkl::rng::generate (distr, engine, n_points * 2 , rng_buf);
4949
5050 // Step 2. Count points under curve (x ^ 2 + y ^ 2 < 1.0f)
51- size_t wg_size = std::min (q.get_device ().get_info <sycl::info::device::max_work_group_size>(), n_points);
52- size_t max_compute_units = q.get_device ().get_info <sycl::info::device::max_compute_units>();
53- size_t wg_num = (n_points > wg_size * max_compute_units) ? max_compute_units : 1 ;
54-
55- size_t count_per_thread = n_points / (wg_size * wg_num);
56-
57- std::vector<size_t > count (wg_num);
51+ size_t count_per_thread = 32 ;
5852
5953 {
60- sycl::buffer<size_t , 1 > count_buf (count) ;
54+ sycl::buffer<size_t > count_buf{ &n_under_curve , 1 } ;
6155
6256 q.submit ([&] (sycl::handler& h) {
6357 auto rng_acc = rng_buf.template get_access <sycl::access::mode::read>(h);
64- auto count_acc = count_buf.template get_access <sycl::access::mode::write>(h);
65- h.parallel_for (sycl::nd_range<1 >(wg_size * wg_num, wg_size),
66- [=](sycl::nd_item<1 > item) {
67- sycl::vec<float , 2 > r;
68- size_t count = 0 ;
69- for (int i = 0 ; i < count_per_thread; i++) {
70- r.load (i + item.get_global_linear_id () * count_per_thread, rng_acc.template get_multi_ptr <sycl::access::decorated::yes>());
71- if (sycl::length (r) <= 1 .0f ) {
72- count += 1 ;
58+ auto reductor = sycl::reduction (count_buf, h, size_t (0 ), std::plus<size_t >());
59+
60+ h.parallel_for (sycl::range<1 >(n_points / count_per_thread), reductor,
61+ [=](sycl::item<1 > item, auto & sum) {
62+ sycl::vec<float , 2 > r;
63+ size_t count = 0 ;
64+ for (int i = 0 ; i < count_per_thread; i++) {
65+ r.load (i + item.get_id (0 ) * count_per_thread, rng_acc.template get_multi_ptr <sycl::access::decorated::yes>());
66+ if (sycl::length (r) <= 1 .0f ) {
67+ count++;
68+ }
7369 }
74- }
75- count_acc[item.get_group_linear_id ()] = sycl::reduce_over_group (item.get_group (), count, std::plus<size_t >());
70+ sum += count;
7671 });
7772 });
7873 }
7974
80- n_under_curve = std::accumulate (count.begin (), count.end (), 0 );
81-
8275 // Step 3. Calculate approximated value of Pi
8376 estimated_pi = n_under_curve / ((double )n_points) * 4.0 ;
8477 return estimated_pi;
@@ -132,7 +125,7 @@ int main(int argc, char ** argv) {
132125 std::cout << " Absolute error = " << abs_error << std::endl;
133126 std::cout << std::endl;
134127
135- if (abs_error > 1.0e-3 ) {
128+ if (abs_error > 1.0e-4 ) {
136129 std::cout << " TEST FAILED" << std::endl;
137130 return 1 ;
138131 }
0 commit comments