@@ -52,26 +52,31 @@ inline void writePPM(const std::string &fileName,
5252
5353namespace xsimd {
5454
55- template <std:: size_t N >
56- inline batch<int , N > mandel (const batch_bool<float , N > &_active,
57- const batch<float , N > &c_re,
58- const batch<float , N > &c_im,
55+ template <class arch >
56+ inline batch<int , arch > mandel (const batch_bool<float , arch > &_active,
57+ const batch<float , arch > &c_re,
58+ const batch<float , arch > &c_im,
5959 int maxIters)
6060 {
61- batch<float , N> z_re = c_re;
62- batch<float , N> z_im = c_im;
63- batch<int , N> vi (0 );
61+ using float_batch_type = batch<float , arch>;
62+ using int_batch_type = batch<int , arch>;
63+
64+ constexpr std::size_t N = float_batch_type::size;
65+
66+ float_batch_type z_re = c_re;
67+ float_batch_type z_im = c_im;
68+ int_batch_type vi (0 );
6469
6570 for (int i = 0 ; i < maxIters; ++i)
6671 {
67- auto active = _active & ((z_re * z_re + z_im * z_im) <= batch< float , N> (4 .f ));
72+ auto active = _active & ((z_re * z_re + z_im * z_im) <= float_batch_type (4 .f ));
6873 if (!xsimd::any (active))
6974 {
7075 break ;
7176 }
7277
73- batch< float , N> new_re = z_re * z_re - z_im * z_im;
74- batch< float , N> new_im = 2 .f * z_re * z_im;
78+ float_batch_type new_re = z_re * z_re - z_im * z_im;
79+ float_batch_type new_im = 2 .f * z_re * z_im;
7580
7681 z_re = c_re + new_re;
7782 z_im = c_im + new_im;
@@ -82,7 +87,7 @@ namespace xsimd {
8287 return vi;
8388 }
8489
85- template <std:: size_t N >
90+ template <class arch >
8691 void mandelbrot (float x0,
8792 float y0,
8893 float x1,
@@ -92,29 +97,35 @@ namespace xsimd {
9297 int maxIters,
9398 int output[])
9499 {
100+ using float_batch_type = batch<float , arch>;
101+ using int_batch_type = batch<int , arch>;
102+
103+ constexpr std::size_t N = float_batch_type::size;
95104 float dx = (x1 - x0) / width;
96105 float dy = (y1 - y0) / height;
97106
98107 float arange[N];
99108 std::iota (&arange[0 ], &arange[N], 0 .f );
100- batch<float , N> programIndex (&arange[0 ], xsimd::aligned_mode ());
109+ // float_batch_type programIndex(&arange[0], xsimd::aligned_mode());
110+
111+ auto programIndex = float_batch_type::load (&arange[0 ], xsimd::aligned_mode ());
101112 // std::iota(programIndex.begin(), programIndex.end(), 0.f);
102113
103114 for (int j = 0 ; j < height; j++)
104115 {
105116 for (int i = 0 ; i < width; i += N)
106117 {
107- batch< float , N> x (x0 + (i + programIndex) * dx);
108- batch< float , N> y (y0 + j * dy);
118+ float_batch_type x (x0 + (i + programIndex) * dx);
119+ float_batch_type y (y0 + j * dy);
109120
110- auto active = x < batch< float , N> (width);
121+ auto active = x < float_batch_type (width);
111122
112123 int base_index = (j * width + i);
113- auto result = mandel (active, x, y, maxIters);
124+ auto result = mandel<arch> (active, x, y, maxIters);
114125
115126 // implement masked store!
116127 // xsimd::store_aligned(result, output + base_index, active);
117- batch< int , N> prev_data (output + base_index);
128+ int_batch_type prev_data (output + base_index);
118129 select (bool_cast (active), result, prev_data)
119130 .store_aligned (output + base_index);
120131 }
@@ -217,6 +228,60 @@ namespace scalar {
217228
218229} // namespace scalar
219230
231+
232+
233+ // run simd version of mandelbrot benchmark for a specific arch
234+ template <class arch , class bencher_t >
235+ void run_arch (
236+ bencher_t & bencher,
237+ float x0,
238+ float y0,
239+ float x1,
240+ float y1,
241+ int width,
242+ int height,
243+ int maxIters,
244+ std::vector<int , xsimd::aligned_allocator<int >> & buffer)
245+ {
246+ std::fill (buffer.begin (), buffer.end (), 0 );
247+ auto stats = bencher ([&]() {
248+ xsimd::mandelbrot<arch>(x0, y0, x1, y1, width, height, maxIters, buffer.data ());
249+ });
250+
251+ const float scalar_min = stats.min ().count ();
252+
253+ std::cout << ' \n ' << arch::name () <<" " << stats << ' \n ' ;
254+ auto filename = std::string (" mandelbrot_" ) + std::string (arch::name ()) + std::string (" .ppm" );
255+ writePPM (filename.c_str (), width, height, buffer.data ());
256+
257+ }
258+
259+ template <class T >
260+ struct run_archlist ;
261+
262+ // run simd version of mandelbrot benchmark for a list
263+ // of archs
264+ template <class ... Arch>
265+ struct run_archlist <xsimd::arch_list<Arch ...>>
266+ {
267+ template <class bencher_t >
268+ static void run (
269+ bencher_t & bencher,
270+ float x0,
271+ float y0,
272+ float x1,
273+ float y1,
274+ int width,
275+ int height,
276+ int maxIters,
277+ std::vector<int , xsimd::aligned_allocator<int >> & buffer)
278+ {
279+ using expand_type = int [];
280+ expand_type{(run_arch<Arch>(bencher, x0, y0,x1,x1,width,height, maxIters, buffer),0 )...};
281+ }
282+ };
283+
284+
220285int main ()
221286{
222287 using namespace std ::chrono;
@@ -263,199 +328,7 @@ int main()
263328
264329 writePPM (" mandelbrot_omp.ppm" , width, height, buf.data ());
265330
266- // xsimd_1 run //////////////////////////////////////////////////////////////
267-
268- std::fill (buf.begin (), buf.end (), 0 );
269-
270- auto stats_1 = bencher ([&]() {
271- xsimd::mandelbrot<1 >(x0, y0, x1, y1, width, height, maxIters, buf.data ());
272- });
273-
274- const float xsimd1_min = stats_1.min ().count ();
275-
276- std::cout << ' \n ' << " xsimd_1 " << stats_1 << ' \n ' ;
277-
278- writePPM (" mandelbrot_xsimd1.ppm" , width, height, buf.data ());
279-
280- // xsimd_4 run //////////////////////////////////////////////////////////////
281-
282- std::fill (buf.begin (), buf.end (), 0 );
283-
284- auto stats_4 = bencher ([&]() {
285- xsimd::mandelbrot<4 >(x0, y0, x1, y1, width, height, maxIters, buf.data ());
286- });
287-
288- const float xsimd4_min = stats_4.min ().count ();
289-
290- std::cout << ' \n ' << " xsimd_4 " << stats_4 << ' \n ' ;
291-
292- writePPM (" mandelbrot_xsimd4.ppm" , width, height, buf.data ());
293-
294- // xsimd_8 run //////////////////////////////////////////////////////////////
295-
296- std::fill (buf.begin (), buf.end (), 0 );
297-
298- auto stats_8 = bencher ([&]() {
299- xsimd::mandelbrot<8 >(x0, y0, x1, y1, width, height, maxIters, buf.data ());
300- });
301-
302- const float xsimd8_min = stats_8.min ().count ();
303-
304- std::cout << ' \n ' << " xsimd_8 " << stats_8 << ' \n ' ;
305-
306- writePPM (" mandelbrot_xsimd8.ppm" , width, height, buf.data ());
307-
308- // xsimd_16 run /////////////////////////////////////////////////////////////
309-
310- std::fill (buf.begin (), buf.end (), 0 );
311-
312- auto stats_16 = bencher ([&]() {
313- xsimd::mandelbrot<16 >(x0, y0, x1, y1, width, height, maxIters, buf.data ());
314- });
315-
316- const float xsimd16_min = stats_16.min ().count ();
317-
318- std::cout << ' \n ' << " xsimd_16 " << stats_16 << ' \n ' ;
319-
320- writePPM (" mandelbrot_xsimd16.ppm" , width, height, buf.data ());
321-
322- // conclusions //////////////////////////////////////////////////////////////
323-
324- std::cout << ' \n ' << " Conclusions: " << ' \n ' ;
325-
326- // scalar //
327-
328- std::cout << ' \n '
329- << " --> scalar was " << omp_min / scalar_min
330- << " x the speed of omp" ;
331-
332- std::cout << ' \n '
333- << " --> scalar was " << xsimd1_min / scalar_min
334- << " x the speed of xsimd_1" ;
335-
336- std::cout << ' \n '
337- << " --> scalar was " << xsimd4_min / scalar_min
338- << " x the speed of xsimd_4" ;
339-
340- std::cout << ' \n '
341- << " --> scalar was " << xsimd8_min / scalar_min
342- << " x the speed of xsimd_8" ;
343-
344- std::cout << ' \n '
345- << " --> scalar was " << xsimd16_min / scalar_min
346- << " x the speed of xsimd_16" << ' \n ' ;
347-
348- // omp //
349-
350- std::cout << ' \n '
351- << " --> omp was " << scalar_min / omp_min
352- << " x the speed of scalar" ;
353-
354- std::cout << ' \n '
355- << " --> omp was " << xsimd1_min / omp_min
356- << " x the speed of xsimd_1" ;
357-
358- std::cout << ' \n '
359- << " --> omp was " << xsimd4_min / omp_min
360- << " x the speed of xsimd_4" ;
361-
362- std::cout << ' \n '
363- << " --> omp was " << xsimd8_min / omp_min
364- << " x the speed of xsimd_8" ;
365-
366- std::cout << ' \n '
367- << " --> omp was " << xsimd16_min / omp_min
368- << " x the speed of xsimd_16" << ' \n ' ;
369-
370- // xsimd1 //
371-
372- std::cout << ' \n '
373- << " --> xsimd1 was " << scalar_min / xsimd1_min
374- << " x the speed of scalar" ;
375-
376- std::cout << ' \n '
377- << " --> xsimd1 was " << omp_min / xsimd1_min
378- << " x the speed of omp" ;
379-
380- std::cout << ' \n '
381- << " --> xsimd1 was " << xsimd4_min / xsimd1_min
382- << " x the speed of xsimd_4" ;
383-
384- std::cout << ' \n '
385- << " --> xsimd1 was " << xsimd8_min / xsimd1_min
386- << " x the speed of xsimd_8" ;
387-
388- std::cout << ' \n '
389- << " --> xsimd1 was " << xsimd16_min / xsimd1_min
390- << " x the speed of xsimd_16" << ' \n ' ;
391-
392- // xsimd4 //
393-
394- std::cout << ' \n '
395- << " --> xsimd4 was " << scalar_min / xsimd4_min
396- << " x the speed of scalar" ;
397-
398- std::cout << ' \n '
399- << " --> xsimd4 was " << omp_min / xsimd4_min
400- << " x the speed of omp" ;
401-
402- std::cout << ' \n '
403- << " --> xsimd4 was " << xsimd1_min / xsimd4_min
404- << " x the speed of xsimd_1" ;
405-
406- std::cout << ' \n '
407- << " --> xsimd4 was " << xsimd8_min / xsimd4_min
408- << " x the speed of xsimd_8" ;
409-
410- std::cout << ' \n '
411- << " --> xsimd4 was " << xsimd16_min / xsimd4_min
412- << " x the speed of xsimd_16" << ' \n ' ;
413-
414- // xsimd8 //
415-
416- std::cout << ' \n '
417- << " --> xsimd8 was " << scalar_min / xsimd8_min
418- << " x the speed of scalar" ;
419-
420- std::cout << ' \n '
421- << " --> xsimd8 was " << omp_min / xsimd8_min
422- << " x the speed of omp" ;
423-
424- std::cout << ' \n '
425- << " --> xsimd8 was " << xsimd1_min / xsimd8_min
426- << " x the speed of xsimd_1" ;
427-
428- std::cout << ' \n '
429- << " --> xsimd8 was " << xsimd4_min / xsimd8_min
430- << " x the speed of xsimd_4" ;
431-
432- std::cout << ' \n '
433- << " --> xsimd8 was " << xsimd16_min / xsimd8_min
434- << " x the speed of xsimd_16" << ' \n ' ;
435-
436- // xsimd16 //
437-
438- std::cout << ' \n '
439- << " --> xsimd16 was " << scalar_min / xsimd16_min
440- << " x the speed of scalar" ;
441-
442- std::cout << ' \n '
443- << " --> xsimd16 was " << omp_min / xsimd16_min
444- << " x the speed of omp" ;
445-
446- std::cout << ' \n '
447- << " --> xsimd16 was " << xsimd1_min / xsimd16_min
448- << " x the speed of xsimd_1" ;
449-
450- std::cout << ' \n '
451- << " --> xsimd16 was " << xsimd4_min / xsimd16_min
452- << " x the speed of xsimd_4" ;
453-
454- std::cout << ' \n '
455- << " --> xsimd16 was " << xsimd8_min / xsimd16_min
456- << " x the speed of xsimd_8" << ' \n ' ;
457-
458- std::cout << ' \n ' << " wrote output images to 'mandelbrot_[type].ppm'" << ' \n ' ;
331+ run_archlist<xsimd::supported_architectures>::run (bencher, x0, y0, x1, y1, width, height, maxIters, buf);
459332
460333 return 0 ;
461334}
0 commit comments