@@ -234,6 +234,45 @@ namespace xsimd
234234 return _mm512_sub_epi64 (lhs, rhs);
235235 }
236236
237+ static batch_type sadd (const batch_type& lhs, const batch_type& rhs)
238+ {
239+ /* origin: /nsimd/include/nsimd/x86/avx512_knl/adds.h */
240+ /*
241+ * ====================================================
242+ * Copyright (c) 2019 Agenium Scale
243+ *
244+ * MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
245+ * ====================================================
246+ */
247+ // todo bench againt unrroled loop
248+ // todo factorize int32_t uint32_t
249+ using ubatch_type = batch<uint64_t , 8 >;
250+ ubatch_type ux = (ubatch_type)(lhs);
251+ const ubatch_type uy = (ubatch_type)(rhs);
252+ const ubatch_type res = _mm512_add_epi64 (ux, uy);
253+
254+ const ubatch_type vmax = _mm512_set1_epi64 (std::numeric_limits<int64_t >::max ());
255+ const ubatch_type shr = _mm512_srl_epi64 (ux, _mm_set1_epi32 (sizeof (int64_t ) * std::numeric_limits<unsigned char >::digits));
256+ ux = _mm512_add_epi64 (shr, vmax);
257+
258+ const ubatch_type xor_ux_uy = _mm512_xor_si512 (ux, uy);
259+ const ubatch_type xor_uy_res = _mm512_xor_si512 (uy, res);
260+ const ubatch_type not_xor_uy_res = _mm512_andnot_si512 (xor_uy_res, _mm512_set1_epi8 (-1 ));
261+
262+ const ubatch_type u_orb = _mm512_or_si512 (xor_ux_uy, not_xor_uy_res);
263+ const batch_type i_orb = (batch_type)u_orb;
264+
265+ const batch_type zeros = _mm512_set1_epi64 (0 );
266+ __mmask8 gteq_to_zero = _mm512_cmp_epi64_mask (zeros, i_orb, _MM_CMPINT_NLT);
267+
268+ return _mm512_mask_blend_epi64 (gteq_to_zero, ux, res);
269+ }
270+
271+ static batch_type ssub (const batch_type& lhs, const batch_type& rhs)
272+ {
273+ return sadd (lhs, neg (rhs));
274+ }
275+
237276 static batch_type mul (const batch_type& lhs, const batch_type& rhs)
238277 {
239278 return _mm512_mullo_epi64 (lhs, rhs);
@@ -419,6 +458,64 @@ namespace xsimd
419458 {
420459 return rhs;
421460 }
461+
462+ static batch_type sadd (const batch_type& lhs, const batch_type& rhs)
463+ {
464+ /* origin: /nsimd/include/nsimd/x86/avx512_knl/adds.h */
465+ /*
466+ * ====================================================
467+ * Copyright (c) 2019 Agenium Scale
468+ *
469+ * MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
470+ * ====================================================
471+ */
472+ // todo bench againt unrroled loop
473+ // todo factorize int32_t uint32_t
474+ using ubatch_type = batch<uint64_t , 8 >;
475+ ubatch_type ux = (ubatch_type)(lhs);
476+ const ubatch_type uy = (ubatch_type)(rhs);
477+ const ubatch_type res = _mm512_add_epi64 (ux, uy);
478+
479+ const ubatch_type vmax = _mm512_set1_epi64 (std::numeric_limits<uint64_t >::max ());
480+ const ubatch_type shr = _mm512_srl_epi64 (ux, _mm_set1_epi32 (sizeof (int64_t ) * std::numeric_limits<unsigned char >::digits));
481+ ux = _mm512_add_epi64 (shr, vmax);
482+
483+ const ubatch_type xor_ux_uy = _mm512_xor_si512 (ux, uy);
484+ const ubatch_type xor_uy_res = _mm512_xor_si512 (uy, res);
485+ const ubatch_type not_xor_uy_res = _mm512_andnot_si512 (xor_uy_res, _mm512_set1_epi8 (-1 ));
486+
487+ const ubatch_type u_orb = _mm512_or_si512 (xor_ux_uy, not_xor_uy_res);
488+ const batch_type i_orb = (batch_type)u_orb;
489+
490+ const batch_type zeros = _mm512_set1_epi64 (0 );
491+ __mmask8 gteq_to_zero = _mm512_cmp_epi64_mask (zeros, i_orb, _MM_CMPINT_NLT);
492+
493+ return _mm512_mask_blend_epi64 (gteq_to_zero, ux, res);
494+ }
495+
496+ static batch_type ssub (const batch_type& lhs, const batch_type& rhs)
497+ {
498+ using ubatch_type = batch<uint64_t , 8 >;
499+ ubatch_type ux = (ubatch_type)(lhs);
500+ const ubatch_type uy = (ubatch_type)(rhs);
501+ const ubatch_type res = _mm512_sub_epi64 (ux, uy);
502+
503+ const ubatch_type vmin = _mm512_set1_epi64 (std::numeric_limits<uint64_t >::lowest ());
504+ const ubatch_type shr = _mm512_srl_epi64 (ux, _mm_set1_epi32 (sizeof (int64_t ) * std::numeric_limits<unsigned char >::digits));
505+ ux = _mm512_sub_epi64 (shr, vmin);
506+
507+ const ubatch_type xor_ux_uy = _mm512_xor_si512 (ux, uy);
508+ const ubatch_type xor_uy_res = _mm512_xor_si512 (uy, res);
509+ const ubatch_type not_xor_uy_res = _mm512_andnot_si512 (xor_uy_res, _mm512_set1_epi8 (-1 ));
510+
511+ const ubatch_type u_orb = _mm512_or_si512 (xor_ux_uy, not_xor_uy_res);
512+ const batch_type i_orb = (batch_type)u_orb;
513+
514+ const batch_type zeros = _mm512_set1_epi64 (0 );
515+ __mmask8 gteq_to_zero = _mm512_cmp_epi64_mask (zeros, i_orb, _MM_CMPINT_NLT);
516+
517+ return _mm512_mask_blend_epi64 (gteq_to_zero, ux, res);
518+ }
422519 };
423520 }
424521
0 commit comments