@@ -178,36 +178,10 @@ namespace xsimd
178178
179179 static batch_type sadd (const batch_type& lhs, const batch_type& rhs)
180180 {
181- /* origin: /nsimd/include/nsimd/x86/avx512_knl/adds.h */
182- /*
183- * ====================================================
184- * Copyright (c) 2019 Agenium Scale
185- *
186- * MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
187- * ====================================================
188- */
189- // todo bench againt unrroled loop
190- // todo factorize int32_t uint32_t
191- using ubatch_type = batch<uint32_t , 16 >;
192- ubatch_type ux = (ubatch_type)(lhs);
193- const ubatch_type uy = (ubatch_type)(rhs);
194- const ubatch_type res = _mm512_add_epi32 (ux, uy);
195-
196- const ubatch_type vmax = _mm512_set1_epi32 (std::numeric_limits<int32_t >::max ());
197- const ubatch_type shr = _mm512_srl_epi32 (ux, _mm_set1_epi32 (sizeof (int32_t ) * std::numeric_limits<unsigned char >::digits));
198- ux = _mm512_add_epi32 (shr, vmax);
199-
200- const ubatch_type xor_ux_uy = _mm512_xor_si512 (ux, uy);
201- const ubatch_type xor_uy_res = _mm512_xor_si512 (uy, res);
202- const ubatch_type not_xor_uy_res = _mm512_andnot_si512 (xor_uy_res, _mm512_set1_epi8 (-1 ));
203-
204- const ubatch_type u_orb = _mm512_or_si512 (xor_ux_uy, not_xor_uy_res);
205- const batch_type i_orb = (batch_type)u_orb;
206-
207- const batch_type zeros = _mm512_set1_epi32 (0 );
208- __mmask16 gteq_to_zero = _mm512_cmp_epi32_mask (zeros, i_orb, _MM_CMPINT_NLT);
209-
210- return _mm512_mask_blend_epi32 (gteq_to_zero, ux, res);
181+ batch_bool_type mask = _mm512_movepi32_mask (rhs);
182+ batch_type lhs_pos_branch = min (std::numeric_limits<value_type>::max () - rhs, lhs);
183+ batch_type lhs_neg_branch = max (std::numeric_limits<value_type>::min () - rhs, lhs);
184+ return rhs + select (mask, lhs_neg_branch, lhs_pos_branch);
211185 }
212186
213187 static batch_type ssub (const batch_type& lhs, const batch_type& rhs)
@@ -392,39 +366,15 @@ namespace xsimd
392366
393367 static batch_type sadd (const batch_type& lhs, const batch_type& rhs)
394368 {
395- /* origin: /nsimd/include/nsimd/x86/avx512_skylake/adds.h */
396- /*
397- * ====================================================
398- * Copyright (c) 2019 Agenium Scale
399- *
400- * MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
401- * ====================================================
402- */
403- // todo bench againt unrroled loop
404- // todo factorize int32_t uint32_t
405- const auto ures = _mm512_add_epi32 (lhs, rhs);
406- const auto umax = _mm512_set1_epi32 (std::numeric_limits<uint32_t >::max ());
407- const auto is_overflow = _mm512_cmp_epu32_mask (_mm512_add_epi32 (lhs, umax), _mm512_add_epi32 (ures, umax), _MM_CMPINT_NLE);
408- return _mm512_mask_blend_epi32 (is_overflow, ures, umax);
369+ const auto diffmax = batch_type (std::numeric_limits<value_type>::max ()) - lhs;
370+ const auto mindiff = min (diffmax, rhs);
371+ return lhs + mindiff;
409372 }
410373
411374 static batch_type ssub (const batch_type& lhs, const batch_type& rhs)
412375 {
413- /* origin: /nsimd/include/nsimd/x86/avx512_skylake/subs.h */
414- /*
415- * ====================================================
416- * Copyright (c) 2019 Agenium Scale
417- *
418- * MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
419- * ====================================================
420- */
421- // todo bench againt unrroled loop
422- // todo factorize int32_t uint32_t
423- const auto ures = _mm512_sub_epi32 (lhs, rhs);
424- const auto cte = _mm512_set1_epi32 (std::numeric_limits<uint32_t >::max ());
425- const auto is_underflow = _mm512_cmp_epu32_mask (_mm512_add_epi32 (rhs, cte), _mm512_add_epi32 (lhs, cte), _MM_CMPINT_NLE);
426- const auto umin = _mm512_set1_epi32 (std::numeric_limits<uint32_t >::lowest ());
427- return _mm512_mask_blend_epi32 (is_underflow, ures, umin);
376+ const auto diff = min (lhs, rhs);
377+ return lhs - diff;
428378 }
429379 };
430380 }
0 commit comments