Skip to content

Commit 60a8f27

Browse files
ThomasRetornazserge-sans-paille
authored andcommitted
Add missing saturated sadd ssub
* int8,uint8,int16,uint16,int32,uint32,int64,uint64,float,double * sse2/sse4 * avx/avx2 * avx512 * fallback * neon
1 parent e367a4b commit 60a8f27

31 files changed

+1044
-0
lines changed

include/xsimd/types/xsimd_avx512_double.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,16 @@ namespace xsimd
372372
return _mm512_sub_pd(lhs, rhs);
373373
}
374374

375+
static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
376+
{
377+
return add(lhs, rhs); //do something for inf ?
378+
}
379+
380+
static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
381+
{
382+
return sub(lhs, rhs); //do something for inf ?
383+
}
384+
375385
static batch_type mul(const batch_type& lhs, const batch_type& rhs)
376386
{
377387
return _mm512_mul_pd(lhs, rhs);

include/xsimd/types/xsimd_avx512_float.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,16 @@ namespace xsimd
395395
return _mm512_sub_ps(lhs, rhs);
396396
}
397397

398+
static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
399+
{
400+
return add(lhs, rhs); //do something for inf ?
401+
}
402+
403+
static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
404+
{
405+
return sub(lhs, rhs); //do something for inf ?
406+
}
407+
398408
static batch_type mul(const batch_type& lhs, const batch_type& rhs)
399409
{
400410
return _mm512_mul_ps(lhs, rhs);

include/xsimd/types/xsimd_avx512_int16.hpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,24 @@ namespace xsimd
243243
#endif
244244
}
245245

246+
static batch_type sadd(const batch_type &lhs, const batch_type &rhs)
247+
{
248+
#if defined(XSIMD_AVX512BW_AVAILABLE)
249+
return _mm512_adds_epi16(lhs, rhs);
250+
#else
251+
XSIMD_APPLY_AVX2_FUNCTION_INT16(sadd, lhs, rhs);
252+
#endif
253+
}
254+
255+
static batch_type ssub(const batch_type &lhs, const batch_type &rhs)
256+
{
257+
#if defined(XSIMD_AVX512BW_AVAILABLE)
258+
return _mm512_subs_epi16(lhs, rhs);
259+
#else
260+
XSIMD_APPLY_AVX2_FUNCTION_INT16(ssub, lhs, rhs);
261+
#endif
262+
}
263+
246264
static batch_type mul(const batch_type& lhs, const batch_type& rhs)
247265
{
248266
#if defined(XSIMD_AVX512BW_AVAILABLE)
@@ -466,6 +484,24 @@ namespace xsimd
466484
XSIMD_APPLY_AVX2_FUNCTION_INT16(lte, lhs, rhs);
467485
#endif
468486
}
487+
488+
static batch_type sadd(const batch_type &lhs, const batch_type &rhs)
489+
{
490+
#if defined(XSIMD_AVX512BW_AVAILABLE)
491+
return _mm512_adds_epu16(lhs, rhs);
492+
#else
493+
XSIMD_APPLY_AVX2_FUNCTION_UINT16(sadd, lhs, rhs);
494+
#endif
495+
}
496+
497+
static batch_type ssub(const batch_type &lhs, const batch_type &rhs)
498+
{
499+
#if defined(XSIMD_AVX512BW_AVAILABLE)
500+
return _mm512_subs_epu16(lhs, rhs);
501+
#else
502+
XSIMD_APPLY_AVX2_FUNCTION_UINT16(ssub, lhs, rhs);
503+
#endif
504+
}
469505
};
470506
}
471507

include/xsimd/types/xsimd_avx512_int32.hpp

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,45 @@ namespace xsimd
176176
return _mm512_sub_epi32(lhs, rhs);
177177
}
178178

179+
static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
180+
{
181+
/* origin: /nsimd/include/nsimd/x86/avx512_knl/adds.h */
182+
/*
183+
* ====================================================
184+
* Copyright (c) 2019 Agenium Scale
185+
*
186+
* MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
187+
* ====================================================
188+
*/
189+
//todo bench againt unrroled loop
190+
//todo factorize int32_t uint32_t
191+
using ubatch_type = batch<uint32_t, 16>;
192+
ubatch_type ux = (ubatch_type)(lhs);
193+
const ubatch_type uy = (ubatch_type)(rhs);
194+
const ubatch_type res = _mm512_add_epi32(ux, uy);
195+
196+
const ubatch_type vmax = _mm512_set1_epi32(std::numeric_limits<int32_t>::max());
197+
const ubatch_type shr = _mm512_srl_epi32(ux, _mm_set1_epi32(sizeof(int32_t) * std::numeric_limits<unsigned char>::digits));
198+
ux = _mm512_add_epi32(shr, vmax);
199+
200+
const ubatch_type xor_ux_uy = _mm512_xor_si512(ux, uy);
201+
const ubatch_type xor_uy_res = _mm512_xor_si512(uy, res);
202+
const ubatch_type not_xor_uy_res = _mm512_andnot_si512(xor_uy_res, _mm512_set1_epi8(-1));
203+
204+
const ubatch_type u_orb = _mm512_or_si512(xor_ux_uy, not_xor_uy_res);
205+
const batch_type i_orb = (batch_type)u_orb;
206+
207+
const batch_type zeros = _mm512_set1_epi32(0);
208+
__mmask16 gteq_to_zero = _mm512_cmp_epi32_mask(zeros, i_orb, _MM_CMPINT_NLT);
209+
210+
return _mm512_mask_blend_epi32(gteq_to_zero, ux, res);
211+
}
212+
213+
static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
214+
{
215+
return sadd(lhs, neg(rhs));
216+
}
217+
179218
static batch_type mul(const batch_type& lhs, const batch_type& rhs)
180219
{
181220
return _mm512_mullo_epi32(lhs, rhs);
@@ -350,6 +389,43 @@ namespace xsimd
350389
{
351390
return rhs;
352391
}
392+
393+
static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
394+
{
395+
/* origin: /nsimd/include/nsimd/x86/avx512_skylake/adds.h */
396+
/*
397+
* ====================================================
398+
* Copyright (c) 2019 Agenium Scale
399+
*
400+
* MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
401+
* ====================================================
402+
*/
403+
//todo bench againt unrroled loop
404+
//todo factorize int32_t uint32_t
405+
const auto ures = _mm512_add_epi32(lhs, rhs);
406+
const auto umax = _mm512_set1_epi32(std::numeric_limits<uint32_t>::max());
407+
const auto is_overflow = _mm512_cmp_epu32_mask(_mm512_add_epi32(lhs, umax), _mm512_add_epi32(ures, umax), _MM_CMPINT_NLE);
408+
return _mm512_mask_blend_epi32(is_overflow, ures, umax);
409+
}
410+
411+
static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
412+
{
413+
/* origin: /nsimd/include/nsimd/x86/avx512_skylake/subs.h */
414+
/*
415+
* ====================================================
416+
* Copyright (c) 2019 Agenium Scale
417+
*
418+
* MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
419+
* ====================================================
420+
*/
421+
//todo bench againt unrroled loop
422+
//todo factorize int32_t uint32_t
423+
const auto ures = _mm512_sub_epi32(lhs, rhs);
424+
const auto cte = _mm512_set1_epi32(std::numeric_limits<uint32_t>::max());
425+
const auto is_underflow = _mm512_cmp_epu32_mask(_mm512_add_epi32(rhs, cte), _mm512_add_epi32(lhs, cte), _MM_CMPINT_NLE);
426+
const auto umin = _mm512_set1_epi32(std::numeric_limits<uint32_t>::lowest());
427+
return _mm512_mask_blend_epi32(is_underflow, ures, umin);
428+
}
353429
};
354430
}
355431

include/xsimd/types/xsimd_avx512_int64.hpp

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,45 @@ namespace xsimd
234234
return _mm512_sub_epi64(lhs, rhs);
235235
}
236236

237+
static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
238+
{
239+
/* origin: /nsimd/include/nsimd/x86/avx512_knl/adds.h */
240+
/*
241+
* ====================================================
242+
* Copyright (c) 2019 Agenium Scale
243+
*
244+
* MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
245+
* ====================================================
246+
*/
247+
//todo bench againt unrroled loop
248+
//todo factorize int32_t uint32_t
249+
using ubatch_type = batch<uint64_t, 8>;
250+
ubatch_type ux = (ubatch_type)(lhs);
251+
const ubatch_type uy = (ubatch_type)(rhs);
252+
const ubatch_type res = _mm512_add_epi64(ux, uy);
253+
254+
const ubatch_type vmax = _mm512_set1_epi64(std::numeric_limits<int64_t>::max());
255+
const ubatch_type shr = _mm512_srl_epi64(ux, _mm_set1_epi32(sizeof(int64_t) * std::numeric_limits<unsigned char>::digits));
256+
ux = _mm512_add_epi64(shr, vmax);
257+
258+
const ubatch_type xor_ux_uy = _mm512_xor_si512(ux, uy);
259+
const ubatch_type xor_uy_res = _mm512_xor_si512(uy, res);
260+
const ubatch_type not_xor_uy_res = _mm512_andnot_si512(xor_uy_res, _mm512_set1_epi8(-1));
261+
262+
const ubatch_type u_orb = _mm512_or_si512(xor_ux_uy, not_xor_uy_res);
263+
const batch_type i_orb = (batch_type)u_orb;
264+
265+
const batch_type zeros = _mm512_set1_epi64(0);
266+
__mmask8 gteq_to_zero = _mm512_cmp_epi64_mask(zeros, i_orb, _MM_CMPINT_NLT);
267+
268+
return _mm512_mask_blend_epi64(gteq_to_zero, ux, res);
269+
}
270+
271+
static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
272+
{
273+
return sadd(lhs, neg(rhs));
274+
}
275+
237276
static batch_type mul(const batch_type& lhs, const batch_type& rhs)
238277
{
239278
return _mm512_mullo_epi64(lhs, rhs);
@@ -419,6 +458,64 @@ namespace xsimd
419458
{
420459
return rhs;
421460
}
461+
462+
static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
463+
{
464+
/* origin: /nsimd/include/nsimd/x86/avx512_knl/adds.h */
465+
/*
466+
* ====================================================
467+
* Copyright (c) 2019 Agenium Scale
468+
*
469+
* MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
470+
* ====================================================
471+
*/
472+
//todo bench againt unrroled loop
473+
//todo factorize int32_t uint32_t
474+
using ubatch_type = batch<uint64_t, 8>;
475+
ubatch_type ux = (ubatch_type)(lhs);
476+
const ubatch_type uy = (ubatch_type)(rhs);
477+
const ubatch_type res = _mm512_add_epi64(ux, uy);
478+
479+
const ubatch_type vmax = _mm512_set1_epi64(std::numeric_limits<uint64_t>::max());
480+
const ubatch_type shr = _mm512_srl_epi64(ux, _mm_set1_epi32(sizeof(int64_t) * std::numeric_limits<unsigned char>::digits));
481+
ux = _mm512_add_epi64(shr, vmax);
482+
483+
const ubatch_type xor_ux_uy = _mm512_xor_si512(ux, uy);
484+
const ubatch_type xor_uy_res = _mm512_xor_si512(uy, res);
485+
const ubatch_type not_xor_uy_res = _mm512_andnot_si512(xor_uy_res, _mm512_set1_epi8(-1));
486+
487+
const ubatch_type u_orb = _mm512_or_si512(xor_ux_uy, not_xor_uy_res);
488+
const batch_type i_orb = (batch_type)u_orb;
489+
490+
const batch_type zeros = _mm512_set1_epi64(0);
491+
__mmask8 gteq_to_zero = _mm512_cmp_epi64_mask(zeros, i_orb, _MM_CMPINT_NLT);
492+
493+
return _mm512_mask_blend_epi64(gteq_to_zero, ux, res);
494+
}
495+
496+
static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
497+
{
498+
using ubatch_type = batch<uint64_t, 8>;
499+
ubatch_type ux = (ubatch_type)(lhs);
500+
const ubatch_type uy = (ubatch_type)(rhs);
501+
const ubatch_type res = _mm512_sub_epi64(ux, uy);
502+
503+
const ubatch_type vmin = _mm512_set1_epi64(std::numeric_limits<uint64_t>::lowest());
504+
const ubatch_type shr = _mm512_srl_epi64(ux, _mm_set1_epi32(sizeof(int64_t) * std::numeric_limits<unsigned char>::digits));
505+
ux = _mm512_sub_epi64(shr, vmin);
506+
507+
const ubatch_type xor_ux_uy = _mm512_xor_si512(ux, uy);
508+
const ubatch_type xor_uy_res = _mm512_xor_si512(uy, res);
509+
const ubatch_type not_xor_uy_res = _mm512_andnot_si512(xor_uy_res, _mm512_set1_epi8(-1));
510+
511+
const ubatch_type u_orb = _mm512_or_si512(xor_ux_uy, not_xor_uy_res);
512+
const batch_type i_orb = (batch_type)u_orb;
513+
514+
const batch_type zeros = _mm512_set1_epi64(0);
515+
__mmask8 gteq_to_zero = _mm512_cmp_epi64_mask(zeros, i_orb, _MM_CMPINT_NLT);
516+
517+
return _mm512_mask_blend_epi64(gteq_to_zero, ux, res);
518+
}
422519
};
423520
}
424521

include/xsimd/types/xsimd_avx512_int8.hpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,24 @@ namespace xsimd
243243
#endif
244244
}
245245

246+
static batch_type sadd(const batch_type &lhs, const batch_type &rhs)
247+
{
248+
#if defined(XSIMD_AVX512BW_AVAILABLE)
249+
return _mm512_adds_epi8(lhs, rhs);
250+
#else
251+
XSIMD_APPLY_AVX2_FUNCTION_INT8(sadd, lhs, rhs);
252+
#endif
253+
}
254+
255+
static batch_type ssub(const batch_type &lhs, const batch_type &rhs)
256+
{
257+
#if defined(XSIMD_AVX512BW_AVAILABLE)
258+
return _mm512_subs_epi8(lhs, rhs);
259+
#else
260+
XSIMD_APPLY_AVX2_FUNCTION_INT8(ssub, lhs, rhs);
261+
#endif
262+
}
263+
246264
static batch_type mul(const batch_type& lhs, const batch_type& rhs)
247265
{
248266
#if defined(XSIMD_AVX512BW_AVAILABLE)
@@ -470,6 +488,24 @@ namespace xsimd
470488
XSIMD_APPLY_AVX2_FUNCTION_INT8(lte, lhs, rhs);
471489
#endif
472490
}
491+
492+
static batch_type sadd(const batch_type &lhs, const batch_type &rhs)
493+
{
494+
#if defined(XSIMD_AVX512BW_AVAILABLE)
495+
return _mm512_adds_epu8(lhs, rhs);
496+
#else
497+
XSIMD_APPLY_AVX2_FUNCTION_UINT8(sadd, lhs, rhs);
498+
#endif
499+
}
500+
501+
static batch_type ssub(const batch_type &lhs, const batch_type &rhs)
502+
{
503+
#if defined(XSIMD_AVX512BW_AVAILABLE)
504+
return _mm512_subs_epu8(lhs, rhs);
505+
#else
506+
XSIMD_APPLY_AVX2_FUNCTION_UINT8(ssub, lhs, rhs);
507+
#endif
508+
}
473509
};
474510
}
475511

include/xsimd/types/xsimd_avx_double.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,16 @@ namespace xsimd
492492
return _mm256_sub_pd(lhs, rhs);
493493
}
494494

495+
static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
496+
{
497+
return add(lhs, rhs); //FIXME something special for inf ?
498+
}
499+
500+
static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
501+
{
502+
return sub(lhs,rhs); //FIXME something special for inf ?
503+
}
504+
495505
static batch_type mul(const batch_type& lhs, const batch_type& rhs)
496506
{
497507
return _mm256_mul_pd(lhs, rhs);

include/xsimd/types/xsimd_avx_float.hpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,16 @@ namespace xsimd
521521
{
522522
return _mm256_sub_ps(lhs, rhs);
523523
}
524+
525+
static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
526+
{
527+
return add(lhs, rhs); //FIXME something special for inf ?
528+
}
529+
530+
static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
531+
{
532+
return sub(lhs,rhs); //FIXME something special for inf ?
533+
}
524534

525535
static batch_type mul(const batch_type& lhs, const batch_type& rhs)
526536
{

0 commit comments

Comments
 (0)