Skip to content

Commit 30ff9c4

Browse files
Improve, fix and simplify sadd and ssub operations
faster and simpler saturated add / sub for unsigned types when the builtin doesn't exist. It uses a min/max instead of an explicit comparison, for this instruction has a nice latency of 1 on sse and avx2 and avx512. Also add a doc entry.
1 parent 60a8f27 commit 30ff9c4

21 files changed

+305
-531
lines changed

docs/source/api/basic_functions.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,14 @@ Basic functions
5959
.. doxygenfunction:: fdim(const batch<T, N>&, const batch<T, N>&)
6060
:project: xsimd
6161

62+
.. _sadd-function-reference:
63+
.. doxygenfunction:: sadd(const simd_base<B>&, const simd_base<B>&)
64+
:project: xsimd
65+
66+
.. _ssub-function-reference:
67+
.. doxygenfunction:: ssub(const simd_base<B>&, const simd_base<B>&)
68+
:project: xsimd
69+
6270
.. _clip-function-reference:
6371
.. doxygenfunction:: clip(const simd_base<B>&, const simd_base<B>&, const simd_base<B>&)
6472
:project: xsimd

docs/source/api/math_index.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ Mathematical functions
6060
+---------------------------------------+----------------------------------------------------+
6161
| :ref:`fdim <fdim-function-reference>` | positive difference |
6262
+---------------------------------------+----------------------------------------------------+
63+
| :ref:`sadd <sadd-function-reference>` | saturated addition |
64+
+---------------------------------------+----------------------------------------------------+
65+
| :ref:`ssub <ssub-function-reference>` | saturated subtraction |
66+
+---------------------------------------+----------------------------------------------------+
6367
| :ref:`clip <clip-function-reference>` | clipping operation |
6468
+---------------------------------------+----------------------------------------------------+
6569

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/***************************************************************************
2+
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3+
* Martin Renou *
4+
* Copyright (c) QuantStack *
5+
* *
6+
* Distributed under the terms of the BSD 3-Clause License. *
7+
* *
8+
* The full license is in the file LICENSE, distributed with this software. *
9+
****************************************************************************/
10+
11+
#ifndef XSIMD_MATH_UTILS_HPP
12+
#define XSIMD_MATH_UTILS_HPP
13+
14+
#include <limits>
15+
#include <type_traits>
16+
17+
namespace xsimd
18+
{
19+
/*********************************************
20+
* Some utility math operations shared *
21+
* across scalar versio and fallback *
22+
* versions *
23+
*********************************************/
24+
namespace detail
25+
{
26+
template <class T0, class T1>
27+
inline T0
28+
ipow(const T0& t0, const T1& t1)
29+
{
30+
static_assert(std::is_integral<T1>::value, "second argument must be an integer");
31+
T0 a = t0;
32+
T1 b = t1;
33+
bool const recip = b < 0;
34+
T0 r{static_cast<T0>(1)};
35+
while (1)
36+
{
37+
if (b & 1)
38+
{
39+
r *= a;
40+
}
41+
b /= 2;
42+
if (b == 0)
43+
{
44+
break;
45+
}
46+
a *= a;
47+
}
48+
return recip ? 1 / r : r;
49+
}
50+
template<typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
51+
T sadd(const T& lhs, const T& rhs)
52+
{
53+
if (std::numeric_limits<T>::is_signed)
54+
{
55+
if ((lhs > 0) && (rhs > std::numeric_limits<T>::max() - lhs))
56+
{
57+
return std::numeric_limits<T>::max();
58+
}
59+
else if ((lhs < 0) && (rhs < std::numeric_limits<T>::lowest() - lhs))
60+
{
61+
return std::numeric_limits<T>::lowest();
62+
}
63+
else {
64+
return lhs + rhs;
65+
}
66+
}
67+
else
68+
{
69+
if (rhs > std::numeric_limits<T>::max() - lhs)
70+
{
71+
return std::numeric_limits<T>::max();
72+
}
73+
else
74+
{
75+
return lhs + rhs;
76+
}
77+
78+
}
79+
}
80+
81+
template<typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
82+
T ssub(const T& lhs, const T& rhs)
83+
{
84+
if (std::numeric_limits<T>::is_signed)
85+
{
86+
return sadd(lhs, (T)-rhs);
87+
}
88+
else
89+
{
90+
if (lhs < rhs)
91+
{
92+
return std::numeric_limits<T>::lowest();
93+
}
94+
else
95+
{
96+
return lhs - rhs;
97+
}
98+
99+
}
100+
}
101+
}
102+
}
103+
104+
#endif

include/xsimd/math/xsimd_power.hpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,9 @@
1717
#include "xsimd_fp_sign.hpp"
1818
#include "xsimd_horner.hpp"
1919
#include "xsimd_logarithm.hpp"
20+
#include "xsimd_math_utils.hpp"
2021
#include "xsimd_numerical_constant.hpp"
2122

22-
#include "xsimd/math/xsimd_scalar.hpp"
23-
2423
namespace xsimd
2524
{
2625

include/xsimd/math/xsimd_scalar.hpp

Lines changed: 55 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
#define XSIMD_SCALAR_HPP
1313

1414
#include <cmath>
15+
#include <limits>
16+
17+
#include "xsimd_math_utils.hpp"
1518

1619
namespace xsimd
1720
{
@@ -208,30 +211,6 @@ namespace xsimd
208211
#endif
209212

210213
namespace detail {
211-
template <class T0, class T1>
212-
inline T0
213-
ipow(const T0& t0, const T1& t1)
214-
{
215-
static_assert(std::is_integral<T1>::value, "second argument must be an integer");
216-
T0 a = t0;
217-
T1 b = t1;
218-
bool const recip = b < 0;
219-
T0 r{static_cast<T0>(1)};
220-
while (1)
221-
{
222-
if (b & 1)
223-
{
224-
r *= a;
225-
}
226-
b /= 2;
227-
if (b == 0)
228-
{
229-
break;
230-
}
231-
a *= a;
232-
}
233-
return recip ? 1 / r : r;
234-
}
235214
}
236215

237216
template <class T0, class T1>
@@ -480,6 +459,58 @@ namespace xsimd
480459
return tmp * tmp;
481460
}
482461

462+
template<typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
463+
T sadd(const T& lhs, const T& rhs)
464+
{
465+
if (std::numeric_limits<T>::is_signed)
466+
{
467+
if ((lhs > 0) && (rhs > std::numeric_limits<T>::max() - lhs))
468+
{
469+
return std::numeric_limits<T>::max();
470+
}
471+
else if ((lhs < 0) && (rhs < std::numeric_limits<T>::lowest() - lhs))
472+
{
473+
return std::numeric_limits<T>::lowest();
474+
}
475+
else {
476+
return lhs + rhs;
477+
}
478+
}
479+
else
480+
{
481+
if (rhs > std::numeric_limits<T>::max() - lhs)
482+
{
483+
return std::numeric_limits<T>::max();
484+
}
485+
else
486+
{
487+
return lhs + rhs;
488+
}
489+
490+
}
491+
}
492+
493+
template<typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
494+
T ssub(const T& lhs, const T& rhs)
495+
{
496+
if (std::numeric_limits<T>::is_signed)
497+
{
498+
return sadd(lhs, (T)-rhs);
499+
}
500+
else
501+
{
502+
if (lhs < rhs)
503+
{
504+
return std::numeric_limits<T>::lowest();
505+
}
506+
else
507+
{
508+
return lhs - rhs;
509+
}
510+
511+
}
512+
}
513+
483514
}
484515

485516
#endif

include/xsimd/types/xsimd_avx512_float.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ namespace xsimd
404404
{
405405
return sub(lhs, rhs); //do something for inf ?
406406
}
407-
407+
408408
static batch_type mul(const batch_type& lhs, const batch_type& rhs)
409409
{
410410
return _mm512_mul_ps(lhs, rhs);

include/xsimd/types/xsimd_avx512_int32.hpp

Lines changed: 9 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -178,36 +178,10 @@ namespace xsimd
178178

179179
static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
180180
{
181-
/* origin: /nsimd/include/nsimd/x86/avx512_knl/adds.h */
182-
/*
183-
* ====================================================
184-
* Copyright (c) 2019 Agenium Scale
185-
*
186-
* MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
187-
* ====================================================
188-
*/
189-
//todo bench againt unrroled loop
190-
//todo factorize int32_t uint32_t
191-
using ubatch_type = batch<uint32_t, 16>;
192-
ubatch_type ux = (ubatch_type)(lhs);
193-
const ubatch_type uy = (ubatch_type)(rhs);
194-
const ubatch_type res = _mm512_add_epi32(ux, uy);
195-
196-
const ubatch_type vmax = _mm512_set1_epi32(std::numeric_limits<int32_t>::max());
197-
const ubatch_type shr = _mm512_srl_epi32(ux, _mm_set1_epi32(sizeof(int32_t) * std::numeric_limits<unsigned char>::digits));
198-
ux = _mm512_add_epi32(shr, vmax);
199-
200-
const ubatch_type xor_ux_uy = _mm512_xor_si512(ux, uy);
201-
const ubatch_type xor_uy_res = _mm512_xor_si512(uy, res);
202-
const ubatch_type not_xor_uy_res = _mm512_andnot_si512(xor_uy_res, _mm512_set1_epi8(-1));
203-
204-
const ubatch_type u_orb = _mm512_or_si512(xor_ux_uy, not_xor_uy_res);
205-
const batch_type i_orb = (batch_type)u_orb;
206-
207-
const batch_type zeros = _mm512_set1_epi32(0);
208-
__mmask16 gteq_to_zero = _mm512_cmp_epi32_mask(zeros, i_orb, _MM_CMPINT_NLT);
209-
210-
return _mm512_mask_blend_epi32(gteq_to_zero, ux, res);
181+
batch_bool_type mask = _mm512_movepi32_mask(rhs);
182+
batch_type lhs_pos_branch = min(std::numeric_limits<value_type>::max() - rhs, lhs);
183+
batch_type lhs_neg_branch = max(std::numeric_limits<value_type>::min() - rhs, lhs);
184+
return rhs + select(mask, lhs_neg_branch, lhs_pos_branch);
211185
}
212186

213187
static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
@@ -392,39 +366,15 @@ namespace xsimd
392366

393367
static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
394368
{
395-
/* origin: /nsimd/include/nsimd/x86/avx512_skylake/adds.h */
396-
/*
397-
* ====================================================
398-
* Copyright (c) 2019 Agenium Scale
399-
*
400-
* MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
401-
* ====================================================
402-
*/
403-
//todo bench againt unrroled loop
404-
//todo factorize int32_t uint32_t
405-
const auto ures = _mm512_add_epi32(lhs, rhs);
406-
const auto umax = _mm512_set1_epi32(std::numeric_limits<uint32_t>::max());
407-
const auto is_overflow = _mm512_cmp_epu32_mask(_mm512_add_epi32(lhs, umax), _mm512_add_epi32(ures, umax), _MM_CMPINT_NLE);
408-
return _mm512_mask_blend_epi32(is_overflow, ures, umax);
369+
const auto diffmax = batch_type(std::numeric_limits<value_type>::max()) - lhs;
370+
const auto mindiff = min(diffmax, rhs);
371+
return lhs + mindiff;
409372
}
410373

411374
static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
412375
{
413-
/* origin: /nsimd/include/nsimd/x86/avx512_skylake/subs.h */
414-
/*
415-
* ====================================================
416-
* Copyright (c) 2019 Agenium Scale
417-
*
418-
* MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
419-
* ====================================================
420-
*/
421-
//todo bench againt unrroled loop
422-
//todo factorize int32_t uint32_t
423-
const auto ures = _mm512_sub_epi32(lhs, rhs);
424-
const auto cte = _mm512_set1_epi32(std::numeric_limits<uint32_t>::max());
425-
const auto is_underflow = _mm512_cmp_epu32_mask(_mm512_add_epi32(rhs, cte), _mm512_add_epi32(lhs, cte), _MM_CMPINT_NLE);
426-
const auto umin = _mm512_set1_epi32(std::numeric_limits<uint32_t>::lowest());
427-
return _mm512_mask_blend_epi32(is_underflow, ures, umin);
376+
const auto diff = min(lhs, rhs);
377+
return lhs - diff;
428378
}
429379
};
430380
}

0 commit comments

Comments
 (0)