xtensor-stack
diff --git a/‎docs/source/api/basic_functions.rst‎
Lines changed: 8 additions & 0 deletions b/‎docs/source/api/basic_functions.rst‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/source/api/math_index.rst‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/api/math_index.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/xsimd/math/xsimd_math_utils.hpp‎
Lines changed: 104 additions & 0 deletions b/‎include/xsimd/math/xsimd_math_utils.hpp‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎include/xsimd/math/xsimd_power.hpp‎
Lines changed: 1 addition & 2 deletions b/‎include/xsimd/math/xsimd_power.hpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎include/xsimd/math/xsimd_scalar.hpp‎
Lines changed: 55 additions & 24 deletions b/‎include/xsimd/math/xsimd_scalar.hpp‎
Lines changed: 55 additions & 24 deletions
diff --git a/‎include/xsimd/types/xsimd_avx512_float.hpp‎
Lines changed: 1 addition & 1 deletion b/‎include/xsimd/types/xsimd_avx512_float.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/xsimd/types/xsimd_avx512_int32.hpp‎
Lines changed: 9 additions & 59 deletions b/‎include/xsimd/types/xsimd_avx512_int32.hpp‎
Lines changed: 9 additions & 59 deletions
@@ -59,6 +59,14 @@ Basic functions
 .. doxygenfunction:: fdim(const batch<T, N>&, const batch<T, N>&)
    :project: xsimd
 
+.. _sadd-function-reference:
+.. doxygenfunction:: sadd(const simd_base<B>&, const simd_base<B>&)
+   :project: xsimd
+
+.. _ssub-function-reference:
+.. doxygenfunction:: ssub(const simd_base<B>&, const simd_base<B>&)
+   :project: xsimd
+
 .. _clip-function-reference:
 .. doxygenfunction:: clip(const simd_base<B>&, const simd_base<B>&, const simd_base<B>&)
    :project: xsimd
 
@@ -60,6 +60,10 @@ Mathematical functions
 +---------------------------------------+----------------------------------------------------+
 | :ref:`fdim <fdim-function-reference>` | positive difference                                |
 +---------------------------------------+----------------------------------------------------+
+| :ref:`sadd <sadd-function-reference>` | saturated addition                                 |
++---------------------------------------+----------------------------------------------------+
+| :ref:`ssub <ssub-function-reference>` | saturated subtraction                              |
++---------------------------------------+----------------------------------------------------+
 | :ref:`clip <clip-function-reference>` | clipping operation                                 |
 +---------------------------------------+----------------------------------------------------+
 
 
@@ -0,0 +1,104 @@
+/***************************************************************************
+* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+* Martin Renou                                                             *
+* Copyright (c) QuantStack                                                 *
+*                                                                          *
+* Distributed under the terms of the BSD 3-Clause License.                 *
+*                                                                          *
+* The full license is in the file LICENSE, distributed with this software. *
+****************************************************************************/
+
+#ifndef XSIMD_MATH_UTILS_HPP
+#define XSIMD_MATH_UTILS_HPP
+
+#include <limits>
+#include <type_traits>
+
+namespace xsimd
+{
+    /*********************************************
+     * Some utility math operations shared       *
+     * across scalar versio and fallback         *
+     * versions                                  *
+     *********************************************/
+    namespace detail
+    {
+        template <class T0, class T1>
+        inline T0
+        ipow(const T0& t0, const T1& t1)
+        {
+            static_assert(std::is_integral<T1>::value, "second argument must be an integer");
+            T0 a = t0;
+            T1 b = t1;
+            bool const recip = b < 0;
+            T0 r{static_cast<T0>(1)};
+            while (1)
+            {
+                if (b & 1)
+                {
+                    r *= a;
+                }
+                b /= 2;
+                if (b == 0)
+                {
+                    break;
+                }
+                a *= a;
+            }
+            return recip ? 1 / r : r;
+        }
+        template<typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+        T sadd(const T& lhs, const T& rhs)
+        {
+            if (std::numeric_limits<T>::is_signed)
+            {
+                if ((lhs > 0) && (rhs > std::numeric_limits<T>::max() - lhs))
+                {
+                    return std::numeric_limits<T>::max();
+                }
+                else if ((lhs < 0) && (rhs < std::numeric_limits<T>::lowest() - lhs))
+                {
+                    return std::numeric_limits<T>::lowest();
+                }
+                else {
+                    return lhs + rhs;
+                }
+            }
+            else
+            {
+                if (rhs > std::numeric_limits<T>::max() - lhs)
+                {
+                    return std::numeric_limits<T>::max();
+                }
+                else
+                {
+                    return lhs + rhs;
+                }
+
+            }
+        }
+
+        template<typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+        T ssub(const T& lhs, const T& rhs)
+        {
+            if (std::numeric_limits<T>::is_signed)
+            {
+                return sadd(lhs, (T)-rhs);
+            }
+            else
+            {
+                if (lhs < rhs)
+                {
+                    return std::numeric_limits<T>::lowest();
+                }
+                else
+                {
+                    return lhs - rhs;
+                }
+
+            }
+        }
+    }
+}
+
+#endif
@@ -17,10 +17,9 @@
 #include "xsimd_fp_sign.hpp"
 #include "xsimd_horner.hpp"
 #include "xsimd_logarithm.hpp"
+#include "xsimd_math_utils.hpp"
 #include "xsimd_numerical_constant.hpp"
 
-#include "xsimd/math/xsimd_scalar.hpp"
-
 namespace xsimd
 {
 
 
@@ -12,6 +12,9 @@
 #define XSIMD_SCALAR_HPP
 
 #include <cmath>
+#include <limits>
+
+#include "xsimd_math_utils.hpp"
 
 namespace xsimd
 {
@@ -208,30 +211,6 @@ namespace xsimd
 #endif
 
     namespace detail {
-      template <class T0, class T1>
-      inline T0
-      ipow(const T0& t0, const T1& t1)
-      {
-          static_assert(std::is_integral<T1>::value, "second argument must be an integer");
-          T0 a = t0;
-          T1 b = t1;
-          bool const recip = b < 0;
-          T0 r{static_cast<T0>(1)};
-          while (1)
-          {
-              if (b & 1)
-              {
-                  r *= a;
-              }
-              b /= 2;
-              if (b == 0)
-              {
-                  break;
-              }
-              a *= a;
-          }
-          return recip ? 1 / r : r;
-      }
     }
 
     template <class T0, class T1>
@@ -480,6 +459,58 @@ namespace xsimd
         return tmp * tmp;
     }
 
+    template<typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    T sadd(const T& lhs, const T& rhs)
+    {
+        if (std::numeric_limits<T>::is_signed)
+        {
+            if ((lhs > 0) && (rhs > std::numeric_limits<T>::max() - lhs))
+            {
+                return std::numeric_limits<T>::max();
+            }
+            else if ((lhs < 0) && (rhs < std::numeric_limits<T>::lowest() - lhs))
+            {
+                return std::numeric_limits<T>::lowest();
+            }
+            else {
+                return lhs + rhs;
+            }
+        }
+        else
+        {
+            if (rhs > std::numeric_limits<T>::max() - lhs)
+            {
+                return std::numeric_limits<T>::max();
+            }
+            else
+            {
+                return lhs + rhs;
+            }
+
+        }
+    }
+
+    template<typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    T ssub(const T& lhs, const T& rhs)
+    {
+        if (std::numeric_limits<T>::is_signed)
+        {
+            return sadd(lhs, (T)-rhs);
+        }
+        else
+        {
+            if (lhs < rhs)
+            {
+                return std::numeric_limits<T>::lowest();
+            }
+            else
+            {
+                return lhs - rhs;
+            }
+
+        }
+    }
+
 }
 
 #endif
@@ -404,7 +404,7 @@ namespace xsimd
             {
                 return sub(lhs, rhs); //do something for inf ?
             }
-            
+
             static batch_type mul(const batch_type& lhs, const batch_type& rhs)
             {
                 return _mm512_mul_ps(lhs, rhs);
 
@@ -178,36 +178,10 @@ namespace xsimd
 
             static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
             {
-                /* origin: /nsimd/include/nsimd/x86/avx512_knl/adds.h */
-                /*
-                * ====================================================
-                * Copyright (c) 2019 Agenium Scale
-                *
-                * MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
-                * ====================================================
-                */
-                //todo bench againt unrroled loop
-                //todo factorize int32_t uint32_t
-                using ubatch_type = batch<uint32_t, 16>;
-                ubatch_type ux = (ubatch_type)(lhs);
-                const ubatch_type uy = (ubatch_type)(rhs);
-                const ubatch_type res = _mm512_add_epi32(ux, uy);
-
-                const ubatch_type vmax = _mm512_set1_epi32(std::numeric_limits<int32_t>::max());
-                const ubatch_type shr = _mm512_srl_epi32(ux, _mm_set1_epi32(sizeof(int32_t) * std::numeric_limits<unsigned char>::digits));
-                ux = _mm512_add_epi32(shr, vmax);
-
-                const ubatch_type xor_ux_uy = _mm512_xor_si512(ux, uy);
-                const ubatch_type xor_uy_res = _mm512_xor_si512(uy, res);
-                const ubatch_type not_xor_uy_res = _mm512_andnot_si512(xor_uy_res, _mm512_set1_epi8(-1));
-
-                const ubatch_type u_orb = _mm512_or_si512(xor_ux_uy, not_xor_uy_res);
-                const batch_type i_orb = (batch_type)u_orb;
-
-                const batch_type zeros = _mm512_set1_epi32(0);
-                __mmask16 gteq_to_zero = _mm512_cmp_epi32_mask(zeros, i_orb, _MM_CMPINT_NLT);
-
-                return _mm512_mask_blend_epi32(gteq_to_zero, ux, res);
+                batch_bool_type mask = _mm512_movepi32_mask(rhs);
+                batch_type lhs_pos_branch = min(std::numeric_limits<value_type>::max() - rhs, lhs);
+                batch_type lhs_neg_branch = max(std::numeric_limits<value_type>::min() - rhs, lhs);
+                return rhs + select(mask, lhs_neg_branch, lhs_pos_branch);
             }
 
             static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
@@ -392,39 +366,15 @@ namespace xsimd
 
             static batch_type sadd(const batch_type& lhs, const batch_type& rhs)
             {
-                /* origin: /nsimd/include/nsimd/x86/avx512_skylake/adds.h */
-                /*
-                * ====================================================
-                * Copyright (c) 2019 Agenium Scale
-                *
-                * MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
-                * ====================================================
-                */
-                //todo bench againt unrroled loop
-                //todo factorize int32_t uint32_t
-                const auto ures = _mm512_add_epi32(lhs, rhs);
-                const auto umax = _mm512_set1_epi32(std::numeric_limits<uint32_t>::max());
-                const auto is_overflow = _mm512_cmp_epu32_mask(_mm512_add_epi32(lhs, umax), _mm512_add_epi32(ures, umax), _MM_CMPINT_NLE);
-                return _mm512_mask_blend_epi32(is_overflow, ures, umax);
+                const auto diffmax = batch_type(std::numeric_limits<value_type>::max()) - lhs;
+                const auto mindiff = min(diffmax, rhs);
+                return lhs + mindiff;
             }
 
             static batch_type ssub(const batch_type& lhs, const batch_type& rhs)
             {
-                /* origin: /nsimd/include/nsimd/x86/avx512_skylake/subs.h */
-                /*
-                * ====================================================
-                * Copyright (c) 2019 Agenium Scale
-                *
-                * MIT License see https://github.com/agenium-scale/nsimd/blob/master/LICENSE
-                * ====================================================
-                */
-                //todo bench againt unrroled loop
-                //todo factorize int32_t uint32_t
-                const auto ures = _mm512_sub_epi32(lhs, rhs);
-                const auto cte = _mm512_set1_epi32(std::numeric_limits<uint32_t>::max());
-                const auto is_underflow = _mm512_cmp_epu32_mask(_mm512_add_epi32(rhs, cte), _mm512_add_epi32(lhs, cte), _MM_CMPINT_NLE);
-                const auto umin = _mm512_set1_epi32(std::numeric_limits<uint32_t>::lowest());
-                return _mm512_mask_blend_epi32(is_underflow, ures, umin);
+                const auto diff = min(lhs, rhs);
+                return lhs - diff;
             }
         };
     }
Original file line number	Diff line number	Diff line change
`@@ -404,7 +404,7 @@ namespace xsimd`
`404`	`404`	`{`
`405`	`405`	`return sub(lhs, rhs); //do something for inf ?`
`406`	`406`	`}`
`407`		`-`
	`407`	`+`
`408`	`408`	`static batch_type mul(const batch_type& lhs, const batch_type& rhs)`
`409`	`409`	`{`
`410`	`410`	`return _mm512_mul_ps(lhs, rhs);`