Merge pull request #513 from yumeyao/master

JohanMabille · web-flow · commit fa55bf1438b0 · 2021-07-29T22:41:10.000+02:00
optimize logarithm exponent int-to-float conversion
diff --git a/include/xsimd/math/xsimd_fp_manipulation.hpp b/include/xsimd/math/xsimd_fp_manipulation.hpp
@@ -22,6 +22,9 @@ namespace xsimd
     template <class T, std::size_t N>
     batch<T, N> frexp(const batch<T, N>& arg, batch<as_integer_t<T>, N>& exp);
 
+    template <typename T>
+    as_float_t<T> fexp_to_float(const T& x);
+
     /********************************************************
      * Floating point manipulation functions implementation *
      ********************************************************/
@@ -66,6 +69,27 @@ namespace xsimd
         exp = select(bool_cast(arg != b_type(0.)), exp, zero<i_type>());
         return select((arg != b_type(0.)), x | bitwise_cast<b_type>(mask2frexp<b_type>()), b_type(0.));
     }
+
+    template <typename T>
+    inline as_float_t<T> fexp_to_float(const T& x)
+    {
+        return to_float(x);
+    }
+
+    // fexps are 8/11-bit numbers. so we can down-cast to int16_t/int32_t then do the job
+#if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION && XSIMD_X86_INSTR_SET < XSIMD_X86_AVX512_VERSION
+    inline batch<double, 4> fexp_to_float(const batch<int64_t, 4>& x)
+    {
+        return _mm256_cvtepi32_pd(detail::xsimd_cvtepi64_epi32(x));
+    }
+#endif
+
+#if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION && XSIMD_X86_INSTR_SET < XSIMD_X86_AVX512_VERSION
+    inline batch<double, 2> fexp_to_float(const batch<int64_t, 2>& x)
+    {
+        return _mm_cvtepi32_pd(_mm_shuffle_epi32(x, _MM_SHUFFLE(0, 0, 2, 0)));
+    }
+#endif
 }
 
 #endif
diff --git a/include/xsimd/math/xsimd_logarithm.hpp b/include/xsimd/math/xsimd_logarithm.hpp
@@ -95,7 +95,7 @@ namespace xsimd
                 B t2 = z * horner<B, 0x3f2aaaaa, 0x3e91e9ee>(w);
                 B R = t2 + t1;
                 B hfsq = B(0.5) * f * f;
-                B dk = to_float(k);
+                B dk = fexp_to_float(k);
                 B r = fma(dk, log_2hi<B>(), fma(s, (hfsq + R), dk * log_2lo<B>()) - hfsq + f);
 #ifndef XSIMD_NO_INFINITIES
                 B zz = select(isnez, select(a == infinity<B>(), infinity<B>(), r), minusinfinity<B>());
@@ -136,7 +136,7 @@ namespace xsimd
 #endif
                 hx += 0x3ff00000 - 0x3fe6a09e;
                 k += (hx >> 20) - 0x3ff;
-                B dk = to_float(k);
+                B dk = fexp_to_float(k);
                 hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
                 x = bitwise_cast<B>(hx << 32 | (i_type(0xffffffff) & bitwise_cast<i_type>(x)));
 
@@ -223,7 +223,7 @@ namespace xsimd
                 B t2 = z * horner<B, 0x3f2aaaaa, 0x3e91e9ee>(w);
                 B R = t1 + t2;
                 B hfsq = B(0.5) * f * f;
-                B dk = to_float(k);
+                B dk = fexp_to_float(k);
                 B r = fma(fms(s, hfsq + R, hfsq) + f, invlog_2<B>(), dk);
 #ifndef XSIMD_NO_INFINITIES
                 B zz = select(isnez, select(a == infinity<B>(), infinity<B>(), r), minusinfinity<B>());
@@ -287,7 +287,7 @@ namespace xsimd
                 B lo = fma(s, hfsq + R, f - hi - hfsq);
                 B val_hi = hi * invlog_2hi<B>();
                 B val_lo = fma(lo + hi, invlog_2lo<B>(), lo * invlog_2hi<B>());
-                B dk = to_float(k);
+                B dk = fexp_to_float(k);
                 B w1 = dk + val_hi;
                 val_lo += (dk - w1) + val_hi;
                 val_hi = w1;
@@ -362,7 +362,7 @@ namespace xsimd
                 B t1 = w * horner<B, 0x3eccce13, 0x3e789e26>(w);
                 B t2 = z * horner<B, 0x3f2aaaaa, 0x3e91e9ee>(w);
                 B R = t2 + t1;
-                B dk = to_float(k);
+                B dk = fexp_to_float(k);
                 B hfsq = B(0.5) * f * f;
                 B hibits = f - hfsq;
                 hibits &= bitwise_cast<B>(i_type(0xfffff000));
@@ -419,7 +419,7 @@ namespace xsimd
                 hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
                 x = bitwise_cast<B>(hx << 32 | (i_type(0xffffffff) & bitwise_cast<i_type>(x)));
                 B f = --x;
-                B dk = to_float(k);
+                B dk = fexp_to_float(k);
                 B s = f / (B(2.) + f);
                 B z = s * s;
                 B w = z * z;
@@ -498,7 +498,7 @@ namespace xsimd
                 B t2 = z * horner<B, 0x3f2aaaaa, 0x3e91e9ee>(w);
                 B R = t2 + t1;
                 B hfsq = B(0.5) * f * f;
-                B dk = to_float(k);
+                B dk = fexp_to_float(k);
                 /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
                 B c = select(bool_cast(k >= i_type(2)), B(1.) - (uf - a), a - (uf - B(1.))) / uf;
                 B r = fma(dk, log_2hi<B>(), fma(s, (hfsq + R), dk * log_2lo<B>() + c) - hfsq + f);
@@ -550,7 +550,7 @@ namespace xsimd
                                   0x3fc7466496cb03dell,
                                   0x3fc2f112df3e5244ll>(w);
                 B R = t2 + t1;
-                B dk = to_float(k);
+                B dk = fexp_to_float(k);
                 B r = fma(dk, log_2hi<B>(), fma(s, hfsq + R, dk * log_2lo<B>() + c) - hfsq + f);
 #ifndef XSIMD_NO_INFINITIES
                 B zz = select(isnez, select(a == infinity<B>(), infinity<B>(), r), minusinfinity<B>());
diff --git a/include/xsimd/types/xsimd_int_conversion.hpp b/include/xsimd/types/xsimd_int_conversion.hpp
@@ -37,6 +37,8 @@ namespace xsimd
         __m128i xsimd_cvtepi32_epi16(__m256i a);
         __m128i xsimd_cvtepi32_epu16(__m256i a);
 
+        __m128i xsimd_cvtepi64_epi32(__m256i a);
+
         /******************
          * Implementation *
          ******************/
@@ -165,6 +167,13 @@ namespace xsimd
 #endif
             return res;
         }
+
+        inline __m128i xsimd_cvtepi64_epi32(__m256i a)
+        {
+            __m128 hi = _mm_castsi128_ps(_mm256_extractf128_si256(a, 1));
+            __m128 lo = _mm_castsi128_ps(_mm256_castsi256_si128(a));
+            return _mm_castps_si128(_mm_shuffle_ps(lo, hi, _MM_SHUFFLE(2, 0, 2, 0)));
+        }
     }
 }