Skip to content

Commit fa55bf1

Browse files
authored
Merge pull request #513 from yumeyao/master
optimize logarithm exponent int-to-float conversion
2 parents 6288fb2 + 844c60c commit fa55bf1

File tree

3 files changed

+41
-8
lines changed

3 files changed

+41
-8
lines changed

include/xsimd/math/xsimd_fp_manipulation.hpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ namespace xsimd
2222
template <class T, std::size_t N>
2323
batch<T, N> frexp(const batch<T, N>& arg, batch<as_integer_t<T>, N>& exp);
2424

25+
template <typename T>
26+
as_float_t<T> fexp_to_float(const T& x);
27+
2528
/********************************************************
2629
* Floating point manipulation functions implementation *
2730
********************************************************/
@@ -66,6 +69,27 @@ namespace xsimd
6669
exp = select(bool_cast(arg != b_type(0.)), exp, zero<i_type>());
6770
return select((arg != b_type(0.)), x | bitwise_cast<b_type>(mask2frexp<b_type>()), b_type(0.));
6871
}
72+
73+
template <typename T>
74+
inline as_float_t<T> fexp_to_float(const T& x)
75+
{
76+
return to_float(x);
77+
}
78+
79+
// fexps are 8/11-bit numbers. so we can down-cast to int16_t/int32_t then do the job
80+
#if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION && XSIMD_X86_INSTR_SET < XSIMD_X86_AVX512_VERSION
81+
inline batch<double, 4> fexp_to_float(const batch<int64_t, 4>& x)
82+
{
83+
return _mm256_cvtepi32_pd(detail::xsimd_cvtepi64_epi32(x));
84+
}
85+
#endif
86+
87+
#if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION && XSIMD_X86_INSTR_SET < XSIMD_X86_AVX512_VERSION
88+
inline batch<double, 2> fexp_to_float(const batch<int64_t, 2>& x)
89+
{
90+
return _mm_cvtepi32_pd(_mm_shuffle_epi32(x, _MM_SHUFFLE(0, 0, 2, 0)));
91+
}
92+
#endif
6993
}
7094

7195
#endif

include/xsimd/math/xsimd_logarithm.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ namespace xsimd
9595
B t2 = z * horner<B, 0x3f2aaaaa, 0x3e91e9ee>(w);
9696
B R = t2 + t1;
9797
B hfsq = B(0.5) * f * f;
98-
B dk = to_float(k);
98+
B dk = fexp_to_float(k);
9999
B r = fma(dk, log_2hi<B>(), fma(s, (hfsq + R), dk * log_2lo<B>()) - hfsq + f);
100100
#ifndef XSIMD_NO_INFINITIES
101101
B zz = select(isnez, select(a == infinity<B>(), infinity<B>(), r), minusinfinity<B>());
@@ -136,7 +136,7 @@ namespace xsimd
136136
#endif
137137
hx += 0x3ff00000 - 0x3fe6a09e;
138138
k += (hx >> 20) - 0x3ff;
139-
B dk = to_float(k);
139+
B dk = fexp_to_float(k);
140140
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
141141
x = bitwise_cast<B>(hx << 32 | (i_type(0xffffffff) & bitwise_cast<i_type>(x)));
142142

@@ -223,7 +223,7 @@ namespace xsimd
223223
B t2 = z * horner<B, 0x3f2aaaaa, 0x3e91e9ee>(w);
224224
B R = t1 + t2;
225225
B hfsq = B(0.5) * f * f;
226-
B dk = to_float(k);
226+
B dk = fexp_to_float(k);
227227
B r = fma(fms(s, hfsq + R, hfsq) + f, invlog_2<B>(), dk);
228228
#ifndef XSIMD_NO_INFINITIES
229229
B zz = select(isnez, select(a == infinity<B>(), infinity<B>(), r), minusinfinity<B>());
@@ -287,7 +287,7 @@ namespace xsimd
287287
B lo = fma(s, hfsq + R, f - hi - hfsq);
288288
B val_hi = hi * invlog_2hi<B>();
289289
B val_lo = fma(lo + hi, invlog_2lo<B>(), lo * invlog_2hi<B>());
290-
B dk = to_float(k);
290+
B dk = fexp_to_float(k);
291291
B w1 = dk + val_hi;
292292
val_lo += (dk - w1) + val_hi;
293293
val_hi = w1;
@@ -362,7 +362,7 @@ namespace xsimd
362362
B t1 = w * horner<B, 0x3eccce13, 0x3e789e26>(w);
363363
B t2 = z * horner<B, 0x3f2aaaaa, 0x3e91e9ee>(w);
364364
B R = t2 + t1;
365-
B dk = to_float(k);
365+
B dk = fexp_to_float(k);
366366
B hfsq = B(0.5) * f * f;
367367
B hibits = f - hfsq;
368368
hibits &= bitwise_cast<B>(i_type(0xfffff000));
@@ -419,7 +419,7 @@ namespace xsimd
419419
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
420420
x = bitwise_cast<B>(hx << 32 | (i_type(0xffffffff) & bitwise_cast<i_type>(x)));
421421
B f = --x;
422-
B dk = to_float(k);
422+
B dk = fexp_to_float(k);
423423
B s = f / (B(2.) + f);
424424
B z = s * s;
425425
B w = z * z;
@@ -498,7 +498,7 @@ namespace xsimd
498498
B t2 = z * horner<B, 0x3f2aaaaa, 0x3e91e9ee>(w);
499499
B R = t2 + t1;
500500
B hfsq = B(0.5) * f * f;
501-
B dk = to_float(k);
501+
B dk = fexp_to_float(k);
502502
/* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
503503
B c = select(bool_cast(k >= i_type(2)), B(1.) - (uf - a), a - (uf - B(1.))) / uf;
504504
B r = fma(dk, log_2hi<B>(), fma(s, (hfsq + R), dk * log_2lo<B>() + c) - hfsq + f);
@@ -550,7 +550,7 @@ namespace xsimd
550550
0x3fc7466496cb03dell,
551551
0x3fc2f112df3e5244ll>(w);
552552
B R = t2 + t1;
553-
B dk = to_float(k);
553+
B dk = fexp_to_float(k);
554554
B r = fma(dk, log_2hi<B>(), fma(s, hfsq + R, dk * log_2lo<B>() + c) - hfsq + f);
555555
#ifndef XSIMD_NO_INFINITIES
556556
B zz = select(isnez, select(a == infinity<B>(), infinity<B>(), r), minusinfinity<B>());

include/xsimd/types/xsimd_int_conversion.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ namespace xsimd
3737
__m128i xsimd_cvtepi32_epi16(__m256i a);
3838
__m128i xsimd_cvtepi32_epu16(__m256i a);
3939

40+
__m128i xsimd_cvtepi64_epi32(__m256i a);
41+
4042
/******************
4143
* Implementation *
4244
******************/
@@ -165,6 +167,13 @@ namespace xsimd
165167
#endif
166168
return res;
167169
}
170+
171+
inline __m128i xsimd_cvtepi64_epi32(__m256i a)
172+
{
173+
__m128 hi = _mm_castsi128_ps(_mm256_extractf128_si256(a, 1));
174+
__m128 lo = _mm_castsi128_ps(_mm256_castsi256_si128(a));
175+
return _mm_castps_si128(_mm_shuffle_ps(lo, hi, _MM_SHUFFLE(2, 0, 2, 0)));
176+
}
168177
}
169178
}
170179

0 commit comments

Comments
 (0)