Skip to content

Commit 9c7c6de

Browse files
New API: widen to widen a batch to a batch twice as big
Intel + common implementation + test + doc Fix #1179
1 parent f3ad5f1 commit 9c7c6de

File tree

12 files changed

+315
-1
lines changed

12 files changed

+315
-1
lines changed

docs/source/api/cast_index.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ Conversion:
4747
+---------------------------------------+----------------------------------------------------+
4848
| :cpp:func:`to_int` | per slot conversion to integer |
4949
+---------------------------------------+----------------------------------------------------+
50+
| :cpp:func:`widen` | per slot conversion to twice as big type |
51+
+---------------------------------------+----------------------------------------------------+
5052

5153
----
5254

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/***************************************************************************
2+
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3+
* Martin Renou *
4+
* Copyright (c) QuantStack *
5+
* Copyright (c) Serge Guelton *
6+
* *
7+
* Distributed under the terms of the BSD 3-Clause License. *
8+
* *
9+
* The full license is in the file LICENSE, distributed with this software. *
10+
****************************************************************************/
11+
12+
#ifndef XSIMD_COMMON_CAST_HPP
13+
#define XSIMD_COMMON_CAST_HPP
14+
15+
#include "../../types/xsimd_traits.hpp"
16+
17+
namespace xsimd
18+
{
19+
namespace kernel
20+
{
21+
template <class A, class T>
22+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<common>) noexcept
23+
{
24+
alignas(A::alignment()) T buffer[batch<T, A>::size];
25+
x.store_aligned(&buffer[0]);
26+
27+
using T_out = widen_t<T>;
28+
alignas(A::alignment()) T_out out_buffer[batch<T, A>::size];
29+
for (size_t i = 0; i < batch<T, A>::size; ++i)
30+
out_buffer[i] = static_cast<T_out>(buffer[i]);
31+
32+
return { batch<T_out, A>::load_aligned(&out_buffer[0]),
33+
batch<T_out, A>::load_aligned(&out_buffer[batch<T_out, A>::size]) };
34+
}
35+
36+
}
37+
38+
}
39+
40+
#endif

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1918,6 +1918,23 @@ namespace xsimd
19181918
return {};
19191919
}
19201920
}
1921+
1922+
// widen
1923+
template <class A, class T>
1924+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept
1925+
{
1926+
auto pair_lo = widen(batch<T, sse4_2>(detail::lower_half(x)), sse4_2 {});
1927+
auto pair_hi = widen(batch<T, sse4_2>(detail::upper_half(x)), sse4_2 {});
1928+
return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) };
1929+
}
1930+
template <class A>
1931+
XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx>) noexcept
1932+
{
1933+
__m256d lo = _mm256_cvtps_pd(detail::lower_half(x));
1934+
__m256d hi = _mm256_cvtps_pd(detail::upper_half(x));
1935+
return { lo, hi };
1936+
}
1937+
19211938
}
19221939
}
19231940

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,6 +1248,56 @@ namespace xsimd
12481248
return {};
12491249
}
12501250
}
1251+
1252+
// widen
1253+
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
1254+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx2>) noexcept
1255+
{
1256+
__m128i x_lo = detail::lower_half(x);
1257+
__m128i x_hi = detail::upper_half(x);
1258+
__m256i lo, hi;
1259+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1260+
{
1261+
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
1262+
{
1263+
lo = _mm256_cvtepi32_epi64(x_lo);
1264+
hi = _mm256_cvtepi32_epi64(x_hi);
1265+
}
1266+
else
1267+
{
1268+
lo = _mm256_cvtepu32_epi64(x_lo);
1269+
hi = _mm256_cvtepu32_epi64(x_hi);
1270+
}
1271+
}
1272+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1273+
{
1274+
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
1275+
{
1276+
lo = _mm256_cvtepi16_epi32(x_lo);
1277+
hi = _mm256_cvtepi16_epi32(x_hi);
1278+
}
1279+
else
1280+
{
1281+
lo = _mm256_cvtepu16_epi32(x_lo);
1282+
hi = _mm256_cvtepu16_epi32(x_hi);
1283+
}
1284+
}
1285+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1286+
{
1287+
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
1288+
{
1289+
lo = _mm256_cvtepi8_epi16(x_lo);
1290+
hi = _mm256_cvtepi8_epi16(x_hi);
1291+
}
1292+
else
1293+
{
1294+
lo = _mm256_cvtepu8_epi16(x_lo);
1295+
hi = _mm256_cvtepu8_epi16(x_hi);
1296+
}
1297+
}
1298+
return { lo, hi };
1299+
}
1300+
12511301
}
12521302
}
12531303

include/xsimd/arch/xsimd_avx512bw.hpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,26 @@ namespace xsimd
643643
return swizzle(self, mask.as_batch(), avx512bw {});
644644
}
645645

646+
// widen
647+
template <class A>
648+
XSIMD_INLINE std::array<batch<widen_t<uint8_t>, A>, 2> widen(batch<uint8_t, A> const& x, requires_arch<avx512bw>) noexcept
649+
{
650+
__m256i x_lo = _mm512_extracti64x4_epi64(x, 0);
651+
__m256i x_hi = _mm512_extracti64x4_epi64(x, 1);
652+
__m512i lo = _mm512_cvtepu8_epi16(x_lo);
653+
__m512i hi = _mm512_cvtepu8_epi16(x_hi);
654+
return { lo, hi };
655+
}
656+
template <class A>
657+
XSIMD_INLINE std::array<batch<widen_t<int8_t>, A>, 2> widen(batch<int8_t, A> const& x, requires_arch<avx512bw>) noexcept
658+
{
659+
__m256i x_lo = _mm512_extracti64x4_epi64(x, 0);
660+
__m256i x_hi = _mm512_extracti64x4_epi64(x, 1);
661+
__m512i lo = _mm512_cvtepi8_epi16(x_lo);
662+
__m512i hi = _mm512_cvtepi8_epi16(x_hi);
663+
return { lo, hi };
664+
}
665+
646666
// zip_hi
647667
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
648668
XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2642,6 +2642,55 @@ namespace xsimd
26422642
}
26432643
}
26442644

2645+
// widen
2646+
template <class A, class T>
2647+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx512f>) noexcept
2648+
{
2649+
__m256i x_lo = detail::lower_half(x);
2650+
__m256i x_hi = detail::upper_half(x);
2651+
__m512i lo, hi;
2652+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2653+
{
2654+
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
2655+
{
2656+
lo = _mm512_cvtepi32_epi64(x_lo);
2657+
hi = _mm512_cvtepi32_epi64(x_hi);
2658+
}
2659+
else
2660+
{
2661+
lo = _mm512_cvtepu32_epi64(x_lo);
2662+
hi = _mm512_cvtepu32_epi64(x_hi);
2663+
}
2664+
}
2665+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2666+
{
2667+
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
2668+
{
2669+
lo = _mm512_cvtepi16_epi32(x_lo);
2670+
hi = _mm512_cvtepi16_epi32(x_hi);
2671+
}
2672+
else
2673+
{
2674+
lo = _mm512_cvtepu16_epi32(x_lo);
2675+
hi = _mm512_cvtepu16_epi32(x_hi);
2676+
}
2677+
}
2678+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2679+
{
2680+
auto pair_lo = widen(batch<T, avx2>(x_lo), avx2 {});
2681+
auto pair_hi = widen(batch<T, avx2>(x_hi), avx2 {});
2682+
return { detail::merge_avx(pair_lo[0], pair_lo[1]), detail::merge_avx(pair_hi[0], pair_hi[1]) };
2683+
}
2684+
return { lo, hi };
2685+
}
2686+
template <class A>
2687+
XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx512f>) noexcept
2688+
{
2689+
__m512d lo = _mm512_cvtps_pd(detail::lower_half(x));
2690+
__m512d hi = _mm512_cvtps_pd(detail::upper_half(x));
2691+
return { lo, hi };
2692+
}
2693+
26452694
}
26462695
}
26472696

include/xsimd/arch/xsimd_common.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#define XSIMD_COMMON_HPP
1414

1515
#include "./common/xsimd_common_arithmetic.hpp"
16+
#include "./common/xsimd_common_cast.hpp"
1617
#include "./common/xsimd_common_complex.hpp"
1718
#include "./common/xsimd_common_logical.hpp"
1819
#include "./common/xsimd_common_math.hpp"

include/xsimd/arch/xsimd_scalar.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1235,7 +1235,6 @@ namespace xsimd
12351235
{
12361236
return cond ? true_br : false_br;
12371237
}
1238-
12391238
}
12401239

12411240
#endif

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <type_traits>
1616

1717
#include "../types/xsimd_sse4_1_register.hpp"
18+
#include "./common/xsimd_common_cast.hpp"
1819

1920
namespace xsimd
2021
{
@@ -382,6 +383,63 @@ namespace xsimd
382383
return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
383384
}
384385

386+
// widen
387+
template <class A, class T>
388+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<sse4_1>) noexcept
389+
{
390+
__m128i x_lo = x;
391+
__m128i x_hi = _mm_unpackhi_epi64(x, x);
392+
__m128i lo, hi;
393+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
394+
{
395+
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
396+
{
397+
lo = _mm_cvtepi32_epi64(x_lo);
398+
hi = _mm_cvtepi32_epi64(x_hi);
399+
}
400+
else
401+
{
402+
lo = _mm_cvtepu32_epi64(x_lo);
403+
hi = _mm_cvtepu32_epi64(x_hi);
404+
}
405+
}
406+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
407+
{
408+
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
409+
{
410+
lo = _mm_cvtepi16_epi32(x_lo);
411+
hi = _mm_cvtepi16_epi32(x_hi);
412+
}
413+
else
414+
{
415+
lo = _mm_cvtepu16_epi32(x_lo);
416+
hi = _mm_cvtepu16_epi32(x_hi);
417+
}
418+
}
419+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
420+
{
421+
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
422+
{
423+
lo = _mm_cvtepi8_epi16(x_lo);
424+
hi = _mm_cvtepi8_epi16(x_hi);
425+
}
426+
else
427+
{
428+
lo = _mm_cvtepu8_epi16(x_lo);
429+
hi = _mm_cvtepu8_epi16(x_hi);
430+
}
431+
}
432+
return { lo, hi };
433+
}
434+
template <class A>
435+
XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<sse4_1>) noexcept
436+
{
437+
__m128 x_shuf = _mm_unpackhi_ps(x, x);
438+
__m128d lo = _mm_cvtps_pd(x);
439+
__m128d hi = _mm_cvtps_pd(x_shuf);
440+
return { lo, hi };
441+
}
442+
385443
}
386444

387445
}

include/xsimd/types/xsimd_api.hpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2719,6 +2719,21 @@ namespace xsimd
27192719
return !xsimd::any(x);
27202720
}
27212721

2722+
/**
2723+
* @ingroup batch_conversion
2724+
*
2725+
* Widen batch \c x from type \c T to a type with twice as many bytes and
2726+
* the same sign (for integers) or from float to double.
2727+
* @param x batch of \c T
2728+
* @return two batches of \c widen_t<T>
2729+
*/
2730+
template <class T, class A>
2731+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x) noexcept
2732+
{
2733+
detail::static_check_supported_config<T, A>();
2734+
return kernel::widen<A>(x, A {});
2735+
}
2736+
27222737
/**
27232738
* @ingroup batch_miscellaneous
27242739
*

0 commit comments

Comments
 (0)