From dba009b8c836ef975ff8722e57b92b953059ed4a Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Fri, 31 Oct 2025 15:56:59 +0100 Subject: [PATCH 1/2] Fix portability issue with avx512f upper_half of batch of float It was relying on an intrinsic only available with avx512dq --- include/xsimd/arch/xsimd_avx512f.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index e332a9af4..05f2ff0a5 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -55,7 +55,7 @@ namespace xsimd } XSIMD_INLINE __m256 upper_half(__m512 self) noexcept { - return _mm512_extractf32x8_ps(self, 1); + return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(self), 1)); } XSIMD_INLINE __m256d upper_half(__m512d self) noexcept { From 7ebdc5ea6c213efb858247768131b5d11c364280 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 29 Oct 2025 23:05:41 +0100 Subject: [PATCH 2/2] New API: widen to widen a batch to a batch twice as big Intel + common implementation + test + doc Fix #1179 --- docs/source/api/cast_index.rst | 2 + .../xsimd/arch/common/xsimd_common_cast.hpp | 40 +++++++++++++ include/xsimd/arch/xsimd_avx.hpp | 17 ++++++ include/xsimd/arch/xsimd_avx2.hpp | 50 ++++++++++++++++ include/xsimd/arch/xsimd_avx512bw.hpp | 20 +++++++ include/xsimd/arch/xsimd_avx512f.hpp | 49 ++++++++++++++++ include/xsimd/arch/xsimd_common.hpp | 1 + include/xsimd/arch/xsimd_scalar.hpp | 1 - include/xsimd/arch/xsimd_sse4_1.hpp | 58 +++++++++++++++++++ include/xsimd/types/xsimd_api.hpp | 15 +++++ include/xsimd/types/xsimd_traits.hpp | 32 ++++++++++ test/test_conversion.cpp | 31 ++++++++++ 12 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 include/xsimd/arch/common/xsimd_common_cast.hpp diff --git a/docs/source/api/cast_index.rst b/docs/source/api/cast_index.rst index dee46b1f2..dae96bb72 100755 --- a/docs/source/api/cast_index.rst +++ b/docs/source/api/cast_index.rst @@ -47,6 +47,8 @@ Conversion: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`to_int` | per slot conversion to integer | +---------------------------------------+----------------------------------------------------+ +| :cpp:func:`widen` | per slot conversion to twice as big type | ++---------------------------------------+----------------------------------------------------+ ---- diff --git a/include/xsimd/arch/common/xsimd_common_cast.hpp b/include/xsimd/arch/common/xsimd_common_cast.hpp new file mode 100644 index 000000000..1226c887c --- /dev/null +++ b/include/xsimd/arch/common/xsimd_common_cast.hpp @@ -0,0 +1,40 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_COMMON_CAST_HPP +#define XSIMD_COMMON_CAST_HPP + +#include "../../types/xsimd_traits.hpp" + +namespace xsimd +{ + namespace kernel + { + template + XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept + { + alignas(A::alignment()) T buffer[batch::size]; + x.store_aligned(&buffer[0]); + + using T_out = widen_t; + alignas(A::alignment()) T_out out_buffer[batch::size]; + for (size_t i = 0; i < batch::size; ++i) + out_buffer[i] = static_cast(buffer[i]); + + return { batch::load_aligned(&out_buffer[0]), + batch::load_aligned(&out_buffer[batch::size]) }; + } + + } + +} + +#endif diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index b57d5c187..c7dc9de5e 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1918,6 +1918,23 @@ namespace xsimd return {}; } } + + // widen + template + XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept + { + auto pair_lo = widen(batch(detail::lower_half(x)), sse4_2 {}); + auto pair_hi = widen(batch(detail::upper_half(x)), sse4_2 {}); + return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) }; + } + template + XSIMD_INLINE std::array, 2> widen(batch const& x, requires_arch) noexcept + { + __m256d lo = _mm256_cvtps_pd(detail::lower_half(x)); + __m256d hi = _mm256_cvtps_pd(detail::upper_half(x)); + return { lo, hi }; + } + } } diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index aa974f9ab..87e5dafae 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1248,6 +1248,56 @@ namespace xsimd return {}; } } + + // widen + template ::value>::type> + XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept + { + __m128i x_lo = detail::lower_half(x); + __m128i x_hi = detail::upper_half(x); + __m256i lo, hi; + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + XSIMD_IF_CONSTEXPR(std::is_signed::value) + { + lo = _mm256_cvtepi32_epi64(x_lo); + hi = _mm256_cvtepi32_epi64(x_hi); + } + else + { + lo = _mm256_cvtepu32_epi64(x_lo); + hi = _mm256_cvtepu32_epi64(x_hi); + } + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + XSIMD_IF_CONSTEXPR(std::is_signed::value) + { + lo = _mm256_cvtepi16_epi32(x_lo); + hi = _mm256_cvtepi16_epi32(x_hi); + } + else + { + lo = _mm256_cvtepu16_epi32(x_lo); + hi = _mm256_cvtepu16_epi32(x_hi); + } + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + XSIMD_IF_CONSTEXPR(std::is_signed::value) + { + lo = _mm256_cvtepi8_epi16(x_lo); + hi = _mm256_cvtepi8_epi16(x_hi); + } + else + { + lo = _mm256_cvtepu8_epi16(x_lo); + hi = _mm256_cvtepu8_epi16(x_hi); + } + } + return { lo, hi }; + } + } } diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index 27c537dcb..767c5960b 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -643,6 +643,26 @@ namespace xsimd return swizzle(self, mask.as_batch(), avx512bw {}); } + // widen + template + XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept + { + __m256i x_lo = _mm512_extracti64x4_epi64(x, 0); + __m256i x_hi = _mm512_extracti64x4_epi64(x, 1); + __m512i lo = _mm512_cvtepu8_epi16(x_lo); + __m512i hi = _mm512_cvtepu8_epi16(x_hi); + return { lo, hi }; + } + template + XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept + { + __m256i x_lo = _mm512_extracti64x4_epi64(x, 0); + __m256i x_hi = _mm512_extracti64x4_epi64(x, 1); + __m512i lo = _mm512_cvtepi8_epi16(x_lo); + __m512i hi = _mm512_cvtepi8_epi16(x_hi); + return { lo, hi }; + } + // zip_hi template ::value>::type> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 05f2ff0a5..329c8b78c 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -2642,6 +2642,55 @@ namespace xsimd } } + // widen + template + XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept + { + __m256i x_lo = detail::lower_half(x); + __m256i x_hi = detail::upper_half(x); + __m512i lo, hi; + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + XSIMD_IF_CONSTEXPR(std::is_signed::value) + { + lo = _mm512_cvtepi32_epi64(x_lo); + hi = _mm512_cvtepi32_epi64(x_hi); + } + else + { + lo = _mm512_cvtepu32_epi64(x_lo); + hi = _mm512_cvtepu32_epi64(x_hi); + } + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + XSIMD_IF_CONSTEXPR(std::is_signed::value) + { + lo = _mm512_cvtepi16_epi32(x_lo); + hi = _mm512_cvtepi16_epi32(x_hi); + } + else + { + lo = _mm512_cvtepu16_epi32(x_lo); + hi = _mm512_cvtepu16_epi32(x_hi); + } + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto pair_lo = widen(batch(x_lo), avx2 {}); + auto pair_hi = widen(batch(x_hi), avx2 {}); + return { detail::merge_avx(pair_lo[0], pair_lo[1]), detail::merge_avx(pair_hi[0], pair_hi[1]) }; + } + return { lo, hi }; + } + template + XSIMD_INLINE std::array, 2> widen(batch const& x, requires_arch) noexcept + { + __m512d lo = _mm512_cvtps_pd(detail::lower_half(x)); + __m512d hi = _mm512_cvtps_pd(detail::upper_half(x)); + return { lo, hi }; + } + } } diff --git a/include/xsimd/arch/xsimd_common.hpp b/include/xsimd/arch/xsimd_common.hpp index ccd4b47c6..11f21bd82 100644 --- a/include/xsimd/arch/xsimd_common.hpp +++ b/include/xsimd/arch/xsimd_common.hpp @@ -13,6 +13,7 @@ #define XSIMD_COMMON_HPP #include "./common/xsimd_common_arithmetic.hpp" +#include "./common/xsimd_common_cast.hpp" #include "./common/xsimd_common_complex.hpp" #include "./common/xsimd_common_logical.hpp" #include "./common/xsimd_common_math.hpp" diff --git a/include/xsimd/arch/xsimd_scalar.hpp b/include/xsimd/arch/xsimd_scalar.hpp index 762eac684..9d24ed01f 100644 --- a/include/xsimd/arch/xsimd_scalar.hpp +++ b/include/xsimd/arch/xsimd_scalar.hpp @@ -1235,7 +1235,6 @@ namespace xsimd { return cond ? true_br : false_br; } - } #endif diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index 92330ef3c..130d50476 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -15,6 +15,7 @@ #include #include "../types/xsimd_sse4_1_register.hpp" +#include "./common/xsimd_common_cast.hpp" namespace xsimd { @@ -382,6 +383,63 @@ namespace xsimd return _mm_round_pd(self, _MM_FROUND_TO_ZERO); } + // widen + template + XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept + { + __m128i x_lo = x; + __m128i x_hi = _mm_unpackhi_epi64(x, x); + __m128i lo, hi; + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + XSIMD_IF_CONSTEXPR(std::is_signed::value) + { + lo = _mm_cvtepi32_epi64(x_lo); + hi = _mm_cvtepi32_epi64(x_hi); + } + else + { + lo = _mm_cvtepu32_epi64(x_lo); + hi = _mm_cvtepu32_epi64(x_hi); + } + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + XSIMD_IF_CONSTEXPR(std::is_signed::value) + { + lo = _mm_cvtepi16_epi32(x_lo); + hi = _mm_cvtepi16_epi32(x_hi); + } + else + { + lo = _mm_cvtepu16_epi32(x_lo); + hi = _mm_cvtepu16_epi32(x_hi); + } + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + XSIMD_IF_CONSTEXPR(std::is_signed::value) + { + lo = _mm_cvtepi8_epi16(x_lo); + hi = _mm_cvtepi8_epi16(x_hi); + } + else + { + lo = _mm_cvtepu8_epi16(x_lo); + hi = _mm_cvtepu8_epi16(x_hi); + } + } + return { lo, hi }; + } + template + XSIMD_INLINE std::array, 2> widen(batch const& x, requires_arch) noexcept + { + __m128 x_shuf = _mm_unpackhi_ps(x, x); + __m128d lo = _mm_cvtps_pd(x); + __m128d hi = _mm_cvtps_pd(x_shuf); + return { lo, hi }; + } + } } diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index efa50c28e..12bd9d95e 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -2719,6 +2719,21 @@ namespace xsimd return !xsimd::any(x); } + /** + * @ingroup batch_conversion + * + * Widen batch \c x from type \c T to a type with twice as many bytes and + * the same sign (for integers) or from float to double. + * @param x batch of \c T + * @return two batches of \c widen_t + */ + template + XSIMD_INLINE std::array, A>, 2> widen(batch const& x) noexcept + { + detail::static_check_supported_config(); + return kernel::widen(x, A {}); + } + /** * @ingroup batch_miscellaneous * diff --git a/include/xsimd/types/xsimd_traits.hpp b/include/xsimd/types/xsimd_traits.hpp index 1abcbdbe6..1e1e6bc77 100644 --- a/include/xsimd/types/xsimd_traits.hpp +++ b/include/xsimd/types/xsimd_traits.hpp @@ -332,6 +332,38 @@ namespace xsimd template using mask_type_t = typename mask_type::type; + + namespace detail + { + template + struct widen : widen::type> + { + }; + + template <> + struct widen + { + using type = uint64_t; + }; + template <> + struct widen + { + using type = uint32_t; + }; + template <> + struct widen + { + using type = uint16_t; + }; + template <> + struct widen + { + using type = double; + }; + } + template + using widen_t = typename detail::widen::type; + } #endif diff --git a/test/test_conversion.cpp b/test/test_conversion.cpp index 153920ac0..0861cd7ea 100644 --- a/test/test_conversion.cpp +++ b/test/test_conversion.cpp @@ -244,5 +244,36 @@ TEST_CASE_TEMPLATE("[conversion]", T, uint8_t, uint16_t, uint32_t, uint64_t) } } +template +struct widening_test +{ + + void test_widen(T value) + { + xsimd::batch bvalue(value); + xsimd::batch> wvalue(value); + + auto widened_batch = xsimd::widen(bvalue); + CHECK_BATCH_EQ(widened_batch[0], wvalue); + CHECK_BATCH_EQ(widened_batch[1], wvalue); + } +}; + +TEST_CASE_TEMPLATE("[widening]", T, int8_t, int16_t, int32_t, uint8_t, uint16_t, uint32_t, float) +{ + widening_test Test; + + SUBCASE("widen") + { + Test.test_widen(1); + } + + SUBCASE("limits") + { + Test.test_widen(std::numeric_limits::max()); + Test.test_widen(std::numeric_limits::min()); + } +} + #endif #endif