New API: widen to widen a batch to a batch twice as big

serge-sans-paille · serge-sans-paille · commit 9c7c6dec94a5 · 2025-10-31T19:00:46.000Z
Intel + common implementation + test + doc Fix #1179
diff --git a/docs/source/api/cast_index.rst b/docs/source/api/cast_index.rst
@@ -47,6 +47,8 @@ Conversion:
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`to_int`                    | per slot conversion to integer                     |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`widen`                     | per slot conversion to twice as big type           |
++---------------------------------------+----------------------------------------------------+
 
 ----
 
diff --git a/include/xsimd/arch/common/xsimd_common_cast.hpp b/include/xsimd/arch/common/xsimd_common_cast.hpp
@@ -0,0 +1,40 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_COMMON_CAST_HPP
+#define XSIMD_COMMON_CAST_HPP
+
+#include "../../types/xsimd_traits.hpp"
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<common>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            x.store_aligned(&buffer[0]);
+
+            using T_out = widen_t<T>;
+            alignas(A::alignment()) T_out out_buffer[batch<T, A>::size];
+            for (size_t i = 0; i < batch<T, A>::size; ++i)
+                out_buffer[i] = static_cast<T_out>(buffer[i]);
+
+            return { batch<T_out, A>::load_aligned(&out_buffer[0]),
+                     batch<T_out, A>::load_aligned(&out_buffer[batch<T_out, A>::size]) };
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -1918,6 +1918,23 @@ namespace xsimd
                 return {};
             }
         }
+
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            auto pair_lo = widen(batch<T, sse4_2>(detail::lower_half(x)), sse4_2 {});
+            auto pair_hi = widen(batch<T, sse4_2>(detail::upper_half(x)), sse4_2 {});
+            return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx>) noexcept
+        {
+            __m256d lo = _mm256_cvtps_pd(detail::lower_half(x));
+            __m256d hi = _mm256_cvtps_pd(detail::upper_half(x));
+            return { lo, hi };
+        }
+
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
@@ -1248,6 +1248,56 @@ namespace xsimd
                 return {};
             }
         }
+
+        // widen
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            __m128i x_lo = detail::lower_half(x);
+            __m128i x_hi = detail::upper_half(x);
+            __m256i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm256_cvtepi32_epi64(x_lo);
+                    hi = _mm256_cvtepi32_epi64(x_hi);
+                }
+                else
+                {
+                    lo = _mm256_cvtepu32_epi64(x_lo);
+                    hi = _mm256_cvtepu32_epi64(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm256_cvtepi16_epi32(x_lo);
+                    hi = _mm256_cvtepi16_epi32(x_hi);
+                }
+                else
+                {
+                    lo = _mm256_cvtepu16_epi32(x_lo);
+                    hi = _mm256_cvtepu16_epi32(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm256_cvtepi8_epi16(x_lo);
+                    hi = _mm256_cvtepi8_epi16(x_hi);
+                }
+                else
+                {
+                    lo = _mm256_cvtepu8_epi16(x_lo);
+                    hi = _mm256_cvtepu8_epi16(x_hi);
+                }
+            }
+            return { lo, hi };
+        }
+
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -643,6 +643,26 @@ namespace xsimd
             return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
+        // widen
+        template <class A>
+        XSIMD_INLINE std::array<batch<widen_t<uint8_t>, A>, 2> widen(batch<uint8_t, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            __m256i x_lo = _mm512_extracti64x4_epi64(x, 0);
+            __m256i x_hi = _mm512_extracti64x4_epi64(x, 1);
+            __m512i lo = _mm512_cvtepu8_epi16(x_lo);
+            __m512i hi = _mm512_cvtepu8_epi16(x_hi);
+            return { lo, hi };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<widen_t<int8_t>, A>, 2> widen(batch<int8_t, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            __m256i x_lo = _mm512_extracti64x4_epi64(x, 0);
+            __m256i x_hi = _mm512_extracti64x4_epi64(x, 1);
+            __m512i lo = _mm512_cvtepi8_epi16(x_lo);
+            __m512i hi = _mm512_cvtepi8_epi16(x_hi);
+            return { lo, hi };
+        }
+
         // zip_hi
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -2642,6 +2642,55 @@ namespace xsimd
             }
         }
 
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx512f>) noexcept
+        {
+            __m256i x_lo = detail::lower_half(x);
+            __m256i x_hi = detail::upper_half(x);
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm512_cvtepi32_epi64(x_lo);
+                    hi = _mm512_cvtepi32_epi64(x_hi);
+                }
+                else
+                {
+                    lo = _mm512_cvtepu32_epi64(x_lo);
+                    hi = _mm512_cvtepu32_epi64(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm512_cvtepi16_epi32(x_lo);
+                    hi = _mm512_cvtepi16_epi32(x_hi);
+                }
+                else
+                {
+                    lo = _mm512_cvtepu16_epi32(x_lo);
+                    hi = _mm512_cvtepu16_epi32(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto pair_lo = widen(batch<T, avx2>(x_lo), avx2 {});
+                auto pair_hi = widen(batch<T, avx2>(x_hi), avx2 {});
+                return { detail::merge_avx(pair_lo[0], pair_lo[1]), detail::merge_avx(pair_hi[0], pair_hi[1]) };
+            }
+            return { lo, hi };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx512f>) noexcept
+        {
+            __m512d lo = _mm512_cvtps_pd(detail::lower_half(x));
+            __m512d hi = _mm512_cvtps_pd(detail::upper_half(x));
+            return { lo, hi };
+        }
+
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_common.hpp b/include/xsimd/arch/xsimd_common.hpp
@@ -13,6 +13,7 @@
 #define XSIMD_COMMON_HPP
 
 #include "./common/xsimd_common_arithmetic.hpp"
+#include "./common/xsimd_common_cast.hpp"
 #include "./common/xsimd_common_complex.hpp"
 #include "./common/xsimd_common_logical.hpp"
 #include "./common/xsimd_common_math.hpp"
diff --git a/include/xsimd/arch/xsimd_scalar.hpp b/include/xsimd/arch/xsimd_scalar.hpp
@@ -1235,7 +1235,6 @@ namespace xsimd
     {
         return cond ? true_br : false_br;
     }
-
 }
 
 #endif
diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -15,6 +15,7 @@
 #include <type_traits>
 
 #include "../types/xsimd_sse4_1_register.hpp"
+#include "./common/xsimd_common_cast.hpp"
 
 namespace xsimd
 {
@@ -382,6 +383,63 @@ namespace xsimd
             return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
         }
 
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<sse4_1>) noexcept
+        {
+            __m128i x_lo = x;
+            __m128i x_hi = _mm_unpackhi_epi64(x, x);
+            __m128i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm_cvtepi32_epi64(x_lo);
+                    hi = _mm_cvtepi32_epi64(x_hi);
+                }
+                else
+                {
+                    lo = _mm_cvtepu32_epi64(x_lo);
+                    hi = _mm_cvtepu32_epi64(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm_cvtepi16_epi32(x_lo);
+                    hi = _mm_cvtepi16_epi32(x_hi);
+                }
+                else
+                {
+                    lo = _mm_cvtepu16_epi32(x_lo);
+                    hi = _mm_cvtepu16_epi32(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm_cvtepi8_epi16(x_lo);
+                    hi = _mm_cvtepi8_epi16(x_hi);
+                }
+                else
+                {
+                    lo = _mm_cvtepu8_epi16(x_lo);
+                    hi = _mm_cvtepu8_epi16(x_hi);
+                }
+            }
+            return { lo, hi };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<sse4_1>) noexcept
+        {
+            __m128 x_shuf = _mm_unpackhi_ps(x, x);
+            __m128d lo = _mm_cvtps_pd(x);
+            __m128d hi = _mm_cvtps_pd(x_shuf);
+            return { lo, hi };
+        }
+
     }
 
 }
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
@@ -2719,6 +2719,21 @@ namespace xsimd
         return !xsimd::any(x);
     }
 
+    /**
+     * @ingroup batch_conversion
+     *
+     * Widen batch \c x from type \c T to a type with twice as many bytes and
+     * the same sign (for integers) or from float to double.
+     * @param x batch of \c T
+     * @return two batches of \c widen_t<T>
+     */
+    template <class T, class A>
+    XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::widen<A>(x, A {});
+    }
+
     /**
      * @ingroup batch_miscellaneous
      *
diff --git a/include/xsimd/types/xsimd_traits.hpp b/include/xsimd/types/xsimd_traits.hpp
@@ -332,6 +332,38 @@ namespace xsimd
 
     template <class T>
     using mask_type_t = typename mask_type<T>::type;
+
+    namespace detail
+    {
+        template <typename T>
+        struct widen : widen<typename std::make_unsigned<T>::type>
+        {
+        };
+
+        template <>
+        struct widen<uint32_t>
+        {
+            using type = uint64_t;
+        };
+        template <>
+        struct widen<uint16_t>
+        {
+            using type = uint32_t;
+        };
+        template <>
+        struct widen<uint8_t>
+        {
+            using type = uint16_t;
+        };
+        template <>
+        struct widen<float>
+        {
+            using type = double;
+        };
+    }
+    template <typename T>
+    using widen_t = typename detail::widen<T>::type;
+
 }
 
 #endif
diff --git a/test/test_conversion.cpp b/test/test_conversion.cpp
@@ -244,5 +244,36 @@ TEST_CASE_TEMPLATE("[conversion]", T, uint8_t, uint16_t, uint32_t, uint64_t)
     }
 }
 
+template <class T>
+struct widening_test
+{
+
+    void test_widen(T value)
+    {
+        xsimd::batch<T> bvalue(value);
+        xsimd::batch<xsimd::widen_t<T>> wvalue(value);
+
+        auto widened_batch = xsimd::widen(bvalue);
+        CHECK_BATCH_EQ(widened_batch[0], wvalue);
+        CHECK_BATCH_EQ(widened_batch[1], wvalue);
+    }
+};
+
+TEST_CASE_TEMPLATE("[widening]", T, int8_t, int16_t, int32_t, uint8_t, uint16_t, uint32_t, float)
+{
+    widening_test<T> Test;
+
+    SUBCASE("widen")
+    {
+        Test.test_widen(1);
+    }
+
+    SUBCASE("limits")
+    {
+        Test.test_widen(std::numeric_limits<T>::max());
+        Test.test_widen(std::numeric_limits<T>::min());
+    }
+}
+
 #endif
 #endif

Original file line number	Diff line number	Diff line change
`@@ -1918,6 +1918,23 @@ namespace xsimd`
`1918`	`1918`	`return {};`
`1919`	`1919`	`}`
`1920`	`1920`	`}`
	`1921`	`+`
	`1922`	`+ // widen`
	`1923`	`+ template <class A, class T>`
	`1924`	`+ XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept`
	`1925`	`+ {`
	`1926`	`+ auto pair_lo = widen(batch<T, sse4_2>(detail::lower_half(x)), sse4_2 {});`
	`1927`	`+ auto pair_hi = widen(batch<T, sse4_2>(detail::upper_half(x)), sse4_2 {});`
	`1928`	`+ return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) };`
	`1929`	`+ }`
	`1930`	`+ template <class A>`
	`1931`	`+ XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx>) noexcept`
	`1932`	`+ {`
	`1933`	`+ __m256d lo = _mm256_cvtps_pd(detail::lower_half(x));`
	`1934`	`+ __m256d hi = _mm256_cvtps_pd(detail::upper_half(x));`
	`1935`	`+ return { lo, hi };`
	`1936`	`+ }`
	`1937`	`+`
`1921`	`1938`	`}`
`1922`	`1939`	`}`
`1923`	`1940`
Original file line number	Diff line number	Diff line change
`@@ -1235,7 +1235,6 @@ namespace xsimd`
`1235`	`1235`	`{`
`1236`	`1236`	`return cond ? true_br : false_br;`
`1237`	`1237`	`}`
`1238`		`-`
`1239`	`1238`	`}`
`1240`	`1239`
`1241`	`1240`	`#endif`