xtensor-stack · serge-sans-paille · Oct 31, 2025 · Oct 31, 2025 · Oct 29, 2025 · DiamonDinoia
diff --git a/docs/source/api/cast_index.rst b/docs/source/api/cast_index.rst
@@ -47,6 +47,8 @@ Conversion:
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`to_int`                    | per slot conversion to integer                     |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`widen`                     | per slot conversion to twice as big type           |
++---------------------------------------+----------------------------------------------------+
 
 ----
 

diff --git a/include/xsimd/arch/common/xsimd_common_cast.hpp b/include/xsimd/arch/common/xsimd_common_cast.hpp
@@ -0,0 +1,40 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_COMMON_CAST_HPP
+#define XSIMD_COMMON_CAST_HPP
+
+#include "../../types/xsimd_traits.hpp"
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<common>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            x.store_aligned(&buffer[0]);
+
+            using T_out = widen_t<T>;
+            alignas(A::alignment()) T_out out_buffer[batch<T, A>::size];
+            for (size_t i = 0; i < batch<T, A>::size; ++i)
+                out_buffer[i] = static_cast<T_out>(buffer[i]);
+
+            return { batch<T_out, A>::load_aligned(&out_buffer[0]),
+                     batch<T_out, A>::load_aligned(&out_buffer[batch<T_out, A>::size]) };
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -1918,6 +1918,23 @@ namespace xsimd
                 return {};
             }
         }
+
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            auto pair_lo = widen(batch<T, sse4_2>(detail::lower_half(x)), sse4_2 {});
+            auto pair_hi = widen(batch<T, sse4_2>(detail::upper_half(x)), sse4_2 {});
+            return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx>) noexcept
+        {
+            __m256d lo = _mm256_cvtps_pd(detail::lower_half(x));
+            __m256d hi = _mm256_cvtps_pd(detail::upper_half(x));
+            return { lo, hi };
+        }
+
     }
 }
 

diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
@@ -1248,6 +1248,56 @@ namespace xsimd
                 return {};
             }
         }
+
+        // widen
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            __m128i x_lo = detail::lower_half(x);
+            __m128i x_hi = detail::upper_half(x);
+            __m256i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm256_cvtepi32_epi64(x_lo);
+                    hi = _mm256_cvtepi32_epi64(x_hi);
+                }
+                else
+                {
+                    lo = _mm256_cvtepu32_epi64(x_lo);
+                    hi = _mm256_cvtepu32_epi64(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm256_cvtepi16_epi32(x_lo);
+                    hi = _mm256_cvtepi16_epi32(x_hi);
+                }
+                else
+                {
+                    lo = _mm256_cvtepu16_epi32(x_lo);
+                    hi = _mm256_cvtepu16_epi32(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm256_cvtepi8_epi16(x_lo);
+                    hi = _mm256_cvtepi8_epi16(x_hi);
+                }
+                else
+                {
+                    lo = _mm256_cvtepu8_epi16(x_lo);
+                    hi = _mm256_cvtepu8_epi16(x_hi);
+                }
+            }
+            return { lo, hi };
+        }
+
     }
 }
 

diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -643,6 +643,26 @@ namespace xsimd
             return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
+        // widen
+        template <class A>
+        XSIMD_INLINE std::array<batch<widen_t<uint8_t>, A>, 2> widen(batch<uint8_t, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            __m256i x_lo = _mm512_extracti64x4_epi64(x, 0);
+            __m256i x_hi = _mm512_extracti64x4_epi64(x, 1);
+            __m512i lo = _mm512_cvtepu8_epi16(x_lo);
+            __m512i hi = _mm512_cvtepu8_epi16(x_hi);
+            return { lo, hi };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<widen_t<int8_t>, A>, 2> widen(batch<int8_t, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            __m256i x_lo = _mm512_extracti64x4_epi64(x, 0);
+            __m256i x_hi = _mm512_extracti64x4_epi64(x, 1);
+            __m512i lo = _mm512_cvtepi8_epi16(x_lo);
+            __m512i hi = _mm512_cvtepi8_epi16(x_hi);
+            return { lo, hi };
+        }
+
         // zip_hi
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept

diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -55,7 +55,7 @@ namespace xsimd
             }
             XSIMD_INLINE __m256 upper_half(__m512 self) noexcept
             {
-                return _mm512_extractf32x8_ps(self, 1);
+                return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(self), 1));
             }
             XSIMD_INLINE __m256d upper_half(__m512d self) noexcept
             {
@@ -2642,6 +2642,55 @@ namespace xsimd
             }
         }
 
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx512f>) noexcept
+        {
+            __m256i x_lo = detail::lower_half(x);
+            __m256i x_hi = detail::upper_half(x);
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm512_cvtepi32_epi64(x_lo);
+                    hi = _mm512_cvtepi32_epi64(x_hi);
+                }
+                else
+                {
+                    lo = _mm512_cvtepu32_epi64(x_lo);
+                    hi = _mm512_cvtepu32_epi64(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm512_cvtepi16_epi32(x_lo);
+                    hi = _mm512_cvtepi16_epi32(x_hi);
+                }
+                else
+                {
+                    lo = _mm512_cvtepu16_epi32(x_lo);
+                    hi = _mm512_cvtepu16_epi32(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto pair_lo = widen(batch<T, avx2>(x_lo), avx2 {});
+                auto pair_hi = widen(batch<T, avx2>(x_hi), avx2 {});
+                return { detail::merge_avx(pair_lo[0], pair_lo[1]), detail::merge_avx(pair_hi[0], pair_hi[1]) };
+            }
+            return { lo, hi };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx512f>) noexcept
+        {
+            __m512d lo = _mm512_cvtps_pd(detail::lower_half(x));
+            __m512d hi = _mm512_cvtps_pd(detail::upper_half(x));
+            return { lo, hi };
+        }
+
     }
 }
 

diff --git a/include/xsimd/arch/xsimd_common.hpp b/include/xsimd/arch/xsimd_common.hpp
@@ -13,6 +13,7 @@
 #define XSIMD_COMMON_HPP
 
 #include "./common/xsimd_common_arithmetic.hpp"
+#include "./common/xsimd_common_cast.hpp"
 #include "./common/xsimd_common_complex.hpp"
 #include "./common/xsimd_common_logical.hpp"
 #include "./common/xsimd_common_math.hpp"

diff --git a/include/xsimd/arch/xsimd_scalar.hpp b/include/xsimd/arch/xsimd_scalar.hpp
@@ -1235,7 +1235,6 @@ namespace xsimd
     {
         return cond ? true_br : false_br;
     }
-
 }
 
 #endif
diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -15,6 +15,7 @@
 #include <type_traits>
 
 #include "../types/xsimd_sse4_1_register.hpp"
+#include "./common/xsimd_common_cast.hpp"
 
 namespace xsimd
 {
@@ -382,6 +383,63 @@ namespace xsimd
             return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
         }
 
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<sse4_1>) noexcept
+        {
+            __m128i x_lo = x;
+            __m128i x_hi = _mm_unpackhi_epi64(x, x);
+            __m128i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm_cvtepi32_epi64(x_lo);
+                    hi = _mm_cvtepi32_epi64(x_hi);
+                }
+                else
+                {
+                    lo = _mm_cvtepu32_epi64(x_lo);
+                    hi = _mm_cvtepu32_epi64(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm_cvtepi16_epi32(x_lo);
+                    hi = _mm_cvtepi16_epi32(x_hi);
+                }
+                else
+                {
+                    lo = _mm_cvtepu16_epi32(x_lo);
+                    hi = _mm_cvtepu16_epi32(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm_cvtepi8_epi16(x_lo);
+                    hi = _mm_cvtepi8_epi16(x_hi);
+                }
+                else
+                {
+                    lo = _mm_cvtepu8_epi16(x_lo);
+                    hi = _mm_cvtepu8_epi16(x_hi);
+                }
+            }
+            return { lo, hi };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<sse4_1>) noexcept
+        {
+            __m128 x_shuf = _mm_unpackhi_ps(x, x);
+            __m128d lo = _mm_cvtps_pd(x);
+            __m128d hi = _mm_cvtps_pd(x_shuf);
+            return { lo, hi };
+        }
+
     }
 
 }

diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
@@ -2719,6 +2719,21 @@ namespace xsimd
         return !xsimd::any(x);
     }
 
+    /**
+     * @ingroup batch_conversion
+     *
+     * Widen batch \c x from type \c T to a type with twice as many bytes and
+     * the same sign (for integers) or from float to double.
+     * @param x batch of \c T
+     * @return two batches of \c widen_t<T>
+     */
+    template <class T, class A>
+    XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::widen<A>(x, A {});
+    }
+
     /**
      * @ingroup batch_miscellaneous
      *

diff --git a/include/xsimd/types/xsimd_traits.hpp b/include/xsimd/types/xsimd_traits.hpp
@@ -332,6 +332,38 @@ namespace xsimd
 
     template <class T>
     using mask_type_t = typename mask_type<T>::type;
+
+    namespace detail
+    {
+        template <typename T>
+        struct widen : widen<typename std::make_unsigned<T>::type>
+        {
+        };
+
+        template <>
+        struct widen<uint32_t>
+        {
+            using type = uint64_t;
+        };
+        template <>
+        struct widen<uint16_t>
+        {
+            using type = uint32_t;
+        };
+        template <>
+        struct widen<uint8_t>
+        {
+            using type = uint16_t;
+        };
+        template <>
+        struct widen<float>
+        {
+            using type = double;
+        };
+    }
+    template <typename T>
+    using widen_t = typename detail::widen<T>::type;
+
 }
 
 #endif
-Original file line number
+Diff line change
@@ Expand Up / @@ -47,6 +47,8 @@ Conversion: @@
     +---------------------------------------+----------------------------------------------------+
     | :cpp:func:`to_int`                    | per slot conversion to integer                     |
     +---------------------------------------+----------------------------------------------------+
+    | :cpp:func:`widen`                     | per slot conversion to twice as big type           |
+    +---------------------------------------+----------------------------------------------------+
     ----
@@ Expand Down @@