From dba009b8c836ef975ff8722e57b92b953059ed4a Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Fri, 31 Oct 2025 15:56:59 +0100
Subject: [PATCH 1/2] Fix portability issue with avx512f upper_half of batch of
 float

It was relying on an intrinsic only available with avx512dq
---
 include/xsimd/arch/xsimd_avx512f.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
index e332a9af4..05f2ff0a5 100644
--- a/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -55,7 +55,7 @@ namespace xsimd
             }
             XSIMD_INLINE __m256 upper_half(__m512 self) noexcept
             {
-                return _mm512_extractf32x8_ps(self, 1);
+                return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(self), 1));
             }
             XSIMD_INLINE __m256d upper_half(__m512d self) noexcept
             {

From 7ebdc5ea6c213efb858247768131b5d11c364280 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Wed, 29 Oct 2025 23:05:41 +0100
Subject: [PATCH 2/2] New API: widen to widen a batch to a batch twice as big

Intel + common implementation + test + doc

Fix #1179
---
 docs/source/api/cast_index.rst                |  2 +
 .../xsimd/arch/common/xsimd_common_cast.hpp   | 40 +++++++++++++
 include/xsimd/arch/xsimd_avx.hpp              | 17 ++++++
 include/xsimd/arch/xsimd_avx2.hpp             | 50 ++++++++++++++++
 include/xsimd/arch/xsimd_avx512bw.hpp         | 20 +++++++
 include/xsimd/arch/xsimd_avx512f.hpp          | 49 ++++++++++++++++
 include/xsimd/arch/xsimd_common.hpp           |  1 +
 include/xsimd/arch/xsimd_scalar.hpp           |  1 -
 include/xsimd/arch/xsimd_sse4_1.hpp           | 58 +++++++++++++++++++
 include/xsimd/types/xsimd_api.hpp             | 15 +++++
 include/xsimd/types/xsimd_traits.hpp          | 32 ++++++++++
 test/test_conversion.cpp                      | 31 ++++++++++
 12 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 include/xsimd/arch/common/xsimd_common_cast.hpp

diff --git a/docs/source/api/cast_index.rst b/docs/source/api/cast_index.rst
index dee46b1f2..dae96bb72 100755
--- a/docs/source/api/cast_index.rst
+++ b/docs/source/api/cast_index.rst
@@ -47,6 +47,8 @@ Conversion:
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`to_int`                    | per slot conversion to integer                     |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`widen`                     | per slot conversion to twice as big type           |
++---------------------------------------+----------------------------------------------------+
 
 ----
 
diff --git a/include/xsimd/arch/common/xsimd_common_cast.hpp b/include/xsimd/arch/common/xsimd_common_cast.hpp
new file mode 100644
index 000000000..1226c887c
--- /dev/null
+++ b/include/xsimd/arch/common/xsimd_common_cast.hpp
@@ -0,0 +1,40 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_COMMON_CAST_HPP
+#define XSIMD_COMMON_CAST_HPP
+
+#include "../../types/xsimd_traits.hpp"
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<common>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            x.store_aligned(&buffer[0]);
+
+            using T_out = widen_t<T>;
+            alignas(A::alignment()) T_out out_buffer[batch<T, A>::size];
+            for (size_t i = 0; i < batch<T, A>::size; ++i)
+                out_buffer[i] = static_cast<T_out>(buffer[i]);
+
+            return { batch<T_out, A>::load_aligned(&out_buffer[0]),
+                     batch<T_out, A>::load_aligned(&out_buffer[batch<T_out, A>::size]) };
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index b57d5c187..c7dc9de5e 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -1918,6 +1918,23 @@ namespace xsimd
                 return {};
             }
         }
+
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            auto pair_lo = widen(batch<T, sse4_2>(detail::lower_half(x)), sse4_2 {});
+            auto pair_hi = widen(batch<T, sse4_2>(detail::upper_half(x)), sse4_2 {});
+            return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx>) noexcept
+        {
+            __m256d lo = _mm256_cvtps_pd(detail::lower_half(x));
+            __m256d hi = _mm256_cvtps_pd(detail::upper_half(x));
+            return { lo, hi };
+        }
+
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
index aa974f9ab..87e5dafae 100644
--- a/include/xsimd/arch/xsimd_avx2.hpp
+++ b/include/xsimd/arch/xsimd_avx2.hpp
@@ -1248,6 +1248,56 @@ namespace xsimd
                 return {};
             }
         }
+
+        // widen
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            __m128i x_lo = detail::lower_half(x);
+            __m128i x_hi = detail::upper_half(x);
+            __m256i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm256_cvtepi32_epi64(x_lo);
+                    hi = _mm256_cvtepi32_epi64(x_hi);
+                }
+                else
+                {
+                    lo = _mm256_cvtepu32_epi64(x_lo);
+                    hi = _mm256_cvtepu32_epi64(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm256_cvtepi16_epi32(x_lo);
+                    hi = _mm256_cvtepi16_epi32(x_hi);
+                }
+                else
+                {
+                    lo = _mm256_cvtepu16_epi32(x_lo);
+                    hi = _mm256_cvtepu16_epi32(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm256_cvtepi8_epi16(x_lo);
+                    hi = _mm256_cvtepi8_epi16(x_hi);
+                }
+                else
+                {
+                    lo = _mm256_cvtepu8_epi16(x_lo);
+                    hi = _mm256_cvtepu8_epi16(x_hi);
+                }
+            }
+            return { lo, hi };
+        }
+
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp
index 27c537dcb..767c5960b 100644
--- a/include/xsimd/arch/xsimd_avx512bw.hpp
+++ b/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -643,6 +643,26 @@ namespace xsimd
             return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
+        // widen
+        template <class A>
+        XSIMD_INLINE std::array<batch<widen_t<uint8_t>, A>, 2> widen(batch<uint8_t, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            __m256i x_lo = _mm512_extracti64x4_epi64(x, 0);
+            __m256i x_hi = _mm512_extracti64x4_epi64(x, 1);
+            __m512i lo = _mm512_cvtepu8_epi16(x_lo);
+            __m512i hi = _mm512_cvtepu8_epi16(x_hi);
+            return { lo, hi };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<widen_t<int8_t>, A>, 2> widen(batch<int8_t, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            __m256i x_lo = _mm512_extracti64x4_epi64(x, 0);
+            __m256i x_hi = _mm512_extracti64x4_epi64(x, 1);
+            __m512i lo = _mm512_cvtepi8_epi16(x_lo);
+            __m512i hi = _mm512_cvtepi8_epi16(x_hi);
+            return { lo, hi };
+        }
+
         // zip_hi
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
index 05f2ff0a5..329c8b78c 100644
--- a/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -2642,6 +2642,55 @@ namespace xsimd
             }
         }
 
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx512f>) noexcept
+        {
+            __m256i x_lo = detail::lower_half(x);
+            __m256i x_hi = detail::upper_half(x);
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm512_cvtepi32_epi64(x_lo);
+                    hi = _mm512_cvtepi32_epi64(x_hi);
+                }
+                else
+                {
+                    lo = _mm512_cvtepu32_epi64(x_lo);
+                    hi = _mm512_cvtepu32_epi64(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm512_cvtepi16_epi32(x_lo);
+                    hi = _mm512_cvtepi16_epi32(x_hi);
+                }
+                else
+                {
+                    lo = _mm512_cvtepu16_epi32(x_lo);
+                    hi = _mm512_cvtepu16_epi32(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto pair_lo = widen(batch<T, avx2>(x_lo), avx2 {});
+                auto pair_hi = widen(batch<T, avx2>(x_hi), avx2 {});
+                return { detail::merge_avx(pair_lo[0], pair_lo[1]), detail::merge_avx(pair_hi[0], pair_hi[1]) };
+            }
+            return { lo, hi };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx512f>) noexcept
+        {
+            __m512d lo = _mm512_cvtps_pd(detail::lower_half(x));
+            __m512d hi = _mm512_cvtps_pd(detail::upper_half(x));
+            return { lo, hi };
+        }
+
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_common.hpp b/include/xsimd/arch/xsimd_common.hpp
index ccd4b47c6..11f21bd82 100644
--- a/include/xsimd/arch/xsimd_common.hpp
+++ b/include/xsimd/arch/xsimd_common.hpp
@@ -13,6 +13,7 @@
 #define XSIMD_COMMON_HPP
 
 #include "./common/xsimd_common_arithmetic.hpp"
+#include "./common/xsimd_common_cast.hpp"
 #include "./common/xsimd_common_complex.hpp"
 #include "./common/xsimd_common_logical.hpp"
 #include "./common/xsimd_common_math.hpp"
diff --git a/include/xsimd/arch/xsimd_scalar.hpp b/include/xsimd/arch/xsimd_scalar.hpp
index 762eac684..9d24ed01f 100644
--- a/include/xsimd/arch/xsimd_scalar.hpp
+++ b/include/xsimd/arch/xsimd_scalar.hpp
@@ -1235,7 +1235,6 @@ namespace xsimd
     {
         return cond ? true_br : false_br;
     }
-
 }
 
 #endif
diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
index 92330ef3c..130d50476 100644
--- a/include/xsimd/arch/xsimd_sse4_1.hpp
+++ b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -15,6 +15,7 @@
 #include <type_traits>
 
 #include "../types/xsimd_sse4_1_register.hpp"
+#include "./common/xsimd_common_cast.hpp"
 
 namespace xsimd
 {
@@ -382,6 +383,63 @@ namespace xsimd
             return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
         }
 
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<sse4_1>) noexcept
+        {
+            __m128i x_lo = x;
+            __m128i x_hi = _mm_unpackhi_epi64(x, x);
+            __m128i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm_cvtepi32_epi64(x_lo);
+                    hi = _mm_cvtepi32_epi64(x_hi);
+                }
+                else
+                {
+                    lo = _mm_cvtepu32_epi64(x_lo);
+                    hi = _mm_cvtepu32_epi64(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm_cvtepi16_epi32(x_lo);
+                    hi = _mm_cvtepi16_epi32(x_hi);
+                }
+                else
+                {
+                    lo = _mm_cvtepu16_epi32(x_lo);
+                    hi = _mm_cvtepu16_epi32(x_hi);
+                }
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+                {
+                    lo = _mm_cvtepi8_epi16(x_lo);
+                    hi = _mm_cvtepi8_epi16(x_hi);
+                }
+                else
+                {
+                    lo = _mm_cvtepu8_epi16(x_lo);
+                    hi = _mm_cvtepu8_epi16(x_hi);
+                }
+            }
+            return { lo, hi };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<sse4_1>) noexcept
+        {
+            __m128 x_shuf = _mm_unpackhi_ps(x, x);
+            __m128d lo = _mm_cvtps_pd(x);
+            __m128d hi = _mm_cvtps_pd(x_shuf);
+            return { lo, hi };
+        }
+
     }
 
 }
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
index efa50c28e..12bd9d95e 100644
--- a/include/xsimd/types/xsimd_api.hpp
+++ b/include/xsimd/types/xsimd_api.hpp
@@ -2719,6 +2719,21 @@ namespace xsimd
         return !xsimd::any(x);
     }
 
+    /**
+     * @ingroup batch_conversion
+     *
+     * Widen batch \c x from type \c T to a type with twice as many bytes and
+     * the same sign (for integers) or from float to double.
+     * @param x batch of \c T
+     * @return two batches of \c widen_t<T>
+     */
+    template <class T, class A>
+    XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::widen<A>(x, A {});
+    }
+
     /**
      * @ingroup batch_miscellaneous
      *
diff --git a/include/xsimd/types/xsimd_traits.hpp b/include/xsimd/types/xsimd_traits.hpp
index 1abcbdbe6..1e1e6bc77 100644
--- a/include/xsimd/types/xsimd_traits.hpp
+++ b/include/xsimd/types/xsimd_traits.hpp
@@ -332,6 +332,38 @@ namespace xsimd
 
     template <class T>
     using mask_type_t = typename mask_type<T>::type;
+
+    namespace detail
+    {
+        template <typename T>
+        struct widen : widen<typename std::make_unsigned<T>::type>
+        {
+        };
+
+        template <>
+        struct widen<uint32_t>
+        {
+            using type = uint64_t;
+        };
+        template <>
+        struct widen<uint16_t>
+        {
+            using type = uint32_t;
+        };
+        template <>
+        struct widen<uint8_t>
+        {
+            using type = uint16_t;
+        };
+        template <>
+        struct widen<float>
+        {
+            using type = double;
+        };
+    }
+    template <typename T>
+    using widen_t = typename detail::widen<T>::type;
+
 }
 
 #endif
diff --git a/test/test_conversion.cpp b/test/test_conversion.cpp
index 153920ac0..0861cd7ea 100644
--- a/test/test_conversion.cpp
+++ b/test/test_conversion.cpp
@@ -244,5 +244,36 @@ TEST_CASE_TEMPLATE("[conversion]", T, uint8_t, uint16_t, uint32_t, uint64_t)
     }
 }
 
+template <class T>
+struct widening_test
+{
+
+    void test_widen(T value)
+    {
+        xsimd::batch<T> bvalue(value);
+        xsimd::batch<xsimd::widen_t<T>> wvalue(value);
+
+        auto widened_batch = xsimd::widen(bvalue);
+        CHECK_BATCH_EQ(widened_batch[0], wvalue);
+        CHECK_BATCH_EQ(widened_batch[1], wvalue);
+    }
+};
+
+TEST_CASE_TEMPLATE("[widening]", T, int8_t, int16_t, int32_t, uint8_t, uint16_t, uint32_t, float)
+{
+    widening_test<T> Test;
+
+    SUBCASE("widen")
+    {
+        Test.test_widen(1);
+    }
+
+    SUBCASE("limits")
+    {
+        Test.test_widen(std::numeric_limits<T>::max());
+        Test.test_widen(std::numeric_limits<T>::min());
+    }
+}
+
 #endif
 #endif