Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/api/cast_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ Conversion:
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`to_int` | per slot conversion to integer |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`widen` | per slot conversion to twice as big type |
+---------------------------------------+----------------------------------------------------+

----

Expand Down
40 changes: 40 additions & 0 deletions include/xsimd/arch/common/xsimd_common_cast.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/

#ifndef XSIMD_COMMON_CAST_HPP
#define XSIMD_COMMON_CAST_HPP

#include "../../types/xsimd_traits.hpp"

namespace xsimd
{
namespace kernel
{
template <class A, class T>
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<common>) noexcept
{
alignas(A::alignment()) T buffer[batch<T, A>::size];
x.store_aligned(&buffer[0]);

using T_out = widen_t<T>;
alignas(A::alignment()) T_out out_buffer[batch<T, A>::size];
for (size_t i = 0; i < batch<T, A>::size; ++i)
out_buffer[i] = static_cast<T_out>(buffer[i]);

return { batch<T_out, A>::load_aligned(&out_buffer[0]),
batch<T_out, A>::load_aligned(&out_buffer[batch<T_out, A>::size]) };
Comment on lines +32 to +33
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

out_buffer
and out_buffer + batch<T_out, A>::size seems clearer to me

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I mean doing pointer arithmetic directly instead of using the [] and & operators)

}

}

}

#endif
17 changes: 17 additions & 0 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1918,6 +1918,23 @@ namespace xsimd
return {};
}
}

// widen
template <class A, class T>
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept
{
auto pair_lo = widen(batch<T, sse4_2>(detail::lower_half(x)), sse4_2 {});
auto pair_hi = widen(batch<T, sse4_2>(detail::upper_half(x)), sse4_2 {});
return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) };
}
template <class A>
XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx>) noexcept
{
__m256d lo = _mm256_cvtps_pd(detail::lower_half(x));
__m256d hi = _mm256_cvtps_pd(detail::upper_half(x));
return { lo, hi };
}

}
}

Expand Down
50 changes: 50 additions & 0 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1248,6 +1248,56 @@ namespace xsimd
return {};
}
}

// widen
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx2>) noexcept
{
__m128i x_lo = detail::lower_half(x);
__m128i x_hi = detail::upper_half(x);
__m256i lo, hi;
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
{
lo = _mm256_cvtepi32_epi64(x_lo);
hi = _mm256_cvtepi32_epi64(x_hi);
}
else
{
lo = _mm256_cvtepu32_epi64(x_lo);
hi = _mm256_cvtepu32_epi64(x_hi);
}
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
{
lo = _mm256_cvtepi16_epi32(x_lo);
hi = _mm256_cvtepi16_epi32(x_hi);
}
else
{
lo = _mm256_cvtepu16_epi32(x_lo);
hi = _mm256_cvtepu16_epi32(x_hi);
}
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
{
lo = _mm256_cvtepi8_epi16(x_lo);
hi = _mm256_cvtepi8_epi16(x_hi);
}
else
{
lo = _mm256_cvtepu8_epi16(x_lo);
hi = _mm256_cvtepu8_epi16(x_hi);
}
}
return { lo, hi };
}

}
}

Expand Down
20 changes: 20 additions & 0 deletions include/xsimd/arch/xsimd_avx512bw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,26 @@ namespace xsimd
return swizzle(self, mask.as_batch(), avx512bw {});
}

// widen
template <class A>
XSIMD_INLINE std::array<batch<widen_t<uint8_t>, A>, 2> widen(batch<uint8_t, A> const& x, requires_arch<avx512bw>) noexcept
{
__m256i x_lo = _mm512_extracti64x4_epi64(x, 0);
__m256i x_hi = _mm512_extracti64x4_epi64(x, 1);
__m512i lo = _mm512_cvtepu8_epi16(x_lo);
__m512i hi = _mm512_cvtepu8_epi16(x_hi);
return { lo, hi };
}
template <class A>
XSIMD_INLINE std::array<batch<widen_t<int8_t>, A>, 2> widen(batch<int8_t, A> const& x, requires_arch<avx512bw>) noexcept
{
__m256i x_lo = _mm512_extracti64x4_epi64(x, 0);
__m256i x_hi = _mm512_extracti64x4_epi64(x, 1);
__m512i lo = _mm512_cvtepi8_epi16(x_lo);
__m512i hi = _mm512_cvtepi8_epi16(x_hi);
return { lo, hi };
}

// zip_hi
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value>::type>
XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
Expand Down
51 changes: 50 additions & 1 deletion include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ namespace xsimd
}
XSIMD_INLINE __m256 upper_half(__m512 self) noexcept
{
return _mm512_extractf32x8_ps(self, 1);
return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(self), 1));
}
XSIMD_INLINE __m256d upper_half(__m512d self) noexcept
{
Expand Down Expand Up @@ -2642,6 +2642,55 @@ namespace xsimd
}
}

// widen
template <class A, class T>
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx512f>) noexcept
{
__m256i x_lo = detail::lower_half(x);
__m256i x_hi = detail::upper_half(x);
__m512i lo, hi;
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
{
lo = _mm512_cvtepi32_epi64(x_lo);
hi = _mm512_cvtepi32_epi64(x_hi);
}
else
{
lo = _mm512_cvtepu32_epi64(x_lo);
hi = _mm512_cvtepu32_epi64(x_hi);
}
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
{
lo = _mm512_cvtepi16_epi32(x_lo);
hi = _mm512_cvtepi16_epi32(x_hi);
}
else
{
lo = _mm512_cvtepu16_epi32(x_lo);
hi = _mm512_cvtepu16_epi32(x_hi);
}
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
auto pair_lo = widen(batch<T, avx2>(x_lo), avx2 {});
auto pair_hi = widen(batch<T, avx2>(x_hi), avx2 {});
return { detail::merge_avx(pair_lo[0], pair_lo[1]), detail::merge_avx(pair_hi[0], pair_hi[1]) };
}
return { lo, hi };
}
template <class A>
XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx512f>) noexcept
{
__m512d lo = _mm512_cvtps_pd(detail::lower_half(x));
__m512d hi = _mm512_cvtps_pd(detail::upper_half(x));
return { lo, hi };
}

}
}

Expand Down
1 change: 1 addition & 0 deletions include/xsimd/arch/xsimd_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#define XSIMD_COMMON_HPP

#include "./common/xsimd_common_arithmetic.hpp"
#include "./common/xsimd_common_cast.hpp"
#include "./common/xsimd_common_complex.hpp"
#include "./common/xsimd_common_logical.hpp"
#include "./common/xsimd_common_math.hpp"
Expand Down
1 change: 0 additions & 1 deletion include/xsimd/arch/xsimd_scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1235,7 +1235,6 @@ namespace xsimd
{
return cond ? true_br : false_br;
}

}

#endif
58 changes: 58 additions & 0 deletions include/xsimd/arch/xsimd_sse4_1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <type_traits>

#include "../types/xsimd_sse4_1_register.hpp"
#include "./common/xsimd_common_cast.hpp"

namespace xsimd
{
Expand Down Expand Up @@ -382,6 +383,63 @@ namespace xsimd
return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
}

// widen
template <class A, class T>
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<sse4_1>) noexcept
{
__m128i x_lo = x;
__m128i x_hi = _mm_unpackhi_epi64(x, x);
__m128i lo, hi;
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
{
lo = _mm_cvtepi32_epi64(x_lo);
hi = _mm_cvtepi32_epi64(x_hi);
}
else
{
lo = _mm_cvtepu32_epi64(x_lo);
hi = _mm_cvtepu32_epi64(x_hi);
}
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
{
lo = _mm_cvtepi16_epi32(x_lo);
hi = _mm_cvtepi16_epi32(x_hi);
}
else
{
lo = _mm_cvtepu16_epi32(x_lo);
hi = _mm_cvtepu16_epi32(x_hi);
}
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
{
lo = _mm_cvtepi8_epi16(x_lo);
hi = _mm_cvtepi8_epi16(x_hi);
}
else
{
lo = _mm_cvtepu8_epi16(x_lo);
hi = _mm_cvtepu8_epi16(x_hi);
}
}
return { lo, hi };
}
template <class A>
XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<sse4_1>) noexcept
{
__m128 x_shuf = _mm_unpackhi_ps(x, x);
__m128d lo = _mm_cvtps_pd(x);
__m128d hi = _mm_cvtps_pd(x_shuf);
return { lo, hi };
}

}

}
Expand Down
15 changes: 15 additions & 0 deletions include/xsimd/types/xsimd_api.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2719,6 +2719,21 @@ namespace xsimd
return !xsimd::any(x);
}

/**
* @ingroup batch_conversion
*
* Widen batch \c x from type \c T to a type with twice as many bytes and
* the same sign (for integers) or from float to double.
* @param x batch of \c T
* @return two batches of \c widen_t<T>
*/
template <class T, class A>
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x) noexcept
{
detail::static_check_supported_config<T, A>();
return kernel::widen<A>(x, A {});
}

/**
* @ingroup batch_miscellaneous
*
Expand Down
32 changes: 32 additions & 0 deletions include/xsimd/types/xsimd_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,38 @@ namespace xsimd

template <class T>
using mask_type_t = typename mask_type<T>::type;

namespace detail
{
template <typename T>
struct widen : widen<typename std::make_unsigned<T>::type>
{
};

template <>
struct widen<uint32_t>
{
using type = uint64_t;
};
template <>
struct widen<uint16_t>
{
using type = uint32_t;
};
template <>
struct widen<uint8_t>
{
using type = uint16_t;
};
template <>
struct widen<float>
{
using type = double;
};
Comment on lines +338 to +362
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a trait that convert a byte size to a uint/float type could be more generic:

template <>
struct sized_uint<4>
{
    using type = uint32_t;
};

...

And then use with batch<sized_uint_t<2*sizeof(T)>> (or an alias for that).

Copy link
Contributor

@DiamonDinoia DiamonDinoia Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not in favor of this. Then we need sized_int and sized_uint and the API will need an std::conditional<std::is_signed<T>, sized_int_t<2*sizeof(T)>, sized_uint_t<2*sizeof(T)>

}
template <typename T>
using widen_t = typename detail::widen<T>::type;

}

#endif
Loading
Loading