Merge pull request #434 from guyuqi/add-zip_hilo

JohanMabille · web-flow · commit 252fd00b529e · 2021-04-02T09:40:36.000+02:00
Add zip_hi/zip_lo for shuffling operations
diff --git a/include/xsimd/types/xsimd_base.hpp b/include/xsimd/types/xsimd_base.hpp
@@ -282,6 +282,12 @@ namespace xsimd
     template <class X>
     batch_type_t<X> select(const typename simd_batch_traits<X>::batch_bool_type& cond, const simd_base<X>& a, const simd_base<X>& b);
 
+    template <class X>
+    batch_type_t<X> zip_lo(const simd_base<X>& lhs, const simd_base<X>& rhs);
+
+    template <class X>
+    batch_type_t<X> zip_hi(const simd_base<X>& lhs, const simd_base<X>& rhs);
+
     template <class X>
     typename simd_batch_traits<X>::batch_bool_type
     isnan(const simd_base<X>& x);
@@ -1777,6 +1783,36 @@ namespace xsimd
         return kernel::select(cond, a(), b());
     }
 
+    /**
+     * Unpack and interleave data from the LOW half of batches \c lhs and \c rhs.
+     * Store the results in the Return value.
+     * @param lhs a batch of integer or floating point or double precision values.
+     * @param rhs a batch of integer or floating point or double precision values.
+     * @return a batch of the low part of shuffled values.
+     */
+    template <class X>
+    inline batch_type_t<X> zip_lo(const simd_base<X>& lhs, const simd_base<X>& rhs)
+    {
+        using value_type = typename simd_batch_traits<X>::value_type;
+        using kernel = detail::batch_kernel<value_type, simd_batch_traits<X>::size>;
+        return kernel::zip_lo(lhs(), rhs());
+    }
+
+    /**
+     * Unpack and interleave data from the HIGH half of batches \c lhs and \c rhs.
+     * Store the results in the Return value.
+     * @param lhs a batch of integer or floating point or double precision values.
+     * @param rhs a batch of integer or floating point or double precision values.
+     * @return a batch of the high part of shuffled values.
+     */
+    template <class X>
+    inline batch_type_t<X> zip_hi(const simd_base<X>& lhs, const simd_base<X>& rhs)
+    {
+        using value_type = typename simd_batch_traits<X>::value_type;
+        using kernel = detail::batch_kernel<value_type, simd_batch_traits<X>::size>;
+        return kernel::zip_hi(lhs(), rhs());
+    }
+
     /**
      * Determines if the scalars in the given batch \c x are NaN values.
      * @param x batch of floating point values.
diff --git a/include/xsimd/types/xsimd_neon_double.hpp b/include/xsimd/types/xsimd_neon_double.hpp
@@ -580,6 +580,24 @@ namespace xsimd
                 return vbslq_f64(cond, a, b);
             }
 
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip1q_f64(lhs, rhs);
+#else
+                return vcombine_f64(vget_low_f64(lhs), vget_low_f64(rhs));
+#endif
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip2q_f64(lhs, rhs);
+#else
+                return vcombine_f64(vget_high_f64(lhs), vget_high_f64(rhs));
+#endif
+            }
+
             static batch_bool_type isnan(const batch_type& x)
             {
                 return !(x == x);
diff --git a/include/xsimd/types/xsimd_neon_float.hpp b/include/xsimd/types/xsimd_neon_float.hpp
@@ -639,6 +639,26 @@ namespace xsimd
                 return vbslq_f32(cond, a, b);
             }
 
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip1q_f32(lhs, rhs);
+#else
+                float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs));
+                return vcombine_f32(tmp.val[0], tmp.val[1]);
+#endif
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip2q_f32(lhs, rhs);
+#else
+                float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs));
+                return vcombine_f32(tmp.val[0], tmp.val[1]);
+#endif
+            }
+
             static batch_bool_type isnan(const batch_type& x)
             {
                 return !(x == x);
diff --git a/include/xsimd/types/xsimd_neon_int16.hpp b/include/xsimd/types/xsimd_neon_int16.hpp
@@ -344,6 +344,26 @@ namespace xsimd
             {
                 return vbslq_s16(cond, a, b);
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip1q_s16(lhs, rhs);
+#else
+                int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs));
+                return vcombine_s16(tmp.val[0], tmp.val[1]);
+#endif
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip2q_s16(lhs, rhs);
+#else
+                int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs));
+                return vcombine_s16(tmp.val[0], tmp.val[1]);
+#endif
+            }
         };
     }
 
diff --git a/include/xsimd/types/xsimd_neon_int32.hpp b/include/xsimd/types/xsimd_neon_int32.hpp
@@ -424,6 +424,26 @@ namespace xsimd
             {
                 return vbslq_s32(cond, a, b);
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip1q_s32(lhs, rhs);
+#else
+                int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs));
+                return vcombine_s32(tmp.val[0], tmp.val[1]);
+#endif
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip2q_s32(lhs, rhs);
+#else
+                int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs));
+                return vcombine_s32(tmp.val[0], tmp.val[1]);
+#endif
+            }
         };
     }
 
diff --git a/include/xsimd/types/xsimd_neon_int64.hpp b/include/xsimd/types/xsimd_neon_int64.hpp
@@ -439,6 +439,24 @@ namespace xsimd
             {
                 return vbslq_s64(cond, a, b);
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip1q_s64(lhs, rhs);
+#else
+                return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs));
+#endif
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip2q_s64(lhs, rhs);
+#else
+                return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs));
+#endif
+            }
         };
     }
 
diff --git a/include/xsimd/types/xsimd_neon_int8.hpp b/include/xsimd/types/xsimd_neon_int8.hpp
@@ -345,6 +345,26 @@ namespace xsimd
             {
                 return vbslq_s8(cond, a, b);
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip1q_s8(lhs, rhs);
+#else
+                int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs));
+                return vcombine_s8(tmp.val[0], tmp.val[1]);
+#endif
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip2q_s8(lhs, rhs);
+#else
+                int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs));
+                return vcombine_s8(tmp.val[0], tmp.val[1]);
+#endif
+            }
         };
     }
 
diff --git a/include/xsimd/types/xsimd_neon_uint16.hpp b/include/xsimd/types/xsimd_neon_uint16.hpp
@@ -318,6 +318,26 @@ namespace xsimd
             {
                 return vbslq_u16(cond, a, b);
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip1q_u16(lhs, rhs);
+#else
+                uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs));
+                return vcombine_u16(tmp.val[0], tmp.val[1]);
+#endif
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip2q_u16(lhs, rhs);
+#else
+                uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs));
+                return vcombine_u16(tmp.val[0], tmp.val[1]);
+#endif
+            }
         };
     }
 
diff --git a/include/xsimd/types/xsimd_neon_uint32.hpp b/include/xsimd/types/xsimd_neon_uint32.hpp
@@ -415,6 +415,27 @@ namespace xsimd
             {
                 return vbslq_u32(cond, a, b);
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip1q_u32(lhs, rhs);
+#else
+                uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs));
+                return vcombine_u32(tmp.val[0], tmp.val[1]);
+#endif
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip2q_u32(lhs, rhs);
+#else
+                uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs));
+                return vcombine_u32(tmp.val[0], tmp.val[1]);
+#endif
+            }
+
         };
 
         inline batch<uint32_t, 4> shift_left(const batch<uint32_t, 4>& lhs, int32_t n)
diff --git a/include/xsimd/types/xsimd_neon_uint64.hpp b/include/xsimd/types/xsimd_neon_uint64.hpp
@@ -467,6 +467,25 @@ namespace xsimd
             {
                 return vbslq_u64(cond, a, b);
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip1q_u64(lhs, rhs);
+#else
+                return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs));
+#endif
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip2q_u64(lhs, rhs);
+#else
+                return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs));
+#endif
+            }
+
         };
 
         inline batch<uint64_t, 2> shift_left(const batch<uint64_t, 2>& lhs, int32_t n)
diff --git a/include/xsimd/types/xsimd_neon_uint8.hpp b/include/xsimd/types/xsimd_neon_uint8.hpp
@@ -319,6 +319,26 @@ namespace xsimd
             {
                 return vbslq_u8(cond, a, b);
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip1q_u8(lhs, rhs);
+#else
+                uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs));
+                return vcombine_u8(tmp.val[0], tmp.val[1]);
+#endif
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+#if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION
+                return vzip2q_u8(lhs, rhs);
+#else
+                uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs));
+                return vcombine_u8(tmp.val[0], tmp.val[1]);
+#endif
+            }
         };
     }
 
diff --git a/include/xsimd/types/xsimd_sse_double.hpp b/include/xsimd/types/xsimd_sse_double.hpp
@@ -645,6 +645,16 @@ namespace xsimd
             {
                 return _mm_cmpunord_pd(x, x);
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+                return _mm_unpacklo_pd(lhs, rhs);
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+                return _mm_unpackhi_pd(lhs, rhs);
+            }
         };
     }
 }
diff --git a/include/xsimd/types/xsimd_sse_float.hpp b/include/xsimd/types/xsimd_sse_float.hpp
@@ -708,6 +708,16 @@ namespace xsimd
             {
                 return _mm_cmpunord_ps(x, x);
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+                return _mm_unpacklo_ps(lhs, rhs);
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+                return _mm_unpackhi_ps(lhs, rhs);
+            }
         };
     }
 }
diff --git a/include/xsimd/types/xsimd_sse_int16.hpp b/include/xsimd/types/xsimd_sse_int16.hpp
@@ -192,6 +192,16 @@ namespace xsimd
                 return _mm_or_si128(_mm_and_si128(cond, a), _mm_andnot_si128(cond, b));
 #endif
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+                return _mm_unpacklo_epi16(lhs, rhs);
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+                return _mm_unpackhi_epi16(lhs, rhs);
+            }
         };
 
         template <>
diff --git a/include/xsimd/types/xsimd_sse_int32.hpp b/include/xsimd/types/xsimd_sse_int32.hpp
@@ -396,6 +396,16 @@ namespace xsimd
                 return _mm_or_si128(_mm_and_si128(cond, a), _mm_andnot_si128(cond, b));
 #endif
             }
+
+            static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
+            {
+                return _mm_unpacklo_epi32(lhs, rhs);
+            }
+
+            static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
+            {
+                return _mm_unpackhi_epi32(lhs, rhs);
+            }
         };
 
         template <>
diff --git a/include/xsimd/types/xsimd_sse_int64.hpp b/include/xsimd/types/xsimd_sse_int64.hpp
diff --git a/include/xsimd/types/xsimd_sse_int8.hpp b/include/xsimd/types/xsimd_sse_int8.hpp