|
44 | 44 | ))] |
45 | 45 | 8 => transize(vtbl1_u8, self, idxs), |
46 | 46 | #[cfg(target_feature = "ssse3")] |
47 | | - 16 => transize(x86::_mm_shuffle_epi8, self, idxs), |
| 47 | + 16 => transize(x86::_mm_shuffle_epi8, self, zeroing_idxs(idxs)), |
48 | 48 | #[cfg(target_feature = "simd128")] |
49 | 49 | 16 => transize(wasm::i8x16_swizzle, self, idxs), |
50 | 50 | #[cfg(all( |
|
54 | 54 | ))] |
55 | 55 | 16 => transize(vqtbl1q_u8, self, idxs), |
56 | 56 | #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))] |
57 | | - 32 => transize_raw(avx2_pshufb, self, idxs), |
| 57 | + 32 => transize(avx2_pshufb, self, idxs), |
58 | 58 | #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))] |
59 | | - 32 => transize(x86::_mm256_permutexvar_epi8, self, idxs), |
| 59 | + 32 => transize(x86::_mm256_permutexvar_epi8, zeroing_idxs(idxs), self), |
60 | 60 | // Notable absence: avx512bw shuffle |
61 | 61 | // If avx512bw is available, odds of avx512vbmi are good |
62 | 62 | // FIXME: initial AVX512VBMI variant didn't actually pass muster |
@@ -129,45 +129,25 @@ unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> { |
129 | 129 | #[inline(always)] |
130 | 130 | unsafe fn transize<T, const N: usize>( |
131 | 131 | f: unsafe fn(T, T) -> T, |
132 | | - bytes: Simd<u8, N>, |
133 | | - idxs: Simd<u8, N>, |
| 132 | + a: Simd<u8, N>, |
| 133 | + b: Simd<u8, N>, |
134 | 134 | ) -> Simd<u8, N> |
135 | 135 | where |
136 | 136 | LaneCount<N>: SupportedLaneCount, |
137 | 137 | { |
138 | | - let idxs = zeroing_idxs(idxs); |
139 | 138 | // SAFETY: Same obligation to use this function as to use mem::transmute_copy. |
140 | | - unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } |
| 139 | + unsafe { mem::transmute_copy(&f(mem::transmute_copy(&a), mem::transmute_copy(&b))) } |
141 | 140 | } |
142 | 141 |
|
143 | | -/// Make indices that yield 0 for this architecture |
| 142 | +/// Make indices that yield 0 for x86 |
| 143 | +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| 144 | +#[allow(unused)] |
144 | 145 | #[inline(always)] |
145 | 146 | fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N> |
146 | 147 | where |
147 | 148 | LaneCount<N>: SupportedLaneCount, |
148 | 149 | { |
149 | | - // On x86, make sure the top bit is set. |
150 | | - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
151 | | - let idxs = { |
152 | | - use crate::simd::cmp::SimdPartialOrd; |
153 | | - idxs.simd_lt(Simd::splat(N as u8)) |
154 | | - .select(idxs, Simd::splat(u8::MAX)) |
155 | | - }; |
156 | | - // Simply do nothing on most architectures. |
157 | | - idxs |
158 | | -} |
159 | | - |
160 | | -/// As transize but no implicit call to `zeroing_idxs`. |
161 | | -#[allow(dead_code)] |
162 | | -#[inline(always)] |
163 | | -unsafe fn transize_raw<T, const N: usize>( |
164 | | - f: unsafe fn(T, T) -> T, |
165 | | - bytes: Simd<u8, N>, |
166 | | - idxs: Simd<u8, N>, |
167 | | -) -> Simd<u8, N> |
168 | | -where |
169 | | - LaneCount<N>: SupportedLaneCount, |
170 | | -{ |
171 | | - // SAFETY: Same obligation to use this function as to use mem::transmute_copy. |
172 | | - unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) } |
| 150 | + use crate::simd::cmp::SimdPartialOrd; |
| 151 | + idxs.simd_lt(Simd::splat(N as u8)) |
| 152 | + .select(idxs, Simd::splat(u8::MAX)) |
173 | 153 | } |
0 commit comments