|
18 | 18 | //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions |
19 | 19 | //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate |
20 | 20 |
|
21 | | -use core::hint::unreachable_unchecked; |
| 21 | + |
22 | 22 |
|
23 | 23 | use crate::core_arch::{simd::*, x86::*}; |
24 | 24 | use crate::intrinsics::simd::*; |
@@ -170,158 +170,74 @@ pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i { |
170 | 170 | #[stable(feature = "simd_x86", since = "1.27.0")] |
171 | 171 | pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i { |
172 | 172 | static_assert_uimm_bits!(IMM8, 8); |
| 173 | + |
| 174 | + |
| 175 | + // If palignr is shifting the pair of vectors more than the size of two |
| 176 | + // lanes, emit zero. |
| 177 | + if IMM8 >= 32 { |
| 178 | + return _mm256_setzero_si256(); |
| 179 | + } |
| 180 | + // If palignr is shifting the pair of input vectors more than one lane, |
| 181 | + // but less than two lanes, convert to shifting in zeroes. |
| 182 | + let (a, b) = if IMM8 > 16 { |
| 183 | + (_mm256_setzero_si256(), a) |
| 184 | + } else { |
| 185 | + (a, b) |
| 186 | + }; |
173 | 187 | unsafe { |
174 | | - // If palignr is shifting the pair of vectors more than the size of two |
175 | | - // lanes, emit zero. |
176 | | - if IMM8 >= 32 { |
177 | | - return _mm256_setzero_si256(); |
178 | | - } |
179 | | - // If palignr is shifting the pair of input vectors more than one lane, |
180 | | - // but less than two lanes, convert to shifting in zeroes. |
181 | | - let (a, b) = if IMM8 > 16 { |
182 | | - (_mm256_setzero_si256(), a) |
183 | | - } else { |
184 | | - (a, b) |
185 | | - }; |
186 | | - |
187 | | - let a = a.as_i8x32(); |
188 | | - let b = b.as_i8x32(); |
189 | | - |
190 | | - if IMM8 == 16 { |
191 | | - return transmute(a); |
192 | | - } |
193 | | - |
194 | | - let r: i8x32 = match IMM8 % 16 { |
195 | | - 0 => simd_shuffle!( |
196 | | - b, |
197 | | - a, |
198 | | - [ |
199 | | - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, |
200 | | - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, |
201 | | - ], |
202 | | - ), |
203 | | - 1 => simd_shuffle!( |
204 | | - b, |
205 | | - a, |
206 | | - [ |
207 | | - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, |
208 | | - 23, 24, 25, 26, 27, 28, 29, 30, 31, 48, |
209 | | - ], |
210 | | - ), |
211 | | - 2 => simd_shuffle!( |
212 | | - b, |
213 | | - a, |
214 | | - [ |
215 | | - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, |
216 | | - 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, |
217 | | - ], |
218 | | - ), |
219 | | - 3 => simd_shuffle!( |
220 | | - b, |
221 | | - a, |
222 | | - [ |
223 | | - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, |
224 | | - 24, 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, |
225 | | - ], |
226 | | - ), |
227 | | - 4 => simd_shuffle!( |
228 | | - b, |
229 | | - a, |
230 | | - [ |
231 | | - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, |
232 | | - 25, 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, |
233 | | - ], |
234 | | - ), |
235 | | - 5 => simd_shuffle!( |
236 | | - b, |
237 | | - a, |
238 | | - [ |
239 | | - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, |
240 | | - 26, 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, |
241 | | - ], |
242 | | - ), |
243 | | - 6 => simd_shuffle!( |
244 | | - b, |
245 | | - a, |
246 | | - [ |
247 | | - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, |
248 | | - 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, |
249 | | - ], |
250 | | - ), |
251 | | - 7 => simd_shuffle!( |
252 | | - b, |
253 | | - a, |
254 | | - [ |
255 | | - 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, |
256 | | - 27, 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, |
257 | | - ], |
258 | | - ), |
259 | | - 8 => simd_shuffle!( |
260 | | - b, |
261 | | - a, |
262 | | - [ |
263 | | - 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, |
264 | | - 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, |
265 | | - ], |
266 | | - ), |
267 | | - 9 => simd_shuffle!( |
268 | | - b, |
269 | | - a, |
270 | | - [ |
271 | | - 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, |
272 | | - 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, |
273 | | - ], |
274 | | - ), |
275 | | - 10 => simd_shuffle!( |
276 | | - b, |
277 | | - a, |
278 | | - [ |
279 | | - 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, |
280 | | - 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, |
281 | | - ], |
282 | | - ), |
283 | | - 11 => simd_shuffle!( |
284 | | - b, |
285 | | - a, |
286 | | - [ |
287 | | - 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, |
288 | | - 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, |
289 | | - ], |
290 | | - ), |
291 | | - 12 => simd_shuffle!( |
292 | | - b, |
293 | | - a, |
294 | | - [ |
295 | | - 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, |
296 | | - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, |
297 | | - ], |
298 | | - ), |
299 | | - 13 => simd_shuffle!( |
300 | | - b, |
301 | | - a, |
302 | | - [ |
303 | | - 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, |
304 | | - 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, |
305 | | - ], |
306 | | - ), |
307 | | - 14 => simd_shuffle!( |
308 | | - b, |
309 | | - a, |
310 | | - [ |
311 | | - 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, |
312 | | - 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, |
313 | | - ], |
314 | | - ), |
315 | | - 15 => simd_shuffle!( |
316 | | - b, |
317 | | - a, |
318 | | - [ |
319 | | - 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, |
320 | | - 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, |
321 | | - ], |
322 | | - ), |
323 | | - _ => unreachable_unchecked(), |
324 | | - }; |
| 188 | + if IMM8 == 16 { |
| 189 | + return transmute(a) |
| 190 | + } |
| 191 | + } |
| 192 | + const fn mask(shift: u32, i: u32) -> u32 { |
| 193 | + let shift = shift % 16; |
| 194 | + let mod_i = i%16; |
| 195 | + if mod_i < (16 - shift) { |
| 196 | + i + shift |
| 197 | + } else { |
| 198 | + i + 16 + shift |
| 199 | + } |
| 200 | + } |
| 201 | + |
| 202 | + unsafe { |
| 203 | + let r: i8x32 = simd_shuffle!( |
| 204 | + b.as_i8x32(), |
| 205 | + a.as_i8x32(), |
| 206 | + [ |
| 207 | + mask(IMM8 as u32, 0), |
| 208 | + mask(IMM8 as u32, 1), |
| 209 | + mask(IMM8 as u32, 2), |
| 210 | + mask(IMM8 as u32, 3), |
| 211 | + mask(IMM8 as u32, 4), |
| 212 | + mask(IMM8 as u32, 5), |
| 213 | + mask(IMM8 as u32, 6), |
| 214 | + mask(IMM8 as u32, 7), |
| 215 | + mask(IMM8 as u32, 8), |
| 216 | + mask(IMM8 as u32, 9), |
| 217 | + mask(IMM8 as u32, 10), |
| 218 | + mask(IMM8 as u32, 11), |
| 219 | + mask(IMM8 as u32, 12), |
| 220 | + mask(IMM8 as u32, 13), |
| 221 | + mask(IMM8 as u32, 14), |
| 222 | + mask(IMM8 as u32, 15), |
| 223 | + mask(IMM8 as u32, 16), |
| 224 | + mask(IMM8 as u32, 17), |
| 225 | + mask(IMM8 as u32, 18), |
| 226 | + mask(IMM8 as u32, 19), |
| 227 | + mask(IMM8 as u32, 20), |
| 228 | + mask(IMM8 as u32, 21), |
| 229 | + mask(IMM8 as u32, 22), |
| 230 | + mask(IMM8 as u32, 23), |
| 231 | + mask(IMM8 as u32, 24), |
| 232 | + mask(IMM8 as u32, 25), |
| 233 | + mask(IMM8 as u32, 26), |
| 234 | + mask(IMM8 as u32, 27), |
| 235 | + mask(IMM8 as u32, 28), |
| 236 | + mask(IMM8 as u32, 29), |
| 237 | + mask(IMM8 as u32, 30), |
| 238 | + mask(IMM8 as u32, 31), |
| 239 | + ], |
| 240 | + ); |
325 | 241 | transmute(r) |
326 | 242 | } |
327 | 243 | } |
|
0 commit comments