@@ -131,7 +131,7 @@ unsafe fn foo_avx2() {
131131 #[cfg(target_arch = "x86_64")]
132132 use std::arch::x86_64::_mm256_add_epi64;
133133
134- _mm256_add_epi64(...);
134+ unsafe { _mm256_add_epi64(...); }
135135}
136136```
137137
@@ -287,47 +287,49 @@ unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) {
287287 #[cfg(target_arch = "x86_64")]
288288 use std::arch::x86_64::*;
289289
290- let ascii_zero = _mm_set1_epi8(b'0' as i8);
291- let nines = _mm_set1_epi8(9);
292- let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
293- let and4bits = _mm_set1_epi8(0xf);
294-
295- let mut i = 0_isize;
296- while src.len() >= 16 {
297- let invec = _mm_loadu_si128(src.as_ptr() as *const _);
298-
299- let masked1 = _mm_and_si128(invec, and4bits);
300- let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
301-
302- // return 0xff corresponding to the elements > 9, or 0x00 otherwise
303- let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
304- let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
305-
306- // add '0' or the offset depending on the masks
307- let masked1 = _mm_add_epi8(
308- masked1,
309- _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
310- );
311- let masked2 = _mm_add_epi8(
312- masked2,
313- _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
314- );
315-
316- // interleave masked1 and masked2 bytes
317- let res1 = _mm_unpacklo_epi8(masked2, masked1);
318- let res2 = _mm_unpackhi_epi8(masked2, masked1);
319-
320- _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
321- _mm_storeu_si128(
322- dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
323- res2,
324- );
325- src = &src[16..];
326- i += 16;
327- }
290+ unsafe {
291+ let ascii_zero = _mm_set1_epi8(b'0' as i8);
292+ let nines = _mm_set1_epi8(9);
293+ let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
294+ let and4bits = _mm_set1_epi8(0xf);
295+
296+ let mut i = 0_isize;
297+ while src.len() >= 16 {
298+ let invec = _mm_loadu_si128(src.as_ptr() as *const _);
299+
300+ let masked1 = _mm_and_si128(invec, and4bits);
301+ let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
302+
303+ // return 0xff corresponding to the elements > 9, or 0x00 otherwise
304+ let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
305+ let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
306+
307+ // add '0' or the offset depending on the masks
308+ let masked1 = _mm_add_epi8(
309+ masked1,
310+ _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
311+ );
312+ let masked2 = _mm_add_epi8(
313+ masked2,
314+ _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
315+ );
316+
317+ // interleave masked1 and masked2 bytes
318+ let res1 = _mm_unpacklo_epi8(masked2, masked1);
319+ let res2 = _mm_unpackhi_epi8(masked2, masked1);
320+
321+ _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
322+ _mm_storeu_si128(
323+ dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
324+ res2,
325+ );
326+ src = &src[16..];
327+ i += 16;
328+ }
328329
329- let i = i as usize;
330- hex_encode_fallback(src, &mut dst[i * 2..]);
330+ let i = i as usize;
331+ hex_encode_fallback(src, &mut dst[i * 2..]);
332+ }
331333}
332334
333335fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) {
0 commit comments