|
3 | 3 | use core::ascii::EscapeDefault; |
4 | 4 |
|
5 | 5 | use crate::fmt::{self, Write}; |
6 | | -#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))] |
7 | 6 | use crate::intrinsics::const_eval_select; |
8 | 7 | use crate::{ascii, iter, ops}; |
9 | 8 |
|
@@ -327,175 +326,52 @@ impl<'a> fmt::Debug for EscapeAscii<'a> { |
327 | 326 | } |
328 | 327 | } |
329 | 328 |
|
330 | | -/// ASCII test *without* the chunk-at-a-time optimizations. |
331 | | -/// |
332 | | -/// This is carefully structured to produce nice small code -- it's smaller in |
333 | | -/// `-O` than what the "obvious" ways produces under `-C opt-level=s`. If you |
334 | | -/// touch it, be sure to run (and update if needed) the assembly test. |
335 | | -#[unstable(feature = "str_internals", issue = "none")] |
336 | | -#[doc(hidden)] |
337 | 329 | #[inline] |
338 | | -pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool { |
339 | | - while let [rest @ .., last] = bytes { |
340 | | - if !last.is_ascii() { |
| 330 | +const fn is_ascii_const(mut bytes: &[u8]) -> bool { |
| 331 | + while let [first, rest @ ..] = bytes { |
| 332 | + if !first.is_ascii() { |
341 | 333 | break; |
342 | 334 | } |
343 | 335 | bytes = rest; |
344 | 336 | } |
345 | 337 | bytes.is_empty() |
346 | 338 | } |
347 | 339 |
|
| 340 | +/// The implementation using iterators produces a tighter loop than the |
| 341 | +/// implementation using pattern-matching when inlined into `is_ascii_chunked`. |
| 342 | +/// So we have duplicate implementations of the scalar case until iterators are |
| 343 | +/// usable in const contexts. |
| 344 | +#[inline(always)] |
| 345 | +fn is_ascii_scalar(bytes: &[u8]) -> bool { |
| 346 | + bytes.iter().all(u8::is_ascii) |
| 347 | +} |
| 348 | + |
348 | 349 | /// Optimized ASCII test that will use usize-at-a-time operations instead of |
349 | 350 | /// byte-at-a-time operations (when possible). |
350 | | -/// |
351 | | -/// The algorithm we use here is pretty simple. If `s` is too short, we just |
352 | | -/// check each byte and be done with it. Otherwise: |
353 | | -/// |
354 | | -/// - Read the first word with an unaligned load. |
355 | | -/// - Align the pointer, read subsequent words until end with aligned loads. |
356 | | -/// - Read the last `usize` from `s` with an unaligned load. |
357 | | -/// |
358 | | -/// If any of these loads produces something for which `contains_nonascii` |
359 | | -/// (above) returns true, then we know the answer is false. |
360 | | -#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))] |
361 | 351 | #[inline] |
362 | 352 | #[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior |
363 | | -const fn is_ascii(s: &[u8]) -> bool { |
| 353 | +const fn is_ascii(bytes: &[u8]) -> bool { |
364 | 354 | // The runtime version behaves the same as the compiletime version, it's |
365 | 355 | // just more optimized. |
366 | 356 | const_eval_select!( |
367 | | - @capture { s: &[u8] } -> bool: |
| 357 | + @capture { bytes: &[u8] } -> bool: |
368 | 358 | if const { |
369 | | - is_ascii_simple(s) |
| 359 | + is_ascii_const(bytes) |
370 | 360 | } else { |
371 | | - /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed |
372 | | - /// from `../str/mod.rs`, which does something similar for utf8 validation. |
373 | | - const fn contains_nonascii(v: usize) -> bool { |
374 | | - const NONASCII_MASK: usize = usize::repeat_u8(0x80); |
375 | | - (NONASCII_MASK & v) != 0 |
376 | | - } |
377 | | - |
378 | | - const USIZE_SIZE: usize = size_of::<usize>(); |
379 | | - |
380 | | - let len = s.len(); |
381 | | - let align_offset = s.as_ptr().align_offset(USIZE_SIZE); |
382 | | - |
383 | | - // If we wouldn't gain anything from the word-at-a-time implementation, fall |
384 | | - // back to a scalar loop. |
385 | | - // |
386 | | - // We also do this for architectures where `size_of::<usize>()` isn't |
387 | | - // sufficient alignment for `usize`, because it's a weird edge case. |
388 | | - if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of::<usize>() { |
389 | | - return is_ascii_simple(s); |
390 | | - } |
391 | | - |
392 | | - // We always read the first word unaligned, which means `align_offset` is |
393 | | - // 0, we'd read the same value again for the aligned read. |
394 | | - let offset_to_aligned = if align_offset == 0 { USIZE_SIZE } else { align_offset }; |
395 | | - |
396 | | - let start = s.as_ptr(); |
397 | | - // SAFETY: We verify `len < USIZE_SIZE` above. |
398 | | - let first_word = unsafe { (start as *const usize).read_unaligned() }; |
399 | | - |
400 | | - if contains_nonascii(first_word) { |
401 | | - return false; |
402 | | - } |
403 | | - // We checked this above, somewhat implicitly. Note that `offset_to_aligned` |
404 | | - // is either `align_offset` or `USIZE_SIZE`, both of are explicitly checked |
405 | | - // above. |
406 | | - debug_assert!(offset_to_aligned <= len); |
407 | | - |
408 | | - // SAFETY: word_ptr is the (properly aligned) usize ptr we use to read the |
409 | | - // middle chunk of the slice. |
410 | | - let mut word_ptr = unsafe { start.add(offset_to_aligned) as *const usize }; |
411 | | - |
412 | | - // `byte_pos` is the byte index of `word_ptr`, used for loop end checks. |
413 | | - let mut byte_pos = offset_to_aligned; |
414 | | - |
415 | | - // Paranoia check about alignment, since we're about to do a bunch of |
416 | | - // unaligned loads. In practice this should be impossible barring a bug in |
417 | | - // `align_offset` though. |
418 | | - // While this method is allowed to spuriously fail in CTFE, if it doesn't |
419 | | - // have alignment information it should have given a `usize::MAX` for |
420 | | - // `align_offset` earlier, sending things through the scalar path instead of |
421 | | - // this one, so this check should pass if it's reachable. |
422 | | - debug_assert!(word_ptr.is_aligned_to(align_of::<usize>())); |
423 | | - |
424 | | - // Read subsequent words until the last aligned word, excluding the last |
425 | | - // aligned word by itself to be done in tail check later, to ensure that |
426 | | - // tail is always one `usize` at most to extra branch `byte_pos == len`. |
427 | | - while byte_pos < len - USIZE_SIZE { |
428 | | - // Sanity check that the read is in bounds |
429 | | - debug_assert!(byte_pos + USIZE_SIZE <= len); |
430 | | - // And that our assumptions about `byte_pos` hold. |
431 | | - debug_assert!(word_ptr.cast::<u8>() == start.wrapping_add(byte_pos)); |
432 | | - |
433 | | - // SAFETY: We know `word_ptr` is properly aligned (because of |
434 | | - // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end |
435 | | - let word = unsafe { word_ptr.read() }; |
436 | | - if contains_nonascii(word) { |
437 | | - return false; |
438 | | - } |
439 | | - |
440 | | - byte_pos += USIZE_SIZE; |
441 | | - // SAFETY: We know that `byte_pos <= len - USIZE_SIZE`, which means that |
442 | | - // after this `add`, `word_ptr` will be at most one-past-the-end. |
443 | | - word_ptr = unsafe { word_ptr.add(1) }; |
444 | | - } |
445 | | - |
446 | | - // Sanity check to ensure there really is only one `usize` left. This should |
447 | | - // be guaranteed by our loop condition. |
448 | | - debug_assert!(byte_pos <= len && len - byte_pos <= USIZE_SIZE); |
449 | | - |
450 | | - // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start. |
451 | | - let last_word = unsafe { (start.add(len - USIZE_SIZE) as *const usize).read_unaligned() }; |
452 | | - |
453 | | - !contains_nonascii(last_word) |
| 361 | + const CHUNK_SIZE: usize = if cfg!(all(target_arch = "x86_64", target_feature = "sse2")) { |
| 362 | + 4 * size_of::<usize>() |
| 363 | + } else { |
| 364 | + 2 * size_of::<usize>() |
| 365 | + }; |
| 366 | + is_ascii_chunked::<CHUNK_SIZE>(bytes) |
454 | 367 | } |
455 | 368 | ) |
456 | 369 | } |
457 | 370 |
|
458 | | -/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64` |
459 | | -/// platforms. |
460 | | -/// |
461 | | -/// Other platforms are not likely to benefit from this code structure, so they |
462 | | -/// use SWAR techniques to test for ASCII in `usize`-sized chunks. |
463 | | -#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] |
| 371 | +/// Test for ASCII-ness `CHUNK_SIZE` bytes at a time. |
| 372 | +/// This loop should be simple enough that LLVM can auto-vectorise it. |
464 | 373 | #[inline] |
465 | | -const fn is_ascii(bytes: &[u8]) -> bool { |
466 | | - // Process chunks of 32 bytes at a time in the fast path to enable |
467 | | - // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers |
468 | | - // can be OR'd together and then the resulting vector can be tested for |
469 | | - // non-ASCII bytes. |
470 | | - const CHUNK_SIZE: usize = 32; |
471 | | - |
472 | | - let mut i = 0; |
473 | | - |
474 | | - while i + CHUNK_SIZE <= bytes.len() { |
475 | | - let chunk_end = i + CHUNK_SIZE; |
476 | | - |
477 | | - // Get LLVM to produce a `pmovmskb` instruction on x86-64 which |
478 | | - // creates a mask from the most significant bit of each byte. |
479 | | - // ASCII bytes are less than 128 (0x80), so their most significant |
480 | | - // bit is unset. |
481 | | - let mut count = 0; |
482 | | - while i < chunk_end { |
483 | | - count += bytes[i].is_ascii() as u8; |
484 | | - i += 1; |
485 | | - } |
486 | | - |
487 | | - // All bytes should be <= 127 so count is equal to chunk size. |
488 | | - if count != CHUNK_SIZE as u8 { |
489 | | - return false; |
490 | | - } |
491 | | - } |
492 | | - |
493 | | - // Process the remaining `bytes.len() % N` bytes. |
494 | | - let mut is_ascii = true; |
495 | | - while i < bytes.len() { |
496 | | - is_ascii &= bytes[i].is_ascii(); |
497 | | - i += 1; |
498 | | - } |
499 | | - |
500 | | - is_ascii |
| 374 | +fn is_ascii_chunked<const CHUNK_SIZE: usize>(bytes: &[u8]) -> bool { |
| 375 | + let (chunks, remainder) = bytes.as_chunks::<CHUNK_SIZE>(); |
| 376 | + chunks.iter().all(|chunk| is_ascii_scalar(chunk)) && is_ascii_scalar(remainder) |
501 | 377 | } |
0 commit comments