@@ -327,175 +327,52 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
327327 }
328328}
329329
330- /// ASCII test *without* the chunk-at-a-time optimizations.
331- ///
332- /// This is carefully structured to produce nice small code -- it's smaller in
333- /// `-O` than what the "obvious" ways produces under `-C opt-level=s`. If you
334- /// touch it, be sure to run (and update if needed) the assembly test.
335- #[ unstable( feature = "str_internals" , issue = "none" ) ]
336- #[ doc( hidden) ]
337330#[ inline]
338- pub const fn is_ascii_simple ( mut bytes : & [ u8 ] ) -> bool {
339- while let [ rest @ .., last ] = bytes {
340- if !last . is_ascii ( ) {
331+ const fn is_ascii_const ( mut bytes : & [ u8 ] ) -> bool {
332+ while let [ first , rest @ ..] = bytes {
333+ if !first . is_ascii ( ) {
341334 break ;
342335 }
343336 bytes = rest;
344337 }
345338 bytes. is_empty ( )
346339}
347340
341+ /// The implementation using iterators produces a tighter loop than the
342+ /// implementation using pattern-matching when inlined into `is_ascii_chunked`.
343+ /// So we have duplicate implementations of the scalar case until iterators are
344+ /// usable in const contexts.
345+ #[ inline( always) ]
346+ fn is_ascii_scalar ( bytes : & [ u8 ] ) -> bool {
347+ bytes. iter ( ) . all ( u8:: is_ascii)
348+ }
349+
348350/// Optimized ASCII test that will use usize-at-a-time operations instead of
349351/// byte-at-a-time operations (when possible).
350- ///
351- /// The algorithm we use here is pretty simple. If `s` is too short, we just
352- /// check each byte and be done with it. Otherwise:
353- ///
354- /// - Read the first word with an unaligned load.
355- /// - Align the pointer, read subsequent words until end with aligned loads.
356- /// - Read the last `usize` from `s` with an unaligned load.
357- ///
358- /// If any of these loads produces something for which `contains_nonascii`
359- /// (above) returns true, then we know the answer is false.
360- #[ cfg( not( all( target_arch = "x86_64" , target_feature = "sse2" ) ) ) ]
361352#[ inline]
362353#[ rustc_allow_const_fn_unstable( const_eval_select) ] // fallback impl has same behavior
363- const fn is_ascii ( s : & [ u8 ] ) -> bool {
354+ const fn is_ascii ( bytes : & [ u8 ] ) -> bool {
364355 // The runtime version behaves the same as the compiletime version, it's
365356 // just more optimized.
366357 const_eval_select ! (
367- @capture { s : & [ u8 ] } -> bool :
358+ @capture { bytes : & [ u8 ] } -> bool :
368359 if const {
369- is_ascii_simple ( s )
360+ is_ascii_const ( bytes )
370361 } else {
371- /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
372- /// from `../str/mod.rs`, which does something similar for utf8 validation.
373- const fn contains_nonascii( v: usize ) -> bool {
374- const NONASCII_MASK : usize = usize :: repeat_u8( 0x80 ) ;
375- ( NONASCII_MASK & v) != 0
376- }
377-
378- const USIZE_SIZE : usize = size_of:: <usize >( ) ;
379-
380- let len = s. len( ) ;
381- let align_offset = s. as_ptr( ) . align_offset( USIZE_SIZE ) ;
382-
383- // If we wouldn't gain anything from the word-at-a-time implementation, fall
384- // back to a scalar loop.
385- //
386- // We also do this for architectures where `size_of::<usize>()` isn't
387- // sufficient alignment for `usize`, because it's a weird edge case.
388- if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of:: <usize >( ) {
389- return is_ascii_simple( s) ;
390- }
391-
392- // We always read the first word unaligned, which means `align_offset` is
393- // 0, we'd read the same value again for the aligned read.
394- let offset_to_aligned = if align_offset == 0 { USIZE_SIZE } else { align_offset } ;
395-
396- let start = s. as_ptr( ) ;
397- // SAFETY: We verify `len < USIZE_SIZE` above.
398- let first_word = unsafe { ( start as * const usize ) . read_unaligned( ) } ;
399-
400- if contains_nonascii( first_word) {
401- return false ;
402- }
403- // We checked this above, somewhat implicitly. Note that `offset_to_aligned`
404- // is either `align_offset` or `USIZE_SIZE`, both of are explicitly checked
405- // above.
406- debug_assert!( offset_to_aligned <= len) ;
407-
408- // SAFETY: word_ptr is the (properly aligned) usize ptr we use to read the
409- // middle chunk of the slice.
410- let mut word_ptr = unsafe { start. add( offset_to_aligned) as * const usize } ;
411-
412- // `byte_pos` is the byte index of `word_ptr`, used for loop end checks.
413- let mut byte_pos = offset_to_aligned;
414-
415- // Paranoia check about alignment, since we're about to do a bunch of
416- // unaligned loads. In practice this should be impossible barring a bug in
417- // `align_offset` though.
418- // While this method is allowed to spuriously fail in CTFE, if it doesn't
419- // have alignment information it should have given a `usize::MAX` for
420- // `align_offset` earlier, sending things through the scalar path instead of
421- // this one, so this check should pass if it's reachable.
422- debug_assert!( word_ptr. is_aligned_to( align_of:: <usize >( ) ) ) ;
423-
424- // Read subsequent words until the last aligned word, excluding the last
425- // aligned word by itself to be done in tail check later, to ensure that
426- // tail is always one `usize` at most to extra branch `byte_pos == len`.
427- while byte_pos < len - USIZE_SIZE {
428- // Sanity check that the read is in bounds
429- debug_assert!( byte_pos + USIZE_SIZE <= len) ;
430- // And that our assumptions about `byte_pos` hold.
431- debug_assert!( word_ptr. cast:: <u8 >( ) == start. wrapping_add( byte_pos) ) ;
432-
433- // SAFETY: We know `word_ptr` is properly aligned (because of
434- // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end
435- let word = unsafe { word_ptr. read( ) } ;
436- if contains_nonascii( word) {
437- return false ;
438- }
439-
440- byte_pos += USIZE_SIZE ;
441- // SAFETY: We know that `byte_pos <= len - USIZE_SIZE`, which means that
442- // after this `add`, `word_ptr` will be at most one-past-the-end.
443- word_ptr = unsafe { word_ptr. add( 1 ) } ;
444- }
445-
446- // Sanity check to ensure there really is only one `usize` left. This should
447- // be guaranteed by our loop condition.
448- debug_assert!( byte_pos <= len && len - byte_pos <= USIZE_SIZE ) ;
449-
450- // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start.
451- let last_word = unsafe { ( start. add( len - USIZE_SIZE ) as * const usize ) . read_unaligned( ) } ;
452-
453- !contains_nonascii( last_word)
362+ const CHUNK_SIZE : usize = if cfg!( all( target_arch = "x86_64" , target_feature = "sse2" ) ) {
363+ 4 * size_of:: <usize >( )
364+ } else {
365+ 2 * size_of:: <usize >( )
366+ } ;
367+ is_ascii_chunked:: <CHUNK_SIZE >( bytes)
454368 }
455369 )
456370}
457371
458- /// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64`
459- /// platforms.
460- ///
461- /// Other platforms are not likely to benefit from this code structure, so they
462- /// use SWAR techniques to test for ASCII in `usize`-sized chunks.
463- #[ cfg( all( target_arch = "x86_64" , target_feature = "sse2" ) ) ]
372+ /// Test for ASCII-ness `CHUNK_SIZE` bytes at a time.
373+ /// This loop should be simple enough that LLVM can auto-vectorise it.
464374#[ inline]
465- const fn is_ascii ( bytes : & [ u8 ] ) -> bool {
466- // Process chunks of 32 bytes at a time in the fast path to enable
467- // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
468- // can be OR'd together and then the resulting vector can be tested for
469- // non-ASCII bytes.
470- const CHUNK_SIZE : usize = 32 ;
471-
472- let mut i = 0 ;
473-
474- while i + CHUNK_SIZE <= bytes. len ( ) {
475- let chunk_end = i + CHUNK_SIZE ;
476-
477- // Get LLVM to produce a `pmovmskb` instruction on x86-64 which
478- // creates a mask from the most significant bit of each byte.
479- // ASCII bytes are less than 128 (0x80), so their most significant
480- // bit is unset.
481- let mut count = 0 ;
482- while i < chunk_end {
483- count += bytes[ i] . is_ascii ( ) as u8 ;
484- i += 1 ;
485- }
486-
487- // All bytes should be <= 127 so count is equal to chunk size.
488- if count != CHUNK_SIZE as u8 {
489- return false ;
490- }
491- }
492-
493- // Process the remaining `bytes.len() % N` bytes.
494- let mut is_ascii = true ;
495- while i < bytes. len ( ) {
496- is_ascii &= bytes[ i] . is_ascii ( ) ;
497- i += 1 ;
498- }
499-
500- is_ascii
375+ fn is_ascii_chunked < const CHUNK_SIZE : usize > ( bytes : & [ u8 ] ) -> bool {
376+ let ( chunks, remainder) = bytes. as_chunks :: < CHUNK_SIZE > ( ) ;
377+ chunks. iter ( ) . all ( |chunk| is_ascii_scalar ( chunk) ) && is_ascii_scalar ( remainder)
501378}
0 commit comments