1111mod error;
1212mod iter;
1313mod traits;
14+ mod validations;
1415
1516use self :: pattern:: Pattern ;
1617use self :: pattern:: { DoubleEndedSearcher , ReverseSearcher , Searcher } ;
@@ -62,10 +63,15 @@ pub use iter::SplitAsciiWhitespace;
6263#[ unstable( feature = "split_inclusive" , issue = "72360" ) ]
6364use iter:: SplitInclusive ;
6465
66+ #[ unstable( feature = "str_internals" , issue = "none" ) ]
67+ pub use validations:: next_code_point;
68+
6569use iter:: MatchIndicesInternal ;
6670use iter:: SplitInternal ;
6771use iter:: { MatchesInternal , SplitNInternal } ;
6872
73+ use validations:: { run_utf8_validation, truncate_to_char_boundary} ;
74+
6975/*
7076Section: Creating a string
7177*/
@@ -257,102 +263,6 @@ pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
257263 unsafe { & mut * ( v as * mut [ u8 ] as * mut str ) }
258264}
259265
260- /// Returns the initial codepoint accumulator for the first byte.
261- /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
262- /// for width 3, and 3 bits for width 4.
263- #[ inline]
264- fn utf8_first_byte ( byte : u8 , width : u32 ) -> u32 {
265- ( byte & ( 0x7F >> width) ) as u32
266- }
267-
268- /// Returns the value of `ch` updated with continuation byte `byte`.
269- #[ inline]
270- fn utf8_acc_cont_byte ( ch : u32 , byte : u8 ) -> u32 {
271- ( ch << 6 ) | ( byte & CONT_MASK ) as u32
272- }
273-
274- /// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
275- /// bits `10`).
276- #[ inline]
277- fn utf8_is_cont_byte ( byte : u8 ) -> bool {
278- ( byte & !CONT_MASK ) == TAG_CONT_U8
279- }
280-
281- #[ inline]
282- fn unwrap_or_0 ( opt : Option < & u8 > ) -> u8 {
283- match opt {
284- Some ( & byte) => byte,
285- None => 0 ,
286- }
287- }
288-
289- /// Reads the next code point out of a byte iterator (assuming a
290- /// UTF-8-like encoding).
291- #[ unstable( feature = "str_internals" , issue = "none" ) ]
292- #[ inline]
293- pub fn next_code_point < ' a , I : Iterator < Item = & ' a u8 > > ( bytes : & mut I ) -> Option < u32 > {
294- // Decode UTF-8
295- let x = * bytes. next ( ) ?;
296- if x < 128 {
297- return Some ( x as u32 ) ;
298- }
299-
300- // Multibyte case follows
301- // Decode from a byte combination out of: [[[x y] z] w]
302- // NOTE: Performance is sensitive to the exact formulation here
303- let init = utf8_first_byte ( x, 2 ) ;
304- let y = unwrap_or_0 ( bytes. next ( ) ) ;
305- let mut ch = utf8_acc_cont_byte ( init, y) ;
306- if x >= 0xE0 {
307- // [[x y z] w] case
308- // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
309- let z = unwrap_or_0 ( bytes. next ( ) ) ;
310- let y_z = utf8_acc_cont_byte ( ( y & CONT_MASK ) as u32 , z) ;
311- ch = init << 12 | y_z;
312- if x >= 0xF0 {
313- // [x y z w] case
314- // use only the lower 3 bits of `init`
315- let w = unwrap_or_0 ( bytes. next ( ) ) ;
316- ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ( y_z, w) ;
317- }
318- }
319-
320- Some ( ch)
321- }
322-
323- /// Reads the last code point out of a byte iterator (assuming a
324- /// UTF-8-like encoding).
325- #[ inline]
326- fn next_code_point_reverse < ' a , I > ( bytes : & mut I ) -> Option < u32 >
327- where
328- I : DoubleEndedIterator < Item = & ' a u8 > ,
329- {
330- // Decode UTF-8
331- let w = match * bytes. next_back ( ) ? {
332- next_byte if next_byte < 128 => return Some ( next_byte as u32 ) ,
333- back_byte => back_byte,
334- } ;
335-
336- // Multibyte case follows
337- // Decode from a byte combination out of: [x [y [z w]]]
338- let mut ch;
339- let z = unwrap_or_0 ( bytes. next_back ( ) ) ;
340- ch = utf8_first_byte ( z, 2 ) ;
341- if utf8_is_cont_byte ( z) {
342- let y = unwrap_or_0 ( bytes. next_back ( ) ) ;
343- ch = utf8_first_byte ( y, 3 ) ;
344- if utf8_is_cont_byte ( y) {
345- let x = unwrap_or_0 ( bytes. next_back ( ) ) ;
346- ch = utf8_first_byte ( x, 4 ) ;
347- ch = utf8_acc_cont_byte ( ch, y) ;
348- }
349- ch = utf8_acc_cont_byte ( ch, z) ;
350- }
351- ch = utf8_acc_cont_byte ( ch, w) ;
352-
353- Some ( ch)
354- }
355-
356266impl_fn_for_zst ! {
357267 /// A nameable, cloneable fn type
358268 #[ derive( Clone ) ]
@@ -363,184 +273,6 @@ impl_fn_for_zst! {
363273 } ;
364274}
365275
366- /*
367- Section: UTF-8 validation
368- */
369-
370- // use truncation to fit u64 into usize
371- const NONASCII_MASK : usize = 0x80808080_80808080u64 as usize ;
372-
373- /// Returns `true` if any byte in the word `x` is nonascii (>= 128).
374- #[ inline]
375- fn contains_nonascii ( x : usize ) -> bool {
376- ( x & NONASCII_MASK ) != 0
377- }
378-
379- /// Walks through `v` checking that it's a valid UTF-8 sequence,
380- /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
381- #[ inline( always) ]
382- fn run_utf8_validation ( v : & [ u8 ] ) -> Result < ( ) , Utf8Error > {
383- let mut index = 0 ;
384- let len = v. len ( ) ;
385-
386- let usize_bytes = mem:: size_of :: < usize > ( ) ;
387- let ascii_block_size = 2 * usize_bytes;
388- let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 } ;
389- let align = v. as_ptr ( ) . align_offset ( usize_bytes) ;
390-
391- while index < len {
392- let old_offset = index;
393- macro_rules! err {
394- ( $error_len: expr) => {
395- return Err ( Utf8Error { valid_up_to: old_offset, error_len: $error_len } ) ;
396- } ;
397- }
398-
399- macro_rules! next {
400- ( ) => { {
401- index += 1 ;
402- // we needed data, but there was none: error!
403- if index >= len {
404- err!( None )
405- }
406- v[ index]
407- } } ;
408- }
409-
410- let first = v[ index] ;
411- if first >= 128 {
412- let w = UTF8_CHAR_WIDTH [ first as usize ] ;
413- // 2-byte encoding is for codepoints \u{0080} to \u{07ff}
414- // first C2 80 last DF BF
415- // 3-byte encoding is for codepoints \u{0800} to \u{ffff}
416- // first E0 A0 80 last EF BF BF
417- // excluding surrogates codepoints \u{d800} to \u{dfff}
418- // ED A0 80 to ED BF BF
419- // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
420- // first F0 90 80 80 last F4 8F BF BF
421- //
422- // Use the UTF-8 syntax from the RFC
423- //
424- // https://tools.ietf.org/html/rfc3629
425- // UTF8-1 = %x00-7F
426- // UTF8-2 = %xC2-DF UTF8-tail
427- // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
428- // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
429- // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
430- // %xF4 %x80-8F 2( UTF8-tail )
431- match w {
432- 2 => {
433- if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
434- err ! ( Some ( 1 ) )
435- }
436- }
437- 3 => {
438- match ( first, next ! ( ) ) {
439- ( 0xE0 , 0xA0 ..=0xBF )
440- | ( 0xE1 ..=0xEC , 0x80 ..=0xBF )
441- | ( 0xED , 0x80 ..=0x9F )
442- | ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => { }
443- _ => err ! ( Some ( 1 ) ) ,
444- }
445- if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
446- err ! ( Some ( 2 ) )
447- }
448- }
449- 4 => {
450- match ( first, next ! ( ) ) {
451- ( 0xF0 , 0x90 ..=0xBF ) | ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) | ( 0xF4 , 0x80 ..=0x8F ) => { }
452- _ => err ! ( Some ( 1 ) ) ,
453- }
454- if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
455- err ! ( Some ( 2 ) )
456- }
457- if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
458- err ! ( Some ( 3 ) )
459- }
460- }
461- _ => err ! ( Some ( 1 ) ) ,
462- }
463- index += 1 ;
464- } else {
465- // Ascii case, try to skip forward quickly.
466- // When the pointer is aligned, read 2 words of data per iteration
467- // until we find a word containing a non-ascii byte.
468- if align != usize:: MAX && align. wrapping_sub ( index) % usize_bytes == 0 {
469- let ptr = v. as_ptr ( ) ;
470- while index < blocks_end {
471- // SAFETY: since `align - index` and `ascii_block_size` are
472- // multiples of `usize_bytes`, `block = ptr.add(index)` is
473- // always aligned with a `usize` so it's safe to dereference
474- // both `block` and `block.offset(1)`.
475- unsafe {
476- let block = ptr. add ( index) as * const usize ;
477- // break if there is a nonascii byte
478- let zu = contains_nonascii ( * block) ;
479- let zv = contains_nonascii ( * block. offset ( 1 ) ) ;
480- if zu | zv {
481- break ;
482- }
483- }
484- index += ascii_block_size;
485- }
486- // step from the point where the wordwise loop stopped
487- while index < len && v[ index] < 128 {
488- index += 1 ;
489- }
490- } else {
491- index += 1 ;
492- }
493- }
494- }
495-
496- Ok ( ( ) )
497- }
498-
499- // https://tools.ietf.org/html/rfc3629
500- static UTF8_CHAR_WIDTH : [ u8 ; 256 ] = [
501- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
502- 1 , // 0x1F
503- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
504- 1 , // 0x3F
505- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
506- 1 , // 0x5F
507- 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
508- 1 , // 0x7F
509- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
510- 0 , // 0x9F
511- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
512- 0 , // 0xBF
513- 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
514- 2 , // 0xDF
515- 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // 0xEF
516- 4 , 4 , 4 , 4 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xFF
517- ] ;
518-
519- /// Given a first byte, determines how many bytes are in this UTF-8 character.
520- #[ unstable( feature = "str_internals" , issue = "none" ) ]
521- #[ inline]
522- pub fn utf8_char_width ( b : u8 ) -> usize {
523- UTF8_CHAR_WIDTH [ b as usize ] as usize
524- }
525-
526- /// Mask of the value bits of a continuation byte.
527- const CONT_MASK : u8 = 0b0011_1111 ;
528- /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte.
529- const TAG_CONT_U8 : u8 = 0b1000_0000 ;
530-
531- // truncate `&str` to length at most equal to `max`
532- // return `true` if it were truncated, and the new str.
533- fn truncate_to_char_boundary ( s : & str , mut max : usize ) -> ( bool , & str ) {
534- if max >= s. len ( ) {
535- ( false , s)
536- } else {
537- while !s. is_char_boundary ( max) {
538- max -= 1 ;
539- }
540- ( true , & s[ ..max] )
541- }
542- }
543-
544276#[ inline( never) ]
545277#[ cold]
546278#[ track_caller]
0 commit comments