@@ -120,6 +120,87 @@ const fn contains_nonascii(x: usize) -> bool {
120120 ( x & NONASCII_MASK ) != 0
121121}
122122
123+ /// Reads the first code point out of a byte slice validating whether it’s
124+ /// valid.
125+ ///
126+ /// This is different than [`next_code_point`] in that it doesn’t assume
127+ /// argument is well-formed UTF-8-like string. Together with the character its
128+ /// encoded length is returned.
129+ ///
130+ /// If front of the bytes slice doesn’t contain valid UTF-8 bytes sequence (that
131+ /// includes a WTF-8 encoded surrogate) returns `None`.
132+ ///
133+ /// ```
134+ /// #![feature(str_internals)]
135+ /// use core::str::try_next_code_point;
136+ ///
137+ /// assert_eq!(Some(('f', 1)), try_next_code_point(b"foo".as_ref()));
138+ /// assert_eq!(Some(('Ż', 2)), try_next_code_point("Żółw".as_bytes()));
139+ /// assert_eq!(None, try_next_code_point(b"\xffoo".as_ref()));
140+ /// ```
141+ #[ unstable( feature = "str_internals" , issue = "none" ) ]
142+ #[ inline]
143+ pub const fn try_next_code_point ( bytes : & [ u8 ] ) -> Option < ( char , usize ) > {
144+ let first = match bytes. first ( ) {
145+ Some ( & byte) => byte,
146+ None => return None ,
147+ } ;
148+ let ( value, length) = if first < 0x80 {
149+ ( first as u32 , 1 )
150+ } else if let Ok ( ( cp, len) ) = try_finish_byte_sequence ( first, bytes, 0 ) {
151+ ( cp, len)
152+ } else {
153+ return None ;
154+ } ;
155+ // SAFETY: We’ve just verified value is correct Unicode scalar value.
156+ // Either ASCII (first branch of the if-else-if-else) or non-ASCII Unicode
157+ // character (second branch).
158+ Some ( ( unsafe { char:: from_u32_unchecked ( value) } , length) )
159+ }
160+
161+ /// Reads the last code point out of a byte slice validating whether it’s
162+ /// valid.
163+ ///
164+ /// This is different than `next_code_point_reverse` in that it doesn’t assume
165+ /// argument is well-formed UTF-8-like string. Together with the character its
166+ /// encoded length is returned.
167+ ///
168+ /// If back of the bytes slice doesn’t contain valid UTF-8 bytes sequence (that
169+ /// includes a WTF-8 encoded surrogate) returns `None`.
170+ ///
171+ /// ```
172+ /// #![feature(str_internals)]
173+ /// use core::str::try_next_code_point_reverse;
174+ ///
175+ /// assert_eq!(Some(('o', 1)), try_next_code_point_reverse(b"foo".as_ref()));
176+ /// assert_eq!(Some(('‽', 3)), try_next_code_point_reverse("Uh‽".as_bytes()));
177+ /// assert_eq!(None, try_next_code_point_reverse(b"foo\xff".as_ref()));
178+ /// ```
179+ #[ unstable( feature = "str_internals" , issue = "none" ) ]
180+ #[ inline]
181+ pub const fn try_next_code_point_reverse ( bytes : & [ u8 ] ) -> Option < ( char , usize ) > {
182+ let mut n = 1 ;
183+ let limit = bytes. len ( ) ;
184+ let limit = if limit < 4 { limit } else { 4 } ; // not .min(4) because of const
185+ while n <= limit && !bytes[ bytes. len ( ) - n] . is_utf8_char_boundary ( ) {
186+ n += 1 ;
187+ }
188+ if n <= limit {
189+ // It’s not clear to me why, but range indexing isn’t const here,
190+ // i.e. `&bytes[bytes.len() - n..]` doesn’t compile. Because of that
191+ // I’m resorting to unsafe block with from_raw_parts.
192+ // SAFETY: n ≤ limit ≤ bytes.len() thus bytes.len() - n ≥ 0 and we
193+ // have n remaining bytes.
194+ let bytes = unsafe { crate :: slice:: from_raw_parts ( bytes. as_ptr ( ) . add ( bytes. len ( ) - n) , n) } ;
195+ if let Some ( ( chr, len) ) = try_next_code_point ( bytes) {
196+ if n == len {
197+ return Some ( ( chr, len) ) ;
198+ }
199+ }
200+ }
201+ None
202+ }
203+
123204/// Walks through `v` checking that it's a valid UTF-8 sequence,
124205/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
125206#[ inline( always) ]
@@ -134,78 +215,13 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
134215 let align = v. as_ptr ( ) . align_offset ( usize_bytes) ;
135216
136217 while index < len {
137- let old_offset = index;
138- macro_rules! err {
139- ( $error_len: expr) => {
140- return Err ( Utf8Error { valid_up_to: old_offset, error_len: $error_len } )
141- } ;
142- }
143-
144- macro_rules! next {
145- ( ) => { {
146- index += 1 ;
147- // we needed data, but there was none: error!
148- if index >= len {
149- err!( None )
150- }
151- v[ index]
152- } } ;
153- }
154-
218+ let valid_up_to = index;
155219 let first = v[ index] ;
156220 if first >= 128 {
157- let w = utf8_char_width ( first) ;
158- // 2-byte encoding is for codepoints \u{0080} to \u{07ff}
159- // first C2 80 last DF BF
160- // 3-byte encoding is for codepoints \u{0800} to \u{ffff}
161- // first E0 A0 80 last EF BF BF
162- // excluding surrogates codepoints \u{d800} to \u{dfff}
163- // ED A0 80 to ED BF BF
164- // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
165- // first F0 90 80 80 last F4 8F BF BF
166- //
167- // Use the UTF-8 syntax from the RFC
168- //
169- // https://tools.ietf.org/html/rfc3629
170- // UTF8-1 = %x00-7F
171- // UTF8-2 = %xC2-DF UTF8-tail
172- // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
173- // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
174- // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
175- // %xF4 %x80-8F 2( UTF8-tail )
176- match w {
177- 2 => {
178- if next ! ( ) as i8 >= -64 {
179- err ! ( Some ( 1 ) )
180- }
181- }
182- 3 => {
183- match ( first, next ! ( ) ) {
184- ( 0xE0 , 0xA0 ..=0xBF )
185- | ( 0xE1 ..=0xEC , 0x80 ..=0xBF )
186- | ( 0xED , 0x80 ..=0x9F )
187- | ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => { }
188- _ => err ! ( Some ( 1 ) ) ,
189- }
190- if next ! ( ) as i8 >= -64 {
191- err ! ( Some ( 2 ) )
192- }
193- }
194- 4 => {
195- match ( first, next ! ( ) ) {
196- ( 0xF0 , 0x90 ..=0xBF ) | ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) | ( 0xF4 , 0x80 ..=0x8F ) => { }
197- _ => err ! ( Some ( 1 ) ) ,
198- }
199- if next ! ( ) as i8 >= -64 {
200- err ! ( Some ( 2 ) )
201- }
202- if next ! ( ) as i8 >= -64 {
203- err ! ( Some ( 3 ) )
204- }
205- }
206- _ => err ! ( Some ( 1 ) ) ,
221+ match try_finish_byte_sequence ( first, v, index) {
222+ Ok ( ( _value, length) ) => index += length,
223+ Err ( error_len) => return Err ( Utf8Error { valid_up_to, error_len } ) ,
207224 }
208- index += 1 ;
209225 } else {
210226 // Ascii case, try to skip forward quickly.
211227 // When the pointer is aligned, read 2 words of data per iteration
@@ -241,6 +257,93 @@ pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
241257 Ok ( ( ) )
242258}
243259
260+ /// Try to finish an UTF-8 byte sequence.
261+ ///
262+ /// Assumes that `bytes[index] == first` and than `first >= 128`, i.e. that
263+ /// `index` points at the beginning of a non-ASCII UTF-8 sequence in `bytes`.
264+ ///
265+ /// If the byte sequence at the index is correct, returns decoded code point and
266+ /// length of the sequence. If it was invalid returns number of invalid bytes
267+ /// or None if read was cut short.
268+ #[ inline( always) ]
269+ #[ rustc_const_unstable( feature = "str_internals" , issue = "none" ) ]
270+ const fn try_finish_byte_sequence (
271+ first : u8 ,
272+ bytes : & [ u8 ] ,
273+ index : usize ,
274+ ) -> Result < ( u32 , usize ) , Option < u8 > > {
275+ macro_rules! get {
276+ ( raw $offset: expr) => {
277+ if index + $offset < bytes. len( ) {
278+ bytes[ index + $offset]
279+ } else {
280+ return Err ( None )
281+ }
282+ } ;
283+ ( cont $offset: expr) => { {
284+ let byte = get!( raw $offset) ;
285+ if !utf8_is_cont_byte( byte) {
286+ return Err ( Some ( $offset as u8 ) )
287+ }
288+ byte
289+ } }
290+ }
291+
292+ // 2-byte encoding is for codepoints \u{0080} to \u{07ff}
293+ // first C2 80 last DF BF
294+ // 3-byte encoding is for codepoints \u{0800} to \u{ffff}
295+ // first E0 A0 80 last EF BF BF
296+ // excluding surrogates codepoints \u{d800} to \u{dfff}
297+ // ED A0 80 to ED BF BF
298+ // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
299+ // first F0 90 80 80 last F4 8F BF BF
300+ //
301+ // Use the UTF-8 syntax from the RFC
302+ //
303+ // https://tools.ietf.org/html/rfc3629
304+ // UTF8-1 = %x00-7F
305+ // UTF8-2 = %xC2-DF UTF8-tail
306+ // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
307+ // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
308+ // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
309+ // %xF4 %x80-8F 2( UTF8-tail )
310+ match utf8_char_width ( first) {
311+ 2 => {
312+ let second = get ! ( cont 1 ) ;
313+ let value = utf8_first_byte ( first, 3 ) ;
314+ let value = utf8_acc_cont_byte ( value, second) ;
315+ Ok ( ( value, 2 ) )
316+ }
317+ 3 => {
318+ let second = get ! ( raw 1 ) ;
319+ match ( first, second) {
320+ ( 0xE0 , 0xA0 ..=0xBF )
321+ | ( 0xE1 ..=0xEC , 0x80 ..=0xBF )
322+ | ( 0xED , 0x80 ..=0x9F )
323+ | ( 0xEE ..=0xEF , 0x80 ..=0xBF ) => { }
324+ _ => return Err ( Some ( 1 ) ) ,
325+ }
326+ let value = utf8_first_byte ( first, 3 ) ;
327+ let value = utf8_acc_cont_byte ( value, second) ;
328+ let value = utf8_acc_cont_byte ( value, get ! ( cont 2 ) ) ;
329+ Ok ( ( value, 3 ) )
330+ }
331+ 4 => {
332+ let second = get ! ( raw 1 ) ;
333+ match ( first, second) {
334+ ( 0xF0 , 0x90 ..=0xBF ) | ( 0xF1 ..=0xF3 , 0x80 ..=0xBF ) | ( 0xF4 , 0x80 ..=0x8F ) => { }
335+ _ => return Err ( Some ( 1 ) ) ,
336+ }
337+ let value = utf8_first_byte ( first, 4 ) ;
338+ let value = utf8_acc_cont_byte ( value, second) ;
339+ let value = utf8_acc_cont_byte ( value, get ! ( cont 2 ) ) ;
340+ let value = utf8_acc_cont_byte ( value, get ! ( cont 3 ) ) ;
341+ Ok ( ( value, 4 ) )
342+ }
343+ _ => Err ( Some ( 1 ) ) ,
344+ }
345+ }
346+
244347// https://tools.ietf.org/html/rfc3629
245348const UTF8_CHAR_WIDTH : & [ u8 ; 256 ] = & [
246349 // 1 2 3 4 5 6 7 8 9 A B C D E F
0 commit comments