11//! Operations related to UTF-8 validation.
22
33use super :: Utf8Error ;
4+ use super :: error:: Utf8ErrorLen ;
45use crate :: intrinsics:: const_eval_select;
56
67/// Returns the initial codepoint accumulator for the first byte.
@@ -210,25 +211,26 @@ const fn is_utf8_first_byte(byte: u8) -> bool {
210211/// The caller must ensure `bytes[..i]` is a valid UTF-8 prefix and `st` is the DFA state after
211212/// executing on `bytes[..i]`.
212213#[ inline]
213- const unsafe fn resolve_error_location ( st : u32 , bytes : & [ u8 ] , i : usize ) -> ( usize , u8 ) {
214+ const unsafe fn resolve_error_location ( st : u32 , bytes : & [ u8 ] , i : usize ) -> Utf8Error {
214215 // There are two cases:
215216 // 1. [valid UTF-8..] | *here
216217 // The previous state must be ACCEPT for the case 1, and `valid_up_to = i`.
217218 // 2. [valid UTF-8..] | valid first byte, [valid continuation byte...], *here
218219 // `valid_up_to` is at the latest non-continuation byte, which must exist and
219220 // be in range `(i-3)..i`.
220- if st & STATE_MASK == ST_ACCEPT {
221- ( i, 1 )
221+ let ( valid_up_to , error_len ) = if st & STATE_MASK == ST_ACCEPT {
222+ ( i, Utf8ErrorLen :: One )
222223 // SAFETY: UTF-8 first byte must exist if we are in an intermediate state.
223224 // We use pointer here because `get_unchecked` is not const fn.
224225 } else if is_utf8_first_byte ( unsafe { bytes. as_ptr ( ) . add ( i - 1 ) . read ( ) } ) {
225- ( i - 1 , 1 )
226+ ( i - 1 , Utf8ErrorLen :: One )
226227 // SAFETY: Same as above.
227228 } else if is_utf8_first_byte ( unsafe { bytes. as_ptr ( ) . add ( i - 2 ) . read ( ) } ) {
228- ( i - 2 , 2 )
229+ ( i - 2 , Utf8ErrorLen :: Two )
229230 } else {
230- ( i - 3 , 3 )
231- }
231+ ( i - 3 , Utf8ErrorLen :: Three )
232+ } ;
233+ Utf8Error { valid_up_to, error_len }
232234}
233235
234236// The simpler but slower algorithm to run DFA with error handling.
@@ -245,8 +247,7 @@ const unsafe fn run_with_error_handling(
245247 let new_st = next_state ( * st, bytes[ i] ) ;
246248 if new_st & STATE_MASK == ST_ERROR {
247249 // SAFETY: Guaranteed by the caller.
248- let ( valid_up_to, error_len) = unsafe { resolve_error_location ( * st, bytes, i) } ;
249- return Err ( Utf8Error { valid_up_to, error_len : Some ( error_len) } ) ;
250+ return Err ( unsafe { resolve_error_location ( * st, bytes, i) } ) ;
250251 }
251252 * st = new_st;
252253 i += 1 ;
@@ -256,7 +257,7 @@ const unsafe fn run_with_error_handling(
256257
257258/// Walks through `v` checking that it's a valid UTF-8 sequence,
258259/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
259- #[ inline( always ) ]
260+ #[ inline]
260261#[ rustc_allow_const_fn_unstable( const_eval_select) ] // fallback impl has same behavior
261262pub ( super ) const fn run_utf8_validation ( bytes : & [ u8 ] ) -> Result < ( ) , Utf8Error > {
262263 const_eval_select ( ( bytes, ) , run_utf8_validation_const, run_utf8_validation_rt)
@@ -273,8 +274,9 @@ const fn run_utf8_validation_const(bytes: &[u8]) -> Result<(), Utf8Error> {
273274 Ok ( ( ) )
274275 } else {
275276 // SAFETY: `st` is the last state after execution without encountering any error.
276- let ( valid_up_to, _) = unsafe { resolve_error_location ( st, bytes, bytes. len ( ) ) } ;
277- Err ( Utf8Error { valid_up_to, error_len : None } )
277+ let mut err = unsafe { resolve_error_location ( st, bytes, bytes. len ( ) ) } ;
278+ err. error_len = Utf8ErrorLen :: Eof ;
279+ Err ( err)
278280 }
279281 }
280282 }
@@ -333,8 +335,9 @@ fn run_utf8_validation_rt(bytes: &[u8]) -> Result<(), Utf8Error> {
333335
334336 if st & STATE_MASK != ST_ACCEPT {
335337 // SAFETY: Same as above.
336- let ( valid_up_to, _) = unsafe { resolve_error_location ( st, bytes, bytes. len ( ) ) } ;
337- return Err ( Utf8Error { valid_up_to, error_len : None } ) ;
338+ let mut err = unsafe { resolve_error_location ( st, bytes, bytes. len ( ) ) } ;
339+ err. error_len = Utf8ErrorLen :: Eof ;
340+ return Err ( err) ;
338341 }
339342
340343 Ok ( ( ) )
0 commit comments