Use UTF-8 error length enum to reduce register spill

oxalica · oxalica · commit dbba13b45968 · 2025-03-07T22:50:07.000-05:00
When using `error_len: Option&lt;u8&gt;`, `Result&lt;(), Utf8Error&gt;` will be
returned on stack and produces suboptimal stack suffling operations. It
causes 50%-200% latency increase on the error path.
diff --git a/library/core/src/str/error.rs b/library/core/src/str/error.rs
@@ -42,11 +42,24 @@ use crate::fmt;
 ///     }
 /// }
 /// ```
-#[derive(Copy, Eq, PartialEq, Clone, Debug)]
+#[derive(Copy, Eq, PartialEq, Clone)]
 #[stable(feature = "rust1", since = "1.0.0")]
 pub struct Utf8Error {
     pub(super) valid_up_to: usize,
-    pub(super) error_len: Option<u8>,
+    // Use a single value instead of tagged enum `Option<u8>` to make `Result<(), Utf8Error>` fits
+    // in two machine words, so `run_utf8_validation` does not need to returns values on stack on
+    // x86(_64). Register spill is very expensive on `run_utf8_validation` and can give up to 200%
+    // latency penalty on the error path.
+    pub(super) error_len: Utf8ErrorLen,
+}
+
+#[derive(Copy, Eq, PartialEq, Clone)]
+#[repr(u8)]
+pub(super) enum Utf8ErrorLen {
+    Eof = 0,
+    One,
+    Two,
+    Three,
 }
 
 impl Utf8Error {
@@ -100,18 +113,28 @@ impl Utf8Error {
     #[must_use]
     #[inline]
     pub const fn error_len(&self) -> Option<usize> {
-        // FIXME(const-hack): This should become `map` again, once it's `const`
         match self.error_len {
-            Some(len) => Some(len as usize),
-            None => None,
+            Utf8ErrorLen::Eof => None,
+            // FIXME(136972): Direct `match` gives suboptimal codegen involving two table lookups.
+            len => Some(len as usize),
         }
     }
 }
 
+#[stable(feature = "rust1", since = "1.0.0")]
+impl fmt::Debug for Utf8Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Utf8Error")
+            .field("valid_up_to", &self.valid_up_to)
+            .field("error_len", &self.error_len())
+            .finish()
+    }
+}
+
 #[stable(feature = "rust1", since = "1.0.0")]
 impl fmt::Display for Utf8Error {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        if let Some(error_len) = self.error_len {
+        if let Some(error_len) = self.error_len() {
             write!(
                 f,
                 "invalid utf-8 sequence of {} bytes from index {}",
diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs
@@ -1,4 +1,5 @@
 use super::from_utf8_unchecked;
+use super::validations::run_utf8_validation;
 use crate::fmt;
 use crate::fmt::{Formatter, Write};
 use crate::iter::FusedIterator;
@@ -196,8 +197,10 @@ impl<'a> Iterator for Utf8Chunks<'a> {
             return None;
         }
 
-        match super::from_utf8(self.source) {
-            Ok(valid) => {
+        match run_utf8_validation(self.source) {
+            Ok(()) => {
+                // SAFETY: The whole `source` is valid in UTF-8.
+                let valid = unsafe { from_utf8_unchecked(&self.source) };
                 // Truncate the slice, no need to touch the pointer.
                 self.source = &self.source[..0];
                 Some(Utf8Chunk { valid, invalid: &[] })
diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
@@ -1,6 +1,7 @@
 //! Operations related to UTF-8 validation.
 
 use super::Utf8Error;
+use super::error::Utf8ErrorLen;
 use crate::intrinsics::const_eval_select;
 
 /// Returns the initial codepoint accumulator for the first byte.
@@ -210,25 +211,26 @@ const fn is_utf8_first_byte(byte: u8) -> bool {
 /// The caller must ensure `bytes[..i]` is a valid UTF-8 prefix and `st` is the DFA state after
 /// executing on `bytes[..i]`.
 #[inline]
-const unsafe fn resolve_error_location(st: u32, bytes: &[u8], i: usize) -> (usize, u8) {
+const unsafe fn resolve_error_location(st: u32, bytes: &[u8], i: usize) -> Utf8Error {
     // There are two cases:
     // 1. [valid UTF-8..] | *here
     //    The previous state must be ACCEPT for the case 1, and `valid_up_to = i`.
     // 2. [valid UTF-8..] | valid first byte, [valid continuation byte...], *here
     //    `valid_up_to` is at the latest non-continuation byte, which must exist and
     //    be in range `(i-3)..i`.
-    if st & STATE_MASK == ST_ACCEPT {
-        (i, 1)
+    let (valid_up_to, error_len) = if st & STATE_MASK == ST_ACCEPT {
+        (i, Utf8ErrorLen::One)
     // SAFETY: UTF-8 first byte must exist if we are in an intermediate state.
     // We use pointer here because `get_unchecked` is not const fn.
     } else if is_utf8_first_byte(unsafe { bytes.as_ptr().add(i - 1).read() }) {
-        (i - 1, 1)
+        (i - 1, Utf8ErrorLen::One)
     // SAFETY: Same as above.
     } else if is_utf8_first_byte(unsafe { bytes.as_ptr().add(i - 2).read() }) {
-        (i - 2, 2)
+        (i - 2, Utf8ErrorLen::Two)
     } else {
-        (i - 3, 3)
-    }
+        (i - 3, Utf8ErrorLen::Three)
+    };
+    Utf8Error { valid_up_to, error_len }
 }
 
 // The simpler but slower algorithm to run DFA with error handling.
@@ -245,8 +247,7 @@ const unsafe fn run_with_error_handling(
         let new_st = next_state(*st, bytes[i]);
         if new_st & STATE_MASK == ST_ERROR {
             // SAFETY: Guaranteed by the caller.
-            let (valid_up_to, error_len) = unsafe { resolve_error_location(*st, bytes, i) };
-            return Err(Utf8Error { valid_up_to, error_len: Some(error_len) });
+            return Err(unsafe { resolve_error_location(*st, bytes, i) });
         }
         *st = new_st;
         i += 1;
@@ -256,7 +257,7 @@ const unsafe fn run_with_error_handling(
 
 /// Walks through `v` checking that it's a valid UTF-8 sequence,
 /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
-#[inline(always)]
+#[inline]
 #[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior
 pub(super) const fn run_utf8_validation(bytes: &[u8]) -> Result<(), Utf8Error> {
     const_eval_select((bytes,), run_utf8_validation_const, run_utf8_validation_rt)
@@ -273,8 +274,9 @@ const fn run_utf8_validation_const(bytes: &[u8]) -> Result<(), Utf8Error> {
                 Ok(())
             } else {
                 // SAFETY: `st` is the last state after execution without encountering any error.
-                let (valid_up_to, _) = unsafe { resolve_error_location(st, bytes, bytes.len()) };
-                Err(Utf8Error { valid_up_to, error_len: None })
+                let mut err = unsafe { resolve_error_location(st, bytes, bytes.len()) };
+                err.error_len = Utf8ErrorLen::Eof;
+                Err(err)
             }
         }
     }
@@ -333,8 +335,9 @@ fn run_utf8_validation_rt(bytes: &[u8]) -> Result<(), Utf8Error> {
 
     if st & STATE_MASK != ST_ACCEPT {
         // SAFETY: Same as above.
-        let (valid_up_to, _) = unsafe { resolve_error_location(st, bytes, bytes.len()) };
-        return Err(Utf8Error { valid_up_to, error_len: None });
+        let mut err = unsafe { resolve_error_location(st, bytes, bytes.len()) };
+        err.error_len = Utf8ErrorLen::Eof;
+        return Err(err);
     }
 
     Ok(())