From 7149260b090718a7a87b070cafe72bfbe8ed0a7c Mon Sep 17 00:00:00 2001 From: WaterWhisperer Date: Mon, 27 Oct 2025 17:21:03 +0800 Subject: [PATCH] feat(uri): make `Authority/PathAndQuery::from_static` const --- src/uri/authority.rs | 223 +++++++++++++++++++++++++------------------ src/uri/path.rs | 80 +++++++++++++++- 2 files changed, 208 insertions(+), 95 deletions(-) diff --git a/src/uri/authority.rs b/src/uri/authority.rs index 07aa6795..67754e45 100644 --- a/src/uri/authority.rs +++ b/src/uri/authority.rs @@ -8,6 +8,19 @@ use bytes::Bytes; use super::{ErrorKind, InvalidUri, Port, URI_CHARS}; use crate::byte_str::ByteStr; +/// Validation result for authority parsing. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum AuthorityError { + Empty, + InvalidUriChar, + InvalidAuthority, + TooManyColons, + MismatchedBrackets, + InvalidBracketUsage, + EmptyAfterAt, + InvalidPercent, +} + /// Represents the authority component of a URI. #[derive(Clone)] pub struct Authority { @@ -45,9 +58,14 @@ impl Authority { /// let authority = Authority::from_static("example.com"); /// assert_eq!(authority.host(), "example.com"); /// ``` - pub fn from_static(src: &'static str) -> Self { - Authority::from_shared(Bytes::from_static(src.as_bytes())) - .expect("static str is not valid authority") + #[inline] + pub const fn from_static(src: &'static str) -> Self { + match validate_authority_bytes(src.as_bytes()) { + Ok(_) => Authority { + data: ByteStr::from_static(src), + }, + Err(_) => panic!("static str is not valid authority"), + } } /// Attempt to convert a `Bytes` buffer to a `Authority`. @@ -69,95 +87,19 @@ impl Authority { // Postcondition: for all Ok() returns, s[..ret.unwrap()] is valid UTF-8 where // ret is the return value. pub(super) fn parse(s: &[u8]) -> Result { - let mut colon_cnt = 0u32; - let mut start_bracket = false; - let mut end_bracket = false; - let mut has_percent = false; - let mut end = s.len(); - let mut at_sign_pos = None; - const MAX_COLONS: u32 = 8; // e.g., [FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80 - - // Among other things, this loop checks that every byte in s up to the - // first '/', '?', or '#' is a valid URI character (or in some contexts, - // a '%'). This means that each such byte is a valid single-byte UTF-8 - // code point. - for (i, &b) in s.iter().enumerate() { - match URI_CHARS[b as usize] { - b'/' | b'?' | b'#' => { - end = i; - break; - } - b':' => { - if colon_cnt >= MAX_COLONS { - return Err(ErrorKind::InvalidAuthority.into()); - } - colon_cnt += 1; - } - b'[' => { - if has_percent || start_bracket { - // Something other than the userinfo has a `%`, so reject it. - return Err(ErrorKind::InvalidAuthority.into()); - } - start_bracket = true; - } - b']' => { - if (!start_bracket) || end_bracket { - return Err(ErrorKind::InvalidAuthority.into()); - } - end_bracket = true; - - // Those were part of an IPv6 hostname, so forget them... - colon_cnt = 0; - has_percent = false; - } - b'@' => { - at_sign_pos = Some(i); - - // Those weren't a port colon, but part of the - // userinfo, so it needs to be forgotten. - colon_cnt = 0; - has_percent = false; - } - 0 if b == b'%' => { - // Per https://tools.ietf.org/html/rfc3986#section-3.2.1 and - // https://url.spec.whatwg.org/#authority-state - // the userinfo can have a percent-encoded username and password, - // so record that a `%` was found. If this turns out to be - // part of the userinfo, this flag will be cleared. - // Also per https://tools.ietf.org/html/rfc6874, percent-encoding can - // be used to indicate a zone identifier. - // If the flag hasn't been cleared at the end, that means this - // was part of the hostname (and not part of an IPv6 address), and - // will fail with an error. - has_percent = true; - } - 0 => { - return Err(ErrorKind::InvalidUriChar.into()); - } - _ => {} + validate_authority_bytes(s).map_err(|e| { + match e { + AuthorityError::Empty => ErrorKind::Empty, + AuthorityError::InvalidUriChar => ErrorKind::InvalidUriChar, + AuthorityError::InvalidAuthority + | AuthorityError::MismatchedBrackets + | AuthorityError::InvalidBracketUsage + | AuthorityError::EmptyAfterAt + | AuthorityError::InvalidPercent + | AuthorityError::TooManyColons => ErrorKind::InvalidAuthority, } - } - - if start_bracket ^ end_bracket { - return Err(ErrorKind::InvalidAuthority.into()); - } - - if colon_cnt > 1 { - // Things like 'localhost:8080:3030' are rejected. - return Err(ErrorKind::InvalidAuthority.into()); - } - - if end > 0 && at_sign_pos == Some(end - 1) { - // If there's nothing after an `@`, this is bonkers. - return Err(ErrorKind::InvalidAuthority.into()); - } - - if has_percent { - // Something after the userinfo has a `%`, so reject it. - return Err(ErrorKind::InvalidAuthority.into()); - } - - Ok(end) + .into() + }) } // Parse bytes as an Authority, not allowing an empty string. @@ -528,6 +470,105 @@ where }) } +/// Shared validation logic for authority bytes. +/// Returns the end position of valid authority bytes, or an error. +const fn validate_authority_bytes(s: &[u8]) -> Result { + if s.is_empty() { + return Err(AuthorityError::Empty); + } + + let mut colon_cnt: u32 = 0; + let mut start_bracket = false; + let mut end_bracket = false; + let mut has_percent = false; + let mut end = s.len(); + let mut at_sign_pos: usize = s.len(); + const MAX_COLONS: u32 = 8; // e.g., [FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80 + + let mut i = 0; + // Among other things, this loop checks that every byte in s up to the + // first '/', '?', or '#' is a valid URI character (or in some contexts, + // a '%'). This means that each such byte is a valid single-byte UTF-8 + // code point. + while i < s.len() { + let b = s[i]; + let ch = URI_CHARS[b as usize]; + + if ch == b'/' || ch == b'?' || ch == b'#' { + end = i; + break; + } + + if ch == 0 { + if b == b'%' { + // Per https://tools.ietf.org/html/rfc3986#section-3.2.1 and + // https://url.spec.whatwg.org/#authority-state + // the userinfo can have a percent-encoded username and password, + // so record that a `%` was found. If this turns out to be + // part of the userinfo, this flag will be cleared. + // Also per https://tools.ietf.org/html/rfc6874, percent-encoding can + // be used to indicate a zone identifier. + // If the flag hasn't been cleared at the end, that means this + // was part of the hostname (and not part of an IPv6 address), and + // will fail with an error. + has_percent = true; + } else { + return Err(AuthorityError::InvalidUriChar); + } + } else if ch == b':' { + if colon_cnt >= MAX_COLONS { + return Err(AuthorityError::TooManyColons); + } + colon_cnt += 1; + } else if ch == b'[' { + if has_percent || start_bracket { + // Something other than the userinfo has a `%`, so reject it. + return Err(AuthorityError::InvalidBracketUsage); + } + start_bracket = true; + } else if ch == b']' { + if !start_bracket || end_bracket { + return Err(AuthorityError::InvalidBracketUsage); + } + end_bracket = true; + + // Those were part of an IPv6 hostname, so forget them... + colon_cnt = 0; + has_percent = false; + } else if ch == b'@' { + at_sign_pos = i; + + // Those weren't a port colon, but part of the + // userinfo, so it needs to be forgotten. + colon_cnt = 0; + has_percent = false; + } + + i += 1; + } + + if start_bracket != end_bracket { + return Err(AuthorityError::MismatchedBrackets); + } + + if colon_cnt > 1 { + // Things like 'localhost:8080:3030' are rejected. + return Err(AuthorityError::InvalidAuthority); + } + + if end > 0 && at_sign_pos == end - 1 { + // If there's nothing after an `@`, this is bonkers. + return Err(AuthorityError::EmptyAfterAt); + } + + if has_percent { + // Something after the userinfo has a `%`, so reject it. + return Err(AuthorityError::InvalidPercent); + } + + Ok(end) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/uri/path.rs b/src/uri/path.rs index 42db1f92..8f9356e8 100644 --- a/src/uri/path.rs +++ b/src/uri/path.rs @@ -7,6 +7,14 @@ use bytes::Bytes; use super::{ErrorKind, InvalidUri}; use crate::byte_str::ByteStr; +/// Validation result for path and query parsing. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum PathAndQueryError { + InvalidPathChar, + InvalidQueryChar, + FragmentNotAllowed, +} + /// Represents the path component of a URI #[derive(Clone)] pub struct PathAndQuery { @@ -138,10 +146,14 @@ impl PathAndQuery { /// assert_eq!(v.query(), Some("world")); /// ``` #[inline] - pub fn from_static(src: &'static str) -> Self { - let src = Bytes::from_static(src.as_bytes()); - - PathAndQuery::from_shared(src).unwrap() + pub const fn from_static(src: &'static str) -> Self { + match validate_path_and_query_bytes(src.as_bytes()) { + Ok(query) => PathAndQuery { + data: ByteStr::from_static(src), + query, + }, + Err(_) => panic!("static str is not valid path"), + } } /// Attempt to convert a `Bytes` buffer to a `PathAndQuery`. @@ -467,6 +479,66 @@ impl PartialOrd for String { } } +/// Shared validation logic for path and query bytes. +/// Returns the query position (or NONE), or an error. +const fn validate_path_and_query_bytes(bytes: &[u8]) -> Result { + let mut query: u16 = NONE; + let mut i: usize = 0; + + // path ... + while i < bytes.len() { + let b = bytes[i]; + if b == b'?' { + query = i as u16; + i += 1; + break; + } else if b == b'#' { + return Err(PathAndQueryError::FragmentNotAllowed); + } else { + let allowed = b == 0x21 + || (b >= 0x24 && b <= 0x3B) + || b == 0x3D + || (b >= 0x40 && b <= 0x5F) + || (b >= 0x61 && b <= 0x7A) + || b == 0x7C + || b == 0x7E + || b == b'"' + || b == b'{' + || b == b'}' + || (b >= 0x7F); + + if !allowed { + return Err(PathAndQueryError::InvalidPathChar); + } + } + i += 1; + } + + // query ... + if query != NONE { + while i < bytes.len() { + let b = bytes[i]; + if b == b'#' { + return Err(PathAndQueryError::FragmentNotAllowed); + } + + let allowed = b == 0x21 + || (b >= 0x24 && b <= 0x3B) + || b == 0x3D + || (b >= 0x3F && b <= 0x7E) + || (b >= 0x7F); + + if !allowed { + return Err(PathAndQueryError::InvalidQueryChar); + } + + i += 1; + } + } + + Ok(query) +} + #[cfg(test)] mod tests { use super::*;