From 28d312a27b770dc508fb1ae327089ce099bee652 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Mon, 20 Mar 2023 11:39:06 -0700 Subject: [PATCH 01/10] Update generate-bindings instructions to install bindgen-cli `cargo install bindgen-cli` is how it's done these days. --- pcre2-sys/generate-bindings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pcre2-sys/generate-bindings b/pcre2-sys/generate-bindings index 260cfd6..7cf0251 100755 --- a/pcre2-sys/generate-bindings +++ b/pcre2-sys/generate-bindings @@ -5,7 +5,7 @@ if ! command -V bindgen > /dev/null 2>&1; then echo "bindgen must be installed" >&2 - echo "to install: cargo install bindgen" >&2 + echo "to install: cargo install bindgen-cli" >&2 exit 1 fi if ! [ -f "$PCRE2SYS_HEADER" ]; then From 0637728f943f733f20b7a128f292d499f90734f0 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Sat, 3 Feb 2024 10:18:17 -0800 Subject: [PATCH 02/10] Switch from jit to jit_if_available in tests The PCRE2 jit is disabled dependent on the platform in build.rs. If it is disabled, then tests which assume the jit is available will fail. Fix these tests by switching to jit_if_available. This fixes the static build on macOS. --- src/bytes.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index 5f02c4e..2129119 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1343,7 +1343,7 @@ mod tests { let re = RegexBuilder::new() .extended(true) .utf(true) - .jit(true) + .jit_if_available(true) .build(pattern) .unwrap(); let matched = re.find(hay.as_bytes()).unwrap().unwrap(); @@ -1364,7 +1364,7 @@ mod tests { let re = RegexBuilder::new() .extended(true) .utf(true) - .jit(true) + .jit_if_available(true) .build(pattern) .unwrap(); let matched = re.find(hay.as_bytes()).unwrap().unwrap(); From 41a8085a2d0f1c154c7f6f6b8ec2d43aa2661d51 Mon Sep 17 00:00:00 2001 From: ridiculousfish Date: Tue, 21 Mar 2023 14:26:15 -0700 Subject: [PATCH 03/10] Factor bytes into regex_impl and prepare for UTF-32 This moves the "bytes" module into regex_impl, and equips it with a trait, in preparation for UTF-32 matching. --- pcre2-sys/generate-bindings | 8 +- src/bytes.rs | 1120 +---------------------------------- src/ffi.rs | 377 +++++++++--- src/lib.rs | 1 + src/regex_impl.rs | 1104 ++++++++++++++++++++++++++++++++++ 5 files changed, 1438 insertions(+), 1172 deletions(-) create mode 100644 src/regex_impl.rs diff --git a/pcre2-sys/generate-bindings b/pcre2-sys/generate-bindings index 7cf0251..0630da1 100755 --- a/pcre2-sys/generate-bindings +++ b/pcre2-sys/generate-bindings @@ -14,6 +14,12 @@ if ! [ -f "$PCRE2SYS_HEADER" ]; then exit 1 fi +if [ -z "$PCRE2_CODE_UNIT_WIDTH" ]; then + echo "The PCRE2_CODE_UNIT_WIDTH environment variable must be set" >&2 + echo "Valid values are 8, 16, and 32" >&2 + exit 1 +fi + bindgen \ "$PCRE2SYS_HEADER" \ --ctypes-prefix '::libc' \ @@ -22,4 +28,4 @@ bindgen \ --allowlist-var '^PCRE2_.*' \ --blocklist-function '^.*_callout_.*' \ --blocklist-type '^.*_callout_.*' \ - -- -DPCRE2_CODE_UNIT_WIDTH=8 > "$PCRE2SYS_BINDINGS" + -- -DPCRE2_CODE_UNIT_WIDTH=${PCRE2_CODE_UNIT_WIDTH} > "$PCRE2SYS_BINDINGS" diff --git a/src/bytes.rs b/src/bytes.rs index 2129119..da474df 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1,1103 +1,25 @@ -use std::{ - collections::HashMap, - panic::{RefUnwindSafe, UnwindSafe}, - sync::Arc, -}; - -use pcre2_sys::{ - PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MATCH_INVALID_UTF, - PCRE2_MULTILINE, PCRE2_NEWLINE_ANYCRLF, PCRE2_UCP, PCRE2_UNSET, PCRE2_UTF, -}; +use crate::ffi::CodeUnitWidth8; +pub use crate::regex_impl::Match as MatchImpl; -use crate::{ - error::Error, - ffi::{Code, CompileContext, MatchConfig, MatchData}, - pool::{Pool, PoolGuard}, +#[doc(inline)] +pub use crate::regex_impl::{ + Regex as RegexImpl, RegexBuilder as RegexBuilderImpl, }; -/// Match represents a single match of a regex in a subject string. -/// -/// The lifetime parameter `'s` refers to the lifetime of the matched portion -/// of the subject string. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub struct Match<'s> { - subject: &'s [u8], - start: usize, - end: usize, -} - -impl<'s> Match<'s> { - /// Returns the starting byte offset of the match in the subject. - #[inline] - pub fn start(&self) -> usize { - self.start - } - - /// Returns the ending byte offset of the match in the subject. - #[inline] - pub fn end(&self) -> usize { - self.end - } - - /// Returns the matched portion of the subject string. - #[inline] - pub fn as_bytes(&self) -> &'s [u8] { - &self.subject[self.start..self.end] - } - - /// Creates a new match from the given subject string and byte offsets. - fn new(subject: &'s [u8], start: usize, end: usize) -> Match<'s> { - Match { subject, start, end } - } - - #[cfg(test)] - fn as_pair(&self) -> (usize, usize) { - (self.start, self.end) - } -} - -#[derive(Clone, Debug)] -struct Config { - /// PCRE2_CASELESS - caseless: bool, - /// PCRE2_DOTALL - dotall: bool, - /// PCRE2_EXTENDED - extended: bool, - /// PCRE2_MULTILINE - multi_line: bool, - /// PCRE2_NEWLINE_ANYCRLF - crlf: bool, - /// PCRE2_UCP - ucp: bool, - /// PCRE2_UTF - utf: bool, - /// use pcre2_jit_compile - jit: JITChoice, - /// Match-time specific configuration knobs. - match_config: MatchConfig, -} - -#[derive(Clone, Debug)] -enum JITChoice { - /// Never do JIT compilation. - Never, - /// Always do JIT compilation and return an error if it fails. - Always, - /// Attempt to do JIT compilation but silently fall back to non-JIT. - Attempt, -} - -impl Default for Config { - fn default() -> Config { - Config { - caseless: false, - dotall: false, - extended: false, - multi_line: false, - crlf: false, - ucp: false, - utf: false, - jit: JITChoice::Never, - match_config: MatchConfig::default(), - } - } -} - -/// A builder for configuring the compilation of a PCRE2 regex. -#[derive(Clone, Debug)] -pub struct RegexBuilder { - config: Config, -} - -impl RegexBuilder { - /// Create a new builder with a default configuration. - pub fn new() -> RegexBuilder { - RegexBuilder { config: Config::default() } - } - - /// Compile the given pattern into a PCRE regex using the current - /// configuration. - /// - /// If there was a problem compiling the pattern, then an error is - /// returned. - pub fn build(&self, pattern: &str) -> Result { - let mut options = 0; - if self.config.caseless { - options |= PCRE2_CASELESS; - } - if self.config.dotall { - options |= PCRE2_DOTALL; - } - if self.config.extended { - options |= PCRE2_EXTENDED; - } - if self.config.multi_line { - options |= PCRE2_MULTILINE; - } - if self.config.ucp { - options |= PCRE2_UCP; - options |= PCRE2_UTF; - options |= PCRE2_MATCH_INVALID_UTF; - } - if self.config.utf { - options |= PCRE2_UTF; - } - - let mut ctx = CompileContext::new(); - if self.config.crlf { - ctx.set_newline(PCRE2_NEWLINE_ANYCRLF) - .expect("PCRE2_NEWLINE_ANYCRLF is a legal value"); - } - - let mut code = Code::new(pattern, options, ctx)?; - match self.config.jit { - JITChoice::Never => {} // fallthrough - JITChoice::Always => { - code.jit_compile()?; - } - JITChoice::Attempt => { - if let Err(err) = code.jit_compile() { - log::debug!("JIT compilation failed: {}", err); - } - } - } - let capture_names = code.capture_names()?; - let mut idx = HashMap::new(); - for (i, group) in capture_names.iter().enumerate() { - if let Some(ref name) = *group { - idx.insert(name.to_string(), i); - } - } - let code = Arc::new(code); - let match_data = { - let config = self.config.match_config.clone(); - let code = Arc::clone(&code); - let create: MatchDataPoolFn = - Box::new(move || MatchData::new(config.clone(), &code)); - Pool::new(create) - }; - Ok(Regex { - config: Arc::new(self.config.clone()), - pattern: pattern.to_string(), - code, - capture_names: Arc::new(capture_names), - capture_names_idx: Arc::new(idx), - match_data, - }) - } - - /// Enables case insensitive matching. - /// - /// If the `utf` option is also set, then Unicode case folding is used - /// to determine case insensitivity. When the `utf` option is not set, - /// then only standard ASCII case insensitivity is considered. - /// - /// This option corresponds to the `i` flag. - pub fn caseless(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.caseless = yes; - self - } - - /// Enables "dot all" matching. - /// - /// When enabled, the `.` metacharacter in the pattern matches any - /// character, include `\n`. When disabled (the default), `.` will match - /// any character except for `\n`. - /// - /// This option corresponds to the `s` flag. - pub fn dotall(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.dotall = yes; - self - } - - /// Enable "extended" mode in the pattern, where whitespace is ignored. - /// - /// This option corresponds to the `x` flag. - pub fn extended(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.extended = yes; - self - } - - /// Enable multiline matching mode. - /// - /// When enabled, the `^` and `$` anchors will match both at the beginning - /// and end of a subject string, in addition to matching at the start of - /// a line and the end of a line. When disabled, the `^` and `$` anchors - /// will only match at the beginning and end of a subject string. - /// - /// This option corresponds to the `m` flag. - pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.multi_line = yes; - self - } - - /// Enable matching of CRLF as a line terminator. - /// - /// When enabled, anchors such as `^` and `$` will match any of the - /// following as a line terminator: `\r`, `\n` or `\r\n`. - /// - /// This is disabled by default, in which case, only `\n` is recognized as - /// a line terminator. - pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.crlf = yes; - self - } - - /// Enable Unicode matching mode. - /// - /// When enabled, the following patterns become Unicode aware: `\b`, `\B`, - /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`. - /// - /// When set, this implies UTF matching mode. It is not possible to enable - /// Unicode matching mode without enabling UTF matching mode. - /// - /// This is disabled by default. - pub fn ucp(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.ucp = yes; - self - } - - /// Enable UTF matching mode. - /// - /// When enabled, characters are treated as sequences of code units that - /// make up a single codepoint instead of as single bytes. For example, - /// this will cause `.` to match any single UTF-8 encoded codepoint, where - /// as when this is disabled, `.` will any single byte (except for `\n` in - /// both cases, unless "dot all" mode is enabled). - /// - /// This is disabled by default. - pub fn utf(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.utf = yes; - self - } - - /// This is now deprecated and is a no-op. - /// - /// Previously, this option permitted disabling PCRE2's UTF-8 validity - /// check, which could result in undefined behavior if the haystack was - /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`, - /// in 10.34 which this crate always sets. When this option is enabled, - /// PCRE2 claims to not have undefined behavior when the haystack is - /// invalid UTF-8. - /// - /// Therefore, disabling the UTF-8 check is not something that is exposed - /// by this crate. - #[deprecated( - since = "0.2.4", - note = "now a no-op due to new PCRE2 features" - )] - pub fn disable_utf_check(&mut self) -> &mut RegexBuilder { - self - } - - /// Enable PCRE2's JIT and return an error if it's not available. - /// - /// This generally speeds up matching quite a bit. The downside is that it - /// can increase the time it takes to compile a pattern. - /// - /// If the JIT isn't available or if JIT compilation returns an error, then - /// regex compilation will fail with the corresponding error. - /// - /// This is disabled by default, and always overrides `jit_if_available`. - pub fn jit(&mut self, yes: bool) -> &mut RegexBuilder { - if yes { - self.config.jit = JITChoice::Always; - } else { - self.config.jit = JITChoice::Never; - } - self - } - - /// Enable PCRE2's JIT if it's available. - /// - /// This generally speeds up matching quite a bit. The downside is that it - /// can increase the time it takes to compile a pattern. - /// - /// If the JIT isn't available or if JIT compilation returns an error, - /// then a debug message with the error will be emitted and the regex will - /// otherwise silently fall back to non-JIT matching. - /// - /// This is disabled by default, and always overrides `jit`. - pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexBuilder { - if yes { - self.config.jit = JITChoice::Attempt; - } else { - self.config.jit = JITChoice::Never; - } - self - } - - /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is - /// not enabled, then this has no effect. - /// - /// When `None` is given, no custom JIT stack will be created, and instead, - /// the default JIT stack is used. When the default is used, its maximum - /// size is 32 KB. - /// - /// When this is set, then a new JIT stack will be created with the given - /// maximum size as its limit. - /// - /// Increasing the stack size can be useful for larger regular expressions. - /// - /// By default, this is set to `None`. - pub fn max_jit_stack_size( - &mut self, - bytes: Option, - ) -> &mut RegexBuilder { - self.config.match_config.max_jit_stack_size = bytes; - self - } -} - -/// A compiled PCRE2 regular expression. +/// A compiled PCRE2 regular expression for matching bytes. /// /// This regex is safe to use from multiple threads simultaneously. For top /// performance, it is better to clone a new regex for each thread. -pub struct Regex { - /// The configuration used to build the regex. - config: Arc, - /// The original pattern string. - pattern: String, - /// The underlying compiled PCRE2 object. - code: Arc, - /// The capture group names for this regex. - capture_names: Arc>>, - /// A map from capture group name to capture group index. - capture_names_idx: Arc>, - /// A pool of mutable scratch data used by PCRE2 during matching. - match_data: MatchDataPool, -} - -impl Clone for Regex { - fn clone(&self) -> Regex { - let match_data = { - let config = self.config.match_config.clone(); - let code = Arc::clone(&self.code); - let create: MatchDataPoolFn = - Box::new(move || MatchData::new(config.clone(), &code)); - Pool::new(create) - }; - Regex { - config: Arc::clone(&self.config), - pattern: self.pattern.clone(), - code: Arc::clone(&self.code), - capture_names: Arc::clone(&self.capture_names), - capture_names_idx: Arc::clone(&self.capture_names_idx), - match_data, - } - } -} - -impl std::fmt::Debug for Regex { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "Regex({:?})", self.pattern) - } -} - -impl Regex { - /// Compiles a regular expression using the default configuration. - /// - /// Once compiled, it can be used repeatedly to search, split or replace - /// text in a string. - /// - /// If an invalid expression is given, then an error is returned. - /// - /// To configure compilation options for the regex, use the - /// [`RegexBuilder`](struct.RegexBuilder.html). - pub fn new(pattern: &str) -> Result { - RegexBuilder::new().build(pattern) - } - - /// Returns true if and only if the regex matches the subject string given. - /// - /// # Example - /// - /// Test if some text contains at least one word with exactly 13 ASCII word - /// bytes: - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let text = b"I categorically deny having triskaidekaphobia."; - /// assert!(Regex::new(r"\b\w{13}\b")?.is_match(text)?); - /// # Ok(()) }; example().unwrap() - /// ``` - pub fn is_match(&self, subject: &[u8]) -> Result { - self.is_match_at(subject, 0) - } - - /// Returns the start and end byte range of the leftmost-first match in - /// `subject`. If no match exists, then `None` is returned. - /// - /// # Example - /// - /// Find the start and end location of the first word with exactly 13 - /// ASCII word bytes: - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let text = b"I categorically deny having triskaidekaphobia."; - /// let mat = Regex::new(r"\b\w{13}\b")?.find(text)?.unwrap(); - /// assert_eq!((mat.start(), mat.end()), (2, 15)); - /// # Ok(()) }; example().unwrap() - /// ``` - pub fn find<'s>( - &self, - subject: &'s [u8], - ) -> Result>, Error> { - self.find_at(subject, 0) - } - - /// Returns an iterator for each successive non-overlapping match in - /// `subject`, returning the start and end byte indices with respect to - /// `subject`. - /// - /// # Example - /// - /// Find the start and end location of every word with exactly 13 ASCII - /// word bytes: - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let text = b"Retroactively relinquishing remunerations is reprehensible."; - /// for result in Regex::new(r"\b\w{13}\b")?.find_iter(text) { - /// let mat = result?; - /// println!("{:?}", mat); - /// } - /// # Ok(()) }; example().unwrap() - /// ``` - pub fn find_iter<'r, 's>(&'r self, subject: &'s [u8]) -> Matches<'r, 's> { - Matches { - re: self, - match_data: self.match_data(), - subject, - last_end: 0, - last_match: None, - } - } - - /// Returns the capture groups corresponding to the leftmost-first - /// match in `subject`. Capture group `0` always corresponds to the entire - /// match. If no match is found, then `None` is returned. - /// - /// # Examples - /// - /// Say you have some text with movie names and their release years, - /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text - /// looking like that, while also extracting the movie name and its release - /// year separately. - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")?; - /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; - /// let caps = re.captures(text)?.unwrap(); - /// assert_eq!(&caps[1], &b"Citizen Kane"[..]); - /// assert_eq!(&caps[2], &b"1941"[..]); - /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); - /// // You can also access the groups by index using the Index notation. - /// // Note that this will panic on an invalid index. - /// assert_eq!(&caps[1], b"Citizen Kane"); - /// assert_eq!(&caps[2], b"1941"); - /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); - /// # Ok(()) }; example().unwrap() - /// ``` - /// - /// Note that the full match is at capture group `0`. Each subsequent - /// capture group is indexed by the order of its opening `(`. - /// - /// We can make this example a bit clearer by using *named* capture groups: - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let re = Regex::new(r"'(?P[^']+)'\s+\((?P<year>\d{4})\)")?; - /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; - /// let caps = re.captures(text)?.unwrap(); - /// assert_eq!(&caps["title"], &b"Citizen Kane"[..]); - /// assert_eq!(&caps["year"], &b"1941"[..]); - /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); - /// // You can also access the groups by name using the Index notation. - /// // Note that this will panic on an invalid group name. - /// assert_eq!(&caps["title"], b"Citizen Kane"); - /// assert_eq!(&caps["year"], b"1941"); - /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); - /// # Ok(()) }; example().unwrap() - /// ``` - /// - /// Here we name the capture groups, which we can access with the `name` - /// method or the `Index` notation with a `&str`. Note that the named - /// capture groups are still accessible with `get` or the `Index` notation - /// with a `usize`. - /// - /// The `0`th capture group is always unnamed, so it must always be - /// accessed with `get(0)` or `[0]`. - pub fn captures<'s>( - &self, - subject: &'s [u8], - ) -> Result<Option<Captures<'s>>, Error> { - let mut locs = self.capture_locations(); - Ok(self.captures_read(&mut locs, subject)?.map(move |_| Captures { - subject, - locs, - idx: Arc::clone(&self.capture_names_idx), - })) - } +pub type Regex = RegexImpl<CodeUnitWidth8>; - /// Returns an iterator over all the non-overlapping capture groups matched - /// in `subject`. This is operationally the same as `find_iter`, except it - /// yields information about capturing group matches. - /// - /// # Example - /// - /// We can use this to find all movie titles and their release years in - /// some text, where the movie is formatted like "'Title' (xxxx)": - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use std::str; - /// - /// use pcre2::bytes::Regex; - /// - /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?; - /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; - /// for result in re.captures_iter(text) { - /// let caps = result?; - /// let title = str::from_utf8(&caps["title"]).unwrap(); - /// let year = str::from_utf8(&caps["year"]).unwrap(); - /// println!("Movie: {:?}, Released: {:?}", title, year); - /// } - /// // Output: - /// // Movie: Citizen Kane, Released: 1941 - /// // Movie: The Wizard of Oz, Released: 1939 - /// // Movie: M, Released: 1931 - /// # Ok(()) }; example().unwrap() - /// ``` - pub fn captures_iter<'r, 's>( - &'r self, - subject: &'s [u8], - ) -> CaptureMatches<'r, 's> { - CaptureMatches { re: self, subject, last_end: 0, last_match: None } - } -} - -/// Advanced or "lower level" search methods. -impl Regex { - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - pub fn is_match_at( - &self, - subject: &[u8], - start: usize, - ) -> Result<bool, Error> { - assert!( - start <= subject.len(), - "start ({}) must be <= subject.len() ({})", - start, - subject.len() - ); - - let options = 0; - let mut match_data = self.match_data(); - // SAFETY: We don't use any dangerous PCRE2 options. - let res = - unsafe { match_data.find(&self.code, subject, start, options) }; - PoolGuard::put(match_data); - res - } - - /// Returns the same as find, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - pub fn find_at<'s>( - &self, - subject: &'s [u8], - start: usize, - ) -> Result<Option<Match<'s>>, Error> { - let mut match_data = self.match_data(); - let res = - self.find_at_with_match_data(&mut match_data, subject, start); - PoolGuard::put(match_data); - res - } - - /// Like find_at, but accepts match data instead of acquiring one itself. - /// - /// This is useful for implementing the iterator, which permits avoiding - /// the synchronization overhead of acquiring the match data. - #[inline(always)] - fn find_at_with_match_data<'s>( - &self, - match_data: &mut MatchDataPoolGuard<'_>, - subject: &'s [u8], - start: usize, - ) -> Result<Option<Match<'s>>, Error> { - assert!( - start <= subject.len(), - "start ({}) must be <= subject.len() ({})", - start, - subject.len() - ); - - let options = 0; - // SAFETY: We don't use any dangerous PCRE2 options. - if unsafe { !match_data.find(&self.code, subject, start, options)? } { - return Ok(None); - } - let ovector = match_data.ovector(); - let (s, e) = (ovector[0], ovector[1]); - Ok(Some(Match::new(&subject, s, e))) - } - - /// This is like `captures`, but uses - /// [`CaptureLocations`](struct.CaptureLocations.html) - /// instead of - /// [`Captures`](struct.Captures.html) in order to amortize allocations. - /// - /// To create a `CaptureLocations` value, use the - /// `Regex::capture_locations` method. - /// - /// This returns the overall match if this was successful, which is always - /// equivalent to the `0`th capture group. - pub fn captures_read<'s>( - &self, - locs: &mut CaptureLocations, - subject: &'s [u8], - ) -> Result<Option<Match<'s>>, Error> { - self.captures_read_at(locs, subject, 0) - } - - /// Returns the same as `captures_read`, but starts the search at the given - /// offset and populates the capture locations given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - pub fn captures_read_at<'s>( - &self, - locs: &mut CaptureLocations, - subject: &'s [u8], - start: usize, - ) -> Result<Option<Match<'s>>, Error> { - assert!( - start <= subject.len(), - "start ({}) must be <= subject.len() ({})", - start, - subject.len() - ); - - let options = 0; - // SAFETY: We don't use any dangerous PCRE2 options. - if unsafe { !locs.data.find(&self.code, subject, start, options)? } { - return Ok(None); - } - let ovector = locs.data.ovector(); - let (s, e) = (ovector[0], ovector[1]); - Ok(Some(Match::new(&subject, s, e))) - } -} - -/// Auxiliary methods. -impl Regex { - /// Returns the original pattern string for this regex. - pub fn as_str(&self) -> &str { - &self.pattern - } - - /// Returns a sequence of all capturing groups and their names, if present. - /// - /// The length of the slice returned is always equal to the result of - /// `captures_len`, which is the number of capturing groups (including the - /// capturing group for the entire pattern). - /// - /// Each entry in the slice is the name of the corresponding capturing - /// group, if one exists. The first capturing group (at index `0`) is - /// always unnamed. - /// - /// Capturing groups are indexed by the order of the opening parenthesis. - pub fn capture_names(&self) -> &[Option<String>] { - &self.capture_names - } - - /// Returns the number of capturing groups in the pattern. - /// - /// This is always 1 more than the number of syntactic groups in the - /// pattern, since the first group always corresponds to the entire match. - pub fn captures_len(&self) -> usize { - self.code.capture_count().expect("a valid capture count from PCRE2") - } - - /// Returns an empty set of capture locations that can be reused in - /// multiple calls to `captures_read` or `captures_read_at`. - pub fn capture_locations(&self) -> CaptureLocations { - CaptureLocations { - code: Arc::clone(&self.code), - data: self.new_match_data(), - } - } - - fn match_data(&self) -> MatchDataPoolGuard<'_> { - self.match_data.get() - } - - fn new_match_data(&self) -> MatchData { - MatchData::new(self.config.match_config.clone(), &self.code) - } -} - -/// CaptureLocations is a low level representation of the raw offsets of each -/// submatch. -/// -/// Primarily, this type is useful when using `Regex` APIs such as -/// `captures_read`, which permits amortizing the allocation in which capture -/// match locations are stored. -/// -/// In order to build a value of this type, you'll need to call the -/// `capture_locations` method on the `Regex` being used to execute the search. -/// The value returned can then be reused in subsequent searches. -pub struct CaptureLocations { - code: Arc<Code>, - data: MatchData, -} - -impl Clone for CaptureLocations { - fn clone(&self) -> CaptureLocations { - CaptureLocations { - code: Arc::clone(&self.code), - data: MatchData::new(self.data.config().clone(), &self.code), - } - } -} - -impl std::fmt::Debug for CaptureLocations { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let mut offsets: Vec<Option<usize>> = vec![]; - for &offset in self.data.ovector() { - if offset == PCRE2_UNSET { - offsets.push(None); - } else { - offsets.push(Some(offset)); - } - } - write!(f, "CaptureLocations(")?; - f.debug_list().entries(offsets).finish()?; - write!(f, ")") - } -} - -impl CaptureLocations { - /// Returns the start and end positions of the Nth capture group. - /// - /// This returns `None` if `i` is not a valid capture group or if the - /// capture group did not match anything. - /// - /// The positions returned are always byte indices with respect to the - /// original subject string matched. - #[inline] - pub fn get(&self, i: usize) -> Option<(usize, usize)> { - let ovec = self.data.ovector(); - let s = match ovec.get(i * 2) { - None => return None, - Some(&s) if s == PCRE2_UNSET => return None, - Some(&s) => s, - }; - let e = match ovec.get(i * 2 + 1) { - None => return None, - Some(&e) if e == PCRE2_UNSET => return None, - Some(&e) => e, - }; - Some((s, e)) - } - - /// Returns the total number of capturing groups. - /// - /// This is always at least `1` since every regex has at least `1` - /// capturing group that corresponds to the entire match. - #[inline] - pub fn len(&self) -> usize { - self.data.ovector().len() / 2 - } -} - -/// Captures represents a group of captured byte strings for a single match. -/// -/// The 0th capture always corresponds to the entire match. Each subsequent -/// index corresponds to the next capture group in the regex. If a capture -/// group is named, then the matched byte string is *also* available via the -/// `name` method. (Note that the 0th capture is always unnamed and so must be -/// accessed with the `get` method.) -/// -/// Positions returned from a capture group are always byte indices. -/// -/// `'s` is the lifetime of the matched subject string. -pub struct Captures<'s> { - subject: &'s [u8], - locs: CaptureLocations, - idx: Arc<HashMap<String, usize>>, -} - -impl<'s> Captures<'s> { - /// Returns the match associated with the capture group at index `i`. If - /// `i` does not correspond to a capture group, or if the capture group - /// did not participate in the match, then `None` is returned. - /// - /// # Examples - /// - /// Get the text of the match with a default of an empty string if this - /// group didn't participate in the match: - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))")?; - /// let caps = re.captures(b"abc123")?.unwrap(); - /// - /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); - /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); - /// assert_eq!(text1, &b"123"[..]); - /// assert_eq!(text2, &b""[..]); - /// # Ok(()) }; example().unwrap() - /// ``` - pub fn get(&self, i: usize) -> Option<Match<'s>> { - self.locs.get(i).map(|(s, e)| Match::new(self.subject, s, e)) - } - - /// Returns the match for the capture group named `name`. If `name` isn't a - /// valid capture group or didn't match anything, then `None` is returned. - pub fn name(&self, name: &str) -> Option<Match<'s>> { - self.idx.get(name).and_then(|&i| self.get(i)) - } - - /// Returns the number of captured groups. - /// - /// This is always at least `1`, since every regex has at least one capture - /// group that corresponds to the full match. - #[inline] - pub fn len(&self) -> usize { - self.locs.len() - } -} - -impl<'s> std::fmt::Debug for Captures<'s> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() - } -} - -struct CapturesDebug<'c, 's: 'c>(&'c Captures<'s>); - -impl<'c, 's> std::fmt::Debug for CapturesDebug<'c, 's> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - fn escape_bytes(bytes: &[u8]) -> String { - let mut s = String::new(); - for &b in bytes { - s.push_str(&escape_byte(b)); - } - s - } - - fn escape_byte(byte: u8) -> String { - use std::ascii::escape_default; - - let escaped: Vec<u8> = escape_default(byte).collect(); - String::from_utf8_lossy(&escaped).into_owned() - } - - // We'd like to show something nice here, even if it means an - // allocation to build a reverse index. - let slot_to_name: HashMap<&usize, &String> = - self.0.idx.iter().map(|(a, b)| (b, a)).collect(); - let mut map = f.debug_map(); - for slot in 0..self.0.len() { - let m = self - .0 - .locs - .get(slot) - .map(|(s, e)| escape_bytes(&self.0.subject[s..e])); - if let Some(name) = slot_to_name.get(&slot) { - map.entry(&name, &m); - } else { - map.entry(&slot, &m); - } - } - map.finish() - } -} - -/// Get a group by index. -/// -/// `'s` is the lifetime of the matched subject string. -/// -/// The subject can't outlive the `Captures` object if this method is -/// used, because of how `Index` is defined (normally `a[i]` is part -/// of `a` and can't outlive it); to do that, use `get()` instead. -/// -/// # Panics -/// -/// If there is no group at the given index. -impl<'s> std::ops::Index<usize> for Captures<'s> { - type Output = [u8]; - - fn index(&self, i: usize) -> &[u8] { - self.get(i) - .map(|m| m.as_bytes()) - .unwrap_or_else(|| panic!("no group at index '{}'", i)) - } -} - -/// Get a group by name. -/// -/// `'s` is the lifetime of the matched subject string and `'i` is the lifetime -/// of the group name (the index). -/// -/// The text can't outlive the `Captures` object if this method is -/// used, because of how `Index` is defined (normally `a[i]` is part -/// of `a` and can't outlive it); to do that, use `name` instead. -/// -/// # Panics -/// -/// If there is no group named by the given value. -impl<'s, 'i> std::ops::Index<&'i str> for Captures<'s> { - type Output = [u8]; - - fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { - self.name(name) - .map(|m| m.as_bytes()) - .unwrap_or_else(|| panic!("no group named '{}'", name)) - } -} - -/// An iterator over all non-overlapping matches for a particular subject -/// string. -/// -/// The iterator yields matches (if no error occurred while searching) -/// corresponding to the start and end of the match. The indices are byte -/// offsets. The iterator stops when no more matches can be found. -/// -/// `'r` is the lifetime of the compiled regular expression and `'s` is the -/// lifetime of the subject string. -pub struct Matches<'r, 's> { - re: &'r Regex, - match_data: MatchDataPoolGuard<'r>, - subject: &'s [u8], - last_end: usize, - last_match: Option<usize>, -} - -impl<'r, 's> Iterator for Matches<'r, 's> { - type Item = Result<Match<'s>, Error>; - - fn next(&mut self) -> Option<Result<Match<'s>, Error>> { - if self.last_end > self.subject.len() { - return None; - } - let res = self.re.find_at_with_match_data( - &mut self.match_data, - self.subject, - self.last_end, - ); - let m = match res { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - if m.start() == m.end() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = m.end() + 1; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } - } else { - self.last_end = m.end(); - } - self.last_match = Some(m.end()); - Some(Ok(m)) - } -} +/// A builder for configuring the compilation of a PCRE2 regex. +pub type RegexBuilder = RegexBuilderImpl<CodeUnitWidth8>; -/// An iterator that yields all non-overlapping capture groups matching a -/// particular regular expression. -/// -/// The iterator stops when no more matches can be found. +/// Match represents a single match of a regex in a subject string. /// -/// `'r` is the lifetime of the compiled regular expression and `'s` is the -/// lifetime of the subject string. -pub struct CaptureMatches<'r, 's> { - re: &'r Regex, - subject: &'s [u8], - last_end: usize, - last_match: Option<usize>, -} - -impl<'r, 's> Iterator for CaptureMatches<'r, 's> { - type Item = Result<Captures<'s>, Error>; - - fn next(&mut self) -> Option<Result<Captures<'s>, Error>> { - if self.last_end > self.subject.len() { - return None; - } - let mut locs = self.re.capture_locations(); - let res = - self.re.captures_read_at(&mut locs, self.subject, self.last_end); - let m = match res { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - if m.start() == m.end() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = m.end() + 1; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } - } else { - self.last_end = m.end(); - } - self.last_match = Some(m.end()); - Some(Ok(Captures { - subject: self.subject, - locs, - idx: Arc::clone(&self.re.capture_names_idx), - })) - } -} - -/// A type alias for our pool of `MatchData` that fixes the type parameters to -/// what we actually use in practice. -type MatchDataPool = Pool<MatchData, MatchDataPoolFn>; - -/// Same as above, but for the guard returned by a pool. -type MatchDataPoolGuard<'a> = PoolGuard<'a, MatchData, MatchDataPoolFn>; - -/// The type of the closure we use to create new caches. We need to spell out -/// all of the marker traits or else we risk leaking !MARKER impls. -type MatchDataPoolFn = - Box<dyn Fn() -> MatchData + Send + Sync + UnwindSafe + RefUnwindSafe>; +/// The lifetime parameter `'s` refers to the lifetime of the matched portion +/// of the subject string. +pub type Match<'s> = MatchImpl<'s, CodeUnitWidth8>; #[cfg(test)] mod tests { @@ -1186,6 +108,13 @@ mod tests { assert_eq!(re.find(b("β")).unwrap().unwrap().as_pair(), (0, 2)); } + #[test] + fn fmt_debug_works() { + let re = RegexBuilder::new().utf(false).build(".").unwrap(); + let m = re.find(b("x")).unwrap().unwrap(); + let _ = format!("{:?}", m); + } + #[test] fn jit4lyfe() { if is_jit_available() { @@ -1247,10 +176,11 @@ mod tests { ); // Test our internal map as well. - assert_eq!(re.capture_names_idx.len(), 3); - assert_eq!(re.capture_names_idx["foo"], 1); - assert_eq!(re.capture_names_idx["a"], 3); - assert_eq!(re.capture_names_idx["springsteen"], 4); + let capture_names_idx = re.get_capture_names_idxs(); + assert_eq!(capture_names_idx.len(), 3); + assert_eq!(capture_names_idx["foo"], 1); + assert_eq!(capture_names_idx["a"], 3); + assert_eq!(capture_names_idx["springsteen"], 4); } #[test] diff --git a/src/ffi.rs b/src/ffi.rs index bef2ab8..c170e79 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -5,17 +5,251 @@ unsafety, but this layer will take care of the obvious things, such as resource management and error handling. */ -use std::{cmp, ptr, slice}; - +use crate::error::Error; +use std::{ + cmp, + marker::PhantomData, + panic::{RefUnwindSafe, UnwindSafe}, + ptr, slice, +}; use {libc::c_void, pcre2_sys::*}; -use crate::error::Error; +#[allow(non_camel_case_types)] +pub trait CodeUnitWidth: std::fmt::Debug + 'static { + type pcre2_code: UnwindSafe + RefUnwindSafe; + type pcre2_compile_context: UnwindSafe + RefUnwindSafe; + type pcre2_match_context; + type pcre2_match_data; + type pcre2_jit_stack; + type PCRE2_SPTR; + type SubjectChar: Copy; + type Pattern: Clone + std::fmt::Debug; + + fn escape_subject(subject: &[Self::SubjectChar]) -> String; + + fn pattern_to_sptr_len( + pattern: &Self::Pattern, + ) -> (Self::PCRE2_SPTR, usize); + fn subject_to_sptr_len( + subject: &[Self::SubjectChar], + ) -> (Self::PCRE2_SPTR, usize); + + unsafe fn pcre2_config( + arg1: u32, + arg2: *mut ::libc::c_void, + ) -> ::libc::c_int; + unsafe fn pcre2_code_free(arg1: *mut Self::pcre2_code); + unsafe fn pcre2_compile( + arg1: Self::PCRE2_SPTR, + arg2: usize, + arg3: u32, + arg4: *mut ::libc::c_int, + arg5: *mut ::libc::size_t, + arg6: *mut Self::pcre2_compile_context, + ) -> *mut Self::pcre2_code; + unsafe fn pcre2_pattern_info( + arg1: *const Self::pcre2_code, + arg2: u32, + arg3: *mut ::libc::c_void, + ) -> ::libc::c_int; + + unsafe fn pcre2_match( + arg1: *const Self::pcre2_code, + arg2: Self::PCRE2_SPTR, + arg3: usize, + arg4: usize, + arg5: u32, + arg6: *mut Self::pcre2_match_data, + arg7: *mut Self::pcre2_match_context, + ) -> ::libc::c_int; + + unsafe fn pcre2_jit_stack_create( + arg1: ::libc::size_t, + arg2: ::libc::size_t, + ) -> *mut Self::pcre2_jit_stack; + unsafe fn pcre2_jit_compile( + arg1: *mut Self::pcre2_code, + arg2: u32, + ) -> ::libc::c_int; + unsafe fn pcre2_jit_stack_assign( + arg1: *mut Self::pcre2_match_context, + arg3: *mut ::libc::c_void, + ); + unsafe fn pcre2_jit_stack_free(arg1: *mut Self::pcre2_jit_stack); + + unsafe fn pcre2_compile_context_create() -> *mut Self::pcre2_compile_context; + unsafe fn pcre2_set_newline( + arg1: *mut Self::pcre2_compile_context, + arg2: u32, + ) -> ::libc::c_int; + unsafe fn pcre2_compile_context_free( + arg1: *mut Self::pcre2_compile_context, + ); + + unsafe fn pcre2_match_context_create() -> *mut Self::pcre2_match_context; + unsafe fn pcre2_match_context_free(arg1: *mut Self::pcre2_match_context); + + unsafe fn pcre2_match_data_create_from_pattern( + arg1: *const Self::pcre2_code, + ) -> *mut Self::pcre2_match_data; + unsafe fn pcre2_match_data_free(arg1: *mut Self::pcre2_match_data); + + unsafe fn pcre2_get_ovector_pointer( + arg1: *mut Self::pcre2_match_data, + ) -> *mut usize; + unsafe fn pcre2_get_ovector_count( + arg1: *mut Self::pcre2_match_data, + ) -> u32; +} + +#[derive(Debug)] +pub struct CodeUnitWidth8; + +impl CodeUnitWidth for CodeUnitWidth8 { + type pcre2_code = pcre2_code_8; + type PCRE2_SPTR = PCRE2_SPTR8; + type pcre2_compile_context = pcre2_compile_context_8; + type pcre2_match_context = pcre2_match_context_8; + type pcre2_match_data = pcre2_match_data_8; + type pcre2_jit_stack = pcre2_jit_stack_8; + type SubjectChar = u8; + type Pattern = String; + + fn escape_subject(subject: &[Self::SubjectChar]) -> String { + use std::ascii::escape_default; + // Escape bytes. + let mut s = String::new(); + for &b in subject { + let escaped: Vec<u8> = escape_default(b).collect(); + s.push_str(&String::from_utf8_lossy(&escaped)); + } + s + } + + fn pattern_to_sptr_len( + pattern: &Self::Pattern, + ) -> (Self::PCRE2_SPTR, usize) { + (pattern.as_ptr(), pattern.len()) + } + + fn subject_to_sptr_len( + subject: &[Self::SubjectChar], + ) -> (Self::PCRE2_SPTR, usize) { + (subject.as_ptr(), subject.len()) + } + + unsafe fn pcre2_config( + arg1: u32, + arg2: *mut ::libc::c_void, + ) -> ::libc::c_int { + pcre2_config_8(arg1, arg2) + } + unsafe fn pcre2_code_free(arg1: *mut Self::pcre2_code) { + pcre2_code_free_8(arg1) + } + unsafe fn pcre2_compile( + arg1: Self::PCRE2_SPTR, + arg2: usize, + arg3: u32, + arg4: *mut ::libc::c_int, + arg5: *mut ::libc::size_t, + arg6: *mut Self::pcre2_compile_context, + ) -> *mut Self::pcre2_code { + pcre2_compile_8(arg1, arg2, arg3, arg4, arg5, arg6) + } + + unsafe fn pcre2_jit_stack_create( + arg1: ::libc::size_t, + arg2: ::libc::size_t, + ) -> *mut Self::pcre2_jit_stack { + pcre2_jit_stack_create_8(arg1, arg2, ptr::null_mut()) + } + unsafe fn pcre2_jit_compile( + arg1: *mut Self::pcre2_code, + arg2: u32, + ) -> ::libc::c_int { + pcre2_jit_compile_8(arg1, arg2) + } + unsafe fn pcre2_jit_stack_assign( + arg1: *mut Self::pcre2_match_context, + arg3: *mut ::libc::c_void, + ) { + pcre2_jit_stack_assign_8(arg1, None, arg3) + } + unsafe fn pcre2_jit_stack_free(arg1: *mut Self::pcre2_jit_stack) { + pcre2_jit_stack_free_8(arg1) + } + + unsafe fn pcre2_pattern_info( + arg1: *const Self::pcre2_code, + arg2: u32, + arg3: *mut ::libc::c_void, + ) -> ::libc::c_int { + pcre2_pattern_info_8(arg1, arg2, arg3) + } + + unsafe fn pcre2_match( + arg1: *const Self::pcre2_code, + arg2: Self::PCRE2_SPTR, + arg3: usize, + arg4: usize, + arg5: u32, + arg6: *mut Self::pcre2_match_data, + arg7: *mut Self::pcre2_match_context, + ) -> ::libc::c_int { + pcre2_match_8(arg1, arg2, arg3, arg4, arg5, arg6, arg7) + } + + unsafe fn pcre2_compile_context_create() -> *mut Self::pcre2_compile_context + { + pcre2_compile_context_create_8(ptr::null_mut()) + } + unsafe fn pcre2_match_context_free(arg1: *mut Self::pcre2_match_context) { + pcre2_match_context_free_8(arg1) + } + + unsafe fn pcre2_set_newline( + arg1: *mut Self::pcre2_compile_context, + arg2: u32, + ) -> ::libc::c_int { + pcre2_set_newline_8(arg1, arg2) + } + unsafe fn pcre2_compile_context_free( + arg1: *mut Self::pcre2_compile_context, + ) { + pcre2_compile_context_free_8(arg1) + } + unsafe fn pcre2_match_context_create() -> *mut Self::pcre2_match_context { + pcre2_match_context_create_8(ptr::null_mut()) + } + + unsafe fn pcre2_match_data_create_from_pattern( + arg1: *const Self::pcre2_code, + ) -> *mut Self::pcre2_match_data { + pcre2_match_data_create_from_pattern_8(arg1, ptr::null_mut()) + } + unsafe fn pcre2_match_data_free(arg1: *mut Self::pcre2_match_data) { + pcre2_match_data_free_8(arg1) + } + + unsafe fn pcre2_get_ovector_pointer( + arg1: *mut Self::pcre2_match_data, + ) -> *mut usize { + pcre2_get_ovector_pointer_8(arg1) + } + unsafe fn pcre2_get_ovector_count( + arg1: *mut Self::pcre2_match_data, + ) -> u32 { + pcre2_get_ovector_count_8(arg1) + } +} /// Returns true if and only if PCRE2 believes that JIT is available. pub fn is_jit_available() -> bool { + type W = CodeUnitWidth8; let mut rc: u32 = 0; let error_code = unsafe { - pcre2_config_8(PCRE2_CONFIG_JIT, &mut rc as *mut _ as *mut c_void) + W::pcre2_config(PCRE2_CONFIG_JIT, &mut rc as *mut _ as *mut c_void) }; if error_code < 0 { // If PCRE2_CONFIG_JIT is a bad option, then there's a bug somewhere. @@ -61,14 +295,14 @@ pub fn escape(pattern: &str) -> String { } /// A low level representation of a compiled PCRE2 code object. -pub(crate) struct Code { - code: *mut pcre2_code_8, +pub(crate) struct Code<W: CodeUnitWidth> { + code: *mut W::pcre2_code, compiled_jit: bool, // We hang on to this but don't use it so that it gets freed when the // compiled code gets freed. It's not clear whether this is necessary or // not, but presumably doesn't cost us much to be conservative. #[allow(dead_code)] - ctx: CompileContext, + ctx: CompileContext<W>, } // SAFETY: Compiled PCRE2 code objects are immutable once built and explicitly @@ -77,28 +311,30 @@ pub(crate) struct Code { // One hitch here is that JIT compiling can write into a PCRE2 code object, but // we only ever JIT compile immediately after first building the code object // and before making it available to the caller. -unsafe impl Send for Code {} -unsafe impl Sync for Code {} +unsafe impl<W: CodeUnitWidth> Send for Code<W> {} +unsafe impl<W: CodeUnitWidth> Sync for Code<W> {} -impl Drop for Code { +impl<W: CodeUnitWidth> Drop for Code<W> { fn drop(&mut self) { - unsafe { pcre2_code_free_8(self.code) } + unsafe { W::pcre2_code_free(self.code) } } } -impl Code { +impl<W: CodeUnitWidth> Code<W> { /// Compile the given pattern with the given options. If there was a /// problem compiling the pattern, then return an error. pub(crate) fn new( - pattern: &str, + pattern: &W::Pattern, options: u32, - mut ctx: CompileContext, - ) -> Result<Code, Error> { + mut ctx: CompileContext<W>, + ) -> Result<Self, Error> { let (mut error_code, mut error_offset) = (0, 0); + let (pat_sptr, pat_len) = W::pattern_to_sptr_len(pattern); + let code = unsafe { - pcre2_compile_8( - pattern.as_ptr(), - pattern.len(), + W::pcre2_compile( + pat_sptr, + pat_len, options, &mut error_code, &mut error_offset, @@ -118,7 +354,7 @@ impl Code { /// an error. pub(crate) fn jit_compile(&mut self) -> Result<(), Error> { let error_code = - unsafe { pcre2_jit_compile_8(self.code, PCRE2_JIT_COMPLETE) }; + unsafe { W::pcre2_jit_compile(self.code, PCRE2_JIT_COMPLETE) }; if error_code == 0 { self.compiled_jit = true; Ok(()) @@ -168,7 +404,7 @@ impl Code { } /// Return the underlying raw pointer to the code object. - pub(crate) fn as_ptr(&self) -> *const pcre2_code_8 { + pub(crate) fn as_ptr(&self) -> *const W::pcre2_code { self.code } @@ -188,7 +424,7 @@ impl Code { fn raw_name_table(&self) -> Result<*const u8, Error> { let mut bytes: *const u8 = ptr::null(); let rc = unsafe { - pcre2_pattern_info_8( + W::pcre2_pattern_info( self.as_ptr(), PCRE2_INFO_NAMETABLE, &mut bytes as *mut *const u8 as *mut c_void, @@ -205,7 +441,7 @@ impl Code { fn name_count(&self) -> Result<usize, Error> { let mut count: u32 = 0; let rc = unsafe { - pcre2_pattern_info_8( + W::pcre2_pattern_info( self.as_ptr(), PCRE2_INFO_NAMECOUNT, &mut count as *mut u32 as *mut c_void, @@ -227,7 +463,7 @@ impl Code { fn name_entry_size(&self) -> Result<usize, Error> { let mut size: u32 = 0; let rc = unsafe { - pcre2_pattern_info_8( + W::pcre2_pattern_info( self.as_ptr(), PCRE2_INFO_NAMEENTRYSIZE, &mut size as *mut u32 as *mut c_void, @@ -246,7 +482,7 @@ impl Code { pub(crate) fn capture_count(&self) -> Result<usize, Error> { let mut count: u32 = 0; let rc = unsafe { - pcre2_pattern_info_8( + W::pcre2_pattern_info( self.as_ptr(), PCRE2_INFO_CAPTURECOUNT, &mut count as *mut u32 as *mut c_void, @@ -261,25 +497,27 @@ impl Code { } /// A low level representation of PCRE2's compilation context. -pub(crate) struct CompileContext(*mut pcre2_compile_context_8); +pub(crate) struct CompileContext<W: CodeUnitWidth>( + *mut W::pcre2_compile_context, +); // SAFETY: Compile contexts are safe to read from multiple threads // simultaneously. No interior mutability is used, so Sync is safe. -unsafe impl Send for CompileContext {} -unsafe impl Sync for CompileContext {} +unsafe impl<W: CodeUnitWidth> Send for CompileContext<W> {} +unsafe impl<W: CodeUnitWidth> Sync for CompileContext<W> {} -impl Drop for CompileContext { +impl<W: CodeUnitWidth> Drop for CompileContext<W> { fn drop(&mut self) { - unsafe { pcre2_compile_context_free_8(self.0) } + unsafe { W::pcre2_compile_context_free(self.0) } } } -impl CompileContext { +impl<W: CodeUnitWidth> CompileContext<W> { /// Create a new empty compilation context. /// /// If memory could not be allocated for the context, then this panics. - pub(crate) fn new() -> CompileContext { - let ctx = unsafe { pcre2_compile_context_create_8(ptr::null_mut()) }; + pub(crate) fn new() -> Self { + let ctx = unsafe { W::pcre2_compile_context_create() }; assert!(!ctx.is_null(), "could not allocate compile context"); CompileContext(ctx) } @@ -290,7 +528,7 @@ impl CompileContext { /// PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, PCRE2_NEWLINE_ANY or /// PCRE2_NEWLINE_NUL. Using any other value results in an error. pub(crate) fn set_newline(&mut self, value: u32) -> Result<(), Error> { - let rc = unsafe { pcre2_set_newline_8(self.0, value) }; + let rc = unsafe { W::pcre2_set_newline(self.0, value) }; if rc == 0 { Ok(()) } else { @@ -298,7 +536,7 @@ impl CompileContext { } } - fn as_mut_ptr(&mut self) -> *mut pcre2_compile_context_8 { + fn as_mut_ptr(&mut self) -> *mut W::pcre2_compile_context { self.0 } } @@ -322,13 +560,14 @@ impl Default for MatchConfig { /// Technically, a single match data block can be used with multiple regexes /// (not simultaneously), but in practice, we just create a single match data /// block for each regex for each thread it's used in. -pub(crate) struct MatchData { +pub(crate) struct MatchData<W: CodeUnitWidth> { config: MatchConfig, - match_context: *mut pcre2_match_context_8, - match_data: *mut pcre2_match_data_8, - jit_stack: Option<*mut pcre2_jit_stack_8>, + match_context: *mut W::pcre2_match_context, + match_data: *mut W::pcre2_match_data, + jit_stack: Option<*mut W::pcre2_jit_stack>, ovector_ptr: *const usize, ovector_count: u32, + _marker: PhantomData<W>, } // SAFETY: Match data blocks can be freely sent from one thread to another, @@ -337,36 +576,31 @@ pub(crate) struct MatchData { // data block for executing a search, which statically prevents simultaneous // reading/writing. It is legal to read match data blocks from multiple threads // simultaneously. -unsafe impl Send for MatchData {} -unsafe impl Sync for MatchData {} +unsafe impl<W: CodeUnitWidth> Send for MatchData<W> {} +unsafe impl<W: CodeUnitWidth> Sync for MatchData<W> {} -impl Drop for MatchData { +impl<W: CodeUnitWidth> Drop for MatchData<W> { fn drop(&mut self) { unsafe { if let Some(stack) = self.jit_stack { - pcre2_jit_stack_free_8(stack); + W::pcre2_jit_stack_free(stack); } - pcre2_match_data_free_8(self.match_data); - pcre2_match_context_free_8(self.match_context); + W::pcre2_match_data_free(self.match_data); + W::pcre2_match_context_free(self.match_context); } } } -impl MatchData { +impl<W: CodeUnitWidth> MatchData<W> { /// Create a new match data block from a compiled PCRE2 code object. /// /// This panics if memory could not be allocated for the block. - pub(crate) fn new(config: MatchConfig, code: &Code) -> MatchData { - let match_context = - unsafe { pcre2_match_context_create_8(ptr::null_mut()) }; + pub(crate) fn new(config: MatchConfig, code: &Code<W>) -> MatchData<W> { + let match_context = unsafe { W::pcre2_match_context_create() }; assert!(!match_context.is_null(), "failed to allocate match context"); - let match_data = unsafe { - pcre2_match_data_create_from_pattern_8( - code.as_ptr(), - ptr::null_mut(), - ) - }; + let match_data = + unsafe { W::pcre2_match_data_create_from_pattern(code.as_ptr()) }; assert!(!match_data.is_null(), "failed to allocate match data block"); let jit_stack = match config.max_jit_stack_size { @@ -374,18 +608,13 @@ impl MatchData { Some(_) if !code.compiled_jit => None, Some(max) => { let stack = unsafe { - pcre2_jit_stack_create_8( - cmp::min(max, 32 * 1 << 10), - max, - ptr::null_mut(), - ) + W::pcre2_jit_stack_create(cmp::min(max, 32 * 1 << 10), max) }; assert!(!stack.is_null(), "failed to allocate JIT stack"); unsafe { - pcre2_jit_stack_assign_8( + W::pcre2_jit_stack_assign( match_context, - None, stack as *mut c_void, ) }; @@ -393,9 +622,9 @@ impl MatchData { } }; - let ovector_ptr = unsafe { pcre2_get_ovector_pointer_8(match_data) }; + let ovector_ptr = unsafe { W::pcre2_get_ovector_pointer(match_data) }; assert!(!ovector_ptr.is_null(), "got NULL ovector pointer"); - let ovector_count = unsafe { pcre2_get_ovector_count_8(match_data) }; + let ovector_count = unsafe { W::pcre2_get_ovector_count(match_data) }; MatchData { config, match_context, @@ -403,6 +632,7 @@ impl MatchData { jit_stack, ovector_ptr, ovector_count, + _marker: PhantomData, } } @@ -428,8 +658,8 @@ impl MatchData { /// valid UTF-8, then the result is undefined. pub(crate) unsafe fn find( &mut self, - code: &Code, - mut subject: &[u8], + code: &Code<W>, + mut subject: &[W::SubjectChar], start: usize, options: u32, ) -> Result<bool, Error> { @@ -438,18 +668,18 @@ impl MatchData { // from, e.g., an empty `Vec<u8>` may not have a valid // pointer, since creating an empty `Vec` is guaranteed // to not allocate. - const EMPTY: &[u8] = &[]; if subject.is_empty() { - subject = EMPTY; + subject = &[]; } + let (subj_ptr, subj_len) = W::subject_to_sptr_len(subject); - let rc = pcre2_match_8( + let rc = W::pcre2_match( code.as_ptr(), - subject.as_ptr(), - subject.len(), + subj_ptr, + subj_len, start, options, - self.as_mut_ptr(), + self.match_data, self.match_context, ); if rc == PCRE2_ERROR_NOMATCH { @@ -465,11 +695,6 @@ impl MatchData { } } - /// Return a mutable reference to the underlying match data. - fn as_mut_ptr(&mut self) -> *mut pcre2_match_data_8 { - self.match_data - } - /// Return the ovector corresponding to this match data. /// /// The ovector represents match offsets as pairs. This always returns diff --git a/src/lib.rs b/src/lib.rs index 4de2df9..c117277 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,3 +25,4 @@ pub mod bytes; mod error; mod ffi; mod pool; +mod regex_impl; diff --git a/src/regex_impl.rs b/src/regex_impl.rs new file mode 100644 index 0000000..be7506c --- /dev/null +++ b/src/regex_impl.rs @@ -0,0 +1,1104 @@ +use std::{ + collections::HashMap, + fmt, + ops::Index, + panic::{RefUnwindSafe, UnwindSafe}, + sync::Arc, +}; + +use log::debug; +use pcre2_sys::{ + PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MATCH_INVALID_UTF, + PCRE2_MULTILINE, PCRE2_NEWLINE_ANYCRLF, PCRE2_UCP, PCRE2_UNSET, PCRE2_UTF, +}; + +use crate::{ + error::Error, + ffi::{Code, CodeUnitWidth, CompileContext, MatchConfig, MatchData}, + pool::{Pool, PoolGuard}, +}; + +/// Match represents a single match of a regex in a subject string. +/// +/// The lifetime parameter `'s` refers to the lifetime of the matched portion +/// of the subject string. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Match<'s, W: CodeUnitWidth> { + subject: &'s [W::SubjectChar], + start: usize, + end: usize, +} + +impl<'s, W: CodeUnitWidth> Match<'s, W> { + /// Returns the starting byte offset of the match in the subject. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the subject. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the matched portion of the subject string. + #[inline] + pub fn as_bytes(&self) -> &'s [W::SubjectChar] { + &self.subject[self.start..self.end] + } + + /// Creates a new match from the given subject string and byte offsets. + fn new(subject: &'s [W::SubjectChar], start: usize, end: usize) -> Self { + Match { subject, start, end } + } + + #[cfg(test)] + pub(crate) fn as_pair(&self) -> (usize, usize) { + (self.start, self.end) + } +} + +#[derive(Clone, Debug)] +struct Config { + /// PCRE2_CASELESS + caseless: bool, + /// PCRE2_DOTALL + dotall: bool, + /// PCRE2_EXTENDED + extended: bool, + /// PCRE2_MULTILINE + multi_line: bool, + /// PCRE2_NEWLINE_ANYCRLF + crlf: bool, + /// PCRE2_UCP + ucp: bool, + /// PCRE2_UTF + utf: bool, + /// use pcre2_jit_compile + jit: JITChoice, + /// Match-time specific configuration knobs. + match_config: MatchConfig, +} + +#[derive(Clone, Debug)] +enum JITChoice { + /// Never do JIT compilation. + Never, + /// Always do JIT compilation and return an error if it fails. + Always, + /// Attempt to do JIT compilation but silently fall back to non-JIT. + Attempt, +} + +impl Default for Config { + fn default() -> Config { + Config { + caseless: false, + dotall: false, + extended: false, + multi_line: false, + crlf: false, + ucp: false, + utf: false, + jit: JITChoice::Never, + match_config: MatchConfig::default(), + } + } +} + +/// A builder for configuring the compilation of a PCRE2 regex. +/// This takes a phantom parameter to aid type inference. +#[derive(Clone, Debug)] +pub struct RegexBuilder<W: CodeUnitWidth> { + config: Config, + _phantom: std::marker::PhantomData<W>, +} + +impl<W: CodeUnitWidth> RegexBuilder<W> { + /// Create a new builder with a default configuration. + pub fn new() -> Self { + RegexBuilder { + config: Config::default(), + _phantom: std::marker::PhantomData, + } + } + + /// Compile the given pattern into a PCRE regex using the current + /// configuration. + /// + /// If there was a problem compiling the pattern, then an error is + /// returned. + pub fn build<Pat: Into<W::Pattern>>( + &self, + pattern: Pat, + ) -> Result<Regex<W>, Error> { + let mut options = 0; + if self.config.caseless { + options |= PCRE2_CASELESS; + } + if self.config.dotall { + options |= PCRE2_DOTALL; + } + if self.config.extended { + options |= PCRE2_EXTENDED; + } + if self.config.multi_line { + options |= PCRE2_MULTILINE; + } + if self.config.ucp { + options |= PCRE2_UCP; + options |= PCRE2_UTF; + options |= PCRE2_MATCH_INVALID_UTF; + } + if self.config.utf { + options |= PCRE2_UTF; + } + + let mut ctx = CompileContext::new(); + if self.config.crlf { + ctx.set_newline(PCRE2_NEWLINE_ANYCRLF) + .expect("PCRE2_NEWLINE_ANYCRLF is a legal value"); + } + + let pattern = pattern.into(); + let mut code = Code::new(&pattern, options, ctx)?; + match self.config.jit { + JITChoice::Never => {} // fallthrough + JITChoice::Always => { + code.jit_compile()?; + } + JITChoice::Attempt => { + if let Err(err) = code.jit_compile() { + debug!("JIT compilation failed: {}", err); + } + } + } + let capture_names = code.capture_names()?; + let mut idx = HashMap::new(); + for (i, group) in capture_names.iter().enumerate() { + if let Some(ref name) = *group { + idx.insert(name.to_string(), i); + } + } + let code = Arc::new(code); + let match_data = { + let config = self.config.match_config.clone(); + let code = Arc::clone(&code); + let create: MatchDataPoolFn<W> = + Box::new(move || MatchData::new(config.clone(), &code)); + Pool::new(create) + }; + Ok(Regex { + config: Arc::new(self.config.clone()), + pattern, + code, + capture_names: Arc::new(capture_names), + capture_names_idx: Arc::new(idx), + match_data, + }) + } + + /// Enables case insensitive matching. + /// + /// If the `utf` option is also set, then Unicode case folding is used + /// to determine case insensitivity. When the `utf` option is not set, + /// then only standard ASCII case insensitivity is considered. + /// + /// This option corresponds to the `i` flag. + pub fn caseless(&mut self, yes: bool) -> &mut Self { + self.config.caseless = yes; + self + } + + /// Enables "dot all" matching. + /// + /// When enabled, the `.` metacharacter in the pattern matches any + /// character, include `\n`. When disabled (the default), `.` will match + /// any character except for `\n`. + /// + /// This option corresponds to the `s` flag. + pub fn dotall(&mut self, yes: bool) -> &mut Self { + self.config.dotall = yes; + self + } + + /// Enable "extended" mode in the pattern, where whitespace is ignored. + /// + /// This option corresponds to the `x` flag. + pub fn extended(&mut self, yes: bool) -> &mut Self { + self.config.extended = yes; + self + } + + /// Enable multiline matching mode. + /// + /// When enabled, the `^` and `$` anchors will match both at the beginning + /// and end of a subject string, in addition to matching at the start of + /// a line and the end of a line. When disabled, the `^` and `$` anchors + /// will only match at the beginning and end of a subject string. + /// + /// This option corresponds to the `m` flag. + pub fn multi_line(&mut self, yes: bool) -> &mut Self { + self.config.multi_line = yes; + self + } + + /// Enable matching of CRLF as a line terminator. + /// + /// When enabled, anchors such as `^` and `$` will match any of the + /// following as a line terminator: `\r`, `\n` or `\r\n`. + /// + /// This is disabled by default, in which case, only `\n` is recognized as + /// a line terminator. + pub fn crlf(&mut self, yes: bool) -> &mut Self { + self.config.crlf = yes; + self + } + + /// Enable Unicode matching mode. + /// + /// When enabled, the following patterns become Unicode aware: `\b`, `\B`, + /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`. + /// + /// When set, this implies UTF matching mode. It is not possible to enable + /// Unicode matching mode without enabling UTF matching mode. + /// + /// This is disabled by default. + pub fn ucp(&mut self, yes: bool) -> &mut Self { + self.config.ucp = yes; + self + } + + /// Enable UTF matching mode. + /// + /// When enabled, characters are treated as sequences of code units that + /// make up a single codepoint instead of as single bytes. For example, + /// this will cause `.` to match any single UTF-8 encoded codepoint, where + /// as when this is disabled, `.` will any single byte (except for `\n` in + /// both cases, unless "dot all" mode is enabled). + /// + /// This is disabled by default. + pub fn utf(&mut self, yes: bool) -> &mut Self { + self.config.utf = yes; + self + } + + /// This is now deprecated and is a no-op. + /// + /// Previously, this option permitted disabling PCRE2's UTF-8 validity + /// check, which could result in undefined behavior if the haystack was + /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`, + /// in 10.34 which this crate always sets. When this option is enabled, + /// PCRE2 claims to not have undefined behavior when the haystack is + /// invalid UTF-8. + /// + /// Therefore, disabling the UTF-8 check is not something that is exposed + /// by this crate. + #[deprecated( + since = "0.2.4", + note = "now a no-op due to new PCRE2 features" + )] + pub fn disable_utf_check(&mut self) -> &mut Self { + self + } + + /// Enable PCRE2's JIT and return an error if it's not available. + /// + /// This generally speeds up matching quite a bit. The downside is that it + /// can increase the time it takes to compile a pattern. + /// + /// If the JIT isn't available or if JIT compilation returns an error, then + /// regex compilation will fail with the corresponding error. + /// + /// This is disabled by default, and always overrides `jit_if_available`. + pub fn jit(&mut self, yes: bool) -> &mut Self { + if yes { + self.config.jit = JITChoice::Always; + } else { + self.config.jit = JITChoice::Never; + } + self + } + + /// Enable PCRE2's JIT if it's available. + /// + /// This generally speeds up matching quite a bit. The downside is that it + /// can increase the time it takes to compile a pattern. + /// + /// If the JIT isn't available or if JIT compilation returns an error, + /// then a debug message with the error will be emitted and the regex will + /// otherwise silently fall back to non-JIT matching. + /// + /// This is disabled by default, and always overrides `jit`. + pub fn jit_if_available(&mut self, yes: bool) -> &mut Self { + if yes { + self.config.jit = JITChoice::Attempt; + } else { + self.config.jit = JITChoice::Never; + } + self + } + + /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is + /// not enabled, then this has no effect. + /// + /// When `None` is given, no custom JIT stack will be created, and instead, + /// the default JIT stack is used. When the default is used, its maximum + /// size is 32 KB. + /// + /// When this is set, then a new JIT stack will be created with the given + /// maximum size as its limit. + /// + /// Increasing the stack size can be useful for larger regular expressions. + /// + /// By default, this is set to `None`. + pub fn max_jit_stack_size(&mut self, bytes: Option<usize>) -> &mut Self { + self.config.match_config.max_jit_stack_size = bytes; + self + } +} + +/// A compiled PCRE2 regular expression. +/// +/// This regex is safe to use from multiple threads simultaneously. For top +/// performance, it is better to clone a new regex for each thread. +pub struct Regex<W: CodeUnitWidth> { + /// The configuration used to build the regex. + config: Arc<Config>, + /// The original pattern string. + pattern: W::Pattern, + /// The underlying compiled PCRE2 object. + code: Arc<Code<W>>, + /// The capture group names for this regex. + capture_names: Arc<Vec<Option<String>>>, + /// A map from capture group name to capture group index. + capture_names_idx: Arc<HashMap<String, usize>>, + /// A pool of mutable scratch data used by PCRE2 during matching. + match_data: MatchDataPool<W>, +} + +impl<W: CodeUnitWidth> Clone for Regex<W> { + fn clone(&self) -> Self { + let match_data = { + let config = self.config.match_config.clone(); + let code = Arc::clone(&self.code); + let create: MatchDataPoolFn<W> = + Box::new(move || MatchData::new(config.clone(), &code)); + Pool::new(create) + }; + Self { + config: Arc::clone(&self.config), + pattern: self.pattern.clone(), + code: Arc::clone(&self.code), + capture_names: Arc::clone(&self.capture_names), + capture_names_idx: Arc::clone(&self.capture_names_idx), + match_data, + } + } +} + +impl<W: CodeUnitWidth> fmt::Debug for Regex<W> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Regex({:?})", self.pattern) + } +} + +impl<W: CodeUnitWidth> Regex<W> { + /// Compiles a regular expression using the default configuration. + /// + /// Once compiled, it can be used repeatedly to search, split or replace + /// text in a string. + /// + /// If an invalid expression is given, then an error is returned. + /// + /// To configure compilation options for the regex, use the + /// [`RegexBuilder`](struct.RegexBuilder.html). + pub fn new<Pat: Into<W::Pattern>>(pattern: Pat) -> Result<Self, Error> { + RegexBuilder::new().build(pattern) + } + + /// Returns true if and only if the regex matches the subject string given. + /// + /// # Example + /// + /// Test if some text contains at least one word with exactly 13 ASCII word + /// bytes: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let text = b"I categorically deny having triskaidekaphobia."; + /// assert!(Regex::new(r"\b\w{13}\b")?.is_match(text)?); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn is_match(&self, subject: &[W::SubjectChar]) -> Result<bool, Error> { + self.is_match_at(subject, 0) + } + + /// Returns the start and end byte range of the leftmost-first match in + /// `subject`. If no match exists, then `None` is returned. + /// + /// # Example + /// + /// Find the start and end location of the first word with exactly 13 + /// ASCII word bytes: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let text = b"I categorically deny having triskaidekaphobia."; + /// let mat = Regex::new(r"\b\w{13}\b")?.find(text)?.unwrap(); + /// assert_eq!((mat.start(), mat.end()), (2, 15)); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn find<'s>( + &self, + subject: &'s [W::SubjectChar], + ) -> Result<Option<Match<'s, W>>, Error> { + self.find_at(subject, 0) + } + + /// Returns an iterator for each successive non-overlapping match in + /// `subject`, returning the start and end byte indices with respect to + /// `subject`. + /// + /// # Example + /// + /// Find the start and end location of every word with exactly 13 ASCII + /// word bytes: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let text = b"Retroactively relinquishing remunerations is reprehensible."; + /// for result in Regex::new(r"\b\w{13}\b")?.find_iter(text) { + /// let mat = result?; + /// println!("{:?}", mat); + /// } + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn find_iter<'r, 's>( + &'r self, + subject: &'s [W::SubjectChar], + ) -> Matches<'r, 's, W> { + Matches { + re: self, + match_data: self.match_data(), + subject, + last_end: 0, + last_match: None, + } + } + + /// Returns the capture groups corresponding to the leftmost-first + /// match in `subject`. Capture group `0` always corresponds to the entire + /// match. If no match is found, then `None` is returned. + /// + /// # Examples + /// + /// Say you have some text with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text + /// looking like that, while also extracting the movie name and its release + /// year separately. + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")?; + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text)?.unwrap(); + /// assert_eq!(&caps[1], &b"Citizen Kane"[..]); + /// assert_eq!(&caps[2], &b"1941"[..]); + /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. + /// assert_eq!(&caps[1], b"Citizen Kane"); + /// assert_eq!(&caps[2], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// # Ok(()) }; example().unwrap() + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?; + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text)?.unwrap(); + /// assert_eq!(&caps["title"], &b"Citizen Kane"[..]); + /// assert_eq!(&caps["year"], &b"1941"[..]); + /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. + /// assert_eq!(&caps["title"], b"Citizen Kane"); + /// assert_eq!(&caps["year"], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// # Ok(()) }; example().unwrap() + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + pub fn captures<'s>( + &self, + subject: &'s [W::SubjectChar], + ) -> Result<Option<Captures<'s, W>>, Error> { + let mut locs = self.capture_locations(); + Ok(self.captures_read(&mut locs, subject)?.map(move |_| Captures { + subject, + locs, + idx: Arc::clone(&self.capture_names_idx), + })) + } + + /// Returns an iterator over all the non-overlapping capture groups matched + /// in `subject`. This is operationally the same as `find_iter`, except it + /// yields information about capturing group matches. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some text, where the movie is formatted like "'Title' (xxxx)": + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use std::str; + /// + /// use pcre2::bytes::Regex; + /// + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?; + /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// for result in re.captures_iter(text) { + /// let caps = result?; + /// let title = str::from_utf8(&caps["title"]).unwrap(); + /// let year = str::from_utf8(&caps["year"]).unwrap(); + /// println!("Movie: {:?}, Released: {:?}", title, year); + /// } + /// // Output: + /// // Movie: Citizen Kane, Released: 1941 + /// // Movie: The Wizard of Oz, Released: 1939 + /// // Movie: M, Released: 1931 + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn captures_iter<'r, 's>( + &'r self, + subject: &'s [W::SubjectChar], + ) -> CaptureMatches<'r, 's, W> { + CaptureMatches { re: self, subject, last_end: 0, last_match: None } + } + + /// Test helper to access capture name indexes. + #[cfg(test)] + pub(crate) fn get_capture_names_idxs(&self) -> &HashMap<String, usize> { + &self.capture_names_idx + } +} + +/// Advanced or "lower level" search methods. +impl<W: CodeUnitWidth> Regex<W> { + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at( + &self, + subject: &[W::SubjectChar], + start: usize, + ) -> Result<bool, Error> { + assert!( + start <= subject.len(), + "start ({}) must be <= subject.len() ({})", + start, + subject.len() + ); + + let options = 0; + let mut match_data = self.match_data(); + // SAFETY: We don't use any dangerous PCRE2 options. + let res = + unsafe { match_data.find(&self.code, subject, start, options) }; + PoolGuard::put(match_data); + res + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn find_at<'s>( + &self, + subject: &'s [W::SubjectChar], + start: usize, + ) -> Result<Option<Match<'s, W>>, Error> { + let mut match_data = self.match_data(); + let res = + self.find_at_with_match_data(&mut match_data, subject, start); + PoolGuard::put(match_data); + res + } + + /// Like find_at, but accepts match data instead of acquiring one itself. + /// + /// This is useful for implementing the iterator, which permits avoiding + /// the synchronization overhead of acquiring the match data. + #[inline(always)] + fn find_at_with_match_data<'s>( + &self, + match_data: &mut MatchDataPoolGuard<'_, W>, + subject: &'s [W::SubjectChar], + start: usize, + ) -> Result<Option<Match<'s, W>>, Error> { + assert!( + start <= subject.len(), + "start ({}) must be <= subject.len() ({})", + start, + subject.len() + ); + + let options = 0; + // SAFETY: We don't use any dangerous PCRE2 options. + if unsafe { !match_data.find(&self.code, subject, start, options)? } { + return Ok(None); + } + let ovector = match_data.ovector(); + let (s, e) = (ovector[0], ovector[1]); + Ok(Some(Match::new(subject, s, e))) + } + + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalent to the `0`th capture group. + pub fn captures_read<'s>( + &self, + locs: &mut CaptureLocations<W>, + subject: &'s [W::SubjectChar], + ) -> Result<Option<Match<'s, W>>, Error> { + self.captures_read_at(locs, subject, 0) + } + + /// Returns the same as `captures_read`, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_read_at<'s>( + &self, + locs: &mut CaptureLocations<W>, + subject: &'s [W::SubjectChar], + start: usize, + ) -> Result<Option<Match<'s, W>>, Error> { + assert!( + start <= subject.len(), + "start ({}) must be <= subject.len() ({})", + start, + subject.len() + ); + + let options = 0; + // SAFETY: We don't use any dangerous PCRE2 options. + if unsafe { !locs.data.find(&self.code, subject, start, options)? } { + return Ok(None); + } + let ovector = locs.data.ovector(); + let (s, e) = (ovector[0], ovector[1]); + Ok(Some(Match::new(subject, s, e))) + } +} + +/// Auxiliary methods. +impl<W: CodeUnitWidth> Regex<W> { + /// Returns the original pattern string for this regex. + pub fn as_str(&self) -> &W::Pattern { + &self.pattern + } + + /// Returns a sequence of all capturing groups and their names, if present. + /// + /// The length of the slice returned is always equal to the result of + /// `captures_len`, which is the number of capturing groups (including the + /// capturing group for the entire pattern). + /// + /// Each entry in the slice is the name of the corresponding capturing + /// group, if one exists. The first capturing group (at index `0`) is + /// always unnamed. + /// + /// Capturing groups are indexed by the order of the opening parenthesis. + pub fn capture_names(&self) -> &[Option<String>] { + &self.capture_names + } + + /// Returns the number of capturing groups in the pattern. + /// + /// This is always 1 more than the number of syntactic groups in the + /// pattern, since the first group always corresponds to the entire match. + pub fn captures_len(&self) -> usize { + self.code.capture_count().expect("a valid capture count from PCRE2") + } + + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + pub fn capture_locations(&self) -> CaptureLocations<W> { + CaptureLocations { + code: Arc::clone(&self.code), + data: self.new_match_data(), + } + } + + fn match_data(&self) -> MatchDataPoolGuard<'_, W> { + self.match_data.get() + } + + fn new_match_data(&self) -> MatchData<W> { + MatchData::new(self.config.match_config.clone(), &self.code) + } +} + +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// Primarily, this type is useful when using `Regex` APIs such as +/// `captures_read`, which permits amortizing the allocation in which capture +/// match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +pub struct CaptureLocations<W: CodeUnitWidth> { + code: Arc<Code<W>>, + data: MatchData<W>, +} + +impl<W: CodeUnitWidth> Clone for CaptureLocations<W> { + fn clone(&self) -> Self { + CaptureLocations { + code: Arc::clone(&self.code), + data: MatchData::new(self.data.config().clone(), &self.code), + } + } +} + +impl<W: CodeUnitWidth> fmt::Debug for CaptureLocations<W> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut offsets: Vec<Option<usize>> = vec![]; + for &offset in self.data.ovector() { + if offset == PCRE2_UNSET { + offsets.push(None); + } else { + offsets.push(Some(offset)); + } + } + write!(f, "CaptureLocations(")?; + f.debug_list().entries(offsets).finish()?; + write!(f, ")") + } +} + +impl<W: CodeUnitWidth> CaptureLocations<W> { + /// Returns the start and end positions of the Nth capture group. + /// + /// This returns `None` if `i` is not a valid capture group or if the + /// capture group did not match anything. + /// + /// The positions returned are always byte indices with respect to the + /// original subject string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + let ovec = self.data.ovector(); + let s = match ovec.get(i * 2) { + None => return None, + Some(&s) if s == PCRE2_UNSET => return None, + Some(&s) => s, + }; + let e = match ovec.get(i * 2 + 1) { + None => return None, + Some(&e) if e == PCRE2_UNSET => return None, + Some(&e) => e, + }; + Some((s, e)) + } + + /// Returns the total number of capturing groups. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + self.data.ovector().len() / 2 + } +} + +/// Captures represents a group of captured byte strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched byte string is *also* available via the +/// `name` method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'s` is the lifetime of the matched subject string. +pub struct Captures<'s, W: CodeUnitWidth> { + subject: &'s [W::SubjectChar], + locs: CaptureLocations<W>, + idx: Arc<HashMap<String, usize>>, +} + +impl<'s, W: CodeUnitWidth> Captures<'s, W> { + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the text of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))")?; + /// let caps = re.captures(b"abc123")?.unwrap(); + /// + /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); + /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); + /// assert_eq!(text1, &b"123"[..]); + /// assert_eq!(text2, &b""[..]); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn get(&self, i: usize) -> Option<Match<'s, W>> { + self.locs.get(i).map(|(s, e)| Match::new(self.subject, s, e)) + } + + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option<Match<'s, W>> { + self.idx.get(name).and_then(|&i| self.get(i)) + } + + /// Returns the number of captured groups. + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. + #[inline] + pub fn len(&self) -> usize { + self.locs.len() + } +} + +impl<'s, W: CodeUnitWidth> fmt::Debug for Captures<'s, W> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() + } +} + +struct CapturesDebug<'c, 's: 'c, W: CodeUnitWidth>(&'c Captures<'s, W>); + +impl<'c, 's, W: CodeUnitWidth> fmt::Debug for CapturesDebug<'c, 's, W> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // We'd like to show something nice here, even if it means an + // allocation to build a reverse index. + let slot_to_name: HashMap<&usize, &String> = + self.0.idx.iter().map(|(a, b)| (b, a)).collect(); + let mut map = f.debug_map(); + for slot in 0..self.0.len() { + let m = self + .0 + .locs + .get(slot) + .map(|(s, e)| W::escape_subject(&self.0.subject[s..e])); + if let Some(name) = slot_to_name.get(&slot) { + map.entry(&name, &m); + } else { + map.entry(&slot, &m); + } + } + map.finish() + } +} + +/// Get a group by index. +/// +/// `'s` is the lifetime of the matched subject string. +/// +/// The subject can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `get()` instead. +/// +/// # Panics +/// +/// If there is no group at the given index. +impl<'s, W: CodeUnitWidth> Index<usize> for Captures<'s, W> { + type Output = [W::SubjectChar]; + + fn index(&self, i: usize) -> &Self::Output { + self.get(i) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a group by name. +/// +/// `'s` is the lifetime of the matched subject string and `'i` is the lifetime +/// of the group name (the index). +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `name` instead. +/// +/// # Panics +/// +/// If there is no group named by the given value. +impl<'s, 'i, W: CodeUnitWidth> Index<&'i str> for Captures<'s, W> { + type Output = [W::SubjectChar]; + + fn index<'a>(&'a self, name: &'i str) -> &'a [W::SubjectChar] { + self.name(name) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// An iterator over all non-overlapping matches for a particular subject +/// string. +/// +/// The iterator yields matches (if no error occurred while searching) +/// corresponding to the start and end of the match. The indices are byte +/// offsets. The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'s` is the +/// lifetime of the subject string. +pub struct Matches<'r, 's, W: CodeUnitWidth> { + re: &'r Regex<W>, + match_data: MatchDataPoolGuard<'r, W>, + subject: &'s [W::SubjectChar], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 's, W: CodeUnitWidth> Iterator for Matches<'r, 's, W> { + type Item = Result<Match<'s, W>, Error>; + + fn next(&mut self) -> Option<Self::Item> { + if self.last_end > self.subject.len() { + return None; + } + let res = self.re.find_at_with_match_data( + &mut self.match_data, + self.subject, + self.last_end, + ); + let m = match res { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.start() == m.end() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = m.end() + 1; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. +/// +/// The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'s` is the +/// lifetime of the subject string. +pub struct CaptureMatches<'r, 's, W: CodeUnitWidth> { + re: &'r Regex<W>, + subject: &'s [W::SubjectChar], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 's, W: CodeUnitWidth> Iterator for CaptureMatches<'r, 's, W> { + type Item = Result<Captures<'s, W>, Error>; + + fn next(&mut self) -> Option<Result<Captures<'s, W>, Error>> { + if self.last_end > self.subject.len() { + return None; + } + let mut locs = self.re.capture_locations(); + let res = + self.re.captures_read_at(&mut locs, self.subject, self.last_end); + let m = match res { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.start() == m.end() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = m.end() + 1; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(Captures { + subject: self.subject, + locs, + idx: Arc::clone(&self.re.capture_names_idx), + })) + } +} + +/// A type alias for our pool of `MatchData` that fixes the type parameters to +/// what we actually use in practice. +type MatchDataPool<W> = Pool<MatchData<W>, MatchDataPoolFn<W>>; + +/// Same as above, but for the guard returned by a pool. +type MatchDataPoolGuard<'a, W> = + PoolGuard<'a, MatchData<W>, MatchDataPoolFn<W>>; + +/// The type of the closure we use to create new caches. We need to spell out +/// all of the marker traits or else we risk leaking !MARKER impls. +type MatchDataPoolFn<W> = + Box<dyn Fn() -> MatchData<W> + Send + Sync + UnwindSafe + RefUnwindSafe>; From 17fb76a863017543404aedb62afa1f59d5ba231d Mon Sep 17 00:00:00 2001 From: ridiculousfish <corydoras@ridiculousfish.com> Date: Wed, 22 Mar 2023 17:27:19 -0700 Subject: [PATCH 04/10] Add support for UTF-32 matching This adds a module `utf32` which mirrors the module in `bytes`. It uses `CodeUnitWidth32` to provide the implementation. --- Cargo.toml | 5 + pcre2-sys/Cargo.toml | 3 + pcre2-sys/build.rs | 47 ++++--- src/ffi.rs | 277 ++++++++++++++++++++++++++++++++++------ src/lib.rs | 6 + src/utf32.rs | 294 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 578 insertions(+), 54 deletions(-) create mode 100644 src/utf32.rs diff --git a/Cargo.toml b/Cargo.toml index 322dd4b..e4a15c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,3 +19,8 @@ members = ["pcre2-sys"] libc = "0.2.146" log = "0.4.19" pcre2-sys = { version = "0.2.7", path = "pcre2-sys" } + +[features] + +# Enable matching on UTF-32 strings +utf32 = ["pcre2-sys/utf32"] diff --git a/pcre2-sys/Cargo.toml b/pcre2-sys/Cargo.toml index c66466c..3f7027a 100644 --- a/pcre2-sys/Cargo.toml +++ b/pcre2-sys/Cargo.toml @@ -18,3 +18,6 @@ libc = "0.2.146" [build-dependencies] cc = { version = "1.0.73", features = ["parallel"] } pkg-config = "0.3.27" + +[features] +utf32 = [] diff --git a/pcre2-sys/build.rs b/pcre2-sys/build.rs index 4a6fcad..f98c1fe 100644 --- a/pcre2-sys/build.rs +++ b/pcre2-sys/build.rs @@ -21,30 +21,22 @@ use std::path::PathBuf; -fn main() { - println!("cargo:rerun-if-env-changed=PCRE2_SYS_STATIC"); - +// Build and link against a PCRE2 library with the given code unit width, +// which should be "8" or "32". +fn build_1_pcre2_lib(code_unit_width: &str) { let target = std::env::var("TARGET").unwrap(); - // let out = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); let upstream = PathBuf::from("upstream"); - - // Don't link to a system library if we want a static build. - let want_static = pcre2_sys_static().unwrap_or(target.contains("musl")); - if !want_static && pkg_config::probe_library("libpcre2-8").is_ok() { - return; - } - // Set some config options. We mostly just use the default values. We do // this in lieu of patching config.h since it's easier. let mut builder = cc::Build::new(); builder - .define("PCRE2_CODE_UNIT_WIDTH", "8") + .define("PCRE2_CODE_UNIT_WIDTH", code_unit_width) .define("HAVE_STDLIB_H", "1") .define("HAVE_MEMMOVE", "1") .define("HAVE_CONFIG_H", "1") .define("PCRE2_STATIC", "1") .define("STDC_HEADERS", "1") - .define("SUPPORT_PCRE2_8", "1") + .define(&format!("SUPPORT_PCRE2_{}", code_unit_width), "1") .define("SUPPORT_UNICODE", "1"); if target.contains("windows") { builder.define("HAVE_WINDOWS_H", "1"); @@ -78,7 +70,34 @@ fn main() { { builder.debug(true); } - builder.compile("libpcre2.a"); + builder.compile(&format!("libpcre2-{}.a", code_unit_width)); +} + +fn main() { + println!("cargo:rerun-if-env-changed=PCRE2_SYS_STATIC"); + + let target = std::env::var("TARGET").unwrap(); + let do_utf32 = feature_enabled("UTF32"); + + // Don't link to a system library if we want a static build. + let want_static = pcre2_sys_static().unwrap_or(target.contains("musl")); + if want_static || pkg_config::probe_library("libpcre2-8").is_err() { + build_1_pcre2_lib("8"); + } + if do_utf32 + && (want_static || pkg_config::probe_library("libpcre2-32").is_err()) + { + build_1_pcre2_lib("32"); + } +} + +// Return whether a given feature is enabled. +fn feature_enabled(feature: &str) -> bool { + let env_var_name = format!("CARGO_FEATURE_{}", feature); + match std::env::var(&env_var_name) { + Ok(s) => s == "1", + Err(_) => false, + } } fn pcre2_sys_static() -> Option<bool> { diff --git a/src/ffi.rs b/src/ffi.rs index c170e79..ddc7c7c 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -14,6 +14,73 @@ use std::{ }; use {libc::c_void, pcre2_sys::*}; +pub trait NameTableEntry { + /// The index of the named subpattern. + fn index(&self) -> usize; + + /// The name of the named subpattern. + fn name(&self) -> String; +} + +#[allow(non_camel_case_types)] +#[repr(C)] +pub struct name_table_entry_8 { + match_index_msb: u8, + match_index_lsb: u8, + + // In C, the 'name' field is a flexible array member. + // This does not contribute to the sizeof the struct. + name: u8, +} + +#[allow(non_camel_case_types)] +#[repr(C)] +pub struct name_table_entry_32 { + match_index: u32, + name: u32, // See above re: flexible array member +} + +impl NameTableEntry for name_table_entry_8 { + fn index(&self) -> usize { + ((self.match_index_msb as usize) << 8) + | (self.match_index_lsb as usize) + } + + fn name(&self) -> String { + // The name is nul-terminated. + let name = &self.name as *const u8; + let mut len = 0; + while unsafe { *name.offset(len as isize) } != 0 { + len += 1; + } + let bytes = unsafe { slice::from_raw_parts(name, len) }; + String::from_utf8_lossy(bytes).into_owned() + } +} + +impl NameTableEntry for name_table_entry_32 { + fn index(&self) -> usize { + self.match_index as usize + } + + fn name(&self) -> String { + // The name is nul-terminated. + let replacement: char = '\u{FFFD}'; + let name = &self.name as *const u32; + let mut len = 0; + let mut result = String::new(); + loop { + let c = unsafe { *name.offset(len) }; + if c == 0 { + break; + } + result.push(char::from_u32(c).unwrap_or(replacement)); + len += 1; + } + result + } +} + #[allow(non_camel_case_types)] pub trait CodeUnitWidth: std::fmt::Debug + 'static { type pcre2_code: UnwindSafe + RefUnwindSafe; @@ -21,7 +88,9 @@ pub trait CodeUnitWidth: std::fmt::Debug + 'static { type pcre2_match_context; type pcre2_match_data; type pcre2_jit_stack; + type PCRE2_CHAR; type PCRE2_SPTR; + type name_table_entry: NameTableEntry; type SubjectChar: Copy; type Pattern: Clone + std::fmt::Debug; @@ -104,14 +173,15 @@ pub trait CodeUnitWidth: std::fmt::Debug + 'static { #[derive(Debug)] pub struct CodeUnitWidth8; - impl CodeUnitWidth for CodeUnitWidth8 { type pcre2_code = pcre2_code_8; + type PCRE2_CHAR = PCRE2_UCHAR8; type PCRE2_SPTR = PCRE2_SPTR8; type pcre2_compile_context = pcre2_compile_context_8; type pcre2_match_context = pcre2_match_context_8; type pcre2_match_data = pcre2_match_data_8; type pcre2_jit_stack = pcre2_jit_stack_8; + type name_table_entry = name_table_entry_8; type SubjectChar = u8; type Pattern = String; @@ -120,7 +190,7 @@ impl CodeUnitWidth for CodeUnitWidth8 { // Escape bytes. let mut s = String::new(); for &b in subject { - let escaped: Vec<u8> = escape_default(b).collect(); + let escaped = escape_default(b).collect::<Vec<_>>(); s.push_str(&String::from_utf8_lossy(&escaped)); } s @@ -244,6 +314,153 @@ impl CodeUnitWidth for CodeUnitWidth8 { } } +#[derive(Debug)] +pub struct CodeUnitWidth32; +impl CodeUnitWidth for CodeUnitWidth32 { + type pcre2_code = pcre2_code_32; + type PCRE2_CHAR = PCRE2_UCHAR32; + type PCRE2_SPTR = PCRE2_SPTR32; + type pcre2_compile_context = pcre2_compile_context_32; + type pcre2_match_context = pcre2_match_context_32; + type pcre2_match_data = pcre2_match_data_32; + type pcre2_jit_stack = pcre2_jit_stack_32; + type name_table_entry = name_table_entry_32; + type SubjectChar = char; + type Pattern = Box<[char]>; + + fn escape_subject(subject: &[Self::SubjectChar]) -> String { + use std::ascii::escape_default; + // Escape bytes. + let mut s = String::new(); + for &c in subject { + let mut bytes = [0; 4]; + for &b in c.encode_utf8(&mut bytes).as_bytes() { + // Escape the byte. + let escaped = escape_default(b).collect::<Vec<_>>(); + s.push_str(&String::from_utf8_lossy(&escaped)); + } + } + s + } + + fn pattern_to_sptr_len( + pattern: &Self::Pattern, + ) -> (Self::PCRE2_SPTR, usize) { + (pattern.as_ptr() as *const u32, pattern.len()) + } + + fn subject_to_sptr_len( + subject: &[Self::SubjectChar], + ) -> (Self::PCRE2_SPTR, usize) { + (subject.as_ptr() as *const u32, subject.len()) + } + + unsafe fn pcre2_config( + arg1: u32, + arg2: *mut ::libc::c_void, + ) -> ::libc::c_int { + pcre2_config_32(arg1, arg2) + } + unsafe fn pcre2_code_free(arg1: *mut Self::pcre2_code) { + pcre2_code_free_32(arg1) + } + unsafe fn pcre2_compile( + arg1: Self::PCRE2_SPTR, + arg2: usize, + arg3: u32, + arg4: *mut ::libc::c_int, + arg5: *mut ::libc::size_t, + arg6: *mut Self::pcre2_compile_context, + ) -> *mut Self::pcre2_code { + pcre2_compile_32(arg1, arg2, arg3, arg4, arg5, arg6) + } + + unsafe fn pcre2_jit_stack_create( + arg1: ::libc::size_t, + arg2: ::libc::size_t, + ) -> *mut Self::pcre2_jit_stack { + pcre2_jit_stack_create_32(arg1, arg2, ptr::null_mut()) + } + unsafe fn pcre2_jit_compile( + arg1: *mut Self::pcre2_code, + arg2: u32, + ) -> ::libc::c_int { + pcre2_jit_compile_32(arg1, arg2) + } + unsafe fn pcre2_jit_stack_assign( + arg1: *mut Self::pcre2_match_context, + arg3: *mut ::libc::c_void, + ) { + pcre2_jit_stack_assign_32(arg1, None, arg3) + } + unsafe fn pcre2_jit_stack_free(arg1: *mut Self::pcre2_jit_stack) { + pcre2_jit_stack_free_32(arg1) + } + + unsafe fn pcre2_pattern_info( + arg1: *const Self::pcre2_code, + arg2: u32, + arg3: *mut ::libc::c_void, + ) -> ::libc::c_int { + pcre2_pattern_info_32(arg1, arg2, arg3) + } + + unsafe fn pcre2_match( + arg1: *const Self::pcre2_code, + arg2: Self::PCRE2_SPTR, + arg3: usize, + arg4: usize, + arg5: u32, + arg6: *mut Self::pcre2_match_data, + arg7: *mut Self::pcre2_match_context, + ) -> ::libc::c_int { + pcre2_match_32(arg1, arg2, arg3, arg4, arg5, arg6, arg7) + } + + unsafe fn pcre2_compile_context_create() -> *mut Self::pcre2_compile_context + { + pcre2_compile_context_create_32(ptr::null_mut()) + } + unsafe fn pcre2_match_context_free(arg1: *mut Self::pcre2_match_context) { + pcre2_match_context_free_32(arg1) + } + + unsafe fn pcre2_set_newline( + arg1: *mut Self::pcre2_compile_context, + arg2: u32, + ) -> ::libc::c_int { + pcre2_set_newline_32(arg1, arg2) + } + unsafe fn pcre2_compile_context_free( + arg1: *mut Self::pcre2_compile_context, + ) { + pcre2_compile_context_free_32(arg1) + } + unsafe fn pcre2_match_context_create() -> *mut Self::pcre2_match_context { + pcre2_match_context_create_32(ptr::null_mut()) + } + + unsafe fn pcre2_match_data_create_from_pattern( + arg1: *const Self::pcre2_code, + ) -> *mut Self::pcre2_match_data { + pcre2_match_data_create_from_pattern_32(arg1, ptr::null_mut()) + } + unsafe fn pcre2_match_data_free(arg1: *mut Self::pcre2_match_data) { + pcre2_match_data_free_32(arg1) + } + + unsafe fn pcre2_get_ovector_pointer( + arg1: *mut Self::pcre2_match_data, + ) -> *mut usize { + pcre2_get_ovector_pointer_32(arg1) + } + unsafe fn pcre2_get_ovector_count( + arg1: *mut Self::pcre2_match_data, + ) -> u32 { + pcre2_get_ovector_count_32(arg1) + } +} + /// Returns true if and only if PCRE2 believes that JIT is available. pub fn is_jit_available() -> bool { type W = CodeUnitWidth8; @@ -380,25 +597,20 @@ impl<W: CodeUnitWidth> Code<W> { // and search for PCRE2_INFO_NAMETABLE. let name_count = self.name_count()?; - let size = self.name_entry_size()?; - let table = unsafe { - slice::from_raw_parts(self.raw_name_table()?, name_count * size) - }; - + let name_entry_size_in_bytes = + self.name_entry_size()? * std::mem::size_of::<W::PCRE2_CHAR>(); + let name_table = self.raw_name_table()?; let mut names = vec![None; self.capture_count()?]; for i in 0..name_count { - let entry = &table[i * size..(i + 1) * size]; - let name = &entry[2..]; - let nulat = name - .iter() - .position(|&b| b == 0) - .expect("a NUL in name table entry"); - let index = (entry[0] as usize) << 8 | (entry[1] as usize); - names[index] = String::from_utf8(name[..nulat].to_vec()) - .map(Some) - // We require our pattern to be valid UTF-8, so all capture - // names should also be valid UTF-8. - .expect("valid UTF-8 for capture name"); + let entry = unsafe { + name_table + .cast::<u8>() + .add(i * name_entry_size_in_bytes) + .cast::<W::name_table_entry>() + .as_ref() + .unwrap() + }; + names[entry.index()] = Some(entry.name()); } Ok(names) } @@ -408,32 +620,22 @@ impl<W: CodeUnitWidth> Code<W> { self.code } - /// Returns the raw name table, where each entry in the table corresponds - /// to a mapping between a named capturing group and the index of that - /// capturing group. The encoding for each item is as follows: - /// - /// * 2 bytes encoding the capture index (big-endian) - /// * N bytes encoding the code units of the name - /// * 1 byte for the NUL terminator - /// * M padding bytes, corresponding to the difference in length between - /// this name and the longest name. - /// - /// In particular, each entry uses the same number of bytes. + /// Returns a pointer to the array of name table entries. /// /// Entries are in alphabetical order. - fn raw_name_table(&self) -> Result<*const u8, Error> { - let mut bytes: *const u8 = ptr::null(); + fn raw_name_table(&self) -> Result<*const W::name_table_entry, Error> { + let mut table: *const W::name_table_entry = ptr::null(); let rc = unsafe { W::pcre2_pattern_info( self.as_ptr(), PCRE2_INFO_NAMETABLE, - &mut bytes as *mut *const u8 as *mut c_void, + &mut table as *mut *const W::name_table_entry as *mut c_void, ) }; if rc != 0 { Err(Error::info(rc)) } else { - Ok(bytes) + Ok(table) } } @@ -454,12 +656,7 @@ impl<W: CodeUnitWidth> Code<W> { } } - /// Returns the entry size of each name in the name table. - /// - /// This appears to correspond to `3` plus the size of the longest named - /// capturing group. The extra 3 bytes correspond to a NUL terminator and - /// two prefix bytes corresponding to a big-endian encoding of the index - /// of the capture group. + /// Returns the entry size of each name in the name table, in code units. fn name_entry_size(&self) -> Result<usize, Error> { let mut size: u32 = 0; let rc = unsafe { diff --git a/src/lib.rs b/src/lib.rs index c117277..13e9186 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,3 +26,9 @@ mod error; mod ffi; mod pool; mod regex_impl; + +/** +PCRE2 regular expressions for matching on UTF-32 slices. +*/ +#[cfg(feature = "utf32")] +pub mod utf32; diff --git a/src/utf32.rs b/src/utf32.rs new file mode 100644 index 0000000..f27b655 --- /dev/null +++ b/src/utf32.rs @@ -0,0 +1,294 @@ +use crate::ffi::CodeUnitWidth32; +pub use crate::regex_impl::Match as MatchImpl; + +#[doc(inline)] +pub use crate::regex_impl::{ + Regex as RegexImpl, RegexBuilder as RegexBuilderImpl, +}; + +/// A compiled PCRE2 regular expression for matching sequences of Rust chars. +/// +/// This regex is safe to use from multiple threads simultaneously. For top +/// performance, it is better to clone a new regex for each thread. +pub type Regex = RegexImpl<CodeUnitWidth32>; + +/// A builder for configuring the compilation of a PCRE2 regex. +pub type RegexBuilder = RegexBuilderImpl<CodeUnitWidth32>; + +/// Match represents a single match of a regex in a subject string. +/// +/// The lifetime parameter `'s` refers to the lifetime of the matched portion +/// of the subject string. +pub type Match<'s> = MatchImpl<'s, CodeUnitWidth32>; + +#[cfg(test)] +mod tests { + use super::{Regex, RegexBuilder}; + use crate::is_jit_available; + + fn b(string: &str) -> Box<[char]> { + string.chars().collect::<Vec<_>>().into_boxed_slice() + } + + fn find_iter_tuples(re: &Regex, subject: &[char]) -> Vec<(usize, usize)> { + let mut tuples = vec![]; + for result in re.find_iter(subject) { + let m = result.unwrap(); + tuples.push((m.start(), m.end())); + } + tuples + } + + fn cap_iter_tuples(re: &Regex, subject: &str) -> Vec<(usize, usize)> { + let subject = subject.chars().collect::<Vec<_>>(); + let mut tuples = vec![]; + for result in re.captures_iter(&subject) { + let caps = result.unwrap(); + let m = caps.get(0).unwrap(); + tuples.push((m.start(), m.end())); + } + tuples + } + + #[test] + fn caseless() { + let re = RegexBuilder::new().caseless(true).build(b("a")).unwrap(); + assert!(re.is_match(&b("A")).unwrap()); + + let re = RegexBuilder::new() + .caseless(true) + .ucp(true) + .build(b("β")) + .unwrap(); + assert!(re.is_match(&b("Β")).unwrap()); + } + + #[test] + fn crlf() { + let subject = &b("a\r\n"); + let re = RegexBuilder::new().crlf(true).build(b("a$")).unwrap(); + let m = re.find(subject).unwrap().unwrap(); + assert_eq!(m.as_pair(), (0, 1)); + } + + #[test] + fn dotall() { + let re = RegexBuilder::new().dotall(false).build(b(".")).unwrap(); + assert!(!re.is_match(&b("\n")).unwrap()); + + let re = RegexBuilder::new().dotall(true).build(b(".")).unwrap(); + assert!(re.is_match(&b("\n")).unwrap()); + } + + #[test] + fn extended() { + let re = RegexBuilder::new().extended(true).build(b("a b c")).unwrap(); + assert!(re.is_match(&b("abc")).unwrap()); + } + + #[test] + fn multi_line() { + let re = + RegexBuilder::new().multi_line(false).build(b("^abc$")).unwrap(); + assert!(!re.is_match(&b("foo\nabc\nbar")).unwrap()); + + let re = + RegexBuilder::new().multi_line(true).build(b("^abc$")).unwrap(); + assert!(re.is_match(&b("foo\nabc\nbar")).unwrap()); + } + + #[test] + fn ucp() { + let re = RegexBuilder::new().ucp(false).build(b(r"\w")).unwrap(); + assert!(!re.is_match(&b("β")).unwrap()); + + let re = RegexBuilder::new().ucp(true).build(b(r"\w")).unwrap(); + assert!(re.is_match(&b("β")).unwrap()); + } + + #[test] + fn utf() { + let re = RegexBuilder::new().utf(false).build(b(".")).unwrap(); + assert_eq!(re.find(&b("β")).unwrap().unwrap().as_pair(), (0, 1)); + + let re = RegexBuilder::new().utf(true).build(b(".")).unwrap(); + assert_eq!(re.find(&b("β")).unwrap().unwrap().as_pair(), (0, 1)); + } + + #[test] + fn fmt_debug_works() { + let subject = &b("x"); + let re = RegexBuilder::new().utf(false).build(b(".")).unwrap(); + let m = re.find(subject).unwrap().unwrap(); + let _ = format!("{:?}", m); + } + + #[test] + fn jit4lyfe() { + if is_jit_available() { + let re = RegexBuilder::new().jit(true).build(b(r"\w")).unwrap(); + assert!(re.is_match(&b("a")).unwrap()); + } else { + // Check that if JIT isn't enabled, then we get an error if we + // require JIT. + RegexBuilder::new().jit(true).build(b(r"\w")).unwrap_err(); + } + } + + // Unlike jit4lyfe, this tests that everything works when requesting the + // JIT only if it's available. In jit4lyfe, we require the JIT or fail. + // If the JIT isn't available, then in this test, we simply don't use it. + #[test] + fn jit_if_available() { + let re = RegexBuilder::new() + .jit_if_available(true) + .build(b(r"\w")) + .unwrap(); + assert!(re.is_match(&b("a")).unwrap()); + } + + // This tests a regression caused a segfault in the pcre2 library + // https://github.com/BurntSushi/rust-pcre2/issues/10 + #[test] + fn jit_test_lazy_alloc_subject() { + let subject: Vec<char> = vec![]; + + let re = RegexBuilder::new() + .jit_if_available(true) + .build(b(r"xxxx|xxxx|xxxx")) + .unwrap(); + assert!(!re.is_match(&subject).unwrap()); + } + + #[test] + fn utf_with_invalid_data() { + let re = RegexBuilder::new().build(b(r".")).unwrap(); + assert_eq!(re.find(&b("\u{FF}")).unwrap().unwrap().as_pair(), (0, 1)); + + let re = RegexBuilder::new().utf(true).build(b(r".")).unwrap(); + assert_eq!(re.find(&b("\u{FF}")).unwrap().unwrap().as_pair(), (0, 1)); + } + + #[test] + fn capture_names() { + let re = RegexBuilder::new() + .build(b(r"(?P<foo>abc)|(def)|(?P<a>ghi)|(?P<springsteen>jkl)")) + .unwrap(); + assert_eq!( + re.capture_names().to_vec(), + vec![ + None, + Some("foo".to_string()), + None, + Some("a".to_string()), + Some("springsteen".to_string()), + ] + ); + + // Test our internal map as well. + let capture_names_idx = re.get_capture_names_idxs(); + assert_eq!(capture_names_idx.len(), 3); + assert_eq!(capture_names_idx["foo"], 1); + assert_eq!(capture_names_idx["a"], 3); + assert_eq!(capture_names_idx["springsteen"], 4); + } + + #[test] + fn captures_get() { + let subject = &b("abc123"); + let re = Regex::new(b(r"[a-z]+(?:([0-9]+)|([A-Z]+))")).unwrap(); + let caps = re.captures(subject).unwrap().unwrap(); + + let text1: &[char] = caps.get(1).map_or(&[], |m| m.as_bytes()); + let text2: &[char] = caps.get(2).map_or(&[], |m| m.as_bytes()); + assert_eq!(text1, &*b("123")); + assert_eq!(text2, &*b("")); + } + + #[test] + fn find_iter_empty() { + let re = Regex::new(b(r"(?m:^)")).unwrap(); + assert_eq!(find_iter_tuples(&re, &b("")), &[(0, 0)]); + assert_eq!(find_iter_tuples(&re, &b("\n")), &[(0, 0)]); + assert_eq!(find_iter_tuples(&re, &b("\n\n")), &[(0, 0), (1, 1)]); + assert_eq!(find_iter_tuples(&re, &b("\na\n")), &[(0, 0), (1, 1)]); + assert_eq!( + find_iter_tuples(&re, &b("\na\n\n")), + vec![(0, 0), (1, 1), (3, 3),] + ); + } + + #[test] + fn captures_iter_empty() { + let re = Regex::new(b(r"(?m:^)")).unwrap(); + assert_eq!(cap_iter_tuples(&re, ""), &[(0, 0)]); + assert_eq!(cap_iter_tuples(&re, "\n"), &[(0, 0)]); + assert_eq!(cap_iter_tuples(&re, "\n\n"), &[(0, 0), (1, 1)]); + assert_eq!(cap_iter_tuples(&re, "\na\n"), &[(0, 0), (1, 1)]); + assert_eq!( + cap_iter_tuples(&re, "\na\n\n"), + &[(0, 0), (1, 1), (3, 3),] + ); + } + + #[test] + fn max_jit_stack_size_does_something() { + if !is_jit_available() { + return; + } + + let hundred = "\ + ABCDEFGHIJKLMNOPQRSTUVWXY\ + ABCDEFGHIJKLMNOPQRSTUVWXY\ + ABCDEFGHIJKLMNOPQRSTUVWXY\ + ABCDEFGHIJKLMNOPQRSTUVWXY\ + "; + let hay = format!("{}", hundred.repeat(100)); + + // First, try a regex that checks that we can blow the JIT stack limit. + let re = RegexBuilder::new() + .ucp(true) + .jit(true) + .max_jit_stack_size(Some(1)) + .build(b(r"((((\w{10})){100}))+")) + .unwrap(); + let result = re.is_match(&b(&hay)); + if result.is_ok() { + // Skip this test, since for some reason we weren't able to blow + // the stack limit. + return; + } + let err = result.unwrap_err(); + assert!(err.to_string().contains("JIT stack limit reached")); + + // Now bump up the JIT stack limit and check that it succeeds. + let re = RegexBuilder::new() + .ucp(true) + .jit_if_available(true) + .max_jit_stack_size(Some(1 << 20)) + .build(b(r"((((\w{10})){100}))+")) + .unwrap(); + assert!(re.is_match(&b(&hay)).unwrap()); + } + + #[test] + fn find_utf_emoji_as_chars() { + let hay : Vec<char> = "0123456789😀👍🏼🎉abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".chars().collect(); + let pattern: Vec<char> = r"(*UTF) + (?x) (?#: Allow comments and whitespace.) + + [^\N{U+0000}-\N{U+007F}] (?#: Non-ascii code points.) + + (?#: One or more times.) + " + .chars() + .collect(); + let re = RegexBuilder::new() + .extended(true) + .utf(true) + .jit_if_available(true) + .build(pattern) + .unwrap(); + let matched = re.find(&hay).unwrap().unwrap(); + assert!(matched.as_bytes().iter().copied().eq("😀👍🏼🎉".chars())); + } +} From 27c6eb0be4f7e682547b2c60c71ce5bf87e42655 Mon Sep 17 00:00:00 2001 From: ridiculousfish <corydoras@ridiculousfish.com> Date: Sat, 3 Feb 2024 10:54:02 -0800 Subject: [PATCH 05/10] Add crate feature for JIT This adds a new crate feature `jit` to enable the JIT. It is on by default. --- Cargo.toml | 4 ++++ pcre2-sys/Cargo.toml | 1 + pcre2-sys/build.rs | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e4a15c1..92494e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,10 @@ log = "0.4.19" pcre2-sys = { version = "0.2.7", path = "pcre2-sys" } [features] +default = ["jit"] # Enable matching on UTF-32 strings utf32 = ["pcre2-sys/utf32"] + +# Enable the PCRE2 JIT +jit = ["pcre2-sys/jit"] diff --git a/pcre2-sys/Cargo.toml b/pcre2-sys/Cargo.toml index 3f7027a..9dfc689 100644 --- a/pcre2-sys/Cargo.toml +++ b/pcre2-sys/Cargo.toml @@ -21,3 +21,4 @@ pkg-config = "0.3.27" [features] utf32 = [] +jit = [] diff --git a/pcre2-sys/build.rs b/pcre2-sys/build.rs index f98c1fe..32b94ca 100644 --- a/pcre2-sys/build.rs +++ b/pcre2-sys/build.rs @@ -41,7 +41,9 @@ fn build_1_pcre2_lib(code_unit_width: &str) { if target.contains("windows") { builder.define("HAVE_WINDOWS_H", "1"); } - enable_jit(&target, &mut builder); + if feature_enabled("JIT") { + enable_jit(&target, &mut builder); + } builder.include(upstream.join("src")).include(upstream.join("include")); for result in std::fs::read_dir(upstream.join("src")).unwrap() { From 2893b6f4b23eb72e8fe0584974c29deb5b55dd24 Mon Sep 17 00:00:00 2001 From: ridiculousfish <corydoras@ridiculousfish.com> Date: Sat, 25 Mar 2023 15:19:21 -0700 Subject: [PATCH 06/10] Expose the PCRE2_NEVER_UTF flag By default, PCRE2 enables the strange sequence "(*UTF)" which turns on UTF validity checking for both patterns and subjects. This is hinted at as a potential security concern in the man page: "If the data string is very long, such a check might use sufficiently many resources as to cause your application to lose performance" For this reason, pcre2 provides a flag to avoid interpreting this sequence. Re-expose that in rust-pcre2, under the clearer name `block_utf_pattern_directive`. --- src/regex_impl.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/regex_impl.rs b/src/regex_impl.rs index be7506c..067b6fa 100644 --- a/src/regex_impl.rs +++ b/src/regex_impl.rs @@ -9,7 +9,8 @@ use std::{ use log::debug; use pcre2_sys::{ PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MATCH_INVALID_UTF, - PCRE2_MULTILINE, PCRE2_NEWLINE_ANYCRLF, PCRE2_UCP, PCRE2_UNSET, PCRE2_UTF, + PCRE2_MULTILINE, PCRE2_NEVER_UTF, PCRE2_NEWLINE_ANYCRLF, PCRE2_UCP, + PCRE2_UNSET, PCRE2_UTF, }; use crate::{ @@ -75,6 +76,8 @@ struct Config { ucp: bool, /// PCRE2_UTF utf: bool, + /// PCRE2_NEVER_UTF + block_utf_pattern_directive: bool, /// use pcre2_jit_compile jit: JITChoice, /// Match-time specific configuration knobs. @@ -101,6 +104,7 @@ impl Default for Config { crlf: false, ucp: false, utf: false, + block_utf_pattern_directive: false, jit: JITChoice::Never, match_config: MatchConfig::default(), } @@ -154,6 +158,9 @@ impl<W: CodeUnitWidth> RegexBuilder<W> { if self.config.utf { options |= PCRE2_UTF; } + if self.config.block_utf_pattern_directive { + options |= PCRE2_NEVER_UTF; + } let mut ctx = CompileContext::new(); if self.config.crlf { @@ -284,6 +291,16 @@ impl<W: CodeUnitWidth> RegexBuilder<W> { self } + /// Prevent patterns from opting in to UTF matching mode in spite of any flags. + /// + /// This causes the directive `(*UTF)` in the pattern to emit an error. + /// This does not affect any other flags controlling UTF matching mode; + /// it merely disables a particular syntax item in the pattern. + pub fn block_utf_pattern_directive(&mut self, yes: bool) -> &mut Self { + self.config.block_utf_pattern_directive = yes; + self + } + /// This is now deprecated and is a no-op. /// /// Previously, this option permitted disabling PCRE2's UTF-8 validity From 338a96687c9d12a7202999f6f03b80c3c1559f46 Mon Sep 17 00:00:00 2001 From: ridiculousfish <corydoras@ridiculousfish.com> Date: Sat, 25 Mar 2023 15:46:24 -0700 Subject: [PATCH 07/10] Mark Error::error_message as public Prior to this commit, rust-pcre2 would wrap pcre2's error messages with a prefix like "PCRE2: error compiling pattern:". However some clients want the raw error message as returned by pcre2. Allow access to this. --- src/error.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/error.rs b/src/error.rs index 88c7ae7..a94eac6 100644 --- a/src/error.rs +++ b/src/error.rs @@ -89,7 +89,7 @@ impl Error { } /// Returns the error message from PCRE2. - fn error_message(&self) -> String { + pub fn error_message(&self) -> String { // PCRE2 docs say a buffer size of 120 bytes is enough, but we're // cautious and double it. let mut buf = [0u8; 240]; From f933dc9fd3712a0233b0a70a88ac98820acf0c7f Mon Sep 17 00:00:00 2001 From: ridiculousfish <corydoras@ridiculousfish.com> Date: Sat, 25 Mar 2023 16:32:37 -0700 Subject: [PATCH 08/10] Make is_jit_available() require a CodeUnitWidth This both respects the PCRE2 API better and allows us to compile without UTF8 support. --- src/bytes.rs | 6 +++--- src/ffi.rs | 5 +++-- src/utf32.rs | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index da474df..72aeba9 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -23,7 +23,7 @@ pub type Match<'s> = MatchImpl<'s, CodeUnitWidth8>; #[cfg(test)] mod tests { - use super::{Regex, RegexBuilder}; + use super::{CodeUnitWidth8, Regex, RegexBuilder}; use crate::is_jit_available; fn b(string: &str) -> &[u8] { @@ -117,7 +117,7 @@ mod tests { #[test] fn jit4lyfe() { - if is_jit_available() { + if is_jit_available::<CodeUnitWidth8>() { let re = RegexBuilder::new().jit(true).build(r"\w").unwrap(); assert!(re.is_match(b("a")).unwrap()); } else { @@ -222,7 +222,7 @@ mod tests { #[test] fn max_jit_stack_size_does_something() { - if !is_jit_available() { + if !is_jit_available::<CodeUnitWidth8>() { return; } diff --git a/src/ffi.rs b/src/ffi.rs index ddc7c7c..36c7176 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -173,6 +173,7 @@ pub trait CodeUnitWidth: std::fmt::Debug + 'static { #[derive(Debug)] pub struct CodeUnitWidth8; + impl CodeUnitWidth for CodeUnitWidth8 { type pcre2_code = pcre2_code_8; type PCRE2_CHAR = PCRE2_UCHAR8; @@ -316,6 +317,7 @@ impl CodeUnitWidth for CodeUnitWidth8 { #[derive(Debug)] pub struct CodeUnitWidth32; + impl CodeUnitWidth for CodeUnitWidth32 { type pcre2_code = pcre2_code_32; type PCRE2_CHAR = PCRE2_UCHAR32; @@ -462,8 +464,7 @@ impl CodeUnitWidth for CodeUnitWidth32 { } /// Returns true if and only if PCRE2 believes that JIT is available. -pub fn is_jit_available() -> bool { - type W = CodeUnitWidth8; +pub fn is_jit_available<W: CodeUnitWidth>() -> bool { let mut rc: u32 = 0; let error_code = unsafe { W::pcre2_config(PCRE2_CONFIG_JIT, &mut rc as *mut _ as *mut c_void) diff --git a/src/utf32.rs b/src/utf32.rs index f27b655..8085dea 100644 --- a/src/utf32.rs +++ b/src/utf32.rs @@ -23,7 +23,7 @@ pub type Match<'s> = MatchImpl<'s, CodeUnitWidth32>; #[cfg(test)] mod tests { - use super::{Regex, RegexBuilder}; + use super::{CodeUnitWidth32, Regex, RegexBuilder}; use crate::is_jit_available; fn b(string: &str) -> Box<[char]> { @@ -125,7 +125,7 @@ mod tests { #[test] fn jit4lyfe() { - if is_jit_available() { + if is_jit_available::<CodeUnitWidth32>() { let re = RegexBuilder::new().jit(true).build(b(r"\w")).unwrap(); assert!(re.is_match(&b("a")).unwrap()); } else { @@ -233,7 +233,7 @@ mod tests { #[test] fn max_jit_stack_size_does_something() { - if !is_jit_available() { + if !is_jit_available::<CodeUnitWidth32>() { return; } From f56601b9e915f8f7e294fcfe1e7f58a5c305d14d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20H=C3=B8rl=C3=BCck=20Berg?= <36937807+henrikhorluck@users.noreply.github.com> Date: Sun, 18 Jun 2023 17:58:55 +0200 Subject: [PATCH 09/10] Add support for capture groups and substring replacement This adds a new regex function `capture`, which captures matching substrings, using a new type `Captures`. It also adds new functions `replace` and `replace_all`, allowing substring replacement. --- src/bytes.rs | 14 ++++ src/ffi.rs | 118 +++++++++++++++++++++++++++++++- src/regex_impl.rs | 130 ++++++++++++++++++++++++++++++++++- src/utf32.rs | 169 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 427 insertions(+), 4 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index 72aeba9..512b3d0 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1,4 +1,5 @@ use crate::ffi::CodeUnitWidth8; +pub use crate::regex_impl::Captures as CapturesImpl; pub use crate::regex_impl::Match as MatchImpl; #[doc(inline)] @@ -21,6 +22,19 @@ pub type RegexBuilder = RegexBuilderImpl<CodeUnitWidth8>; /// of the subject string. pub type Match<'s> = MatchImpl<'s, CodeUnitWidth8>; +/// `Captures` represents a group of captured byte strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched byte string is *also* available via the +/// `name` method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'s` is the lifetime of the matched subject string. +pub type Captures<'s> = CapturesImpl<'s, CodeUnitWidth8>; + #[cfg(test)] mod tests { use super::{CodeUnitWidth8, Regex, RegexBuilder}; diff --git a/src/ffi.rs b/src/ffi.rs index 36c7176..1e381b9 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -88,7 +88,7 @@ pub trait CodeUnitWidth: std::fmt::Debug + 'static { type pcre2_match_context; type pcre2_match_data; type pcre2_jit_stack; - type PCRE2_CHAR; + type PCRE2_CHAR: Default + Copy + TryInto<Self::SubjectChar>; type PCRE2_SPTR; type name_table_entry: NameTableEntry; type SubjectChar: Copy; @@ -169,6 +169,20 @@ pub trait CodeUnitWidth: std::fmt::Debug + 'static { unsafe fn pcre2_get_ovector_count( arg1: *mut Self::pcre2_match_data, ) -> u32; + + unsafe fn pcre2_substitute( + code: *const Self::pcre2_code, + subject: Self::PCRE2_SPTR, + length: usize, + startoffset: usize, + options: u32, + match_data: *mut Self::pcre2_match_data, + mcontext: *mut Self::pcre2_match_context, + replacement: Self::PCRE2_SPTR, + rlength: usize, + outputbuffer: *mut Self::PCRE2_CHAR, + outputlengthptr: *mut usize, + ) -> ::libc::c_int; } #[derive(Debug)] @@ -313,6 +327,33 @@ impl CodeUnitWidth for CodeUnitWidth8 { ) -> u32 { pcre2_get_ovector_count_8(arg1) } + unsafe fn pcre2_substitute( + code: *const Self::pcre2_code, + subject: Self::PCRE2_SPTR, + length: usize, + startoffset: usize, + options: u32, + match_data: *mut Self::pcre2_match_data, + mcontext: *mut Self::pcre2_match_context, + replacement: Self::PCRE2_SPTR, + rlength: usize, + outputbuffer: *mut Self::PCRE2_CHAR, + outputlengthptr: *mut usize, + ) -> ::libc::c_int { + pcre2_substitute_8( + code, + subject, + length, + startoffset, + options, + match_data, + mcontext, + replacement, + rlength, + outputbuffer, + outputlengthptr, + ) + } } #[derive(Debug)] @@ -461,6 +502,34 @@ impl CodeUnitWidth for CodeUnitWidth32 { ) -> u32 { pcre2_get_ovector_count_32(arg1) } + + unsafe fn pcre2_substitute( + code: *const Self::pcre2_code, + subject: Self::PCRE2_SPTR, + length: usize, + startoffset: usize, + options: u32, + match_data: *mut Self::pcre2_match_data, + mcontext: *mut Self::pcre2_match_context, + replacement: Self::PCRE2_SPTR, + rlength: usize, + outputbuffer: *mut Self::PCRE2_CHAR, + outputlengthptr: *mut usize, + ) -> ::libc::c_int { + pcre2_substitute_32( + code, + subject, + length, + startoffset, + options, + match_data, + mcontext, + replacement, + rlength, + outputbuffer, + outputlengthptr, + ) + } } /// Returns true if and only if PCRE2 believes that JIT is available. @@ -692,6 +761,53 @@ impl<W: CodeUnitWidth> Code<W> { Ok(1 + count as usize) } } + + pub unsafe fn substitute( + &self, + mut subject: &[W::SubjectChar], + mut replacement: &[W::SubjectChar], + start: usize, + options: u32, + output: &mut [W::PCRE2_CHAR], + output_len: &mut usize, + ) -> Result<usize, Error> { + // When the subject is empty, we use an empty slice + // with a known valid pointer. Otherwise, slices derived + // from, e.g., an empty `Vec<u8>` may not have a valid + // pointer, since creating an empty `Vec` is guaranteed + // to not allocate. + if subject.is_empty() { + subject = &[]; + } + if replacement.is_empty() { + replacement = &[]; + } + let (subj_ptr, subj_len) = W::subject_to_sptr_len(subject); + let (repl_ptr, repl_len) = W::subject_to_sptr_len(replacement); + + // safety: we allow arbitrary options, security contract is on the caller + let rc = unsafe { + W::pcre2_substitute( + self.code, + subj_ptr, + subj_len, + start, + options, + ptr::null_mut(), + // should probably not be null for performance reasons? + ptr::null_mut(), + repl_ptr, + repl_len, + output.as_mut_ptr() as *mut W::PCRE2_CHAR, + output_len as *mut usize, + ) + }; + if rc >= 0 { + return Ok(rc as usize); + } + // this might warrant a new error type + Err(Error::info(rc)) + } } /// A low level representation of PCRE2's compilation context. diff --git a/src/regex_impl.rs b/src/regex_impl.rs index 067b6fa..d1361d3 100644 --- a/src/regex_impl.rs +++ b/src/regex_impl.rs @@ -1,4 +1,5 @@ use std::{ + borrow::Cow, collections::HashMap, fmt, ops::Index, @@ -8,8 +9,10 @@ use std::{ use log::debug; use pcre2_sys::{ - PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MATCH_INVALID_UTF, - PCRE2_MULTILINE, PCRE2_NEVER_UTF, PCRE2_NEWLINE_ANYCRLF, PCRE2_UCP, + PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_ERROR_NOMEMORY, PCRE2_EXTENDED, + PCRE2_MATCH_INVALID_UTF, PCRE2_MULTILINE, PCRE2_NEVER_UTF, + PCRE2_NEWLINE_ANYCRLF, PCRE2_SUBSTITUTE_EXTENDED, PCRE2_SUBSTITUTE_GLOBAL, + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, PCRE2_SUBSTITUTE_UNSET_EMPTY, PCRE2_UCP, PCRE2_UNSET, PCRE2_UTF, }; @@ -623,6 +626,127 @@ impl<W: CodeUnitWidth> Regex<W> { pub(crate) fn get_capture_names_idxs(&self) -> &HashMap<String, usize> { &self.capture_names_idx } + + /// Replace the first match in the subject string with the replacement + /// If `extended` is true, enable PCRE2's extended replacement syntax. + pub fn replace<'s>( + &self, + subject: &'s [W::SubjectChar], + replacement: &[W::SubjectChar], + extended: bool, + ) -> Result<Cow<'s, [W::SubjectChar]>, Error> + where + [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned, + { + self.replace_impl(subject, replacement, false, extended) + } + + /// Replace all non-overlapping matches in the subject string with the replacement + /// If `extended` is true, enable PCRE2's extended replacement syntax. + pub fn replace_all<'s>( + &self, + subject: &'s [W::SubjectChar], + replacement: &[W::SubjectChar], + extended: bool, + ) -> Result<Cow<'s, [W::SubjectChar]>, Error> + where + [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned, + { + self.replace_impl(subject, replacement, true, extended) + } + + #[inline] + fn replace_impl<'s>( + &self, + subject: &'s [W::SubjectChar], + replacement: &[W::SubjectChar], + replace_all: bool, + extended: bool, + ) -> Result<Cow<'s, [W::SubjectChar]>, Error> + where + [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned, + { + let mut options: u32 = 0; + options |= PCRE2_SUBSTITUTE_OVERFLOW_LENGTH; + // TODO: this should probably be configurable from user-side + options |= PCRE2_SUBSTITUTE_UNSET_EMPTY; + if extended { + options |= PCRE2_SUBSTITUTE_EXTENDED; + } + if replace_all { + options |= PCRE2_SUBSTITUTE_GLOBAL; + } + + // We prefer to allocate on the stack but fall back to the heap. + // Note that PCRE2 has the following behavior with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH: + // - We supply the initial output buffer size in `capacity`. This should have sufficient + // capacity for the terminating NUL character. + // - If the capacity is NOT sufficient, PCRE2 returns the new required capacity, also + // including the terminating NUL character. + // - If the capacity IS sufficient, PCRE2 returns the number of characters written, NOT + // including the terminating NUL character. + // Example: our initial capacity is 256. If the returned string needs to be of length 512, + // then PCRE2 will report NOMEMORY and set capacity to 513. After reallocating we pass in + // a capacity of 513; it succeeds and sets capacity to 512, which is the length of the result. + let mut stack_storage: [W::PCRE2_CHAR; 256] = + [W::PCRE2_CHAR::default(); 256]; + let mut heap_storage = Vec::new(); + let mut output = stack_storage.as_mut(); + let mut capacity = output.len(); + + let mut rc = unsafe { + self.code.substitute( + subject, + replacement, + 0, + options, + output, + &mut capacity, + ) + }; + + if let Err(e) = &rc { + if e.code() == PCRE2_ERROR_NOMEMORY { + if heap_storage.try_reserve_exact(capacity).is_err() { + return Err(rc.unwrap_err()); + } + heap_storage.resize(capacity, W::PCRE2_CHAR::default()); + output = &mut heap_storage; + capacity = output.len(); + rc = unsafe { + self.code.substitute( + subject, + replacement, + 0, + options, + output, + &mut capacity, + ) + }; + } + } + + let s = match rc? { + 0 => Cow::Borrowed(subject), + _ => { + // capacity has been updated with the length of the result (excluding nul terminator). + let output = &output[..capacity]; + + // All inputs contained valid chars, so we expect all outputs to as well. + let to_char = |c: W::PCRE2_CHAR| -> W::SubjectChar { + c.try_into().unwrap_or_else(|_| { + panic!("all output expected to be valid chars") + }) + }; + + // this is really just a type cast + let x: Vec<W::SubjectChar> = + output.iter().copied().map(to_char).collect(); + Cow::Owned(x) + } + }; + Ok(s) + } } /// Advanced or "lower level" search methods. @@ -870,7 +994,7 @@ impl<W: CodeUnitWidth> CaptureLocations<W> { } } -/// Captures represents a group of captured byte strings for a single match. +/// `Captures` represents a group of captured strings for a single match. /// /// The 0th capture always corresponds to the entire match. Each subsequent /// index corresponds to the next capture group in the regex. If a capture diff --git a/src/utf32.rs b/src/utf32.rs index 8085dea..59b0cbd 100644 --- a/src/utf32.rs +++ b/src/utf32.rs @@ -1,4 +1,5 @@ use crate::ffi::CodeUnitWidth32; +pub use crate::regex_impl::Captures as CapturesImpl; pub use crate::regex_impl::Match as MatchImpl; #[doc(inline)] @@ -21,8 +22,23 @@ pub type RegexBuilder = RegexBuilderImpl<CodeUnitWidth32>; /// of the subject string. pub type Match<'s> = MatchImpl<'s, CodeUnitWidth32>; +/// `Captures` represents a group of captured character strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched string is *also* available via the +/// `name` method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always character indices. +/// +/// `'s` is the lifetime of the matched subject string. +pub type Captures<'s> = CapturesImpl<'s, CodeUnitWidth32>; + #[cfg(test)] mod tests { + use std::borrow::Cow; + use super::{CodeUnitWidth32, Regex, RegexBuilder}; use crate::is_jit_available; @@ -97,6 +113,159 @@ mod tests { assert!(re.is_match(&b("foo\nabc\nbar")).unwrap()); } + #[test] + fn replace() { + let re = RegexBuilder::new().build(b(".")).unwrap(); + let s = b("abc"); + let r = b(""); + let replaced = re.replace(&s, &r, true).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("bc")); + } + + #[test] + fn replace_no_match() { + let re = RegexBuilder::new().build(b("d")).unwrap(); + let s = b("abc"); + let r = b(""); + let replaced = re.replace(&s, &r, true).unwrap(); + assert!( + matches!(replaced, Cow::Borrowed(_)), + "when there is no match, the original string should be returned" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("abc")); + } + + #[test] + fn replace_with_replacement() { + let re = RegexBuilder::new().build(b("b")).unwrap(); + let s = b("abc"); + let r = b("d"); + let replaced = re.replace(&s, &r, true).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("adc")); + } + + #[test] + fn replace_first_occurrence() { + let re = RegexBuilder::new().build(b("a")).unwrap(); + let s = b("aaa"); + let r = b("b"); + let replaced = re.replace(&s, &r, false).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("baa")); + } + + #[test] + fn replace_multiple_occurrences() { + let re = RegexBuilder::new().build(b("a")).unwrap(); + let s = b("aaa"); + let r = b("b"); + let replaced = re.replace_all(&s, &r, false).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("bbb")); + } + + #[test] + fn replace_empty_string() { + let re = RegexBuilder::new().build(b("")).unwrap(); + let s = b("abc"); + let r = b("d"); + let replaced = re.replace(&s, &r, true).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("dabc")); + } + + #[test] + fn replace_empty_with_empty() { + let re = RegexBuilder::new().build(b("")).unwrap(); + let s = b(""); + let r = b(""); + let replaced = re.replace(&s, &r, true).unwrap().into_owned(); + assert_eq!(replaced, &*b("")); + } + + #[test] + fn replace_long_string() { + let long_string = vec!['a'; 1024]; // Create a 1MB string filled with 'a' + let re = RegexBuilder::new().build(b("a")).unwrap(); + let r = b("b"); + let replaced = re.replace(&long_string, &r, false).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + let mut expected = long_string.clone(); + expected[0] = 'b'; + assert_eq!(replaced, expected); + } + + #[test] + fn replace_long_string_all() { + let long_string = vec!['a'; 1024]; + let re = RegexBuilder::new().build(b("a")).unwrap(); + let r = b("b"); + let replaced = re.replace_all(&long_string, &r, false).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + let all_b = vec!['b'; 1024]; + assert_eq!(replaced, all_b); + } + + #[test] + fn replace_long_string_all_elongating() { + let long_string = vec!['a'; 1024]; + let re = RegexBuilder::new().build(b("a")).unwrap(); + let r = b("bx"); + let replaced = re.replace_all(&long_string, &r, false).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + let mut all_bx = Vec::new(); + for _ in long_string { + all_bx.push('b'); + all_bx.push('x'); + } + assert_eq!(replaced, all_bx); + } + + #[test] + fn replace_long_string_all_disappearing() { + let long_string = vec!['a'; 1024]; + let re = RegexBuilder::new().build(b("a")).unwrap(); + let r = b(""); + let replaced = re.replace_all(&long_string, &r, false).unwrap(); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &[]); + } + #[test] fn ucp() { let re = RegexBuilder::new().ucp(false).build(b(r"\w")).unwrap(); From f0e5adb2a43559e10fcfdcb7b099978221333a30 Mon Sep 17 00:00:00 2001 From: ridiculousfish <corydoras@ridiculousfish.com> Date: Sat, 3 Feb 2024 18:19:43 -0800 Subject: [PATCH 10/10] Update the CI workflow to build and test UTF32 --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7aed53b..d5fc411 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,14 +50,14 @@ jobs: with: toolchain: ${{ matrix.rust }} - name: Build - run: cargo build --verbose --all + run: cargo build --verbose --all --features utf32 - name: Build docs - run: cargo doc --verbose --all + run: cargo doc --verbose --all --features utf32 - name: Run tests - run: cargo test --verbose --all + run: cargo test --verbose --all --features utf32 - name: Run tests with static build shell: bash - run: PCRE2_SYS_STATIC=1 cargo test --verbose --all + run: PCRE2_SYS_STATIC=1 cargo test --verbose --all --features utf32 rustfmt: name: rustfmt