diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7aed53b..d5fc411 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,14 +50,14 @@ jobs: with: toolchain: ${{ matrix.rust }} - name: Build - run: cargo build --verbose --all + run: cargo build --verbose --all --features utf32 - name: Build docs - run: cargo doc --verbose --all + run: cargo doc --verbose --all --features utf32 - name: Run tests - run: cargo test --verbose --all + run: cargo test --verbose --all --features utf32 - name: Run tests with static build shell: bash - run: PCRE2_SYS_STATIC=1 cargo test --verbose --all + run: PCRE2_SYS_STATIC=1 cargo test --verbose --all --features utf32 rustfmt: name: rustfmt diff --git a/Cargo.toml b/Cargo.toml index 322dd4b..92494e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,3 +19,12 @@ members = ["pcre2-sys"] libc = "0.2.146" log = "0.4.19" pcre2-sys = { version = "0.2.7", path = "pcre2-sys" } + +[features] +default = ["jit"] + +# Enable matching on UTF-32 strings +utf32 = ["pcre2-sys/utf32"] + +# Enable the PCRE2 JIT +jit = ["pcre2-sys/jit"] diff --git a/pcre2-sys/Cargo.toml b/pcre2-sys/Cargo.toml index c66466c..9dfc689 100644 --- a/pcre2-sys/Cargo.toml +++ b/pcre2-sys/Cargo.toml @@ -18,3 +18,7 @@ libc = "0.2.146" [build-dependencies] cc = { version = "1.0.73", features = ["parallel"] } pkg-config = "0.3.27" + +[features] +utf32 = [] +jit = [] diff --git a/pcre2-sys/build.rs b/pcre2-sys/build.rs index 4a6fcad..32b94ca 100644 --- a/pcre2-sys/build.rs +++ b/pcre2-sys/build.rs @@ -21,35 +21,29 @@ use std::path::PathBuf; -fn main() { - println!("cargo:rerun-if-env-changed=PCRE2_SYS_STATIC"); - +// Build and link against a PCRE2 library with the given code unit width, +// which should be "8" or "32". +fn build_1_pcre2_lib(code_unit_width: &str) { let target = std::env::var("TARGET").unwrap(); - // let out = PathBuf::from(std::env::var_os("OUT_DIR").unwrap()); let upstream = PathBuf::from("upstream"); - - // Don't link to a system library if we want a static build. - let want_static = pcre2_sys_static().unwrap_or(target.contains("musl")); - if !want_static && pkg_config::probe_library("libpcre2-8").is_ok() { - return; - } - // Set some config options. We mostly just use the default values. We do // this in lieu of patching config.h since it's easier. let mut builder = cc::Build::new(); builder - .define("PCRE2_CODE_UNIT_WIDTH", "8") + .define("PCRE2_CODE_UNIT_WIDTH", code_unit_width) .define("HAVE_STDLIB_H", "1") .define("HAVE_MEMMOVE", "1") .define("HAVE_CONFIG_H", "1") .define("PCRE2_STATIC", "1") .define("STDC_HEADERS", "1") - .define("SUPPORT_PCRE2_8", "1") + .define(&format!("SUPPORT_PCRE2_{}", code_unit_width), "1") .define("SUPPORT_UNICODE", "1"); if target.contains("windows") { builder.define("HAVE_WINDOWS_H", "1"); } - enable_jit(&target, &mut builder); + if feature_enabled("JIT") { + enable_jit(&target, &mut builder); + } builder.include(upstream.join("src")).include(upstream.join("include")); for result in std::fs::read_dir(upstream.join("src")).unwrap() { @@ -78,7 +72,34 @@ fn main() { { builder.debug(true); } - builder.compile("libpcre2.a"); + builder.compile(&format!("libpcre2-{}.a", code_unit_width)); +} + +fn main() { + println!("cargo:rerun-if-env-changed=PCRE2_SYS_STATIC"); + + let target = std::env::var("TARGET").unwrap(); + let do_utf32 = feature_enabled("UTF32"); + + // Don't link to a system library if we want a static build. + let want_static = pcre2_sys_static().unwrap_or(target.contains("musl")); + if want_static || pkg_config::probe_library("libpcre2-8").is_err() { + build_1_pcre2_lib("8"); + } + if do_utf32 + && (want_static || pkg_config::probe_library("libpcre2-32").is_err()) + { + build_1_pcre2_lib("32"); + } +} + +// Return whether a given feature is enabled. +fn feature_enabled(feature: &str) -> bool { + let env_var_name = format!("CARGO_FEATURE_{}", feature); + match std::env::var(&env_var_name) { + Ok(s) => s == "1", + Err(_) => false, + } } fn pcre2_sys_static() -> Option { diff --git a/pcre2-sys/generate-bindings b/pcre2-sys/generate-bindings index 260cfd6..0630da1 100755 --- a/pcre2-sys/generate-bindings +++ b/pcre2-sys/generate-bindings @@ -5,7 +5,7 @@ if ! command -V bindgen > /dev/null 2>&1; then echo "bindgen must be installed" >&2 - echo "to install: cargo install bindgen" >&2 + echo "to install: cargo install bindgen-cli" >&2 exit 1 fi if ! [ -f "$PCRE2SYS_HEADER" ]; then @@ -14,6 +14,12 @@ if ! [ -f "$PCRE2SYS_HEADER" ]; then exit 1 fi +if [ -z "$PCRE2_CODE_UNIT_WIDTH" ]; then + echo "The PCRE2_CODE_UNIT_WIDTH environment variable must be set" >&2 + echo "Valid values are 8, 16, and 32" >&2 + exit 1 +fi + bindgen \ "$PCRE2SYS_HEADER" \ --ctypes-prefix '::libc' \ @@ -22,4 +28,4 @@ bindgen \ --allowlist-var '^PCRE2_.*' \ --blocklist-function '^.*_callout_.*' \ --blocklist-type '^.*_callout_.*' \ - -- -DPCRE2_CODE_UNIT_WIDTH=8 > "$PCRE2SYS_BINDINGS" + -- -DPCRE2_CODE_UNIT_WIDTH=${PCRE2_CODE_UNIT_WIDTH} > "$PCRE2SYS_BINDINGS" diff --git a/src/bytes.rs b/src/bytes.rs index 5f02c4e..512b3d0 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1,841 +1,28 @@ -use std::{ - collections::HashMap, - panic::{RefUnwindSafe, UnwindSafe}, - sync::Arc, -}; - -use pcre2_sys::{ - PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MATCH_INVALID_UTF, - PCRE2_MULTILINE, PCRE2_NEWLINE_ANYCRLF, PCRE2_UCP, PCRE2_UNSET, PCRE2_UTF, -}; +use crate::ffi::CodeUnitWidth8; +pub use crate::regex_impl::Captures as CapturesImpl; +pub use crate::regex_impl::Match as MatchImpl; -use crate::{ - error::Error, - ffi::{Code, CompileContext, MatchConfig, MatchData}, - pool::{Pool, PoolGuard}, +#[doc(inline)] +pub use crate::regex_impl::{ + Regex as RegexImpl, RegexBuilder as RegexBuilderImpl, }; -/// Match represents a single match of a regex in a subject string. -/// -/// The lifetime parameter `'s` refers to the lifetime of the matched portion -/// of the subject string. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub struct Match<'s> { - subject: &'s [u8], - start: usize, - end: usize, -} - -impl<'s> Match<'s> { - /// Returns the starting byte offset of the match in the subject. - #[inline] - pub fn start(&self) -> usize { - self.start - } - - /// Returns the ending byte offset of the match in the subject. - #[inline] - pub fn end(&self) -> usize { - self.end - } - - /// Returns the matched portion of the subject string. - #[inline] - pub fn as_bytes(&self) -> &'s [u8] { - &self.subject[self.start..self.end] - } - - /// Creates a new match from the given subject string and byte offsets. - fn new(subject: &'s [u8], start: usize, end: usize) -> Match<'s> { - Match { subject, start, end } - } - - #[cfg(test)] - fn as_pair(&self) -> (usize, usize) { - (self.start, self.end) - } -} - -#[derive(Clone, Debug)] -struct Config { - /// PCRE2_CASELESS - caseless: bool, - /// PCRE2_DOTALL - dotall: bool, - /// PCRE2_EXTENDED - extended: bool, - /// PCRE2_MULTILINE - multi_line: bool, - /// PCRE2_NEWLINE_ANYCRLF - crlf: bool, - /// PCRE2_UCP - ucp: bool, - /// PCRE2_UTF - utf: bool, - /// use pcre2_jit_compile - jit: JITChoice, - /// Match-time specific configuration knobs. - match_config: MatchConfig, -} - -#[derive(Clone, Debug)] -enum JITChoice { - /// Never do JIT compilation. - Never, - /// Always do JIT compilation and return an error if it fails. - Always, - /// Attempt to do JIT compilation but silently fall back to non-JIT. - Attempt, -} - -impl Default for Config { - fn default() -> Config { - Config { - caseless: false, - dotall: false, - extended: false, - multi_line: false, - crlf: false, - ucp: false, - utf: false, - jit: JITChoice::Never, - match_config: MatchConfig::default(), - } - } -} - -/// A builder for configuring the compilation of a PCRE2 regex. -#[derive(Clone, Debug)] -pub struct RegexBuilder { - config: Config, -} - -impl RegexBuilder { - /// Create a new builder with a default configuration. - pub fn new() -> RegexBuilder { - RegexBuilder { config: Config::default() } - } - - /// Compile the given pattern into a PCRE regex using the current - /// configuration. - /// - /// If there was a problem compiling the pattern, then an error is - /// returned. - pub fn build(&self, pattern: &str) -> Result { - let mut options = 0; - if self.config.caseless { - options |= PCRE2_CASELESS; - } - if self.config.dotall { - options |= PCRE2_DOTALL; - } - if self.config.extended { - options |= PCRE2_EXTENDED; - } - if self.config.multi_line { - options |= PCRE2_MULTILINE; - } - if self.config.ucp { - options |= PCRE2_UCP; - options |= PCRE2_UTF; - options |= PCRE2_MATCH_INVALID_UTF; - } - if self.config.utf { - options |= PCRE2_UTF; - } - - let mut ctx = CompileContext::new(); - if self.config.crlf { - ctx.set_newline(PCRE2_NEWLINE_ANYCRLF) - .expect("PCRE2_NEWLINE_ANYCRLF is a legal value"); - } - - let mut code = Code::new(pattern, options, ctx)?; - match self.config.jit { - JITChoice::Never => {} // fallthrough - JITChoice::Always => { - code.jit_compile()?; - } - JITChoice::Attempt => { - if let Err(err) = code.jit_compile() { - log::debug!("JIT compilation failed: {}", err); - } - } - } - let capture_names = code.capture_names()?; - let mut idx = HashMap::new(); - for (i, group) in capture_names.iter().enumerate() { - if let Some(ref name) = *group { - idx.insert(name.to_string(), i); - } - } - let code = Arc::new(code); - let match_data = { - let config = self.config.match_config.clone(); - let code = Arc::clone(&code); - let create: MatchDataPoolFn = - Box::new(move || MatchData::new(config.clone(), &code)); - Pool::new(create) - }; - Ok(Regex { - config: Arc::new(self.config.clone()), - pattern: pattern.to_string(), - code, - capture_names: Arc::new(capture_names), - capture_names_idx: Arc::new(idx), - match_data, - }) - } - - /// Enables case insensitive matching. - /// - /// If the `utf` option is also set, then Unicode case folding is used - /// to determine case insensitivity. When the `utf` option is not set, - /// then only standard ASCII case insensitivity is considered. - /// - /// This option corresponds to the `i` flag. - pub fn caseless(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.caseless = yes; - self - } - - /// Enables "dot all" matching. - /// - /// When enabled, the `.` metacharacter in the pattern matches any - /// character, include `\n`. When disabled (the default), `.` will match - /// any character except for `\n`. - /// - /// This option corresponds to the `s` flag. - pub fn dotall(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.dotall = yes; - self - } - - /// Enable "extended" mode in the pattern, where whitespace is ignored. - /// - /// This option corresponds to the `x` flag. - pub fn extended(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.extended = yes; - self - } - - /// Enable multiline matching mode. - /// - /// When enabled, the `^` and `$` anchors will match both at the beginning - /// and end of a subject string, in addition to matching at the start of - /// a line and the end of a line. When disabled, the `^` and `$` anchors - /// will only match at the beginning and end of a subject string. - /// - /// This option corresponds to the `m` flag. - pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.multi_line = yes; - self - } - - /// Enable matching of CRLF as a line terminator. - /// - /// When enabled, anchors such as `^` and `$` will match any of the - /// following as a line terminator: `\r`, `\n` or `\r\n`. - /// - /// This is disabled by default, in which case, only `\n` is recognized as - /// a line terminator. - pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.crlf = yes; - self - } - - /// Enable Unicode matching mode. - /// - /// When enabled, the following patterns become Unicode aware: `\b`, `\B`, - /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`. - /// - /// When set, this implies UTF matching mode. It is not possible to enable - /// Unicode matching mode without enabling UTF matching mode. - /// - /// This is disabled by default. - pub fn ucp(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.ucp = yes; - self - } - - /// Enable UTF matching mode. - /// - /// When enabled, characters are treated as sequences of code units that - /// make up a single codepoint instead of as single bytes. For example, - /// this will cause `.` to match any single UTF-8 encoded codepoint, where - /// as when this is disabled, `.` will any single byte (except for `\n` in - /// both cases, unless "dot all" mode is enabled). - /// - /// This is disabled by default. - pub fn utf(&mut self, yes: bool) -> &mut RegexBuilder { - self.config.utf = yes; - self - } - - /// This is now deprecated and is a no-op. - /// - /// Previously, this option permitted disabling PCRE2's UTF-8 validity - /// check, which could result in undefined behavior if the haystack was - /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`, - /// in 10.34 which this crate always sets. When this option is enabled, - /// PCRE2 claims to not have undefined behavior when the haystack is - /// invalid UTF-8. - /// - /// Therefore, disabling the UTF-8 check is not something that is exposed - /// by this crate. - #[deprecated( - since = "0.2.4", - note = "now a no-op due to new PCRE2 features" - )] - pub fn disable_utf_check(&mut self) -> &mut RegexBuilder { - self - } - - /// Enable PCRE2's JIT and return an error if it's not available. - /// - /// This generally speeds up matching quite a bit. The downside is that it - /// can increase the time it takes to compile a pattern. - /// - /// If the JIT isn't available or if JIT compilation returns an error, then - /// regex compilation will fail with the corresponding error. - /// - /// This is disabled by default, and always overrides `jit_if_available`. - pub fn jit(&mut self, yes: bool) -> &mut RegexBuilder { - if yes { - self.config.jit = JITChoice::Always; - } else { - self.config.jit = JITChoice::Never; - } - self - } - - /// Enable PCRE2's JIT if it's available. - /// - /// This generally speeds up matching quite a bit. The downside is that it - /// can increase the time it takes to compile a pattern. - /// - /// If the JIT isn't available or if JIT compilation returns an error, - /// then a debug message with the error will be emitted and the regex will - /// otherwise silently fall back to non-JIT matching. - /// - /// This is disabled by default, and always overrides `jit`. - pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexBuilder { - if yes { - self.config.jit = JITChoice::Attempt; - } else { - self.config.jit = JITChoice::Never; - } - self - } - - /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is - /// not enabled, then this has no effect. - /// - /// When `None` is given, no custom JIT stack will be created, and instead, - /// the default JIT stack is used. When the default is used, its maximum - /// size is 32 KB. - /// - /// When this is set, then a new JIT stack will be created with the given - /// maximum size as its limit. - /// - /// Increasing the stack size can be useful for larger regular expressions. - /// - /// By default, this is set to `None`. - pub fn max_jit_stack_size( - &mut self, - bytes: Option, - ) -> &mut RegexBuilder { - self.config.match_config.max_jit_stack_size = bytes; - self - } -} - -/// A compiled PCRE2 regular expression. +/// A compiled PCRE2 regular expression for matching bytes. /// /// This regex is safe to use from multiple threads simultaneously. For top /// performance, it is better to clone a new regex for each thread. -pub struct Regex { - /// The configuration used to build the regex. - config: Arc, - /// The original pattern string. - pattern: String, - /// The underlying compiled PCRE2 object. - code: Arc, - /// The capture group names for this regex. - capture_names: Arc>>, - /// A map from capture group name to capture group index. - capture_names_idx: Arc>, - /// A pool of mutable scratch data used by PCRE2 during matching. - match_data: MatchDataPool, -} - -impl Clone for Regex { - fn clone(&self) -> Regex { - let match_data = { - let config = self.config.match_config.clone(); - let code = Arc::clone(&self.code); - let create: MatchDataPoolFn = - Box::new(move || MatchData::new(config.clone(), &code)); - Pool::new(create) - }; - Regex { - config: Arc::clone(&self.config), - pattern: self.pattern.clone(), - code: Arc::clone(&self.code), - capture_names: Arc::clone(&self.capture_names), - capture_names_idx: Arc::clone(&self.capture_names_idx), - match_data, - } - } -} - -impl std::fmt::Debug for Regex { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "Regex({:?})", self.pattern) - } -} - -impl Regex { - /// Compiles a regular expression using the default configuration. - /// - /// Once compiled, it can be used repeatedly to search, split or replace - /// text in a string. - /// - /// If an invalid expression is given, then an error is returned. - /// - /// To configure compilation options for the regex, use the - /// [`RegexBuilder`](struct.RegexBuilder.html). - pub fn new(pattern: &str) -> Result { - RegexBuilder::new().build(pattern) - } - - /// Returns true if and only if the regex matches the subject string given. - /// - /// # Example - /// - /// Test if some text contains at least one word with exactly 13 ASCII word - /// bytes: - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let text = b"I categorically deny having triskaidekaphobia."; - /// assert!(Regex::new(r"\b\w{13}\b")?.is_match(text)?); - /// # Ok(()) }; example().unwrap() - /// ``` - pub fn is_match(&self, subject: &[u8]) -> Result { - self.is_match_at(subject, 0) - } - - /// Returns the start and end byte range of the leftmost-first match in - /// `subject`. If no match exists, then `None` is returned. - /// - /// # Example - /// - /// Find the start and end location of the first word with exactly 13 - /// ASCII word bytes: - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let text = b"I categorically deny having triskaidekaphobia."; - /// let mat = Regex::new(r"\b\w{13}\b")?.find(text)?.unwrap(); - /// assert_eq!((mat.start(), mat.end()), (2, 15)); - /// # Ok(()) }; example().unwrap() - /// ``` - pub fn find<'s>( - &self, - subject: &'s [u8], - ) -> Result>, Error> { - self.find_at(subject, 0) - } - - /// Returns an iterator for each successive non-overlapping match in - /// `subject`, returning the start and end byte indices with respect to - /// `subject`. - /// - /// # Example - /// - /// Find the start and end location of every word with exactly 13 ASCII - /// word bytes: - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let text = b"Retroactively relinquishing remunerations is reprehensible."; - /// for result in Regex::new(r"\b\w{13}\b")?.find_iter(text) { - /// let mat = result?; - /// println!("{:?}", mat); - /// } - /// # Ok(()) }; example().unwrap() - /// ``` - pub fn find_iter<'r, 's>(&'r self, subject: &'s [u8]) -> Matches<'r, 's> { - Matches { - re: self, - match_data: self.match_data(), - subject, - last_end: 0, - last_match: None, - } - } - - /// Returns the capture groups corresponding to the leftmost-first - /// match in `subject`. Capture group `0` always corresponds to the entire - /// match. If no match is found, then `None` is returned. - /// - /// # Examples - /// - /// Say you have some text with movie names and their release years, - /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text - /// looking like that, while also extracting the movie name and its release - /// year separately. - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")?; - /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; - /// let caps = re.captures(text)?.unwrap(); - /// assert_eq!(&caps[1], &b"Citizen Kane"[..]); - /// assert_eq!(&caps[2], &b"1941"[..]); - /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); - /// // You can also access the groups by index using the Index notation. - /// // Note that this will panic on an invalid index. - /// assert_eq!(&caps[1], b"Citizen Kane"); - /// assert_eq!(&caps[2], b"1941"); - /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); - /// # Ok(()) }; example().unwrap() - /// ``` - /// - /// Note that the full match is at capture group `0`. Each subsequent - /// capture group is indexed by the order of its opening `(`. - /// - /// We can make this example a bit clearer by using *named* capture groups: - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let re = Regex::new(r"'(?P[^']+)'\s+\((?P<year>\d{4})\)")?; - /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; - /// let caps = re.captures(text)?.unwrap(); - /// assert_eq!(&caps["title"], &b"Citizen Kane"[..]); - /// assert_eq!(&caps["year"], &b"1941"[..]); - /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); - /// // You can also access the groups by name using the Index notation. - /// // Note that this will panic on an invalid group name. - /// assert_eq!(&caps["title"], b"Citizen Kane"); - /// assert_eq!(&caps["year"], b"1941"); - /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); - /// # Ok(()) }; example().unwrap() - /// ``` - /// - /// Here we name the capture groups, which we can access with the `name` - /// method or the `Index` notation with a `&str`. Note that the named - /// capture groups are still accessible with `get` or the `Index` notation - /// with a `usize`. - /// - /// The `0`th capture group is always unnamed, so it must always be - /// accessed with `get(0)` or `[0]`. - pub fn captures<'s>( - &self, - subject: &'s [u8], - ) -> Result<Option<Captures<'s>>, Error> { - let mut locs = self.capture_locations(); - Ok(self.captures_read(&mut locs, subject)?.map(move |_| Captures { - subject, - locs, - idx: Arc::clone(&self.capture_names_idx), - })) - } - - /// Returns an iterator over all the non-overlapping capture groups matched - /// in `subject`. This is operationally the same as `find_iter`, except it - /// yields information about capturing group matches. - /// - /// # Example - /// - /// We can use this to find all movie titles and their release years in - /// some text, where the movie is formatted like "'Title' (xxxx)": - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use std::str; - /// - /// use pcre2::bytes::Regex; - /// - /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?; - /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; - /// for result in re.captures_iter(text) { - /// let caps = result?; - /// let title = str::from_utf8(&caps["title"]).unwrap(); - /// let year = str::from_utf8(&caps["year"]).unwrap(); - /// println!("Movie: {:?}, Released: {:?}", title, year); - /// } - /// // Output: - /// // Movie: Citizen Kane, Released: 1941 - /// // Movie: The Wizard of Oz, Released: 1939 - /// // Movie: M, Released: 1931 - /// # Ok(()) }; example().unwrap() - /// ``` - pub fn captures_iter<'r, 's>( - &'r self, - subject: &'s [u8], - ) -> CaptureMatches<'r, 's> { - CaptureMatches { re: self, subject, last_end: 0, last_match: None } - } -} - -/// Advanced or "lower level" search methods. -impl Regex { - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - pub fn is_match_at( - &self, - subject: &[u8], - start: usize, - ) -> Result<bool, Error> { - assert!( - start <= subject.len(), - "start ({}) must be <= subject.len() ({})", - start, - subject.len() - ); - - let options = 0; - let mut match_data = self.match_data(); - // SAFETY: We don't use any dangerous PCRE2 options. - let res = - unsafe { match_data.find(&self.code, subject, start, options) }; - PoolGuard::put(match_data); - res - } - - /// Returns the same as find, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - pub fn find_at<'s>( - &self, - subject: &'s [u8], - start: usize, - ) -> Result<Option<Match<'s>>, Error> { - let mut match_data = self.match_data(); - let res = - self.find_at_with_match_data(&mut match_data, subject, start); - PoolGuard::put(match_data); - res - } - - /// Like find_at, but accepts match data instead of acquiring one itself. - /// - /// This is useful for implementing the iterator, which permits avoiding - /// the synchronization overhead of acquiring the match data. - #[inline(always)] - fn find_at_with_match_data<'s>( - &self, - match_data: &mut MatchDataPoolGuard<'_>, - subject: &'s [u8], - start: usize, - ) -> Result<Option<Match<'s>>, Error> { - assert!( - start <= subject.len(), - "start ({}) must be <= subject.len() ({})", - start, - subject.len() - ); - - let options = 0; - // SAFETY: We don't use any dangerous PCRE2 options. - if unsafe { !match_data.find(&self.code, subject, start, options)? } { - return Ok(None); - } - let ovector = match_data.ovector(); - let (s, e) = (ovector[0], ovector[1]); - Ok(Some(Match::new(&subject, s, e))) - } - - /// This is like `captures`, but uses - /// [`CaptureLocations`](struct.CaptureLocations.html) - /// instead of - /// [`Captures`](struct.Captures.html) in order to amortize allocations. - /// - /// To create a `CaptureLocations` value, use the - /// `Regex::capture_locations` method. - /// - /// This returns the overall match if this was successful, which is always - /// equivalent to the `0`th capture group. - pub fn captures_read<'s>( - &self, - locs: &mut CaptureLocations, - subject: &'s [u8], - ) -> Result<Option<Match<'s>>, Error> { - self.captures_read_at(locs, subject, 0) - } - - /// Returns the same as `captures_read`, but starts the search at the given - /// offset and populates the capture locations given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - pub fn captures_read_at<'s>( - &self, - locs: &mut CaptureLocations, - subject: &'s [u8], - start: usize, - ) -> Result<Option<Match<'s>>, Error> { - assert!( - start <= subject.len(), - "start ({}) must be <= subject.len() ({})", - start, - subject.len() - ); - - let options = 0; - // SAFETY: We don't use any dangerous PCRE2 options. - if unsafe { !locs.data.find(&self.code, subject, start, options)? } { - return Ok(None); - } - let ovector = locs.data.ovector(); - let (s, e) = (ovector[0], ovector[1]); - Ok(Some(Match::new(&subject, s, e))) - } -} - -/// Auxiliary methods. -impl Regex { - /// Returns the original pattern string for this regex. - pub fn as_str(&self) -> &str { - &self.pattern - } - - /// Returns a sequence of all capturing groups and their names, if present. - /// - /// The length of the slice returned is always equal to the result of - /// `captures_len`, which is the number of capturing groups (including the - /// capturing group for the entire pattern). - /// - /// Each entry in the slice is the name of the corresponding capturing - /// group, if one exists. The first capturing group (at index `0`) is - /// always unnamed. - /// - /// Capturing groups are indexed by the order of the opening parenthesis. - pub fn capture_names(&self) -> &[Option<String>] { - &self.capture_names - } - - /// Returns the number of capturing groups in the pattern. - /// - /// This is always 1 more than the number of syntactic groups in the - /// pattern, since the first group always corresponds to the entire match. - pub fn captures_len(&self) -> usize { - self.code.capture_count().expect("a valid capture count from PCRE2") - } - - /// Returns an empty set of capture locations that can be reused in - /// multiple calls to `captures_read` or `captures_read_at`. - pub fn capture_locations(&self) -> CaptureLocations { - CaptureLocations { - code: Arc::clone(&self.code), - data: self.new_match_data(), - } - } +pub type Regex = RegexImpl<CodeUnitWidth8>; - fn match_data(&self) -> MatchDataPoolGuard<'_> { - self.match_data.get() - } - - fn new_match_data(&self) -> MatchData { - MatchData::new(self.config.match_config.clone(), &self.code) - } -} +/// A builder for configuring the compilation of a PCRE2 regex. +pub type RegexBuilder = RegexBuilderImpl<CodeUnitWidth8>; -/// CaptureLocations is a low level representation of the raw offsets of each -/// submatch. -/// -/// Primarily, this type is useful when using `Regex` APIs such as -/// `captures_read`, which permits amortizing the allocation in which capture -/// match locations are stored. +/// Match represents a single match of a regex in a subject string. /// -/// In order to build a value of this type, you'll need to call the -/// `capture_locations` method on the `Regex` being used to execute the search. -/// The value returned can then be reused in subsequent searches. -pub struct CaptureLocations { - code: Arc<Code>, - data: MatchData, -} - -impl Clone for CaptureLocations { - fn clone(&self) -> CaptureLocations { - CaptureLocations { - code: Arc::clone(&self.code), - data: MatchData::new(self.data.config().clone(), &self.code), - } - } -} - -impl std::fmt::Debug for CaptureLocations { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let mut offsets: Vec<Option<usize>> = vec![]; - for &offset in self.data.ovector() { - if offset == PCRE2_UNSET { - offsets.push(None); - } else { - offsets.push(Some(offset)); - } - } - write!(f, "CaptureLocations(")?; - f.debug_list().entries(offsets).finish()?; - write!(f, ")") - } -} - -impl CaptureLocations { - /// Returns the start and end positions of the Nth capture group. - /// - /// This returns `None` if `i` is not a valid capture group or if the - /// capture group did not match anything. - /// - /// The positions returned are always byte indices with respect to the - /// original subject string matched. - #[inline] - pub fn get(&self, i: usize) -> Option<(usize, usize)> { - let ovec = self.data.ovector(); - let s = match ovec.get(i * 2) { - None => return None, - Some(&s) if s == PCRE2_UNSET => return None, - Some(&s) => s, - }; - let e = match ovec.get(i * 2 + 1) { - None => return None, - Some(&e) if e == PCRE2_UNSET => return None, - Some(&e) => e, - }; - Some((s, e)) - } - - /// Returns the total number of capturing groups. - /// - /// This is always at least `1` since every regex has at least `1` - /// capturing group that corresponds to the entire match. - #[inline] - pub fn len(&self) -> usize { - self.data.ovector().len() / 2 - } -} +/// The lifetime parameter `'s` refers to the lifetime of the matched portion +/// of the subject string. +pub type Match<'s> = MatchImpl<'s, CodeUnitWidth8>; -/// Captures represents a group of captured byte strings for a single match. +/// `Captures` represents a group of captured byte strings for a single match. /// /// The 0th capture always corresponds to the entire match. Each subsequent /// index corresponds to the next capture group in the regex. If a capture @@ -846,262 +33,11 @@ impl CaptureLocations { /// Positions returned from a capture group are always byte indices. /// /// `'s` is the lifetime of the matched subject string. -pub struct Captures<'s> { - subject: &'s [u8], - locs: CaptureLocations, - idx: Arc<HashMap<String, usize>>, -} - -impl<'s> Captures<'s> { - /// Returns the match associated with the capture group at index `i`. If - /// `i` does not correspond to a capture group, or if the capture group - /// did not participate in the match, then `None` is returned. - /// - /// # Examples - /// - /// Get the text of the match with a default of an empty string if this - /// group didn't participate in the match: - /// - /// ```rust - /// # fn example() -> Result<(), ::pcre2::Error> { - /// use pcre2::bytes::Regex; - /// - /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))")?; - /// let caps = re.captures(b"abc123")?.unwrap(); - /// - /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); - /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); - /// assert_eq!(text1, &b"123"[..]); - /// assert_eq!(text2, &b""[..]); - /// # Ok(()) }; example().unwrap() - /// ``` - pub fn get(&self, i: usize) -> Option<Match<'s>> { - self.locs.get(i).map(|(s, e)| Match::new(self.subject, s, e)) - } - - /// Returns the match for the capture group named `name`. If `name` isn't a - /// valid capture group or didn't match anything, then `None` is returned. - pub fn name(&self, name: &str) -> Option<Match<'s>> { - self.idx.get(name).and_then(|&i| self.get(i)) - } - - /// Returns the number of captured groups. - /// - /// This is always at least `1`, since every regex has at least one capture - /// group that corresponds to the full match. - #[inline] - pub fn len(&self) -> usize { - self.locs.len() - } -} - -impl<'s> std::fmt::Debug for Captures<'s> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() - } -} - -struct CapturesDebug<'c, 's: 'c>(&'c Captures<'s>); - -impl<'c, 's> std::fmt::Debug for CapturesDebug<'c, 's> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - fn escape_bytes(bytes: &[u8]) -> String { - let mut s = String::new(); - for &b in bytes { - s.push_str(&escape_byte(b)); - } - s - } - - fn escape_byte(byte: u8) -> String { - use std::ascii::escape_default; - - let escaped: Vec<u8> = escape_default(byte).collect(); - String::from_utf8_lossy(&escaped).into_owned() - } - - // We'd like to show something nice here, even if it means an - // allocation to build a reverse index. - let slot_to_name: HashMap<&usize, &String> = - self.0.idx.iter().map(|(a, b)| (b, a)).collect(); - let mut map = f.debug_map(); - for slot in 0..self.0.len() { - let m = self - .0 - .locs - .get(slot) - .map(|(s, e)| escape_bytes(&self.0.subject[s..e])); - if let Some(name) = slot_to_name.get(&slot) { - map.entry(&name, &m); - } else { - map.entry(&slot, &m); - } - } - map.finish() - } -} - -/// Get a group by index. -/// -/// `'s` is the lifetime of the matched subject string. -/// -/// The subject can't outlive the `Captures` object if this method is -/// used, because of how `Index` is defined (normally `a[i]` is part -/// of `a` and can't outlive it); to do that, use `get()` instead. -/// -/// # Panics -/// -/// If there is no group at the given index. -impl<'s> std::ops::Index<usize> for Captures<'s> { - type Output = [u8]; - - fn index(&self, i: usize) -> &[u8] { - self.get(i) - .map(|m| m.as_bytes()) - .unwrap_or_else(|| panic!("no group at index '{}'", i)) - } -} - -/// Get a group by name. -/// -/// `'s` is the lifetime of the matched subject string and `'i` is the lifetime -/// of the group name (the index). -/// -/// The text can't outlive the `Captures` object if this method is -/// used, because of how `Index` is defined (normally `a[i]` is part -/// of `a` and can't outlive it); to do that, use `name` instead. -/// -/// # Panics -/// -/// If there is no group named by the given value. -impl<'s, 'i> std::ops::Index<&'i str> for Captures<'s> { - type Output = [u8]; - - fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { - self.name(name) - .map(|m| m.as_bytes()) - .unwrap_or_else(|| panic!("no group named '{}'", name)) - } -} - -/// An iterator over all non-overlapping matches for a particular subject -/// string. -/// -/// The iterator yields matches (if no error occurred while searching) -/// corresponding to the start and end of the match. The indices are byte -/// offsets. The iterator stops when no more matches can be found. -/// -/// `'r` is the lifetime of the compiled regular expression and `'s` is the -/// lifetime of the subject string. -pub struct Matches<'r, 's> { - re: &'r Regex, - match_data: MatchDataPoolGuard<'r>, - subject: &'s [u8], - last_end: usize, - last_match: Option<usize>, -} - -impl<'r, 's> Iterator for Matches<'r, 's> { - type Item = Result<Match<'s>, Error>; - - fn next(&mut self) -> Option<Result<Match<'s>, Error>> { - if self.last_end > self.subject.len() { - return None; - } - let res = self.re.find_at_with_match_data( - &mut self.match_data, - self.subject, - self.last_end, - ); - let m = match res { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - if m.start() == m.end() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = m.end() + 1; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } - } else { - self.last_end = m.end(); - } - self.last_match = Some(m.end()); - Some(Ok(m)) - } -} - -/// An iterator that yields all non-overlapping capture groups matching a -/// particular regular expression. -/// -/// The iterator stops when no more matches can be found. -/// -/// `'r` is the lifetime of the compiled regular expression and `'s` is the -/// lifetime of the subject string. -pub struct CaptureMatches<'r, 's> { - re: &'r Regex, - subject: &'s [u8], - last_end: usize, - last_match: Option<usize>, -} - -impl<'r, 's> Iterator for CaptureMatches<'r, 's> { - type Item = Result<Captures<'s>, Error>; - - fn next(&mut self) -> Option<Result<Captures<'s>, Error>> { - if self.last_end > self.subject.len() { - return None; - } - let mut locs = self.re.capture_locations(); - let res = - self.re.captures_read_at(&mut locs, self.subject, self.last_end); - let m = match res { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - if m.start() == m.end() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = m.end() + 1; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } - } else { - self.last_end = m.end(); - } - self.last_match = Some(m.end()); - Some(Ok(Captures { - subject: self.subject, - locs, - idx: Arc::clone(&self.re.capture_names_idx), - })) - } -} - -/// A type alias for our pool of `MatchData` that fixes the type parameters to -/// what we actually use in practice. -type MatchDataPool = Pool<MatchData, MatchDataPoolFn>; - -/// Same as above, but for the guard returned by a pool. -type MatchDataPoolGuard<'a> = PoolGuard<'a, MatchData, MatchDataPoolFn>; - -/// The type of the closure we use to create new caches. We need to spell out -/// all of the marker traits or else we risk leaking !MARKER impls. -type MatchDataPoolFn = - Box<dyn Fn() -> MatchData + Send + Sync + UnwindSafe + RefUnwindSafe>; +pub type Captures<'s> = CapturesImpl<'s, CodeUnitWidth8>; #[cfg(test)] mod tests { - use super::{Regex, RegexBuilder}; + use super::{CodeUnitWidth8, Regex, RegexBuilder}; use crate::is_jit_available; fn b(string: &str) -> &[u8] { @@ -1186,9 +122,16 @@ mod tests { assert_eq!(re.find(b("β")).unwrap().unwrap().as_pair(), (0, 2)); } + #[test] + fn fmt_debug_works() { + let re = RegexBuilder::new().utf(false).build(".").unwrap(); + let m = re.find(b("x")).unwrap().unwrap(); + let _ = format!("{:?}", m); + } + #[test] fn jit4lyfe() { - if is_jit_available() { + if is_jit_available::<CodeUnitWidth8>() { let re = RegexBuilder::new().jit(true).build(r"\w").unwrap(); assert!(re.is_match(b("a")).unwrap()); } else { @@ -1247,10 +190,11 @@ mod tests { ); // Test our internal map as well. - assert_eq!(re.capture_names_idx.len(), 3); - assert_eq!(re.capture_names_idx["foo"], 1); - assert_eq!(re.capture_names_idx["a"], 3); - assert_eq!(re.capture_names_idx["springsteen"], 4); + let capture_names_idx = re.get_capture_names_idxs(); + assert_eq!(capture_names_idx.len(), 3); + assert_eq!(capture_names_idx["foo"], 1); + assert_eq!(capture_names_idx["a"], 3); + assert_eq!(capture_names_idx["springsteen"], 4); } #[test] @@ -1292,7 +236,7 @@ mod tests { #[test] fn max_jit_stack_size_does_something() { - if !is_jit_available() { + if !is_jit_available::<CodeUnitWidth8>() { return; } @@ -1343,7 +287,7 @@ mod tests { let re = RegexBuilder::new() .extended(true) .utf(true) - .jit(true) + .jit_if_available(true) .build(pattern) .unwrap(); let matched = re.find(hay.as_bytes()).unwrap().unwrap(); @@ -1364,7 +308,7 @@ mod tests { let re = RegexBuilder::new() .extended(true) .utf(true) - .jit(true) + .jit_if_available(true) .build(pattern) .unwrap(); let matched = re.find(hay.as_bytes()).unwrap().unwrap(); diff --git a/src/error.rs b/src/error.rs index 88c7ae7..a94eac6 100644 --- a/src/error.rs +++ b/src/error.rs @@ -89,7 +89,7 @@ impl Error { } /// Returns the error message from PCRE2. - fn error_message(&self) -> String { + pub fn error_message(&self) -> String { // PCRE2 docs say a buffer size of 120 bytes is enough, but we're // cautious and double it. let mut buf = [0u8; 240]; diff --git a/src/ffi.rs b/src/ffi.rs index bef2ab8..1e381b9 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -5,17 +5,538 @@ unsafety, but this layer will take care of the obvious things, such as resource management and error handling. */ -use std::{cmp, ptr, slice}; - +use crate::error::Error; +use std::{ + cmp, + marker::PhantomData, + panic::{RefUnwindSafe, UnwindSafe}, + ptr, slice, +}; use {libc::c_void, pcre2_sys::*}; -use crate::error::Error; +pub trait NameTableEntry { + /// The index of the named subpattern. + fn index(&self) -> usize; + + /// The name of the named subpattern. + fn name(&self) -> String; +} + +#[allow(non_camel_case_types)] +#[repr(C)] +pub struct name_table_entry_8 { + match_index_msb: u8, + match_index_lsb: u8, + + // In C, the 'name' field is a flexible array member. + // This does not contribute to the sizeof the struct. + name: u8, +} + +#[allow(non_camel_case_types)] +#[repr(C)] +pub struct name_table_entry_32 { + match_index: u32, + name: u32, // See above re: flexible array member +} + +impl NameTableEntry for name_table_entry_8 { + fn index(&self) -> usize { + ((self.match_index_msb as usize) << 8) + | (self.match_index_lsb as usize) + } + + fn name(&self) -> String { + // The name is nul-terminated. + let name = &self.name as *const u8; + let mut len = 0; + while unsafe { *name.offset(len as isize) } != 0 { + len += 1; + } + let bytes = unsafe { slice::from_raw_parts(name, len) }; + String::from_utf8_lossy(bytes).into_owned() + } +} + +impl NameTableEntry for name_table_entry_32 { + fn index(&self) -> usize { + self.match_index as usize + } + + fn name(&self) -> String { + // The name is nul-terminated. + let replacement: char = '\u{FFFD}'; + let name = &self.name as *const u32; + let mut len = 0; + let mut result = String::new(); + loop { + let c = unsafe { *name.offset(len) }; + if c == 0 { + break; + } + result.push(char::from_u32(c).unwrap_or(replacement)); + len += 1; + } + result + } +} + +#[allow(non_camel_case_types)] +pub trait CodeUnitWidth: std::fmt::Debug + 'static { + type pcre2_code: UnwindSafe + RefUnwindSafe; + type pcre2_compile_context: UnwindSafe + RefUnwindSafe; + type pcre2_match_context; + type pcre2_match_data; + type pcre2_jit_stack; + type PCRE2_CHAR: Default + Copy + TryInto<Self::SubjectChar>; + type PCRE2_SPTR; + type name_table_entry: NameTableEntry; + type SubjectChar: Copy; + type Pattern: Clone + std::fmt::Debug; + + fn escape_subject(subject: &[Self::SubjectChar]) -> String; + + fn pattern_to_sptr_len( + pattern: &Self::Pattern, + ) -> (Self::PCRE2_SPTR, usize); + fn subject_to_sptr_len( + subject: &[Self::SubjectChar], + ) -> (Self::PCRE2_SPTR, usize); + + unsafe fn pcre2_config( + arg1: u32, + arg2: *mut ::libc::c_void, + ) -> ::libc::c_int; + unsafe fn pcre2_code_free(arg1: *mut Self::pcre2_code); + unsafe fn pcre2_compile( + arg1: Self::PCRE2_SPTR, + arg2: usize, + arg3: u32, + arg4: *mut ::libc::c_int, + arg5: *mut ::libc::size_t, + arg6: *mut Self::pcre2_compile_context, + ) -> *mut Self::pcre2_code; + unsafe fn pcre2_pattern_info( + arg1: *const Self::pcre2_code, + arg2: u32, + arg3: *mut ::libc::c_void, + ) -> ::libc::c_int; + + unsafe fn pcre2_match( + arg1: *const Self::pcre2_code, + arg2: Self::PCRE2_SPTR, + arg3: usize, + arg4: usize, + arg5: u32, + arg6: *mut Self::pcre2_match_data, + arg7: *mut Self::pcre2_match_context, + ) -> ::libc::c_int; + + unsafe fn pcre2_jit_stack_create( + arg1: ::libc::size_t, + arg2: ::libc::size_t, + ) -> *mut Self::pcre2_jit_stack; + unsafe fn pcre2_jit_compile( + arg1: *mut Self::pcre2_code, + arg2: u32, + ) -> ::libc::c_int; + unsafe fn pcre2_jit_stack_assign( + arg1: *mut Self::pcre2_match_context, + arg3: *mut ::libc::c_void, + ); + unsafe fn pcre2_jit_stack_free(arg1: *mut Self::pcre2_jit_stack); + + unsafe fn pcre2_compile_context_create() -> *mut Self::pcre2_compile_context; + unsafe fn pcre2_set_newline( + arg1: *mut Self::pcre2_compile_context, + arg2: u32, + ) -> ::libc::c_int; + unsafe fn pcre2_compile_context_free( + arg1: *mut Self::pcre2_compile_context, + ); + + unsafe fn pcre2_match_context_create() -> *mut Self::pcre2_match_context; + unsafe fn pcre2_match_context_free(arg1: *mut Self::pcre2_match_context); + + unsafe fn pcre2_match_data_create_from_pattern( + arg1: *const Self::pcre2_code, + ) -> *mut Self::pcre2_match_data; + unsafe fn pcre2_match_data_free(arg1: *mut Self::pcre2_match_data); + + unsafe fn pcre2_get_ovector_pointer( + arg1: *mut Self::pcre2_match_data, + ) -> *mut usize; + unsafe fn pcre2_get_ovector_count( + arg1: *mut Self::pcre2_match_data, + ) -> u32; + + unsafe fn pcre2_substitute( + code: *const Self::pcre2_code, + subject: Self::PCRE2_SPTR, + length: usize, + startoffset: usize, + options: u32, + match_data: *mut Self::pcre2_match_data, + mcontext: *mut Self::pcre2_match_context, + replacement: Self::PCRE2_SPTR, + rlength: usize, + outputbuffer: *mut Self::PCRE2_CHAR, + outputlengthptr: *mut usize, + ) -> ::libc::c_int; +} + +#[derive(Debug)] +pub struct CodeUnitWidth8; + +impl CodeUnitWidth for CodeUnitWidth8 { + type pcre2_code = pcre2_code_8; + type PCRE2_CHAR = PCRE2_UCHAR8; + type PCRE2_SPTR = PCRE2_SPTR8; + type pcre2_compile_context = pcre2_compile_context_8; + type pcre2_match_context = pcre2_match_context_8; + type pcre2_match_data = pcre2_match_data_8; + type pcre2_jit_stack = pcre2_jit_stack_8; + type name_table_entry = name_table_entry_8; + type SubjectChar = u8; + type Pattern = String; + + fn escape_subject(subject: &[Self::SubjectChar]) -> String { + use std::ascii::escape_default; + // Escape bytes. + let mut s = String::new(); + for &b in subject { + let escaped = escape_default(b).collect::<Vec<_>>(); + s.push_str(&String::from_utf8_lossy(&escaped)); + } + s + } + + fn pattern_to_sptr_len( + pattern: &Self::Pattern, + ) -> (Self::PCRE2_SPTR, usize) { + (pattern.as_ptr(), pattern.len()) + } + + fn subject_to_sptr_len( + subject: &[Self::SubjectChar], + ) -> (Self::PCRE2_SPTR, usize) { + (subject.as_ptr(), subject.len()) + } + + unsafe fn pcre2_config( + arg1: u32, + arg2: *mut ::libc::c_void, + ) -> ::libc::c_int { + pcre2_config_8(arg1, arg2) + } + unsafe fn pcre2_code_free(arg1: *mut Self::pcre2_code) { + pcre2_code_free_8(arg1) + } + unsafe fn pcre2_compile( + arg1: Self::PCRE2_SPTR, + arg2: usize, + arg3: u32, + arg4: *mut ::libc::c_int, + arg5: *mut ::libc::size_t, + arg6: *mut Self::pcre2_compile_context, + ) -> *mut Self::pcre2_code { + pcre2_compile_8(arg1, arg2, arg3, arg4, arg5, arg6) + } + + unsafe fn pcre2_jit_stack_create( + arg1: ::libc::size_t, + arg2: ::libc::size_t, + ) -> *mut Self::pcre2_jit_stack { + pcre2_jit_stack_create_8(arg1, arg2, ptr::null_mut()) + } + unsafe fn pcre2_jit_compile( + arg1: *mut Self::pcre2_code, + arg2: u32, + ) -> ::libc::c_int { + pcre2_jit_compile_8(arg1, arg2) + } + unsafe fn pcre2_jit_stack_assign( + arg1: *mut Self::pcre2_match_context, + arg3: *mut ::libc::c_void, + ) { + pcre2_jit_stack_assign_8(arg1, None, arg3) + } + unsafe fn pcre2_jit_stack_free(arg1: *mut Self::pcre2_jit_stack) { + pcre2_jit_stack_free_8(arg1) + } + + unsafe fn pcre2_pattern_info( + arg1: *const Self::pcre2_code, + arg2: u32, + arg3: *mut ::libc::c_void, + ) -> ::libc::c_int { + pcre2_pattern_info_8(arg1, arg2, arg3) + } + + unsafe fn pcre2_match( + arg1: *const Self::pcre2_code, + arg2: Self::PCRE2_SPTR, + arg3: usize, + arg4: usize, + arg5: u32, + arg6: *mut Self::pcre2_match_data, + arg7: *mut Self::pcre2_match_context, + ) -> ::libc::c_int { + pcre2_match_8(arg1, arg2, arg3, arg4, arg5, arg6, arg7) + } + + unsafe fn pcre2_compile_context_create() -> *mut Self::pcre2_compile_context + { + pcre2_compile_context_create_8(ptr::null_mut()) + } + unsafe fn pcre2_match_context_free(arg1: *mut Self::pcre2_match_context) { + pcre2_match_context_free_8(arg1) + } + + unsafe fn pcre2_set_newline( + arg1: *mut Self::pcre2_compile_context, + arg2: u32, + ) -> ::libc::c_int { + pcre2_set_newline_8(arg1, arg2) + } + unsafe fn pcre2_compile_context_free( + arg1: *mut Self::pcre2_compile_context, + ) { + pcre2_compile_context_free_8(arg1) + } + unsafe fn pcre2_match_context_create() -> *mut Self::pcre2_match_context { + pcre2_match_context_create_8(ptr::null_mut()) + } + + unsafe fn pcre2_match_data_create_from_pattern( + arg1: *const Self::pcre2_code, + ) -> *mut Self::pcre2_match_data { + pcre2_match_data_create_from_pattern_8(arg1, ptr::null_mut()) + } + unsafe fn pcre2_match_data_free(arg1: *mut Self::pcre2_match_data) { + pcre2_match_data_free_8(arg1) + } + + unsafe fn pcre2_get_ovector_pointer( + arg1: *mut Self::pcre2_match_data, + ) -> *mut usize { + pcre2_get_ovector_pointer_8(arg1) + } + unsafe fn pcre2_get_ovector_count( + arg1: *mut Self::pcre2_match_data, + ) -> u32 { + pcre2_get_ovector_count_8(arg1) + } + unsafe fn pcre2_substitute( + code: *const Self::pcre2_code, + subject: Self::PCRE2_SPTR, + length: usize, + startoffset: usize, + options: u32, + match_data: *mut Self::pcre2_match_data, + mcontext: *mut Self::pcre2_match_context, + replacement: Self::PCRE2_SPTR, + rlength: usize, + outputbuffer: *mut Self::PCRE2_CHAR, + outputlengthptr: *mut usize, + ) -> ::libc::c_int { + pcre2_substitute_8( + code, + subject, + length, + startoffset, + options, + match_data, + mcontext, + replacement, + rlength, + outputbuffer, + outputlengthptr, + ) + } +} + +#[derive(Debug)] +pub struct CodeUnitWidth32; + +impl CodeUnitWidth for CodeUnitWidth32 { + type pcre2_code = pcre2_code_32; + type PCRE2_CHAR = PCRE2_UCHAR32; + type PCRE2_SPTR = PCRE2_SPTR32; + type pcre2_compile_context = pcre2_compile_context_32; + type pcre2_match_context = pcre2_match_context_32; + type pcre2_match_data = pcre2_match_data_32; + type pcre2_jit_stack = pcre2_jit_stack_32; + type name_table_entry = name_table_entry_32; + type SubjectChar = char; + type Pattern = Box<[char]>; + + fn escape_subject(subject: &[Self::SubjectChar]) -> String { + use std::ascii::escape_default; + // Escape bytes. + let mut s = String::new(); + for &c in subject { + let mut bytes = [0; 4]; + for &b in c.encode_utf8(&mut bytes).as_bytes() { + // Escape the byte. + let escaped = escape_default(b).collect::<Vec<_>>(); + s.push_str(&String::from_utf8_lossy(&escaped)); + } + } + s + } + + fn pattern_to_sptr_len( + pattern: &Self::Pattern, + ) -> (Self::PCRE2_SPTR, usize) { + (pattern.as_ptr() as *const u32, pattern.len()) + } + + fn subject_to_sptr_len( + subject: &[Self::SubjectChar], + ) -> (Self::PCRE2_SPTR, usize) { + (subject.as_ptr() as *const u32, subject.len()) + } + + unsafe fn pcre2_config( + arg1: u32, + arg2: *mut ::libc::c_void, + ) -> ::libc::c_int { + pcre2_config_32(arg1, arg2) + } + unsafe fn pcre2_code_free(arg1: *mut Self::pcre2_code) { + pcre2_code_free_32(arg1) + } + unsafe fn pcre2_compile( + arg1: Self::PCRE2_SPTR, + arg2: usize, + arg3: u32, + arg4: *mut ::libc::c_int, + arg5: *mut ::libc::size_t, + arg6: *mut Self::pcre2_compile_context, + ) -> *mut Self::pcre2_code { + pcre2_compile_32(arg1, arg2, arg3, arg4, arg5, arg6) + } + + unsafe fn pcre2_jit_stack_create( + arg1: ::libc::size_t, + arg2: ::libc::size_t, + ) -> *mut Self::pcre2_jit_stack { + pcre2_jit_stack_create_32(arg1, arg2, ptr::null_mut()) + } + unsafe fn pcre2_jit_compile( + arg1: *mut Self::pcre2_code, + arg2: u32, + ) -> ::libc::c_int { + pcre2_jit_compile_32(arg1, arg2) + } + unsafe fn pcre2_jit_stack_assign( + arg1: *mut Self::pcre2_match_context, + arg3: *mut ::libc::c_void, + ) { + pcre2_jit_stack_assign_32(arg1, None, arg3) + } + unsafe fn pcre2_jit_stack_free(arg1: *mut Self::pcre2_jit_stack) { + pcre2_jit_stack_free_32(arg1) + } + + unsafe fn pcre2_pattern_info( + arg1: *const Self::pcre2_code, + arg2: u32, + arg3: *mut ::libc::c_void, + ) -> ::libc::c_int { + pcre2_pattern_info_32(arg1, arg2, arg3) + } + + unsafe fn pcre2_match( + arg1: *const Self::pcre2_code, + arg2: Self::PCRE2_SPTR, + arg3: usize, + arg4: usize, + arg5: u32, + arg6: *mut Self::pcre2_match_data, + arg7: *mut Self::pcre2_match_context, + ) -> ::libc::c_int { + pcre2_match_32(arg1, arg2, arg3, arg4, arg5, arg6, arg7) + } + + unsafe fn pcre2_compile_context_create() -> *mut Self::pcre2_compile_context + { + pcre2_compile_context_create_32(ptr::null_mut()) + } + unsafe fn pcre2_match_context_free(arg1: *mut Self::pcre2_match_context) { + pcre2_match_context_free_32(arg1) + } + + unsafe fn pcre2_set_newline( + arg1: *mut Self::pcre2_compile_context, + arg2: u32, + ) -> ::libc::c_int { + pcre2_set_newline_32(arg1, arg2) + } + unsafe fn pcre2_compile_context_free( + arg1: *mut Self::pcre2_compile_context, + ) { + pcre2_compile_context_free_32(arg1) + } + unsafe fn pcre2_match_context_create() -> *mut Self::pcre2_match_context { + pcre2_match_context_create_32(ptr::null_mut()) + } + + unsafe fn pcre2_match_data_create_from_pattern( + arg1: *const Self::pcre2_code, + ) -> *mut Self::pcre2_match_data { + pcre2_match_data_create_from_pattern_32(arg1, ptr::null_mut()) + } + unsafe fn pcre2_match_data_free(arg1: *mut Self::pcre2_match_data) { + pcre2_match_data_free_32(arg1) + } + + unsafe fn pcre2_get_ovector_pointer( + arg1: *mut Self::pcre2_match_data, + ) -> *mut usize { + pcre2_get_ovector_pointer_32(arg1) + } + unsafe fn pcre2_get_ovector_count( + arg1: *mut Self::pcre2_match_data, + ) -> u32 { + pcre2_get_ovector_count_32(arg1) + } + + unsafe fn pcre2_substitute( + code: *const Self::pcre2_code, + subject: Self::PCRE2_SPTR, + length: usize, + startoffset: usize, + options: u32, + match_data: *mut Self::pcre2_match_data, + mcontext: *mut Self::pcre2_match_context, + replacement: Self::PCRE2_SPTR, + rlength: usize, + outputbuffer: *mut Self::PCRE2_CHAR, + outputlengthptr: *mut usize, + ) -> ::libc::c_int { + pcre2_substitute_32( + code, + subject, + length, + startoffset, + options, + match_data, + mcontext, + replacement, + rlength, + outputbuffer, + outputlengthptr, + ) + } +} /// Returns true if and only if PCRE2 believes that JIT is available. -pub fn is_jit_available() -> bool { +pub fn is_jit_available<W: CodeUnitWidth>() -> bool { let mut rc: u32 = 0; let error_code = unsafe { - pcre2_config_8(PCRE2_CONFIG_JIT, &mut rc as *mut _ as *mut c_void) + W::pcre2_config(PCRE2_CONFIG_JIT, &mut rc as *mut _ as *mut c_void) }; if error_code < 0 { // If PCRE2_CONFIG_JIT is a bad option, then there's a bug somewhere. @@ -61,14 +582,14 @@ pub fn escape(pattern: &str) -> String { } /// A low level representation of a compiled PCRE2 code object. -pub(crate) struct Code { - code: *mut pcre2_code_8, +pub(crate) struct Code<W: CodeUnitWidth> { + code: *mut W::pcre2_code, compiled_jit: bool, // We hang on to this but don't use it so that it gets freed when the // compiled code gets freed. It's not clear whether this is necessary or // not, but presumably doesn't cost us much to be conservative. #[allow(dead_code)] - ctx: CompileContext, + ctx: CompileContext<W>, } // SAFETY: Compiled PCRE2 code objects are immutable once built and explicitly @@ -77,28 +598,30 @@ pub(crate) struct Code { // One hitch here is that JIT compiling can write into a PCRE2 code object, but // we only ever JIT compile immediately after first building the code object // and before making it available to the caller. -unsafe impl Send for Code {} -unsafe impl Sync for Code {} +unsafe impl<W: CodeUnitWidth> Send for Code<W> {} +unsafe impl<W: CodeUnitWidth> Sync for Code<W> {} -impl Drop for Code { +impl<W: CodeUnitWidth> Drop for Code<W> { fn drop(&mut self) { - unsafe { pcre2_code_free_8(self.code) } + unsafe { W::pcre2_code_free(self.code) } } } -impl Code { +impl<W: CodeUnitWidth> Code<W> { /// Compile the given pattern with the given options. If there was a /// problem compiling the pattern, then return an error. pub(crate) fn new( - pattern: &str, + pattern: &W::Pattern, options: u32, - mut ctx: CompileContext, - ) -> Result<Code, Error> { + mut ctx: CompileContext<W>, + ) -> Result<Self, Error> { let (mut error_code, mut error_offset) = (0, 0); + let (pat_sptr, pat_len) = W::pattern_to_sptr_len(pattern); + let code = unsafe { - pcre2_compile_8( - pattern.as_ptr(), - pattern.len(), + W::pcre2_compile( + pat_sptr, + pat_len, options, &mut error_code, &mut error_offset, @@ -118,7 +641,7 @@ impl Code { /// an error. pub(crate) fn jit_compile(&mut self) -> Result<(), Error> { let error_code = - unsafe { pcre2_jit_compile_8(self.code, PCRE2_JIT_COMPLETE) }; + unsafe { W::pcre2_jit_compile(self.code, PCRE2_JIT_COMPLETE) }; if error_code == 0 { self.compiled_jit = true; Ok(()) @@ -144,60 +667,45 @@ impl Code { // and search for PCRE2_INFO_NAMETABLE. let name_count = self.name_count()?; - let size = self.name_entry_size()?; - let table = unsafe { - slice::from_raw_parts(self.raw_name_table()?, name_count * size) - }; - + let name_entry_size_in_bytes = + self.name_entry_size()? * std::mem::size_of::<W::PCRE2_CHAR>(); + let name_table = self.raw_name_table()?; let mut names = vec![None; self.capture_count()?]; for i in 0..name_count { - let entry = &table[i * size..(i + 1) * size]; - let name = &entry[2..]; - let nulat = name - .iter() - .position(|&b| b == 0) - .expect("a NUL in name table entry"); - let index = (entry[0] as usize) << 8 | (entry[1] as usize); - names[index] = String::from_utf8(name[..nulat].to_vec()) - .map(Some) - // We require our pattern to be valid UTF-8, so all capture - // names should also be valid UTF-8. - .expect("valid UTF-8 for capture name"); + let entry = unsafe { + name_table + .cast::<u8>() + .add(i * name_entry_size_in_bytes) + .cast::<W::name_table_entry>() + .as_ref() + .unwrap() + }; + names[entry.index()] = Some(entry.name()); } Ok(names) } /// Return the underlying raw pointer to the code object. - pub(crate) fn as_ptr(&self) -> *const pcre2_code_8 { + pub(crate) fn as_ptr(&self) -> *const W::pcre2_code { self.code } - /// Returns the raw name table, where each entry in the table corresponds - /// to a mapping between a named capturing group and the index of that - /// capturing group. The encoding for each item is as follows: - /// - /// * 2 bytes encoding the capture index (big-endian) - /// * N bytes encoding the code units of the name - /// * 1 byte for the NUL terminator - /// * M padding bytes, corresponding to the difference in length between - /// this name and the longest name. - /// - /// In particular, each entry uses the same number of bytes. + /// Returns a pointer to the array of name table entries. /// /// Entries are in alphabetical order. - fn raw_name_table(&self) -> Result<*const u8, Error> { - let mut bytes: *const u8 = ptr::null(); + fn raw_name_table(&self) -> Result<*const W::name_table_entry, Error> { + let mut table: *const W::name_table_entry = ptr::null(); let rc = unsafe { - pcre2_pattern_info_8( + W::pcre2_pattern_info( self.as_ptr(), PCRE2_INFO_NAMETABLE, - &mut bytes as *mut *const u8 as *mut c_void, + &mut table as *mut *const W::name_table_entry as *mut c_void, ) }; if rc != 0 { Err(Error::info(rc)) } else { - Ok(bytes) + Ok(table) } } @@ -205,7 +713,7 @@ impl Code { fn name_count(&self) -> Result<usize, Error> { let mut count: u32 = 0; let rc = unsafe { - pcre2_pattern_info_8( + W::pcre2_pattern_info( self.as_ptr(), PCRE2_INFO_NAMECOUNT, &mut count as *mut u32 as *mut c_void, @@ -218,16 +726,11 @@ impl Code { } } - /// Returns the entry size of each name in the name table. - /// - /// This appears to correspond to `3` plus the size of the longest named - /// capturing group. The extra 3 bytes correspond to a NUL terminator and - /// two prefix bytes corresponding to a big-endian encoding of the index - /// of the capture group. + /// Returns the entry size of each name in the name table, in code units. fn name_entry_size(&self) -> Result<usize, Error> { let mut size: u32 = 0; let rc = unsafe { - pcre2_pattern_info_8( + W::pcre2_pattern_info( self.as_ptr(), PCRE2_INFO_NAMEENTRYSIZE, &mut size as *mut u32 as *mut c_void, @@ -246,7 +749,7 @@ impl Code { pub(crate) fn capture_count(&self) -> Result<usize, Error> { let mut count: u32 = 0; let rc = unsafe { - pcre2_pattern_info_8( + W::pcre2_pattern_info( self.as_ptr(), PCRE2_INFO_CAPTURECOUNT, &mut count as *mut u32 as *mut c_void, @@ -258,28 +761,77 @@ impl Code { Ok(1 + count as usize) } } + + pub unsafe fn substitute( + &self, + mut subject: &[W::SubjectChar], + mut replacement: &[W::SubjectChar], + start: usize, + options: u32, + output: &mut [W::PCRE2_CHAR], + output_len: &mut usize, + ) -> Result<usize, Error> { + // When the subject is empty, we use an empty slice + // with a known valid pointer. Otherwise, slices derived + // from, e.g., an empty `Vec<u8>` may not have a valid + // pointer, since creating an empty `Vec` is guaranteed + // to not allocate. + if subject.is_empty() { + subject = &[]; + } + if replacement.is_empty() { + replacement = &[]; + } + let (subj_ptr, subj_len) = W::subject_to_sptr_len(subject); + let (repl_ptr, repl_len) = W::subject_to_sptr_len(replacement); + + // safety: we allow arbitrary options, security contract is on the caller + let rc = unsafe { + W::pcre2_substitute( + self.code, + subj_ptr, + subj_len, + start, + options, + ptr::null_mut(), + // should probably not be null for performance reasons? + ptr::null_mut(), + repl_ptr, + repl_len, + output.as_mut_ptr() as *mut W::PCRE2_CHAR, + output_len as *mut usize, + ) + }; + if rc >= 0 { + return Ok(rc as usize); + } + // this might warrant a new error type + Err(Error::info(rc)) + } } /// A low level representation of PCRE2's compilation context. -pub(crate) struct CompileContext(*mut pcre2_compile_context_8); +pub(crate) struct CompileContext<W: CodeUnitWidth>( + *mut W::pcre2_compile_context, +); // SAFETY: Compile contexts are safe to read from multiple threads // simultaneously. No interior mutability is used, so Sync is safe. -unsafe impl Send for CompileContext {} -unsafe impl Sync for CompileContext {} +unsafe impl<W: CodeUnitWidth> Send for CompileContext<W> {} +unsafe impl<W: CodeUnitWidth> Sync for CompileContext<W> {} -impl Drop for CompileContext { +impl<W: CodeUnitWidth> Drop for CompileContext<W> { fn drop(&mut self) { - unsafe { pcre2_compile_context_free_8(self.0) } + unsafe { W::pcre2_compile_context_free(self.0) } } } -impl CompileContext { +impl<W: CodeUnitWidth> CompileContext<W> { /// Create a new empty compilation context. /// /// If memory could not be allocated for the context, then this panics. - pub(crate) fn new() -> CompileContext { - let ctx = unsafe { pcre2_compile_context_create_8(ptr::null_mut()) }; + pub(crate) fn new() -> Self { + let ctx = unsafe { W::pcre2_compile_context_create() }; assert!(!ctx.is_null(), "could not allocate compile context"); CompileContext(ctx) } @@ -290,7 +842,7 @@ impl CompileContext { /// PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, PCRE2_NEWLINE_ANY or /// PCRE2_NEWLINE_NUL. Using any other value results in an error. pub(crate) fn set_newline(&mut self, value: u32) -> Result<(), Error> { - let rc = unsafe { pcre2_set_newline_8(self.0, value) }; + let rc = unsafe { W::pcre2_set_newline(self.0, value) }; if rc == 0 { Ok(()) } else { @@ -298,7 +850,7 @@ impl CompileContext { } } - fn as_mut_ptr(&mut self) -> *mut pcre2_compile_context_8 { + fn as_mut_ptr(&mut self) -> *mut W::pcre2_compile_context { self.0 } } @@ -322,13 +874,14 @@ impl Default for MatchConfig { /// Technically, a single match data block can be used with multiple regexes /// (not simultaneously), but in practice, we just create a single match data /// block for each regex for each thread it's used in. -pub(crate) struct MatchData { +pub(crate) struct MatchData<W: CodeUnitWidth> { config: MatchConfig, - match_context: *mut pcre2_match_context_8, - match_data: *mut pcre2_match_data_8, - jit_stack: Option<*mut pcre2_jit_stack_8>, + match_context: *mut W::pcre2_match_context, + match_data: *mut W::pcre2_match_data, + jit_stack: Option<*mut W::pcre2_jit_stack>, ovector_ptr: *const usize, ovector_count: u32, + _marker: PhantomData<W>, } // SAFETY: Match data blocks can be freely sent from one thread to another, @@ -337,36 +890,31 @@ pub(crate) struct MatchData { // data block for executing a search, which statically prevents simultaneous // reading/writing. It is legal to read match data blocks from multiple threads // simultaneously. -unsafe impl Send for MatchData {} -unsafe impl Sync for MatchData {} +unsafe impl<W: CodeUnitWidth> Send for MatchData<W> {} +unsafe impl<W: CodeUnitWidth> Sync for MatchData<W> {} -impl Drop for MatchData { +impl<W: CodeUnitWidth> Drop for MatchData<W> { fn drop(&mut self) { unsafe { if let Some(stack) = self.jit_stack { - pcre2_jit_stack_free_8(stack); + W::pcre2_jit_stack_free(stack); } - pcre2_match_data_free_8(self.match_data); - pcre2_match_context_free_8(self.match_context); + W::pcre2_match_data_free(self.match_data); + W::pcre2_match_context_free(self.match_context); } } } -impl MatchData { +impl<W: CodeUnitWidth> MatchData<W> { /// Create a new match data block from a compiled PCRE2 code object. /// /// This panics if memory could not be allocated for the block. - pub(crate) fn new(config: MatchConfig, code: &Code) -> MatchData { - let match_context = - unsafe { pcre2_match_context_create_8(ptr::null_mut()) }; + pub(crate) fn new(config: MatchConfig, code: &Code<W>) -> MatchData<W> { + let match_context = unsafe { W::pcre2_match_context_create() }; assert!(!match_context.is_null(), "failed to allocate match context"); - let match_data = unsafe { - pcre2_match_data_create_from_pattern_8( - code.as_ptr(), - ptr::null_mut(), - ) - }; + let match_data = + unsafe { W::pcre2_match_data_create_from_pattern(code.as_ptr()) }; assert!(!match_data.is_null(), "failed to allocate match data block"); let jit_stack = match config.max_jit_stack_size { @@ -374,18 +922,13 @@ impl MatchData { Some(_) if !code.compiled_jit => None, Some(max) => { let stack = unsafe { - pcre2_jit_stack_create_8( - cmp::min(max, 32 * 1 << 10), - max, - ptr::null_mut(), - ) + W::pcre2_jit_stack_create(cmp::min(max, 32 * 1 << 10), max) }; assert!(!stack.is_null(), "failed to allocate JIT stack"); unsafe { - pcre2_jit_stack_assign_8( + W::pcre2_jit_stack_assign( match_context, - None, stack as *mut c_void, ) }; @@ -393,9 +936,9 @@ impl MatchData { } }; - let ovector_ptr = unsafe { pcre2_get_ovector_pointer_8(match_data) }; + let ovector_ptr = unsafe { W::pcre2_get_ovector_pointer(match_data) }; assert!(!ovector_ptr.is_null(), "got NULL ovector pointer"); - let ovector_count = unsafe { pcre2_get_ovector_count_8(match_data) }; + let ovector_count = unsafe { W::pcre2_get_ovector_count(match_data) }; MatchData { config, match_context, @@ -403,6 +946,7 @@ impl MatchData { jit_stack, ovector_ptr, ovector_count, + _marker: PhantomData, } } @@ -428,8 +972,8 @@ impl MatchData { /// valid UTF-8, then the result is undefined. pub(crate) unsafe fn find( &mut self, - code: &Code, - mut subject: &[u8], + code: &Code<W>, + mut subject: &[W::SubjectChar], start: usize, options: u32, ) -> Result<bool, Error> { @@ -438,18 +982,18 @@ impl MatchData { // from, e.g., an empty `Vec<u8>` may not have a valid // pointer, since creating an empty `Vec` is guaranteed // to not allocate. - const EMPTY: &[u8] = &[]; if subject.is_empty() { - subject = EMPTY; + subject = &[]; } + let (subj_ptr, subj_len) = W::subject_to_sptr_len(subject); - let rc = pcre2_match_8( + let rc = W::pcre2_match( code.as_ptr(), - subject.as_ptr(), - subject.len(), + subj_ptr, + subj_len, start, options, - self.as_mut_ptr(), + self.match_data, self.match_context, ); if rc == PCRE2_ERROR_NOMATCH { @@ -465,11 +1009,6 @@ impl MatchData { } } - /// Return a mutable reference to the underlying match data. - fn as_mut_ptr(&mut self) -> *mut pcre2_match_data_8 { - self.match_data - } - /// Return the ovector corresponding to this match data. /// /// The ovector represents match offsets as pairs. This always returns diff --git a/src/lib.rs b/src/lib.rs index 4de2df9..13e9186 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,3 +25,10 @@ pub mod bytes; mod error; mod ffi; mod pool; +mod regex_impl; + +/** +PCRE2 regular expressions for matching on UTF-32 slices. +*/ +#[cfg(feature = "utf32")] +pub mod utf32; diff --git a/src/regex_impl.rs b/src/regex_impl.rs new file mode 100644 index 0000000..d1361d3 --- /dev/null +++ b/src/regex_impl.rs @@ -0,0 +1,1245 @@ +use std::{ + borrow::Cow, + collections::HashMap, + fmt, + ops::Index, + panic::{RefUnwindSafe, UnwindSafe}, + sync::Arc, +}; + +use log::debug; +use pcre2_sys::{ + PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_ERROR_NOMEMORY, PCRE2_EXTENDED, + PCRE2_MATCH_INVALID_UTF, PCRE2_MULTILINE, PCRE2_NEVER_UTF, + PCRE2_NEWLINE_ANYCRLF, PCRE2_SUBSTITUTE_EXTENDED, PCRE2_SUBSTITUTE_GLOBAL, + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, PCRE2_SUBSTITUTE_UNSET_EMPTY, PCRE2_UCP, + PCRE2_UNSET, PCRE2_UTF, +}; + +use crate::{ + error::Error, + ffi::{Code, CodeUnitWidth, CompileContext, MatchConfig, MatchData}, + pool::{Pool, PoolGuard}, +}; + +/// Match represents a single match of a regex in a subject string. +/// +/// The lifetime parameter `'s` refers to the lifetime of the matched portion +/// of the subject string. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct Match<'s, W: CodeUnitWidth> { + subject: &'s [W::SubjectChar], + start: usize, + end: usize, +} + +impl<'s, W: CodeUnitWidth> Match<'s, W> { + /// Returns the starting byte offset of the match in the subject. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the ending byte offset of the match in the subject. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns the matched portion of the subject string. + #[inline] + pub fn as_bytes(&self) -> &'s [W::SubjectChar] { + &self.subject[self.start..self.end] + } + + /// Creates a new match from the given subject string and byte offsets. + fn new(subject: &'s [W::SubjectChar], start: usize, end: usize) -> Self { + Match { subject, start, end } + } + + #[cfg(test)] + pub(crate) fn as_pair(&self) -> (usize, usize) { + (self.start, self.end) + } +} + +#[derive(Clone, Debug)] +struct Config { + /// PCRE2_CASELESS + caseless: bool, + /// PCRE2_DOTALL + dotall: bool, + /// PCRE2_EXTENDED + extended: bool, + /// PCRE2_MULTILINE + multi_line: bool, + /// PCRE2_NEWLINE_ANYCRLF + crlf: bool, + /// PCRE2_UCP + ucp: bool, + /// PCRE2_UTF + utf: bool, + /// PCRE2_NEVER_UTF + block_utf_pattern_directive: bool, + /// use pcre2_jit_compile + jit: JITChoice, + /// Match-time specific configuration knobs. + match_config: MatchConfig, +} + +#[derive(Clone, Debug)] +enum JITChoice { + /// Never do JIT compilation. + Never, + /// Always do JIT compilation and return an error if it fails. + Always, + /// Attempt to do JIT compilation but silently fall back to non-JIT. + Attempt, +} + +impl Default for Config { + fn default() -> Config { + Config { + caseless: false, + dotall: false, + extended: false, + multi_line: false, + crlf: false, + ucp: false, + utf: false, + block_utf_pattern_directive: false, + jit: JITChoice::Never, + match_config: MatchConfig::default(), + } + } +} + +/// A builder for configuring the compilation of a PCRE2 regex. +/// This takes a phantom parameter to aid type inference. +#[derive(Clone, Debug)] +pub struct RegexBuilder<W: CodeUnitWidth> { + config: Config, + _phantom: std::marker::PhantomData<W>, +} + +impl<W: CodeUnitWidth> RegexBuilder<W> { + /// Create a new builder with a default configuration. + pub fn new() -> Self { + RegexBuilder { + config: Config::default(), + _phantom: std::marker::PhantomData, + } + } + + /// Compile the given pattern into a PCRE regex using the current + /// configuration. + /// + /// If there was a problem compiling the pattern, then an error is + /// returned. + pub fn build<Pat: Into<W::Pattern>>( + &self, + pattern: Pat, + ) -> Result<Regex<W>, Error> { + let mut options = 0; + if self.config.caseless { + options |= PCRE2_CASELESS; + } + if self.config.dotall { + options |= PCRE2_DOTALL; + } + if self.config.extended { + options |= PCRE2_EXTENDED; + } + if self.config.multi_line { + options |= PCRE2_MULTILINE; + } + if self.config.ucp { + options |= PCRE2_UCP; + options |= PCRE2_UTF; + options |= PCRE2_MATCH_INVALID_UTF; + } + if self.config.utf { + options |= PCRE2_UTF; + } + if self.config.block_utf_pattern_directive { + options |= PCRE2_NEVER_UTF; + } + + let mut ctx = CompileContext::new(); + if self.config.crlf { + ctx.set_newline(PCRE2_NEWLINE_ANYCRLF) + .expect("PCRE2_NEWLINE_ANYCRLF is a legal value"); + } + + let pattern = pattern.into(); + let mut code = Code::new(&pattern, options, ctx)?; + match self.config.jit { + JITChoice::Never => {} // fallthrough + JITChoice::Always => { + code.jit_compile()?; + } + JITChoice::Attempt => { + if let Err(err) = code.jit_compile() { + debug!("JIT compilation failed: {}", err); + } + } + } + let capture_names = code.capture_names()?; + let mut idx = HashMap::new(); + for (i, group) in capture_names.iter().enumerate() { + if let Some(ref name) = *group { + idx.insert(name.to_string(), i); + } + } + let code = Arc::new(code); + let match_data = { + let config = self.config.match_config.clone(); + let code = Arc::clone(&code); + let create: MatchDataPoolFn<W> = + Box::new(move || MatchData::new(config.clone(), &code)); + Pool::new(create) + }; + Ok(Regex { + config: Arc::new(self.config.clone()), + pattern, + code, + capture_names: Arc::new(capture_names), + capture_names_idx: Arc::new(idx), + match_data, + }) + } + + /// Enables case insensitive matching. + /// + /// If the `utf` option is also set, then Unicode case folding is used + /// to determine case insensitivity. When the `utf` option is not set, + /// then only standard ASCII case insensitivity is considered. + /// + /// This option corresponds to the `i` flag. + pub fn caseless(&mut self, yes: bool) -> &mut Self { + self.config.caseless = yes; + self + } + + /// Enables "dot all" matching. + /// + /// When enabled, the `.` metacharacter in the pattern matches any + /// character, include `\n`. When disabled (the default), `.` will match + /// any character except for `\n`. + /// + /// This option corresponds to the `s` flag. + pub fn dotall(&mut self, yes: bool) -> &mut Self { + self.config.dotall = yes; + self + } + + /// Enable "extended" mode in the pattern, where whitespace is ignored. + /// + /// This option corresponds to the `x` flag. + pub fn extended(&mut self, yes: bool) -> &mut Self { + self.config.extended = yes; + self + } + + /// Enable multiline matching mode. + /// + /// When enabled, the `^` and `$` anchors will match both at the beginning + /// and end of a subject string, in addition to matching at the start of + /// a line and the end of a line. When disabled, the `^` and `$` anchors + /// will only match at the beginning and end of a subject string. + /// + /// This option corresponds to the `m` flag. + pub fn multi_line(&mut self, yes: bool) -> &mut Self { + self.config.multi_line = yes; + self + } + + /// Enable matching of CRLF as a line terminator. + /// + /// When enabled, anchors such as `^` and `$` will match any of the + /// following as a line terminator: `\r`, `\n` or `\r\n`. + /// + /// This is disabled by default, in which case, only `\n` is recognized as + /// a line terminator. + pub fn crlf(&mut self, yes: bool) -> &mut Self { + self.config.crlf = yes; + self + } + + /// Enable Unicode matching mode. + /// + /// When enabled, the following patterns become Unicode aware: `\b`, `\B`, + /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`. + /// + /// When set, this implies UTF matching mode. It is not possible to enable + /// Unicode matching mode without enabling UTF matching mode. + /// + /// This is disabled by default. + pub fn ucp(&mut self, yes: bool) -> &mut Self { + self.config.ucp = yes; + self + } + + /// Enable UTF matching mode. + /// + /// When enabled, characters are treated as sequences of code units that + /// make up a single codepoint instead of as single bytes. For example, + /// this will cause `.` to match any single UTF-8 encoded codepoint, where + /// as when this is disabled, `.` will any single byte (except for `\n` in + /// both cases, unless "dot all" mode is enabled). + /// + /// This is disabled by default. + pub fn utf(&mut self, yes: bool) -> &mut Self { + self.config.utf = yes; + self + } + + /// Prevent patterns from opting in to UTF matching mode in spite of any flags. + /// + /// This causes the directive `(*UTF)` in the pattern to emit an error. + /// This does not affect any other flags controlling UTF matching mode; + /// it merely disables a particular syntax item in the pattern. + pub fn block_utf_pattern_directive(&mut self, yes: bool) -> &mut Self { + self.config.block_utf_pattern_directive = yes; + self + } + + /// This is now deprecated and is a no-op. + /// + /// Previously, this option permitted disabling PCRE2's UTF-8 validity + /// check, which could result in undefined behavior if the haystack was + /// not valid UTF-8. But PCRE2 introduced a new option, `PCRE2_MATCH_INVALID_UTF`, + /// in 10.34 which this crate always sets. When this option is enabled, + /// PCRE2 claims to not have undefined behavior when the haystack is + /// invalid UTF-8. + /// + /// Therefore, disabling the UTF-8 check is not something that is exposed + /// by this crate. + #[deprecated( + since = "0.2.4", + note = "now a no-op due to new PCRE2 features" + )] + pub fn disable_utf_check(&mut self) -> &mut Self { + self + } + + /// Enable PCRE2's JIT and return an error if it's not available. + /// + /// This generally speeds up matching quite a bit. The downside is that it + /// can increase the time it takes to compile a pattern. + /// + /// If the JIT isn't available or if JIT compilation returns an error, then + /// regex compilation will fail with the corresponding error. + /// + /// This is disabled by default, and always overrides `jit_if_available`. + pub fn jit(&mut self, yes: bool) -> &mut Self { + if yes { + self.config.jit = JITChoice::Always; + } else { + self.config.jit = JITChoice::Never; + } + self + } + + /// Enable PCRE2's JIT if it's available. + /// + /// This generally speeds up matching quite a bit. The downside is that it + /// can increase the time it takes to compile a pattern. + /// + /// If the JIT isn't available or if JIT compilation returns an error, + /// then a debug message with the error will be emitted and the regex will + /// otherwise silently fall back to non-JIT matching. + /// + /// This is disabled by default, and always overrides `jit`. + pub fn jit_if_available(&mut self, yes: bool) -> &mut Self { + if yes { + self.config.jit = JITChoice::Attempt; + } else { + self.config.jit = JITChoice::Never; + } + self + } + + /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is + /// not enabled, then this has no effect. + /// + /// When `None` is given, no custom JIT stack will be created, and instead, + /// the default JIT stack is used. When the default is used, its maximum + /// size is 32 KB. + /// + /// When this is set, then a new JIT stack will be created with the given + /// maximum size as its limit. + /// + /// Increasing the stack size can be useful for larger regular expressions. + /// + /// By default, this is set to `None`. + pub fn max_jit_stack_size(&mut self, bytes: Option<usize>) -> &mut Self { + self.config.match_config.max_jit_stack_size = bytes; + self + } +} + +/// A compiled PCRE2 regular expression. +/// +/// This regex is safe to use from multiple threads simultaneously. For top +/// performance, it is better to clone a new regex for each thread. +pub struct Regex<W: CodeUnitWidth> { + /// The configuration used to build the regex. + config: Arc<Config>, + /// The original pattern string. + pattern: W::Pattern, + /// The underlying compiled PCRE2 object. + code: Arc<Code<W>>, + /// The capture group names for this regex. + capture_names: Arc<Vec<Option<String>>>, + /// A map from capture group name to capture group index. + capture_names_idx: Arc<HashMap<String, usize>>, + /// A pool of mutable scratch data used by PCRE2 during matching. + match_data: MatchDataPool<W>, +} + +impl<W: CodeUnitWidth> Clone for Regex<W> { + fn clone(&self) -> Self { + let match_data = { + let config = self.config.match_config.clone(); + let code = Arc::clone(&self.code); + let create: MatchDataPoolFn<W> = + Box::new(move || MatchData::new(config.clone(), &code)); + Pool::new(create) + }; + Self { + config: Arc::clone(&self.config), + pattern: self.pattern.clone(), + code: Arc::clone(&self.code), + capture_names: Arc::clone(&self.capture_names), + capture_names_idx: Arc::clone(&self.capture_names_idx), + match_data, + } + } +} + +impl<W: CodeUnitWidth> fmt::Debug for Regex<W> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Regex({:?})", self.pattern) + } +} + +impl<W: CodeUnitWidth> Regex<W> { + /// Compiles a regular expression using the default configuration. + /// + /// Once compiled, it can be used repeatedly to search, split or replace + /// text in a string. + /// + /// If an invalid expression is given, then an error is returned. + /// + /// To configure compilation options for the regex, use the + /// [`RegexBuilder`](struct.RegexBuilder.html). + pub fn new<Pat: Into<W::Pattern>>(pattern: Pat) -> Result<Self, Error> { + RegexBuilder::new().build(pattern) + } + + /// Returns true if and only if the regex matches the subject string given. + /// + /// # Example + /// + /// Test if some text contains at least one word with exactly 13 ASCII word + /// bytes: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let text = b"I categorically deny having triskaidekaphobia."; + /// assert!(Regex::new(r"\b\w{13}\b")?.is_match(text)?); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn is_match(&self, subject: &[W::SubjectChar]) -> Result<bool, Error> { + self.is_match_at(subject, 0) + } + + /// Returns the start and end byte range of the leftmost-first match in + /// `subject`. If no match exists, then `None` is returned. + /// + /// # Example + /// + /// Find the start and end location of the first word with exactly 13 + /// ASCII word bytes: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let text = b"I categorically deny having triskaidekaphobia."; + /// let mat = Regex::new(r"\b\w{13}\b")?.find(text)?.unwrap(); + /// assert_eq!((mat.start(), mat.end()), (2, 15)); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn find<'s>( + &self, + subject: &'s [W::SubjectChar], + ) -> Result<Option<Match<'s, W>>, Error> { + self.find_at(subject, 0) + } + + /// Returns an iterator for each successive non-overlapping match in + /// `subject`, returning the start and end byte indices with respect to + /// `subject`. + /// + /// # Example + /// + /// Find the start and end location of every word with exactly 13 ASCII + /// word bytes: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let text = b"Retroactively relinquishing remunerations is reprehensible."; + /// for result in Regex::new(r"\b\w{13}\b")?.find_iter(text) { + /// let mat = result?; + /// println!("{:?}", mat); + /// } + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn find_iter<'r, 's>( + &'r self, + subject: &'s [W::SubjectChar], + ) -> Matches<'r, 's, W> { + Matches { + re: self, + match_data: self.match_data(), + subject, + last_end: 0, + last_match: None, + } + } + + /// Returns the capture groups corresponding to the leftmost-first + /// match in `subject`. Capture group `0` always corresponds to the entire + /// match. If no match is found, then `None` is returned. + /// + /// # Examples + /// + /// Say you have some text with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text + /// looking like that, while also extracting the movie name and its release + /// year separately. + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")?; + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text)?.unwrap(); + /// assert_eq!(&caps[1], &b"Citizen Kane"[..]); + /// assert_eq!(&caps[2], &b"1941"[..]); + /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. + /// assert_eq!(&caps[1], b"Citizen Kane"); + /// assert_eq!(&caps[2], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// # Ok(()) }; example().unwrap() + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?; + /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(text)?.unwrap(); + /// assert_eq!(&caps["title"], &b"Citizen Kane"[..]); + /// assert_eq!(&caps["year"], &b"1941"[..]); + /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. + /// assert_eq!(&caps["title"], b"Citizen Kane"); + /// assert_eq!(&caps["year"], b"1941"); + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// # Ok(()) }; example().unwrap() + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + pub fn captures<'s>( + &self, + subject: &'s [W::SubjectChar], + ) -> Result<Option<Captures<'s, W>>, Error> { + let mut locs = self.capture_locations(); + Ok(self.captures_read(&mut locs, subject)?.map(move |_| Captures { + subject, + locs, + idx: Arc::clone(&self.capture_names_idx), + })) + } + + /// Returns an iterator over all the non-overlapping capture groups matched + /// in `subject`. This is operationally the same as `find_iter`, except it + /// yields information about capturing group matches. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some text, where the movie is formatted like "'Title' (xxxx)": + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use std::str; + /// + /// use pcre2::bytes::Regex; + /// + /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?; + /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// for result in re.captures_iter(text) { + /// let caps = result?; + /// let title = str::from_utf8(&caps["title"]).unwrap(); + /// let year = str::from_utf8(&caps["year"]).unwrap(); + /// println!("Movie: {:?}, Released: {:?}", title, year); + /// } + /// // Output: + /// // Movie: Citizen Kane, Released: 1941 + /// // Movie: The Wizard of Oz, Released: 1939 + /// // Movie: M, Released: 1931 + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn captures_iter<'r, 's>( + &'r self, + subject: &'s [W::SubjectChar], + ) -> CaptureMatches<'r, 's, W> { + CaptureMatches { re: self, subject, last_end: 0, last_match: None } + } + + /// Test helper to access capture name indexes. + #[cfg(test)] + pub(crate) fn get_capture_names_idxs(&self) -> &HashMap<String, usize> { + &self.capture_names_idx + } + + /// Replace the first match in the subject string with the replacement + /// If `extended` is true, enable PCRE2's extended replacement syntax. + pub fn replace<'s>( + &self, + subject: &'s [W::SubjectChar], + replacement: &[W::SubjectChar], + extended: bool, + ) -> Result<Cow<'s, [W::SubjectChar]>, Error> + where + [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned, + { + self.replace_impl(subject, replacement, false, extended) + } + + /// Replace all non-overlapping matches in the subject string with the replacement + /// If `extended` is true, enable PCRE2's extended replacement syntax. + pub fn replace_all<'s>( + &self, + subject: &'s [W::SubjectChar], + replacement: &[W::SubjectChar], + extended: bool, + ) -> Result<Cow<'s, [W::SubjectChar]>, Error> + where + [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned, + { + self.replace_impl(subject, replacement, true, extended) + } + + #[inline] + fn replace_impl<'s>( + &self, + subject: &'s [W::SubjectChar], + replacement: &[W::SubjectChar], + replace_all: bool, + extended: bool, + ) -> Result<Cow<'s, [W::SubjectChar]>, Error> + where + [<W as CodeUnitWidth>::PCRE2_CHAR]: ToOwned, + { + let mut options: u32 = 0; + options |= PCRE2_SUBSTITUTE_OVERFLOW_LENGTH; + // TODO: this should probably be configurable from user-side + options |= PCRE2_SUBSTITUTE_UNSET_EMPTY; + if extended { + options |= PCRE2_SUBSTITUTE_EXTENDED; + } + if replace_all { + options |= PCRE2_SUBSTITUTE_GLOBAL; + } + + // We prefer to allocate on the stack but fall back to the heap. + // Note that PCRE2 has the following behavior with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH: + // - We supply the initial output buffer size in `capacity`. This should have sufficient + // capacity for the terminating NUL character. + // - If the capacity is NOT sufficient, PCRE2 returns the new required capacity, also + // including the terminating NUL character. + // - If the capacity IS sufficient, PCRE2 returns the number of characters written, NOT + // including the terminating NUL character. + // Example: our initial capacity is 256. If the returned string needs to be of length 512, + // then PCRE2 will report NOMEMORY and set capacity to 513. After reallocating we pass in + // a capacity of 513; it succeeds and sets capacity to 512, which is the length of the result. + let mut stack_storage: [W::PCRE2_CHAR; 256] = + [W::PCRE2_CHAR::default(); 256]; + let mut heap_storage = Vec::new(); + let mut output = stack_storage.as_mut(); + let mut capacity = output.len(); + + let mut rc = unsafe { + self.code.substitute( + subject, + replacement, + 0, + options, + output, + &mut capacity, + ) + }; + + if let Err(e) = &rc { + if e.code() == PCRE2_ERROR_NOMEMORY { + if heap_storage.try_reserve_exact(capacity).is_err() { + return Err(rc.unwrap_err()); + } + heap_storage.resize(capacity, W::PCRE2_CHAR::default()); + output = &mut heap_storage; + capacity = output.len(); + rc = unsafe { + self.code.substitute( + subject, + replacement, + 0, + options, + output, + &mut capacity, + ) + }; + } + } + + let s = match rc? { + 0 => Cow::Borrowed(subject), + _ => { + // capacity has been updated with the length of the result (excluding nul terminator). + let output = &output[..capacity]; + + // All inputs contained valid chars, so we expect all outputs to as well. + let to_char = |c: W::PCRE2_CHAR| -> W::SubjectChar { + c.try_into().unwrap_or_else(|_| { + panic!("all output expected to be valid chars") + }) + }; + + // this is really just a type cast + let x: Vec<W::SubjectChar> = + output.iter().copied().map(to_char).collect(); + Cow::Owned(x) + } + }; + Ok(s) + } +} + +/// Advanced or "lower level" search methods. +impl<W: CodeUnitWidth> Regex<W> { + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at( + &self, + subject: &[W::SubjectChar], + start: usize, + ) -> Result<bool, Error> { + assert!( + start <= subject.len(), + "start ({}) must be <= subject.len() ({})", + start, + subject.len() + ); + + let options = 0; + let mut match_data = self.match_data(); + // SAFETY: We don't use any dangerous PCRE2 options. + let res = + unsafe { match_data.find(&self.code, subject, start, options) }; + PoolGuard::put(match_data); + res + } + + /// Returns the same as find, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn find_at<'s>( + &self, + subject: &'s [W::SubjectChar], + start: usize, + ) -> Result<Option<Match<'s, W>>, Error> { + let mut match_data = self.match_data(); + let res = + self.find_at_with_match_data(&mut match_data, subject, start); + PoolGuard::put(match_data); + res + } + + /// Like find_at, but accepts match data instead of acquiring one itself. + /// + /// This is useful for implementing the iterator, which permits avoiding + /// the synchronization overhead of acquiring the match data. + #[inline(always)] + fn find_at_with_match_data<'s>( + &self, + match_data: &mut MatchDataPoolGuard<'_, W>, + subject: &'s [W::SubjectChar], + start: usize, + ) -> Result<Option<Match<'s, W>>, Error> { + assert!( + start <= subject.len(), + "start ({}) must be <= subject.len() ({})", + start, + subject.len() + ); + + let options = 0; + // SAFETY: We don't use any dangerous PCRE2 options. + if unsafe { !match_data.find(&self.code, subject, start, options)? } { + return Ok(None); + } + let ovector = match_data.ovector(); + let (s, e) = (ovector[0], ovector[1]); + Ok(Some(Match::new(subject, s, e))) + } + + /// This is like `captures`, but uses + /// [`CaptureLocations`](struct.CaptureLocations.html) + /// instead of + /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// + /// To create a `CaptureLocations` value, use the + /// `Regex::capture_locations` method. + /// + /// This returns the overall match if this was successful, which is always + /// equivalent to the `0`th capture group. + pub fn captures_read<'s>( + &self, + locs: &mut CaptureLocations<W>, + subject: &'s [W::SubjectChar], + ) -> Result<Option<Match<'s, W>>, Error> { + self.captures_read_at(locs, subject, 0) + } + + /// Returns the same as `captures_read`, but starts the search at the given + /// offset and populates the capture locations given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_read_at<'s>( + &self, + locs: &mut CaptureLocations<W>, + subject: &'s [W::SubjectChar], + start: usize, + ) -> Result<Option<Match<'s, W>>, Error> { + assert!( + start <= subject.len(), + "start ({}) must be <= subject.len() ({})", + start, + subject.len() + ); + + let options = 0; + // SAFETY: We don't use any dangerous PCRE2 options. + if unsafe { !locs.data.find(&self.code, subject, start, options)? } { + return Ok(None); + } + let ovector = locs.data.ovector(); + let (s, e) = (ovector[0], ovector[1]); + Ok(Some(Match::new(subject, s, e))) + } +} + +/// Auxiliary methods. +impl<W: CodeUnitWidth> Regex<W> { + /// Returns the original pattern string for this regex. + pub fn as_str(&self) -> &W::Pattern { + &self.pattern + } + + /// Returns a sequence of all capturing groups and their names, if present. + /// + /// The length of the slice returned is always equal to the result of + /// `captures_len`, which is the number of capturing groups (including the + /// capturing group for the entire pattern). + /// + /// Each entry in the slice is the name of the corresponding capturing + /// group, if one exists. The first capturing group (at index `0`) is + /// always unnamed. + /// + /// Capturing groups are indexed by the order of the opening parenthesis. + pub fn capture_names(&self) -> &[Option<String>] { + &self.capture_names + } + + /// Returns the number of capturing groups in the pattern. + /// + /// This is always 1 more than the number of syntactic groups in the + /// pattern, since the first group always corresponds to the entire match. + pub fn captures_len(&self) -> usize { + self.code.capture_count().expect("a valid capture count from PCRE2") + } + + /// Returns an empty set of capture locations that can be reused in + /// multiple calls to `captures_read` or `captures_read_at`. + pub fn capture_locations(&self) -> CaptureLocations<W> { + CaptureLocations { + code: Arc::clone(&self.code), + data: self.new_match_data(), + } + } + + fn match_data(&self) -> MatchDataPoolGuard<'_, W> { + self.match_data.get() + } + + fn new_match_data(&self) -> MatchData<W> { + MatchData::new(self.config.match_config.clone(), &self.code) + } +} + +/// CaptureLocations is a low level representation of the raw offsets of each +/// submatch. +/// +/// Primarily, this type is useful when using `Regex` APIs such as +/// `captures_read`, which permits amortizing the allocation in which capture +/// match locations are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// `capture_locations` method on the `Regex` being used to execute the search. +/// The value returned can then be reused in subsequent searches. +pub struct CaptureLocations<W: CodeUnitWidth> { + code: Arc<Code<W>>, + data: MatchData<W>, +} + +impl<W: CodeUnitWidth> Clone for CaptureLocations<W> { + fn clone(&self) -> Self { + CaptureLocations { + code: Arc::clone(&self.code), + data: MatchData::new(self.data.config().clone(), &self.code), + } + } +} + +impl<W: CodeUnitWidth> fmt::Debug for CaptureLocations<W> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut offsets: Vec<Option<usize>> = vec![]; + for &offset in self.data.ovector() { + if offset == PCRE2_UNSET { + offsets.push(None); + } else { + offsets.push(Some(offset)); + } + } + write!(f, "CaptureLocations(")?; + f.debug_list().entries(offsets).finish()?; + write!(f, ")") + } +} + +impl<W: CodeUnitWidth> CaptureLocations<W> { + /// Returns the start and end positions of the Nth capture group. + /// + /// This returns `None` if `i` is not a valid capture group or if the + /// capture group did not match anything. + /// + /// The positions returned are always byte indices with respect to the + /// original subject string matched. + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + let ovec = self.data.ovector(); + let s = match ovec.get(i * 2) { + None => return None, + Some(&s) if s == PCRE2_UNSET => return None, + Some(&s) => s, + }; + let e = match ovec.get(i * 2 + 1) { + None => return None, + Some(&e) if e == PCRE2_UNSET => return None, + Some(&e) => e, + }; + Some((s, e)) + } + + /// Returns the total number of capturing groups. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + #[inline] + pub fn len(&self) -> usize { + self.data.ovector().len() / 2 + } +} + +/// `Captures` represents a group of captured strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched byte string is *also* available via the +/// `name` method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always byte indices. +/// +/// `'s` is the lifetime of the matched subject string. +pub struct Captures<'s, W: CodeUnitWidth> { + subject: &'s [W::SubjectChar], + locs: CaptureLocations<W>, + idx: Arc<HashMap<String, usize>>, +} + +impl<'s, W: CodeUnitWidth> Captures<'s, W> { + /// Returns the match associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group + /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the text of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))")?; + /// let caps = re.captures(b"abc123")?.unwrap(); + /// + /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); + /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); + /// assert_eq!(text1, &b"123"[..]); + /// assert_eq!(text2, &b""[..]); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn get(&self, i: usize) -> Option<Match<'s, W>> { + self.locs.get(i).map(|(s, e)| Match::new(self.subject, s, e)) + } + + /// Returns the match for the capture group named `name`. If `name` isn't a + /// valid capture group or didn't match anything, then `None` is returned. + pub fn name(&self, name: &str) -> Option<Match<'s, W>> { + self.idx.get(name).and_then(|&i| self.get(i)) + } + + /// Returns the number of captured groups. + /// + /// This is always at least `1`, since every regex has at least one capture + /// group that corresponds to the full match. + #[inline] + pub fn len(&self) -> usize { + self.locs.len() + } +} + +impl<'s, W: CodeUnitWidth> fmt::Debug for Captures<'s, W> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() + } +} + +struct CapturesDebug<'c, 's: 'c, W: CodeUnitWidth>(&'c Captures<'s, W>); + +impl<'c, 's, W: CodeUnitWidth> fmt::Debug for CapturesDebug<'c, 's, W> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // We'd like to show something nice here, even if it means an + // allocation to build a reverse index. + let slot_to_name: HashMap<&usize, &String> = + self.0.idx.iter().map(|(a, b)| (b, a)).collect(); + let mut map = f.debug_map(); + for slot in 0..self.0.len() { + let m = self + .0 + .locs + .get(slot) + .map(|(s, e)| W::escape_subject(&self.0.subject[s..e])); + if let Some(name) = slot_to_name.get(&slot) { + map.entry(&name, &m); + } else { + map.entry(&slot, &m); + } + } + map.finish() + } +} + +/// Get a group by index. +/// +/// `'s` is the lifetime of the matched subject string. +/// +/// The subject can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `get()` instead. +/// +/// # Panics +/// +/// If there is no group at the given index. +impl<'s, W: CodeUnitWidth> Index<usize> for Captures<'s, W> { + type Output = [W::SubjectChar]; + + fn index(&self, i: usize) -> &Self::Output { + self.get(i) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a group by name. +/// +/// `'s` is the lifetime of the matched subject string and `'i` is the lifetime +/// of the group name (the index). +/// +/// The text can't outlive the `Captures` object if this method is +/// used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it); to do that, use `name` instead. +/// +/// # Panics +/// +/// If there is no group named by the given value. +impl<'s, 'i, W: CodeUnitWidth> Index<&'i str> for Captures<'s, W> { + type Output = [W::SubjectChar]; + + fn index<'a>(&'a self, name: &'i str) -> &'a [W::SubjectChar] { + self.name(name) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// An iterator over all non-overlapping matches for a particular subject +/// string. +/// +/// The iterator yields matches (if no error occurred while searching) +/// corresponding to the start and end of the match. The indices are byte +/// offsets. The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'s` is the +/// lifetime of the subject string. +pub struct Matches<'r, 's, W: CodeUnitWidth> { + re: &'r Regex<W>, + match_data: MatchDataPoolGuard<'r, W>, + subject: &'s [W::SubjectChar], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 's, W: CodeUnitWidth> Iterator for Matches<'r, 's, W> { + type Item = Result<Match<'s, W>, Error>; + + fn next(&mut self) -> Option<Self::Item> { + if self.last_end > self.subject.len() { + return None; + } + let res = self.re.find_at_with_match_data( + &mut self.match_data, + self.subject, + self.last_end, + ); + let m = match res { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.start() == m.end() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = m.end() + 1; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator that yields all non-overlapping capture groups matching a +/// particular regular expression. +/// +/// The iterator stops when no more matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'s` is the +/// lifetime of the subject string. +pub struct CaptureMatches<'r, 's, W: CodeUnitWidth> { + re: &'r Regex<W>, + subject: &'s [W::SubjectChar], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 's, W: CodeUnitWidth> Iterator for CaptureMatches<'r, 's, W> { + type Item = Result<Captures<'s, W>, Error>; + + fn next(&mut self) -> Option<Result<Captures<'s, W>, Error>> { + if self.last_end > self.subject.len() { + return None; + } + let mut locs = self.re.capture_locations(); + let res = + self.re.captures_read_at(&mut locs, self.subject, self.last_end); + let m = match res { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.start() == m.end() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = m.end() + 1; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(Captures { + subject: self.subject, + locs, + idx: Arc::clone(&self.re.capture_names_idx), + })) + } +} + +/// A type alias for our pool of `MatchData` that fixes the type parameters to +/// what we actually use in practice. +type MatchDataPool<W> = Pool<MatchData<W>, MatchDataPoolFn<W>>; + +/// Same as above, but for the guard returned by a pool. +type MatchDataPoolGuard<'a, W> = + PoolGuard<'a, MatchData<W>, MatchDataPoolFn<W>>; + +/// The type of the closure we use to create new caches. We need to spell out +/// all of the marker traits or else we risk leaking !MARKER impls. +type MatchDataPoolFn<W> = + Box<dyn Fn() -> MatchData<W> + Send + Sync + UnwindSafe + RefUnwindSafe>; diff --git a/src/utf32.rs b/src/utf32.rs new file mode 100644 index 0000000..59b0cbd --- /dev/null +++ b/src/utf32.rs @@ -0,0 +1,463 @@ +use crate::ffi::CodeUnitWidth32; +pub use crate::regex_impl::Captures as CapturesImpl; +pub use crate::regex_impl::Match as MatchImpl; + +#[doc(inline)] +pub use crate::regex_impl::{ + Regex as RegexImpl, RegexBuilder as RegexBuilderImpl, +}; + +/// A compiled PCRE2 regular expression for matching sequences of Rust chars. +/// +/// This regex is safe to use from multiple threads simultaneously. For top +/// performance, it is better to clone a new regex for each thread. +pub type Regex = RegexImpl<CodeUnitWidth32>; + +/// A builder for configuring the compilation of a PCRE2 regex. +pub type RegexBuilder = RegexBuilderImpl<CodeUnitWidth32>; + +/// Match represents a single match of a regex in a subject string. +/// +/// The lifetime parameter `'s` refers to the lifetime of the matched portion +/// of the subject string. +pub type Match<'s> = MatchImpl<'s, CodeUnitWidth32>; + +/// `Captures` represents a group of captured character strings for a single match. +/// +/// The 0th capture always corresponds to the entire match. Each subsequent +/// index corresponds to the next capture group in the regex. If a capture +/// group is named, then the matched string is *also* available via the +/// `name` method. (Note that the 0th capture is always unnamed and so must be +/// accessed with the `get` method.) +/// +/// Positions returned from a capture group are always character indices. +/// +/// `'s` is the lifetime of the matched subject string. +pub type Captures<'s> = CapturesImpl<'s, CodeUnitWidth32>; + +#[cfg(test)] +mod tests { + use std::borrow::Cow; + + use super::{CodeUnitWidth32, Regex, RegexBuilder}; + use crate::is_jit_available; + + fn b(string: &str) -> Box<[char]> { + string.chars().collect::<Vec<_>>().into_boxed_slice() + } + + fn find_iter_tuples(re: &Regex, subject: &[char]) -> Vec<(usize, usize)> { + let mut tuples = vec![]; + for result in re.find_iter(subject) { + let m = result.unwrap(); + tuples.push((m.start(), m.end())); + } + tuples + } + + fn cap_iter_tuples(re: &Regex, subject: &str) -> Vec<(usize, usize)> { + let subject = subject.chars().collect::<Vec<_>>(); + let mut tuples = vec![]; + for result in re.captures_iter(&subject) { + let caps = result.unwrap(); + let m = caps.get(0).unwrap(); + tuples.push((m.start(), m.end())); + } + tuples + } + + #[test] + fn caseless() { + let re = RegexBuilder::new().caseless(true).build(b("a")).unwrap(); + assert!(re.is_match(&b("A")).unwrap()); + + let re = RegexBuilder::new() + .caseless(true) + .ucp(true) + .build(b("β")) + .unwrap(); + assert!(re.is_match(&b("Β")).unwrap()); + } + + #[test] + fn crlf() { + let subject = &b("a\r\n"); + let re = RegexBuilder::new().crlf(true).build(b("a$")).unwrap(); + let m = re.find(subject).unwrap().unwrap(); + assert_eq!(m.as_pair(), (0, 1)); + } + + #[test] + fn dotall() { + let re = RegexBuilder::new().dotall(false).build(b(".")).unwrap(); + assert!(!re.is_match(&b("\n")).unwrap()); + + let re = RegexBuilder::new().dotall(true).build(b(".")).unwrap(); + assert!(re.is_match(&b("\n")).unwrap()); + } + + #[test] + fn extended() { + let re = RegexBuilder::new().extended(true).build(b("a b c")).unwrap(); + assert!(re.is_match(&b("abc")).unwrap()); + } + + #[test] + fn multi_line() { + let re = + RegexBuilder::new().multi_line(false).build(b("^abc$")).unwrap(); + assert!(!re.is_match(&b("foo\nabc\nbar")).unwrap()); + + let re = + RegexBuilder::new().multi_line(true).build(b("^abc$")).unwrap(); + assert!(re.is_match(&b("foo\nabc\nbar")).unwrap()); + } + + #[test] + fn replace() { + let re = RegexBuilder::new().build(b(".")).unwrap(); + let s = b("abc"); + let r = b(""); + let replaced = re.replace(&s, &r, true).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("bc")); + } + + #[test] + fn replace_no_match() { + let re = RegexBuilder::new().build(b("d")).unwrap(); + let s = b("abc"); + let r = b(""); + let replaced = re.replace(&s, &r, true).unwrap(); + assert!( + matches!(replaced, Cow::Borrowed(_)), + "when there is no match, the original string should be returned" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("abc")); + } + + #[test] + fn replace_with_replacement() { + let re = RegexBuilder::new().build(b("b")).unwrap(); + let s = b("abc"); + let r = b("d"); + let replaced = re.replace(&s, &r, true).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("adc")); + } + + #[test] + fn replace_first_occurrence() { + let re = RegexBuilder::new().build(b("a")).unwrap(); + let s = b("aaa"); + let r = b("b"); + let replaced = re.replace(&s, &r, false).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("baa")); + } + + #[test] + fn replace_multiple_occurrences() { + let re = RegexBuilder::new().build(b("a")).unwrap(); + let s = b("aaa"); + let r = b("b"); + let replaced = re.replace_all(&s, &r, false).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("bbb")); + } + + #[test] + fn replace_empty_string() { + let re = RegexBuilder::new().build(b("")).unwrap(); + let s = b("abc"); + let r = b("d"); + let replaced = re.replace(&s, &r, true).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &*b("dabc")); + } + + #[test] + fn replace_empty_with_empty() { + let re = RegexBuilder::new().build(b("")).unwrap(); + let s = b(""); + let r = b(""); + let replaced = re.replace(&s, &r, true).unwrap().into_owned(); + assert_eq!(replaced, &*b("")); + } + + #[test] + fn replace_long_string() { + let long_string = vec!['a'; 1024]; // Create a 1MB string filled with 'a' + let re = RegexBuilder::new().build(b("a")).unwrap(); + let r = b("b"); + let replaced = re.replace(&long_string, &r, false).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + let mut expected = long_string.clone(); + expected[0] = 'b'; + assert_eq!(replaced, expected); + } + + #[test] + fn replace_long_string_all() { + let long_string = vec!['a'; 1024]; + let re = RegexBuilder::new().build(b("a")).unwrap(); + let r = b("b"); + let replaced = re.replace_all(&long_string, &r, false).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + let all_b = vec!['b'; 1024]; + assert_eq!(replaced, all_b); + } + + #[test] + fn replace_long_string_all_elongating() { + let long_string = vec!['a'; 1024]; + let re = RegexBuilder::new().build(b("a")).unwrap(); + let r = b("bx"); + let replaced = re.replace_all(&long_string, &r, false).unwrap(); + assert!( + matches!(replaced, Cow::Owned(_)), + "a replacement should give a new string" + ); + let replaced = replaced.into_owned(); + let mut all_bx = Vec::new(); + for _ in long_string { + all_bx.push('b'); + all_bx.push('x'); + } + assert_eq!(replaced, all_bx); + } + + #[test] + fn replace_long_string_all_disappearing() { + let long_string = vec!['a'; 1024]; + let re = RegexBuilder::new().build(b("a")).unwrap(); + let r = b(""); + let replaced = re.replace_all(&long_string, &r, false).unwrap(); + let replaced = replaced.into_owned(); + assert_eq!(replaced, &[]); + } + + #[test] + fn ucp() { + let re = RegexBuilder::new().ucp(false).build(b(r"\w")).unwrap(); + assert!(!re.is_match(&b("β")).unwrap()); + + let re = RegexBuilder::new().ucp(true).build(b(r"\w")).unwrap(); + assert!(re.is_match(&b("β")).unwrap()); + } + + #[test] + fn utf() { + let re = RegexBuilder::new().utf(false).build(b(".")).unwrap(); + assert_eq!(re.find(&b("β")).unwrap().unwrap().as_pair(), (0, 1)); + + let re = RegexBuilder::new().utf(true).build(b(".")).unwrap(); + assert_eq!(re.find(&b("β")).unwrap().unwrap().as_pair(), (0, 1)); + } + + #[test] + fn fmt_debug_works() { + let subject = &b("x"); + let re = RegexBuilder::new().utf(false).build(b(".")).unwrap(); + let m = re.find(subject).unwrap().unwrap(); + let _ = format!("{:?}", m); + } + + #[test] + fn jit4lyfe() { + if is_jit_available::<CodeUnitWidth32>() { + let re = RegexBuilder::new().jit(true).build(b(r"\w")).unwrap(); + assert!(re.is_match(&b("a")).unwrap()); + } else { + // Check that if JIT isn't enabled, then we get an error if we + // require JIT. + RegexBuilder::new().jit(true).build(b(r"\w")).unwrap_err(); + } + } + + // Unlike jit4lyfe, this tests that everything works when requesting the + // JIT only if it's available. In jit4lyfe, we require the JIT or fail. + // If the JIT isn't available, then in this test, we simply don't use it. + #[test] + fn jit_if_available() { + let re = RegexBuilder::new() + .jit_if_available(true) + .build(b(r"\w")) + .unwrap(); + assert!(re.is_match(&b("a")).unwrap()); + } + + // This tests a regression caused a segfault in the pcre2 library + // https://github.com/BurntSushi/rust-pcre2/issues/10 + #[test] + fn jit_test_lazy_alloc_subject() { + let subject: Vec<char> = vec![]; + + let re = RegexBuilder::new() + .jit_if_available(true) + .build(b(r"xxxx|xxxx|xxxx")) + .unwrap(); + assert!(!re.is_match(&subject).unwrap()); + } + + #[test] + fn utf_with_invalid_data() { + let re = RegexBuilder::new().build(b(r".")).unwrap(); + assert_eq!(re.find(&b("\u{FF}")).unwrap().unwrap().as_pair(), (0, 1)); + + let re = RegexBuilder::new().utf(true).build(b(r".")).unwrap(); + assert_eq!(re.find(&b("\u{FF}")).unwrap().unwrap().as_pair(), (0, 1)); + } + + #[test] + fn capture_names() { + let re = RegexBuilder::new() + .build(b(r"(?P<foo>abc)|(def)|(?P<a>ghi)|(?P<springsteen>jkl)")) + .unwrap(); + assert_eq!( + re.capture_names().to_vec(), + vec![ + None, + Some("foo".to_string()), + None, + Some("a".to_string()), + Some("springsteen".to_string()), + ] + ); + + // Test our internal map as well. + let capture_names_idx = re.get_capture_names_idxs(); + assert_eq!(capture_names_idx.len(), 3); + assert_eq!(capture_names_idx["foo"], 1); + assert_eq!(capture_names_idx["a"], 3); + assert_eq!(capture_names_idx["springsteen"], 4); + } + + #[test] + fn captures_get() { + let subject = &b("abc123"); + let re = Regex::new(b(r"[a-z]+(?:([0-9]+)|([A-Z]+))")).unwrap(); + let caps = re.captures(subject).unwrap().unwrap(); + + let text1: &[char] = caps.get(1).map_or(&[], |m| m.as_bytes()); + let text2: &[char] = caps.get(2).map_or(&[], |m| m.as_bytes()); + assert_eq!(text1, &*b("123")); + assert_eq!(text2, &*b("")); + } + + #[test] + fn find_iter_empty() { + let re = Regex::new(b(r"(?m:^)")).unwrap(); + assert_eq!(find_iter_tuples(&re, &b("")), &[(0, 0)]); + assert_eq!(find_iter_tuples(&re, &b("\n")), &[(0, 0)]); + assert_eq!(find_iter_tuples(&re, &b("\n\n")), &[(0, 0), (1, 1)]); + assert_eq!(find_iter_tuples(&re, &b("\na\n")), &[(0, 0), (1, 1)]); + assert_eq!( + find_iter_tuples(&re, &b("\na\n\n")), + vec![(0, 0), (1, 1), (3, 3),] + ); + } + + #[test] + fn captures_iter_empty() { + let re = Regex::new(b(r"(?m:^)")).unwrap(); + assert_eq!(cap_iter_tuples(&re, ""), &[(0, 0)]); + assert_eq!(cap_iter_tuples(&re, "\n"), &[(0, 0)]); + assert_eq!(cap_iter_tuples(&re, "\n\n"), &[(0, 0), (1, 1)]); + assert_eq!(cap_iter_tuples(&re, "\na\n"), &[(0, 0), (1, 1)]); + assert_eq!( + cap_iter_tuples(&re, "\na\n\n"), + &[(0, 0), (1, 1), (3, 3),] + ); + } + + #[test] + fn max_jit_stack_size_does_something() { + if !is_jit_available::<CodeUnitWidth32>() { + return; + } + + let hundred = "\ + ABCDEFGHIJKLMNOPQRSTUVWXY\ + ABCDEFGHIJKLMNOPQRSTUVWXY\ + ABCDEFGHIJKLMNOPQRSTUVWXY\ + ABCDEFGHIJKLMNOPQRSTUVWXY\ + "; + let hay = format!("{}", hundred.repeat(100)); + + // First, try a regex that checks that we can blow the JIT stack limit. + let re = RegexBuilder::new() + .ucp(true) + .jit(true) + .max_jit_stack_size(Some(1)) + .build(b(r"((((\w{10})){100}))+")) + .unwrap(); + let result = re.is_match(&b(&hay)); + if result.is_ok() { + // Skip this test, since for some reason we weren't able to blow + // the stack limit. + return; + } + let err = result.unwrap_err(); + assert!(err.to_string().contains("JIT stack limit reached")); + + // Now bump up the JIT stack limit and check that it succeeds. + let re = RegexBuilder::new() + .ucp(true) + .jit_if_available(true) + .max_jit_stack_size(Some(1 << 20)) + .build(b(r"((((\w{10})){100}))+")) + .unwrap(); + assert!(re.is_match(&b(&hay)).unwrap()); + } + + #[test] + fn find_utf_emoji_as_chars() { + let hay : Vec<char> = "0123456789😀👍🏼🎉abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".chars().collect(); + let pattern: Vec<char> = r"(*UTF) + (?x) (?#: Allow comments and whitespace.) + + [^\N{U+0000}-\N{U+007F}] (?#: Non-ascii code points.) + + (?#: One or more times.) + " + .chars() + .collect(); + let re = RegexBuilder::new() + .extended(true) + .utf(true) + .jit_if_available(true) + .build(pattern) + .unwrap(); + let matched = re.find(&hay).unwrap().unwrap(); + assert!(matched.as_bytes().iter().copied().eq("😀👍🏼🎉".chars())); + } +}