From c545cc3c243409952c9ef1eae3cdd837c05f2059 Mon Sep 17 00:00:00 2001 From: Benjamin Moody Date: Tue, 7 Nov 2023 22:18:29 -0500 Subject: [PATCH 1/2] find_iter, captures_iter: fix empty matches in UTF mode Following an empty match, the iterator (Matches or CaptureMatches) advances the last_end position so as not to return the same match twice. However, if the regex uses UTF mode, the position passed to find_at_with_match_data or captures_read_at is required to be a UTF-8 character boundary, so last_end must be advanced by a whole UTF-8 character, not just one byte. Determining whether or not the regex is using UTF mode requires PCRE2_INFO_ALLOPTIONS (checking the config is not enough.) --- src/bytes.rs | 23 +++++++++++++++++++++-- src/ffi.rs | 21 +++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index 5f02c4e..30abad1 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -177,10 +177,12 @@ impl RegexBuilder { Box::new(move || MatchData::new(config.clone(), &code)); Pool::new(create) }; + let utf = code.is_utf()?; Ok(Regex { config: Arc::new(self.config.clone()), pattern: pattern.to_string(), code, + utf, capture_names: Arc::new(capture_names), capture_names_idx: Arc::new(idx), match_data, @@ -361,6 +363,8 @@ pub struct Regex { pattern: String, /// The underlying compiled PCRE2 object. code: Arc, + /// True if the regex uses UTF mode. + utf: bool, /// The capture group names for this regex. capture_names: Arc>>, /// A map from capture group name to capture group index. @@ -382,6 +386,7 @@ impl Clone for Regex { config: Arc::clone(&self.config), pattern: self.pattern.clone(), code: Arc::clone(&self.code), + utf: self.utf, capture_names: Arc::clone(&self.capture_names), capture_names_idx: Arc::clone(&self.capture_names_idx), match_data, @@ -759,6 +764,20 @@ impl Regex { fn new_match_data(&self) -> MatchData { MatchData::new(self.config.match_config.clone(), &self.code) } + + /// Determines the next possible match starting position within the + /// subject string. In UTF mode, the starting position must be a + /// UTF-8 character boundary. In non-UTF mode, any byte offset is + /// a valid starting position. + fn position_after(&self, subject: &[u8], start: usize) -> usize { + let mut pos = start + 1; + if self.utf { + while subject.get(pos).map_or(false, |b| (*b as i8) < -0x40) { + pos += 1; + } + } + pos + } } /// CaptureLocations is a low level representation of the raw offsets of each @@ -1022,7 +1041,7 @@ impl<'r, 's> Iterator for Matches<'r, 's> { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. - self.last_end = m.end() + 1; + self.last_end = self.re.position_after(self.subject, m.end()); // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(m.end()) == self.last_match { @@ -1069,7 +1088,7 @@ impl<'r, 's> Iterator for CaptureMatches<'r, 's> { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. - self.last_end = m.end() + 1; + self.last_end = self.re.position_after(self.subject, m.end()); // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(m.end()) == self.last_match { diff --git a/src/ffi.rs b/src/ffi.rs index bef2ab8..4f2beec 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -258,6 +258,27 @@ impl Code { Ok(1 + count as usize) } } + + /// Returns true if this regex uses UTF mode (matches whole UTF-8 + /// characters), or false if it uses non-UTF mode (matches + /// individual bytes). This depends on the options specified when + /// compiling the regex, and may also be affected by flags such as + /// `(*UTF)` within the pattern itself. + pub(crate) fn is_utf(&self) -> Result { + let mut options: u32 = 0; + let rc = unsafe { + pcre2_pattern_info_8( + self.as_ptr(), + PCRE2_INFO_ALLOPTIONS, + &mut options as *mut u32 as *mut c_void, + ) + }; + if rc != 0 { + Err(Error::info(rc)) + } else { + Ok(options & PCRE2_UTF != 0) + } + } } /// A low level representation of PCRE2's compilation context. From fb07086b7f6020587e558ef55a8b56482c26c633 Mon Sep 17 00:00:00 2001 From: Benjamin Moody Date: Tue, 7 Nov 2023 22:37:30 -0500 Subject: [PATCH 2/2] tests: check iterating over empty matches in UTF-8 text These tests will fail if the iterator does not correctly advance the end position (by either a byte or a whole character, as appropriate) following an empty match. --- src/bytes.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/bytes.rs b/src/bytes.rs index 30abad1..0ed3974 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1309,6 +1309,36 @@ mod tests { ); } + #[test] + fn find_iter_empty_utf() { + let re = Regex::new(r"(*UTF)x*").unwrap(); + assert_eq!( + find_iter_tuples(&re, "∀ÁA".as_bytes()), + vec![(0, 0), (3, 3), (5, 5), (6, 6),] + ); + + let re = Regex::new(r"x*").unwrap(); + assert_eq!( + find_iter_tuples(&re, "∀ÁA".as_bytes()), + vec![(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6),] + ); + } + + #[test] + fn captures_iter_empty_utf() { + let re = Regex::new(r"(*UTF)x*").unwrap(); + assert_eq!( + cap_iter_tuples(&re, "∀ÁA".as_bytes()), + vec![(0, 0), (3, 3), (5, 5), (6, 6),] + ); + + let re = Regex::new(r"x*").unwrap(); + assert_eq!( + cap_iter_tuples(&re, "∀ÁA".as_bytes()), + vec![(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6),] + ); + } + #[test] fn max_jit_stack_size_does_something() { if !is_jit_available() {