From c545cc3c243409952c9ef1eae3cdd837c05f2059 Mon Sep 17 00:00:00 2001
From: Benjamin Moody <benjaminmoody@gmail.com>
Date: Tue, 7 Nov 2023 22:18:29 -0500
Subject: [PATCH 1/2] find_iter, captures_iter: fix empty matches in UTF mode

Following an empty match, the iterator (Matches or CaptureMatches)
advances the last_end position so as not to return the same match
twice.

However, if the regex uses UTF mode, the position passed to
find_at_with_match_data or captures_read_at is required to be a UTF-8
character boundary, so last_end must be advanced by a whole UTF-8
character, not just one byte.

Determining whether or not the regex is using UTF mode requires
PCRE2_INFO_ALLOPTIONS (checking the config is not enough.)
---
 src/bytes.rs | 23 +++++++++++++++++++++--
 src/ffi.rs   | 21 +++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/src/bytes.rs b/src/bytes.rs
index 5f02c4e..30abad1 100644
--- a/src/bytes.rs
+++ b/src/bytes.rs
@@ -177,10 +177,12 @@ impl RegexBuilder {
                 Box::new(move || MatchData::new(config.clone(), &code));
             Pool::new(create)
         };
+        let utf = code.is_utf()?;
         Ok(Regex {
             config: Arc::new(self.config.clone()),
             pattern: pattern.to_string(),
             code,
+            utf,
             capture_names: Arc::new(capture_names),
             capture_names_idx: Arc::new(idx),
             match_data,
@@ -361,6 +363,8 @@ pub struct Regex {
     pattern: String,
     /// The underlying compiled PCRE2 object.
     code: Arc<Code>,
+    /// True if the regex uses UTF mode.
+    utf: bool,
     /// The capture group names for this regex.
     capture_names: Arc<Vec<Option<String>>>,
     /// A map from capture group name to capture group index.
@@ -382,6 +386,7 @@ impl Clone for Regex {
             config: Arc::clone(&self.config),
             pattern: self.pattern.clone(),
             code: Arc::clone(&self.code),
+            utf: self.utf,
             capture_names: Arc::clone(&self.capture_names),
             capture_names_idx: Arc::clone(&self.capture_names_idx),
             match_data,
@@ -759,6 +764,20 @@ impl Regex {
     fn new_match_data(&self) -> MatchData {
         MatchData::new(self.config.match_config.clone(), &self.code)
     }
+
+    /// Determines the next possible match starting position within the
+    /// subject string. In UTF mode, the starting position must be a
+    /// UTF-8 character boundary. In non-UTF mode, any byte offset is
+    /// a valid starting position.
+    fn position_after(&self, subject: &[u8], start: usize) -> usize {
+        let mut pos = start + 1;
+        if self.utf {
+            while subject.get(pos).map_or(false, |b| (*b as i8) < -0x40) {
+                pos += 1;
+            }
+        }
+        pos
+    }
 }
 
 /// CaptureLocations is a low level representation of the raw offsets of each
@@ -1022,7 +1041,7 @@ impl<'r, 's> Iterator for Matches<'r, 's> {
             // This is an empty match. To ensure we make progress, start
             // the next search at the smallest possible starting position
             // of the next match following this one.
-            self.last_end = m.end() + 1;
+            self.last_end = self.re.position_after(self.subject, m.end());
             // Don't accept empty matches immediately following a match.
             // Just move on to the next match.
             if Some(m.end()) == self.last_match {
@@ -1069,7 +1088,7 @@ impl<'r, 's> Iterator for CaptureMatches<'r, 's> {
             // This is an empty match. To ensure we make progress, start
             // the next search at the smallest possible starting position
             // of the next match following this one.
-            self.last_end = m.end() + 1;
+            self.last_end = self.re.position_after(self.subject, m.end());
             // Don't accept empty matches immediately following a match.
             // Just move on to the next match.
             if Some(m.end()) == self.last_match {
diff --git a/src/ffi.rs b/src/ffi.rs
index bef2ab8..4f2beec 100644
--- a/src/ffi.rs
+++ b/src/ffi.rs
@@ -258,6 +258,27 @@ impl Code {
             Ok(1 + count as usize)
         }
     }
+
+    /// Returns true if this regex uses UTF mode (matches whole UTF-8
+    /// characters), or false if it uses non-UTF mode (matches
+    /// individual bytes). This depends on the options specified when
+    /// compiling the regex, and may also be affected by flags such as
+    /// `(*UTF)` within the pattern itself.
+    pub(crate) fn is_utf(&self) -> Result<bool, Error> {
+        let mut options: u32 = 0;
+        let rc = unsafe {
+            pcre2_pattern_info_8(
+                self.as_ptr(),
+                PCRE2_INFO_ALLOPTIONS,
+                &mut options as *mut u32 as *mut c_void,
+            )
+        };
+        if rc != 0 {
+            Err(Error::info(rc))
+        } else {
+            Ok(options & PCRE2_UTF != 0)
+        }
+    }
 }
 
 /// A low level representation of PCRE2's compilation context.

From fb07086b7f6020587e558ef55a8b56482c26c633 Mon Sep 17 00:00:00 2001
From: Benjamin Moody <benjaminmoody@gmail.com>
Date: Tue, 7 Nov 2023 22:37:30 -0500
Subject: [PATCH 2/2] tests: check iterating over empty matches in UTF-8 text

These tests will fail if the iterator does not correctly advance the
end position (by either a byte or a whole character, as appropriate)
following an empty match.
---
 src/bytes.rs | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/bytes.rs b/src/bytes.rs
index 30abad1..0ed3974 100644
--- a/src/bytes.rs
+++ b/src/bytes.rs
@@ -1309,6 +1309,36 @@ mod tests {
         );
     }
 
+    #[test]
+    fn find_iter_empty_utf() {
+        let re = Regex::new(r"(*UTF)x*").unwrap();
+        assert_eq!(
+            find_iter_tuples(&re, "∀ÁA".as_bytes()),
+            vec![(0, 0), (3, 3), (5, 5), (6, 6),]
+        );
+
+        let re = Regex::new(r"x*").unwrap();
+        assert_eq!(
+            find_iter_tuples(&re, "∀ÁA".as_bytes()),
+            vec![(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6),]
+        );
+    }
+
+    #[test]
+    fn captures_iter_empty_utf() {
+        let re = Regex::new(r"(*UTF)x*").unwrap();
+        assert_eq!(
+            cap_iter_tuples(&re, "∀ÁA".as_bytes()),
+            vec![(0, 0), (3, 3), (5, 5), (6, 6),]
+        );
+
+        let re = Regex::new(r"x*").unwrap();
+        assert_eq!(
+            cap_iter_tuples(&re, "∀ÁA".as_bytes()),
+            vec![(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6),]
+        );
+    }
+
     #[test]
     fn max_jit_stack_size_does_something() {
         if !is_jit_available() {