Implement look-behind state processing

Multimodcrafter · shilangyu · commit 5cc52ea36bdd · 2025-08-25T10:36:46.000+02:00
diff --git a/regex-automata/src/nfa/thompson/builder.rs b/regex-automata/src/nfa/thompson/builder.rs
@@ -387,6 +387,7 @@ impl Builder {
         self.pattern_id = None;
         self.states.clear();
         self.start_pattern.clear();
+        self.start_look_behind.clear();
         self.captures.clear();
         self.memory_states = 0;
     }
diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs
@@ -1106,6 +1106,12 @@ impl NFA {
         self.0.lookaround_count
     }
 
+    /// Returns the starting states for initializing look-behind evaluation
+    #[inline]
+    pub fn look_behind_starts(&self) -> &Vec<StateID> {
+        &self.0.start_look_behind
+    }
+
     // FIXME: The `look_set_prefix_all` computation was not correct, and it
     // seemed a little tricky to fix it. Since I wasn't actually using it for
     // anything, I just decided to remove it in the run up to the regex 1.9
@@ -1481,6 +1487,9 @@ impl Inner {
         for id in self.start_pattern.iter_mut() {
             *id = old_to_new[*id];
         }
+        for id in self.start_look_behind.iter_mut() {
+            *id = old_to_new[*id];
+        }
     }
 }
 
diff --git a/regex-automata/src/nfa/thompson/pikevm.rs b/regex-automata/src/nfa/thompson/pikevm.rs
@@ -1263,7 +1263,46 @@ impl PikeVM {
             ref mut curr,
             ref mut next,
             ref mut lookaround,
+            ref mut curr_lookaround,
+            ref mut next_lookaround,
         } = cache;
+
+        // This initializes the look-behind threads from the start of the input
+        // Note: since capture groups are not allowed inside look-behinds,
+        // there won't be any Capture epsilon transitions and hence it is ok to
+        // use &mut [] for the slots parameter. We need to add the start states
+        // in reverse because nested look-behinds have a higher index but must
+        // be executed first.
+        for look_behind_start in self.nfa.look_behind_starts() {
+            self.epsilon_closure(
+                stack,
+                &mut [],
+                curr_lookaround,
+                lookaround,
+                input,
+                0,
+                *look_behind_start,
+            );
+        }
+
+        // This brings the look-behind threads into the state they must be for
+        // starting at input.start() instead of the beginning. This is
+        // necessary for lookbehinds to be able to match outside of the input
+        // span.
+        for lb_at in 0..input.start() {
+            self.nexts(
+                stack,
+                curr_lookaround,
+                next_lookaround,
+                lookaround,
+                input,
+                lb_at,
+                &mut [],
+            );
+            core::mem::swap(curr_lookaround, next_lookaround);
+            next_lookaround.set.clear();
+        }
+
         let mut hm = None;
         // Yes, our search doesn't end at input.end(), but includes it. This
         // is necessary because matches are delayed by one byte, just like
@@ -1374,6 +1413,17 @@ impl PikeVM {
                     stack, slots, curr, lookaround, input, at, start_id,
                 );
             }
+            // The lookbehind states must be processed first, since their
+            // result must be available for the processing of the main states.
+            self.nexts(
+                stack,
+                curr_lookaround,
+                next_lookaround,
+                lookaround,
+                input,
+                at,
+                &mut [],
+            );
             if let Some(pid) =
                 self.nexts(stack, curr, next, lookaround, input, at, slots)
             {
@@ -1387,7 +1437,9 @@ impl PikeVM {
                 break;
             }
             core::mem::swap(curr, next);
+            core::mem::swap(curr_lookaround, next_lookaround);
             next.set.clear();
+            next_lookaround.set.clear();
             at += 1;
         }
         instrument!(|c| c.eprint(&self.nfa));
@@ -1442,7 +1494,34 @@ impl PikeVM {
             ref mut curr,
             ref mut next,
             ref mut lookaround,
+            ref mut curr_lookaround,
+            ref mut next_lookaround,
         } = cache;
+
+        for look_behind_start in self.nfa.look_behind_starts() {
+            self.epsilon_closure(
+                stack,
+                &mut [],
+                curr_lookaround,
+                lookaround,
+                input,
+                0,
+                *look_behind_start,
+            );
+        }
+        for lb_at in 0..input.start() {
+            self.nexts(
+                stack,
+                curr_lookaround,
+                next_lookaround,
+                lookaround,
+                input,
+                lb_at,
+                &mut [],
+            );
+            core::mem::swap(curr_lookaround, next_lookaround);
+            next_lookaround.set.clear();
+        }
         for at in input.start()..=input.end() {
             let any_matches = !patset.is_empty();
             if curr.set.is_empty() {
@@ -1459,6 +1538,15 @@ impl PikeVM {
                     stack, slots, curr, lookaround, input, at, start_id,
                 );
             }
+            self.nexts(
+                stack,
+                curr_lookaround,
+                next_lookaround,
+                lookaround,
+                input,
+                at,
+                &mut [],
+            );
             self.nexts_overlapping(
                 stack, curr, next, lookaround, input, at, patset,
             );
@@ -1470,7 +1558,9 @@ impl PikeVM {
                 break;
             }
             core::mem::swap(curr, next);
+            core::mem::swap(curr_lookaround, next_lookaround);
             next.set.clear();
+            next_lookaround.set.clear();
         }
         instrument!(|c| c.eprint(&self.nfa));
     }
@@ -1976,6 +2066,10 @@ pub struct Cache {
     /// haystack at which look-around indexed x holds and which is <= to the
     /// current position".
     lookaround: Vec<Option<NonMaxUsize>>,
+    /// The current active states for look-behind subexpressions
+    curr_lookaround: ActiveStates,
+    /// The next set of states to be explored for look-behind subexpressions
+    next_lookaround: ActiveStates,
 }
 
 impl Cache {
@@ -1993,6 +2087,8 @@ impl Cache {
             curr: ActiveStates::new(re),
             next: ActiveStates::new(re),
             lookaround: vec![None; re.lookaround_count()],
+            curr_lookaround: ActiveStates::new(re),
+            next_lookaround: ActiveStates::new(re),
         }
     }
 
@@ -2036,6 +2132,9 @@ impl Cache {
     pub fn reset(&mut self, re: &PikeVM) {
         self.curr.reset(re);
         self.next.reset(re);
+        self.curr_lookaround.reset(re);
+        self.next_lookaround.reset(re);
+        self.lookaround = vec![None; re.lookaround_count()];
     }
 
     /// Returns the heap memory usage, in bytes, of this cache.
@@ -2063,6 +2162,10 @@ impl Cache {
         self.stack.clear();
         self.curr.setup_search(captures_slot_len);
         self.next.setup_search(captures_slot_len);
+        // capture groups are not allowed inside look-arounds, so we
+        // set the slot-length to zero.
+        self.curr_lookaround.setup_search(0);
+        self.next_lookaround.setup_search(0);
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -387,6 +387,7 @@ impl Builder {`
`387`	`387`	`self.pattern_id = None;`
`388`	`388`	`self.states.clear();`
`389`	`389`	`self.start_pattern.clear();`
	`390`	`+ self.start_look_behind.clear();`
`390`	`391`	`self.captures.clear();`
`391`	`392`	`self.memory_states = 0;`
`392`	`393`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1106,6 +1106,12 @@ impl NFA {`
`1106`	`1106`	`self.0.lookaround_count`
`1107`	`1107`	`}`
`1108`	`1108`
	`1109`	`+ /// Returns the starting states for initializing look-behind evaluation`
	`1110`	`+ #[inline]`
	`1111`	`+ pub fn look_behind_starts(&self) -> &Vec<StateID> {`
	`1112`	`+ &self.0.start_look_behind`
	`1113`	`+ }`
	`1114`	`+`
`1109`	`1115`	// FIXME: The `look_set_prefix_all` computation was not correct, and it
`1110`	`1116`	`// seemed a little tricky to fix it. Since I wasn't actually using it for`
`1111`	`1117`	`// anything, I just decided to remove it in the run up to the regex 1.9`
`@@ -1481,6 +1487,9 @@ impl Inner {`
`1481`	`1487`	`for id in self.start_pattern.iter_mut() {`
`1482`	`1488`	`id = old_to_new[id];`
`1483`	`1489`	`}`
	`1490`	`+ for id in self.start_look_behind.iter_mut() {`
	`1491`	`+ id = old_to_new[id];`
	`1492`	`+ }`
`1484`	`1493`	`}`
`1485`	`1494`	`}`
`1486`	`1495`