Skip to content

Commit c2cebbb

Browse files
Multimodcraftershilangyu
authored andcommitted
Change compilation to disconnected components
1 parent 1e13645 commit c2cebbb

File tree

3 files changed

+43
-54
lines changed

3 files changed

+43
-54
lines changed

regex-automata/src/nfa/thompson/builder.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,8 @@ pub struct Builder {
340340
/// contains a single regex, then `start_pattern[0]` and `start_anchored`
341341
/// are always equivalent.
342342
start_pattern: Vec<StateID>,
343+
/// The starting states for each individual look-behind sub-expression.
344+
start_look_behind: Vec<StateID>,
343345
/// A map from pattern ID to capture group index to name. (If no name
344346
/// exists, then a None entry is present. Thus, all capturing groups are
345347
/// present in this mapping.)
@@ -449,6 +451,7 @@ impl Builder {
449451
remap.resize(self.states.len(), StateID::ZERO);
450452

451453
nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
454+
nfa.set_look_behind_starts(self.start_look_behind.as_slice());
452455
nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
453456
// The idea here is to convert our intermediate states to their final
454457
// form. The only real complexity here is the process of converting
@@ -706,6 +709,12 @@ impl Builder {
706709
self.start_pattern.len()
707710
}
708711

712+
/// Adds the [`start_id`] to the set of starting states that is used when
713+
/// running look-behind expressions.
714+
pub fn start_look_behind(&mut self, start_id: StateID) {
715+
self.start_look_behind.push(start_id);
716+
}
717+
709718
/// Add an "empty" NFA state.
710719
///
711720
/// An "empty" NFA state is a state with a single unconditional epsilon

regex-automata/src/nfa/thompson/compiler.rs

Lines changed: 25 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -711,11 +711,6 @@ pub struct Compiler {
711711
/// State used for caching common suffixes when compiling reverse UTF-8
712712
/// automata (for Unicode character classes).
713713
utf8_suffix: RefCell<Utf8SuffixMap>,
714-
/// Top level alternation state which is used to run all look-around
715-
/// assertion checks in lockstep with the main expression. Each look-around
716-
/// expression is compiled to a set of states that is patched into this
717-
/// state, and this state is updated on each new pattern being compiled.
718-
lookaround_alt: RefCell<Option<StateID>>,
719714
/// The next index to use for a look-around expression.
720715
lookaround_index: RefCell<SmallIndex>,
721716
}
@@ -730,7 +725,6 @@ impl Compiler {
730725
utf8_state: RefCell::new(Utf8State::new()),
731726
trie_state: RefCell::new(RangeTrie::new()),
732727
utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
733-
lookaround_alt: RefCell::new(None),
734728
lookaround_index: RefCell::new(SmallIndex::ZERO),
735729
}
736730
}
@@ -993,32 +987,11 @@ impl Compiler {
993987

994988
let compiled = self.c_alt_iter(exprs.iter().map(|e| {
995989
let _ = self.start_pattern()?;
996-
let has_lookarounds =
997-
(e.borrow() as &Hir).properties().contains_lookaround_expr();
998-
let mut top_level_alt = if has_lookarounds {
999-
self.add_union()?
1000-
} else {
1001-
StateID::ZERO
1002-
};
1003-
if has_lookarounds {
1004-
let lookaround_prefix =
1005-
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
1006-
let lookaround_alt = self.add_union()?;
1007-
self.patch(lookaround_prefix.end, lookaround_alt)?;
1008-
self.patch(top_level_alt, lookaround_prefix.start)?;
1009-
self.lookaround_alt.borrow_mut().replace(lookaround_alt);
1010-
}
1011990
let one = self.c_cap(0, None, e.borrow())?;
1012991
let match_state_id = self.add_match()?;
1013992
self.patch(one.end, match_state_id)?;
1014-
if has_lookarounds {
1015-
self.patch(top_level_alt, one.start)?;
1016-
} else {
1017-
top_level_alt = one.start;
1018-
}
1019-
let _ = self.finish_pattern(top_level_alt)?;
1020-
self.lookaround_alt.borrow_mut().take();
1021-
Ok(ThompsonRef { start: top_level_alt, end: match_state_id })
993+
let _ = self.finish_pattern(one.start)?;
994+
Ok(ThompsonRef { start: one.start, end: match_state_id })
1022995
}))?;
1023996
self.patch(unanchored_prefix.end, compiled.start)?;
1024997
let nfa = self
@@ -1052,25 +1025,25 @@ impl Compiler {
10521025
&self,
10531026
lookaround: &LookAround,
10541027
) -> Result<ThompsonRef, BuildError> {
1055-
let sub = self.c(lookaround.sub())?;
1056-
let pos = match lookaround {
1057-
LookAround::NegativeLookBehind(_) => false,
1058-
LookAround::PositiveLookBehind(_) => true,
1059-
};
10601028
let idx = *self.lookaround_index.borrow();
10611029
*self.lookaround_index.borrow_mut() = SmallIndex::new(idx.one_more())
10621030
.map_err(|e| {
10631031
BuildError::too_many_lookarounds(e.attempted() as usize)
10641032
})?;
1033+
let pos = match lookaround {
1034+
LookAround::NegativeLookBehind(_) => false,
1035+
LookAround::PositiveLookBehind(_) => true,
1036+
};
10651037
let check = self.add_check_lookaround(idx, pos)?;
1038+
1039+
let unanchored =
1040+
self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?;
1041+
1042+
let sub = self.c(lookaround.sub())?;
10661043
let write = self.add_write_lookaround(idx)?;
1044+
self.patch(unanchored.end, sub.start)?;
10671045
self.patch(sub.end, write)?;
1068-
self.patch(
1069-
self.lookaround_alt
1070-
.borrow()
1071-
.expect("Cannot compile look-around outside pattern"),
1072-
sub.start,
1073-
)?;
1046+
self.builder.borrow_mut().start_look_behind(unanchored.start);
10741047
Ok(ThompsonRef { start: check, end: check })
10751048
}
10761049

@@ -2169,13 +2142,12 @@ mod tests {
21692142
&[
21702143
s_bin_union(2, 1),
21712144
s_range(0, 255, 0),
2172-
s_bin_union(3, 6),
2145+
s_check_lookaround(0, true, 7),
21732146
s_bin_union(5, 4),
21742147
s_range(0, 255, 3),
2175-
s_look(Look::Start, 7),
2176-
s_check_lookaround(0, true, 8),
2148+
s_look(Look::Start, 6),
21772149
s_write_lookaround(0),
2178-
s_byte(b'a', 9),
2150+
s_byte(b'a', 8),
21792151
s_match(0)
21802152
]
21812153
);
@@ -2310,28 +2282,27 @@ mod tests {
23102282
assert_eq!(
23112283
build(r"(?<=a)").states(),
23122284
&[
2313-
s_bin_union(1, 4),
2285+
s_check_lookaround(0, true, 5),
23142286
s_bin_union(3, 2),
23152287
s_range(b'\x00', b'\xFF', 1),
2316-
s_byte(b'a', 5),
2317-
s_check_lookaround(0, true, 6),
2288+
s_byte(b'a', 4),
23182289
s_write_lookaround(0),
23192290
s_match(0)
23202291
]
23212292
);
23222293
assert_eq!(
23232294
build(r"(?<=a(?<!b))").states(),
23242295
&[
2325-
s_bin_union(1, 8),
2296+
s_check_lookaround(0, true, 10),
23262297
s_bin_union(3, 2),
23272298
s_range(b'\x00', b'\xFF', 1),
2328-
s_bin_union(5, 4),
2329-
s_byte(b'a', 6),
2330-
s_byte(b'b', 7),
2331-
s_check_lookaround(0, false, 9),
2332-
s_write_lookaround(0),
2333-
s_check_lookaround(1, true, 10),
2299+
s_byte(b'a', 4),
2300+
s_check_lookaround(1, false, 9),
2301+
s_bin_union(7, 6),
2302+
s_range(b'\x00', b'\xFF', 5),
2303+
s_byte(b'b', 8),
23342304
s_write_lookaround(1),
2305+
s_write_lookaround(0),
23352306
s_match(0)
23362307
]
23372308
);

regex-automata/src/nfa/thompson/nfa.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,6 +1270,8 @@ pub(super) struct Inner {
12701270
/// This is needed to initialize the table for storing the result of
12711271
/// look-around evaluation.
12721272
lookaround_count: usize,
1273+
/// Contains the start states for each of the look-behind subexpressions
1274+
start_look_behind: Vec<StateID>,
12731275
/// Heap memory used indirectly by NFA states and other things (like the
12741276
/// various capturing group representations above). Since each state
12751277
/// might use a different amount of heap, we need to keep track of this
@@ -1419,6 +1421,13 @@ impl Inner {
14191421
self.start_pattern = start_pattern.to_vec();
14201422
}
14211423

1424+
pub(super) fn set_look_behind_starts(
1425+
&mut self,
1426+
look_behind_starts: &[StateID],
1427+
) {
1428+
self.start_look_behind = look_behind_starts.to_vec();
1429+
}
1430+
14221431
/// Sets the UTF-8 mode of this NFA.
14231432
pub(super) fn set_utf8(&mut self, yes: bool) {
14241433
self.utf8 = yes;

0 commit comments

Comments
 (0)