Skip to content

Commit 5cc52ea

Browse files
Multimodcraftershilangyu
authored andcommitted
Implement look-behind state processing
1 parent c2cebbb commit 5cc52ea

File tree

3 files changed

+113
-0
lines changed

3 files changed

+113
-0
lines changed

regex-automata/src/nfa/thompson/builder.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ impl Builder {
387387
self.pattern_id = None;
388388
self.states.clear();
389389
self.start_pattern.clear();
390+
self.start_look_behind.clear();
390391
self.captures.clear();
391392
self.memory_states = 0;
392393
}

regex-automata/src/nfa/thompson/nfa.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,12 @@ impl NFA {
11061106
self.0.lookaround_count
11071107
}
11081108

1109+
/// Returns the starting states for initializing look-behind evaluation
1110+
#[inline]
1111+
pub fn look_behind_starts(&self) -> &Vec<StateID> {
1112+
&self.0.start_look_behind
1113+
}
1114+
11091115
// FIXME: The `look_set_prefix_all` computation was not correct, and it
11101116
// seemed a little tricky to fix it. Since I wasn't actually using it for
11111117
// anything, I just decided to remove it in the run up to the regex 1.9
@@ -1481,6 +1487,9 @@ impl Inner {
14811487
for id in self.start_pattern.iter_mut() {
14821488
*id = old_to_new[*id];
14831489
}
1490+
for id in self.start_look_behind.iter_mut() {
1491+
*id = old_to_new[*id];
1492+
}
14841493
}
14851494
}
14861495

regex-automata/src/nfa/thompson/pikevm.rs

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,7 +1263,46 @@ impl PikeVM {
12631263
ref mut curr,
12641264
ref mut next,
12651265
ref mut lookaround,
1266+
ref mut curr_lookaround,
1267+
ref mut next_lookaround,
12661268
} = cache;
1269+
1270+
// This initializes the look-behind threads from the start of the input
1271+
// Note: since capture groups are not allowed inside look-behinds,
1272+
// there won't be any Capture epsilon transitions and hence it is ok to
1273+
// use &mut [] for the slots parameter. We need to add the start states
1274+
// in reverse because nested look-behinds have a higher index but must
1275+
// be executed first.
1276+
for look_behind_start in self.nfa.look_behind_starts() {
1277+
self.epsilon_closure(
1278+
stack,
1279+
&mut [],
1280+
curr_lookaround,
1281+
lookaround,
1282+
input,
1283+
0,
1284+
*look_behind_start,
1285+
);
1286+
}
1287+
1288+
// This brings the look-behind threads into the state they must be for
1289+
// starting at input.start() instead of the beginning. This is
1290+
// necessary for lookbehinds to be able to match outside of the input
1291+
// span.
1292+
for lb_at in 0..input.start() {
1293+
self.nexts(
1294+
stack,
1295+
curr_lookaround,
1296+
next_lookaround,
1297+
lookaround,
1298+
input,
1299+
lb_at,
1300+
&mut [],
1301+
);
1302+
core::mem::swap(curr_lookaround, next_lookaround);
1303+
next_lookaround.set.clear();
1304+
}
1305+
12671306
let mut hm = None;
12681307
// Yes, our search doesn't end at input.end(), but includes it. This
12691308
// is necessary because matches are delayed by one byte, just like
@@ -1374,6 +1413,17 @@ impl PikeVM {
13741413
stack, slots, curr, lookaround, input, at, start_id,
13751414
);
13761415
}
1416+
// The lookbehind states must be processed first, since their
1417+
// result must be available for the processing of the main states.
1418+
self.nexts(
1419+
stack,
1420+
curr_lookaround,
1421+
next_lookaround,
1422+
lookaround,
1423+
input,
1424+
at,
1425+
&mut [],
1426+
);
13771427
if let Some(pid) =
13781428
self.nexts(stack, curr, next, lookaround, input, at, slots)
13791429
{
@@ -1387,7 +1437,9 @@ impl PikeVM {
13871437
break;
13881438
}
13891439
core::mem::swap(curr, next);
1440+
core::mem::swap(curr_lookaround, next_lookaround);
13901441
next.set.clear();
1442+
next_lookaround.set.clear();
13911443
at += 1;
13921444
}
13931445
instrument!(|c| c.eprint(&self.nfa));
@@ -1442,7 +1494,34 @@ impl PikeVM {
14421494
ref mut curr,
14431495
ref mut next,
14441496
ref mut lookaround,
1497+
ref mut curr_lookaround,
1498+
ref mut next_lookaround,
14451499
} = cache;
1500+
1501+
for look_behind_start in self.nfa.look_behind_starts() {
1502+
self.epsilon_closure(
1503+
stack,
1504+
&mut [],
1505+
curr_lookaround,
1506+
lookaround,
1507+
input,
1508+
0,
1509+
*look_behind_start,
1510+
);
1511+
}
1512+
for lb_at in 0..input.start() {
1513+
self.nexts(
1514+
stack,
1515+
curr_lookaround,
1516+
next_lookaround,
1517+
lookaround,
1518+
input,
1519+
lb_at,
1520+
&mut [],
1521+
);
1522+
core::mem::swap(curr_lookaround, next_lookaround);
1523+
next_lookaround.set.clear();
1524+
}
14461525
for at in input.start()..=input.end() {
14471526
let any_matches = !patset.is_empty();
14481527
if curr.set.is_empty() {
@@ -1459,6 +1538,15 @@ impl PikeVM {
14591538
stack, slots, curr, lookaround, input, at, start_id,
14601539
);
14611540
}
1541+
self.nexts(
1542+
stack,
1543+
curr_lookaround,
1544+
next_lookaround,
1545+
lookaround,
1546+
input,
1547+
at,
1548+
&mut [],
1549+
);
14621550
self.nexts_overlapping(
14631551
stack, curr, next, lookaround, input, at, patset,
14641552
);
@@ -1470,7 +1558,9 @@ impl PikeVM {
14701558
break;
14711559
}
14721560
core::mem::swap(curr, next);
1561+
core::mem::swap(curr_lookaround, next_lookaround);
14731562
next.set.clear();
1563+
next_lookaround.set.clear();
14741564
}
14751565
instrument!(|c| c.eprint(&self.nfa));
14761566
}
@@ -1976,6 +2066,10 @@ pub struct Cache {
19762066
/// haystack at which look-around indexed x holds and which is <= to the
19772067
/// current position".
19782068
lookaround: Vec<Option<NonMaxUsize>>,
2069+
/// The current active states for look-behind subexpressions
2070+
curr_lookaround: ActiveStates,
2071+
/// The next set of states to be explored for look-behind subexpressions
2072+
next_lookaround: ActiveStates,
19792073
}
19802074

19812075
impl Cache {
@@ -1993,6 +2087,8 @@ impl Cache {
19932087
curr: ActiveStates::new(re),
19942088
next: ActiveStates::new(re),
19952089
lookaround: vec![None; re.lookaround_count()],
2090+
curr_lookaround: ActiveStates::new(re),
2091+
next_lookaround: ActiveStates::new(re),
19962092
}
19972093
}
19982094

@@ -2036,6 +2132,9 @@ impl Cache {
20362132
pub fn reset(&mut self, re: &PikeVM) {
20372133
self.curr.reset(re);
20382134
self.next.reset(re);
2135+
self.curr_lookaround.reset(re);
2136+
self.next_lookaround.reset(re);
2137+
self.lookaround = vec![None; re.lookaround_count()];
20392138
}
20402139

20412140
/// Returns the heap memory usage, in bytes, of this cache.
@@ -2063,6 +2162,10 @@ impl Cache {
20632162
self.stack.clear();
20642163
self.curr.setup_search(captures_slot_len);
20652164
self.next.setup_search(captures_slot_len);
2165+
// capture groups are not allowed inside look-arounds, so we
2166+
// set the slot-length to zero.
2167+
self.curr_lookaround.setup_search(0);
2168+
self.next_lookaround.setup_search(0);
20662169
}
20672170
}
20682171

0 commit comments

Comments
 (0)