@@ -30,7 +30,7 @@ pub struct Config {
3030 reverse : Option < bool > ,
3131 nfa_size_limit : Option < Option < usize > > ,
3232 shrink : Option < bool > ,
33- captures : Option < bool > ,
33+ which_captures : Option < WhichCaptures > ,
3434 look_matcher : Option < LookMatcher > ,
3535 #[ cfg( test) ]
3636 unanchored_prefix : Option < bool > ,
@@ -178,12 +178,15 @@ impl Config {
178178 /// ```
179179 /// use regex_automata::{
180180 /// dfa::{self, Automaton},
181- /// nfa::thompson::NFA,
181+ /// nfa::thompson::{ NFA, WhichCaptures} ,
182182 /// HalfMatch, Input,
183183 /// };
184184 ///
185185 /// let dfa = dfa::dense::Builder::new()
186- /// .thompson(NFA::config().captures(false).reverse(true))
186+ /// .thompson(NFA::config()
187+ /// .which_captures(WhichCaptures::None)
188+ /// .reverse(true)
189+ /// )
187190 /// .build("baz[0-9]+")?;
188191 /// let expected = Some(HalfMatch::must(0, 3));
189192 /// assert_eq!(
@@ -277,10 +280,12 @@ impl Config {
277280 ///
278281 /// ```
279282 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
280- /// use regex_automata::nfa::thompson::NFA;
283+ /// use regex_automata::nfa::thompson::{ NFA, WhichCaptures} ;
281284 ///
282285 /// // Currently we have to disable captures when enabling reverse NFA.
283- /// let config = NFA::config().captures(false).reverse(true);
286+ /// let config = NFA::config()
287+ /// .which_captures(WhichCaptures::None)
288+ /// .reverse(true);
284289 /// let not_shrunk = NFA::compiler()
285290 /// .configure(config.clone().shrink(false))
286291 /// .build(r"\w")?;
@@ -314,18 +319,70 @@ impl Config {
314319 /// require capturing groups to be present in the NFA. Building a Pike VM
315320 /// with an NFA without capturing groups will result in an error.
316321 ///
322+ /// (Note that since this method is deprecated, the example below uses
323+ /// [`Config::which_captures`] to disable capture states.)
324+ ///
317325 /// ```
318- /// use regex_automata::nfa::thompson::{pikevm::PikeVM, NFA};
326+ /// use regex_automata::nfa::thompson::{
327+ /// pikevm::PikeVM,
328+ /// NFA,
329+ /// WhichCaptures,
330+ /// };
319331 ///
320332 /// let nfa = NFA::compiler()
321- /// .configure(NFA::config().captures(false ))
333+ /// .configure(NFA::config().which_captures(WhichCaptures::None ))
322334 /// .build(r"[a-z]+")?;
323335 /// assert!(PikeVM::new_from_nfa(nfa).is_err());
324336 ///
325337 /// # Ok::<(), Box<dyn std::error::Error>>(())
326338 /// ```
327- pub fn captures ( mut self , yes : bool ) -> Config {
328- self . captures = Some ( yes) ;
339+ #[ deprecated( since = "0.3.5" , note = "use which_captures instead" ) ]
340+ pub fn captures ( self , yes : bool ) -> Config {
341+ self . which_captures ( if yes {
342+ WhichCaptures :: All
343+ } else {
344+ WhichCaptures :: None
345+ } )
346+ }
347+
348+ /// Configures what kinds of capture groups are compiled into
349+ /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
350+ /// Thompson NFA.
351+ ///
352+ /// Currently, using any option except for [`WhichCaptures::None`] requires
353+ /// disabling the [`reverse`](Config::reverse) setting. If both are
354+ /// enabled, then the compiler will return an error. It is expected that
355+ /// this limitation will be lifted in the future.
356+ ///
357+ /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
358+ /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
359+ /// overhead of capture states for explicit groups. Usually this occurs
360+ /// when one wants to use the `PikeVM` only for determining the overall
361+ /// match. Otherwise, the `PikeVM` could use much more memory than is
362+ /// necessary.
363+ ///
364+ /// # Example
365+ ///
366+ /// This example demonstrates that some regex engines, like the Pike VM,
367+ /// require capturing groups to be present in the NFA. Building a Pike VM
368+ /// with an NFA without capturing groups will result in an error.
369+ ///
370+ /// ```
371+ /// use regex_automata::nfa::thompson::{
372+ /// pikevm::PikeVM,
373+ /// NFA,
374+ /// WhichCaptures,
375+ /// };
376+ ///
377+ /// let nfa = NFA::compiler()
378+ /// .configure(NFA::config().which_captures(WhichCaptures::None))
379+ /// .build(r"[a-z]+")?;
380+ /// assert!(PikeVM::new_from_nfa(nfa).is_err());
381+ ///
382+ /// # Ok::<(), Box<dyn std::error::Error>>(())
383+ /// ```
384+ pub fn which_captures ( mut self , which_captures : WhichCaptures ) -> Config {
385+ self . which_captures = Some ( which_captures) ;
329386 self
330387 }
331388
@@ -405,8 +462,14 @@ impl Config {
405462 }
406463
407464 /// Return whether NFA compilation is configured to produce capture states.
465+ #[ deprecated( since = "0.3.5" , note = "use get_which_captures instead" ) ]
408466 pub fn get_captures ( & self ) -> bool {
409- self . captures . unwrap_or ( true )
467+ self . get_which_captures ( ) . is_any ( )
468+ }
469+
470+ /// Return what kinds of capture states will be compiled into an NFA.
471+ pub fn get_which_captures ( & self ) -> WhichCaptures {
472+ self . which_captures . unwrap_or ( WhichCaptures :: All )
410473 }
411474
412475 /// Return the look-around matcher for this NFA.
@@ -439,14 +502,65 @@ impl Config {
439502 reverse : o. reverse . or ( self . reverse ) ,
440503 nfa_size_limit : o. nfa_size_limit . or ( self . nfa_size_limit ) ,
441504 shrink : o. shrink . or ( self . shrink ) ,
442- captures : o. captures . or ( self . captures ) ,
505+ which_captures : o. which_captures . or ( self . which_captures ) ,
443506 look_matcher : o. look_matcher . or_else ( || self . look_matcher . clone ( ) ) ,
444507 #[ cfg( test) ]
445508 unanchored_prefix : o. unanchored_prefix . or ( self . unanchored_prefix ) ,
446509 }
447510 }
448511}
449512
513+ /// A configuration indicating which kinds of
514+ /// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
515+ ///
516+ /// This configuration can be used with [`Config::which_captures`] to control
517+ /// which capture states are compiled into a Thompson NFA.
518+ ///
519+ /// The default configuration is [`WhichCaptures::All`].
520+ #[ derive( Clone , Copy , Debug ) ]
521+ pub enum WhichCaptures {
522+ /// All capture states, including those corresponding to both implicit and
523+ /// explicit capture groups, are included in the Thompson NFA.
524+ All ,
525+ /// Only capture states corresponding to implicit capture groups are
526+ /// included. Implicit capture groups appear in every pattern implicitly
527+ /// and correspond to the overall match of a pattern.
528+ ///
529+ /// This is useful when one only cares about the overall match of a
530+ /// pattern. By excluding capture states from explicit capture groups,
531+ /// one might be able to reduce the memory usage of a multi-pattern regex
532+ /// substantially if it was otherwise written to have many explicit capture
533+ /// groups.
534+ Implicit ,
535+ /// No capture states are compiled into the Thompson NFA.
536+ ///
537+ /// This is useful when capture states are either not needed (for example,
538+ /// if one is only trying to build a DFA) or if they aren't supported (for
539+ /// example, a reverse NFA).
540+ None ,
541+ }
542+
543+ impl Default for WhichCaptures {
544+ fn default ( ) -> WhichCaptures {
545+ WhichCaptures :: All
546+ }
547+ }
548+
549+ impl WhichCaptures {
550+ /// Returns true if this configuration indicates that no capture states
551+ /// should be produced in an NFA.
552+ pub fn is_none ( & self ) -> bool {
553+ matches ! ( * self , WhichCaptures :: None )
554+ }
555+
556+ /// Returns true if this configuration indicates that some capture states
557+ /// should be added to an NFA. Note that this might only include capture
558+ /// states for implicit capture groups.
559+ pub fn is_any ( & self ) -> bool {
560+ !self . is_none ( )
561+ }
562+ }
563+
450564/*
451565This compiler below uses Thompson's construction algorithm. The compiler takes
452566a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
@@ -800,7 +914,9 @@ impl Compiler {
800914 if exprs. len ( ) > PatternID :: LIMIT {
801915 return Err ( BuildError :: too_many_patterns ( exprs. len ( ) ) ) ;
802916 }
803- if self . config . get_reverse ( ) && self . config . get_captures ( ) {
917+ if self . config . get_reverse ( )
918+ && self . config . get_which_captures ( ) . is_any ( )
919+ {
804920 return Err ( BuildError :: unsupported_captures ( ) ) ;
805921 }
806922
@@ -978,7 +1094,7 @@ impl Compiler {
9781094 name : Option < & str > ,
9791095 expr : & Hir ,
9801096 ) -> Result < ThompsonRef , BuildError > {
981- if ! self . config . get_captures ( ) {
1097+ if self . config . get_which_captures ( ) . is_none ( ) {
9821098 return self . c ( expr) ;
9831099 }
9841100
@@ -1728,9 +1844,15 @@ mod tests {
17281844 util:: primitives:: { PatternID , StateID } ,
17291845 } ;
17301846
1847+ use super :: * ;
1848+
17311849 fn build ( pattern : & str ) -> NFA {
17321850 NFA :: compiler ( )
1733- . configure ( NFA :: config ( ) . captures ( false ) . unanchored_prefix ( false ) )
1851+ . configure (
1852+ NFA :: config ( )
1853+ . which_captures ( WhichCaptures :: None )
1854+ . unanchored_prefix ( false ) ,
1855+ )
17341856 . build ( pattern)
17351857 . unwrap ( )
17361858 }
@@ -1794,7 +1916,7 @@ mod tests {
17941916 #[ test]
17951917 fn compile_unanchored_prefix ( ) {
17961918 let nfa = NFA :: compiler ( )
1797- . configure ( NFA :: config ( ) . captures ( false ) )
1919+ . configure ( NFA :: config ( ) . which_captures ( WhichCaptures :: None ) )
17981920 . build ( r"a" )
17991921 . unwrap ( ) ;
18001922 assert_eq ! (
@@ -1827,7 +1949,11 @@ mod tests {
18271949
18281950 // Check that non-UTF-8 literals work.
18291951 let nfa = NFA :: compiler ( )
1830- . configure ( NFA :: config ( ) . captures ( false ) . unanchored_prefix ( false ) )
1952+ . configure (
1953+ NFA :: config ( )
1954+ . which_captures ( WhichCaptures :: None )
1955+ . unanchored_prefix ( false ) ,
1956+ )
18311957 . syntax ( crate :: util:: syntax:: Config :: new ( ) . utf8 ( false ) )
18321958 . build ( r"(?-u)\xFF" )
18331959 . unwrap ( ) ;
@@ -1937,7 +2063,7 @@ mod tests {
19372063 let nfa = NFA :: compiler ( )
19382064 . configure (
19392065 NFA :: config ( )
1940- . captures ( false )
2066+ . which_captures ( WhichCaptures :: None )
19412067 . reverse ( true )
19422068 . shrink ( false )
19432069 . unanchored_prefix ( false ) ,
@@ -1965,7 +2091,11 @@ mod tests {
19652091 #[ test]
19662092 fn compile_many_start_pattern ( ) {
19672093 let nfa = NFA :: compiler ( )
1968- . configure ( NFA :: config ( ) . captures ( false ) . unanchored_prefix ( false ) )
2094+ . configure (
2095+ NFA :: config ( )
2096+ . which_captures ( WhichCaptures :: None )
2097+ . unanchored_prefix ( false ) ,
2098+ )
19692099 . build_many ( & [ "a" , "b" ] )
19702100 . unwrap ( ) ;
19712101 assert_eq ! (
@@ -1993,7 +2123,9 @@ mod tests {
19932123 use regex_syntax:: hir:: { Class , ClassBytes , Hir } ;
19942124
19952125 let hir = Hir :: class ( Class :: Bytes ( ClassBytes :: new ( vec ! [ ] ) ) ) ;
1996- let config = NFA :: config ( ) . captures ( false ) . unanchored_prefix ( false ) ;
2126+ let config = NFA :: config ( )
2127+ . which_captures ( WhichCaptures :: None )
2128+ . unanchored_prefix ( false ) ;
19972129 let nfa =
19982130 NFA :: compiler ( ) . configure ( config) . build_from_hir ( & hir) . unwrap ( ) ;
19992131 assert_eq ! ( nfa. states( ) , & [ s_fail( ) , s_match( 0 ) ] ) ;
@@ -2005,7 +2137,9 @@ mod tests {
20052137 use regex_syntax:: hir:: { Class , ClassUnicode , Hir } ;
20062138
20072139 let hir = Hir :: class ( Class :: Unicode ( ClassUnicode :: new ( vec ! [ ] ) ) ) ;
2008- let config = NFA :: config ( ) . captures ( false ) . unanchored_prefix ( false ) ;
2140+ let config = NFA :: config ( )
2141+ . which_captures ( WhichCaptures :: None )
2142+ . unanchored_prefix ( false ) ;
20092143 let nfa =
20102144 NFA :: compiler ( ) . configure ( config) . build_from_hir ( & hir) . unwrap ( ) ;
20112145 assert_eq ! ( nfa. states( ) , & [ s_fail( ) , s_match( 0 ) ] ) ;
0 commit comments