@@ -3,7 +3,7 @@ use std::error::Error;
33use regex_automata:: {
44 dfa:: { dense, Automaton , OverlappingState } ,
55 nfa:: thompson,
6- HalfMatch , Input , MatchError ,
6+ Anchored , HalfMatch , Input , MatchError ,
77} ;
88
99// Tests that quit bytes in the forward direction work correctly.
@@ -67,3 +67,96 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
6767 assert_eq ! ( Ok ( Some ( expected) ) , dfa. try_search_fwd( & Input :: new( b" a" ) ) ) ;
6868 Ok ( ( ) )
6969}
70+
71+ // A variant of [`Automaton::is_special_state`]'s doctest, but with universal
72+ // start states.
73+ //
74+ // See: https://github.com/rust-lang/regex/pull/1195
75+ #[ test]
76+ fn universal_start_search ( ) -> Result < ( ) , Box < dyn Error > > {
77+ fn find < A : Automaton > (
78+ dfa : & A ,
79+ haystack : & [ u8 ] ,
80+ ) -> Result < Option < HalfMatch > , MatchError > {
81+ let mut state = dfa
82+ . universal_start_state ( Anchored :: No )
83+ . expect ( "regex should not require lookbehind" ) ;
84+ let mut last_match = None ;
85+ // Walk all the bytes in the haystack. We can quit early if we see
86+ // a dead or a quit state. The former means the automaton will
87+ // never transition to any other state. The latter means that the
88+ // automaton entered a condition in which its search failed.
89+ for ( i, & b) in haystack. iter ( ) . enumerate ( ) {
90+ state = dfa. next_state ( state, b) ;
91+ if dfa. is_special_state ( state) {
92+ if dfa. is_match_state ( state) {
93+ last_match =
94+ Some ( HalfMatch :: new ( dfa. match_pattern ( state, 0 ) , i) ) ;
95+ } else if dfa. is_dead_state ( state) {
96+ return Ok ( last_match) ;
97+ } else if dfa. is_quit_state ( state) {
98+ // It is possible to enter into a quit state after
99+ // observing a match has occurred. In that case, we
100+ // should return the match instead of an error.
101+ if last_match. is_some ( ) {
102+ return Ok ( last_match) ;
103+ }
104+ return Err ( MatchError :: quit ( b, i) ) ;
105+ }
106+ // Implementors may also want to check for start or accel
107+ // states and handle them differently for performance
108+ // reasons. But it is not necessary for correctness.
109+ }
110+ }
111+ // Matches are always delayed by 1 byte, so we must explicitly walk
112+ // the special "EOI" transition at the end of the search.
113+ state = dfa. next_eoi_state ( state) ;
114+ if dfa. is_match_state ( state) {
115+ last_match = Some ( HalfMatch :: new (
116+ dfa. match_pattern ( state, 0 ) ,
117+ haystack. len ( ) ,
118+ ) ) ;
119+ }
120+ Ok ( last_match)
121+ }
122+
123+ fn check_impl (
124+ dfa : impl Automaton ,
125+ haystack : & str ,
126+ pat : usize ,
127+ offset : usize ,
128+ ) -> Result < ( ) , Box < dyn Error > > {
129+ let haystack = haystack. as_bytes ( ) ;
130+ let mat = find ( & dfa, haystack) ?. unwrap ( ) ;
131+ assert_eq ! ( mat. pattern( ) . as_usize( ) , pat) ;
132+ assert_eq ! ( mat. offset( ) , offset) ;
133+ Ok ( ( ) )
134+ }
135+
136+ fn check (
137+ dfa : & dense:: DFA < Vec < u32 > > ,
138+ haystack : & str ,
139+ pat : usize ,
140+ offset : usize ,
141+ ) -> Result < ( ) , Box < dyn Error > > {
142+ check_impl ( dfa, haystack, pat, offset) ?;
143+ check_impl ( dfa. to_sparse ( ) ?, haystack, pat, offset) ?;
144+ Ok ( ( ) )
145+ }
146+
147+ let dfa = dense:: DFA :: new ( r"[a-z]+" ) ?;
148+ let haystack = "123 foobar 4567" ;
149+ check ( & dfa, haystack, 0 , 10 ) ?;
150+
151+ let dfa = dense:: DFA :: new ( r"[0-9]{4}" ) ?;
152+ let haystack = "123 foobar 4567" ;
153+ check ( & dfa, haystack, 0 , 15 ) ?;
154+
155+ let dfa = dense:: DFA :: new_many ( & [ r"[a-z]+" , r"[0-9]+" ] ) ?;
156+ let haystack = "123 foobar 4567" ;
157+ check ( & dfa, haystack, 1 , 3 ) ?;
158+ check ( & dfa, & haystack[ 3 ..] , 0 , 7 ) ?;
159+ check ( & dfa, & haystack[ 10 ..] , 1 , 5 ) ?;
160+
161+ Ok ( ( ) )
162+ }
0 commit comments