@@ -3,7 +3,7 @@ use std::error::Error;
33use regex_automata:: {
44 dfa:: { dense, Automaton , OverlappingState } ,
55 nfa:: thompson,
6- HalfMatch , Input , MatchError ,
6+ Anchored , HalfMatch , Input , MatchError ,
77} ;
88
99// Tests that quit bytes in the forward direction work correctly.
@@ -67,3 +67,94 @@ fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
6767 assert_eq ! ( Ok ( Some ( expected) ) , dfa. try_search_fwd( & Input :: new( b" a" ) ) ) ;
6868 Ok ( ( ) )
6969}
70+
71+ // A variant of [`Automaton::is_special_state`]'s doctest, but with universal start states.
72+ #[ test]
73+ fn universal_start_search ( ) -> Result < ( ) , Box < dyn Error > > {
74+ fn find < A : Automaton > (
75+ dfa : & A ,
76+ haystack : & [ u8 ] ,
77+ ) -> Result < Option < HalfMatch > , MatchError > {
78+ let mut state = dfa
79+ . universal_start_state ( Anchored :: No )
80+ . expect ( "regex should not require lookbehind" ) ;
81+ assert ! ( dfa. is_start_state( state) ) ;
82+ let mut last_match = None ;
83+ // Walk all the bytes in the haystack. We can quit early if we see
84+ // a dead or a quit state. The former means the automaton will
85+ // never transition to any other state. The latter means that the
86+ // automaton entered a condition in which its search failed.
87+ for ( i, & b) in haystack. iter ( ) . enumerate ( ) {
88+ state = dfa. next_state ( state, b) ;
89+ if dfa. is_special_state ( state) {
90+ if dfa. is_match_state ( state) {
91+ last_match =
92+ Some ( HalfMatch :: new ( dfa. match_pattern ( state, 0 ) , i) ) ;
93+ } else if dfa. is_dead_state ( state) {
94+ return Ok ( last_match) ;
95+ } else if dfa. is_quit_state ( state) {
96+ // It is possible to enter into a quit state after
97+ // observing a match has occurred. In that case, we
98+ // should return the match instead of an error.
99+ if last_match. is_some ( ) {
100+ return Ok ( last_match) ;
101+ }
102+ return Err ( MatchError :: quit ( b, i) ) ;
103+ }
104+ // Implementors may also want to check for start or accel
105+ // states and handle them differently for performance
106+ // reasons. But it is not necessary for correctness.
107+ }
108+ }
109+ // Matches are always delayed by 1 byte, so we must explicitly walk
110+ // the special "EOI" transition at the end of the search.
111+ state = dfa. next_eoi_state ( state) ;
112+ if dfa. is_match_state ( state) {
113+ last_match = Some ( HalfMatch :: new (
114+ dfa. match_pattern ( state, 0 ) ,
115+ haystack. len ( ) ,
116+ ) ) ;
117+ }
118+ Ok ( last_match)
119+ }
120+
121+ fn check_impl (
122+ dfa : impl Automaton ,
123+ haystack : & str ,
124+ pat : usize ,
125+ offset : usize ,
126+ ) -> Result < ( ) , Box < dyn Error > > {
127+ let haystack = haystack. as_bytes ( ) ;
128+ let mat = find ( & dfa, haystack) ?. unwrap ( ) ;
129+ assert_eq ! ( mat. pattern( ) . as_usize( ) , pat) ;
130+ assert_eq ! ( mat. offset( ) , offset) ;
131+ Ok ( ( ) )
132+ }
133+
134+ fn check (
135+ dfa : & dense:: DFA < Vec < u32 > > ,
136+ haystack : & str ,
137+ pat : usize ,
138+ offset : usize ,
139+ ) -> Result < ( ) , Box < dyn Error > > {
140+ check_impl ( dfa, haystack, pat, offset) ?;
141+ check_impl ( dfa. to_sparse ( ) ?, haystack, pat, offset) ?;
142+ Ok ( ( ) )
143+ }
144+
145+ let dfa = dense:: DFA :: new ( r"[a-z]+" ) ?;
146+ let haystack = "123 foobar 4567" ;
147+ check ( & dfa, haystack, 0 , 10 ) ?;
148+
149+ let dfa = dense:: DFA :: new ( r"[0-9]{4}" ) ?;
150+ let haystack = "123 foobar 4567" ;
151+ check ( & dfa, haystack, 0 , 15 ) ?;
152+
153+ let dfa = dense:: DFA :: new_many ( & [ r"[a-z]+" , r"[0-9]+" ] ) ?;
154+ let haystack = "123 foobar 4567" ;
155+ check ( & dfa, haystack, 1 , 3 ) ?;
156+ check ( & dfa, & haystack[ 3 ..] , 0 , 7 ) ?;
157+ check ( & dfa, & haystack[ 10 ..] , 1 , 5 ) ?;
158+
159+ Ok ( ( ) )
160+ }
0 commit comments