@@ -82,7 +82,7 @@ enum UWordBoundsState {
8282 ExtendNumLet ,
8383 Regional ( RegionalState ) ,
8484 FormatExtend ( FormatExtendType ) ,
85- Zwj ( /* tainted */ bool ) ,
85+ Zwj ,
8686 Emoji ,
8787}
8888
@@ -130,9 +130,12 @@ impl<'a> Iterator for UWordBounds<'a> {
130130 let mut cat = wd:: WC_Any ;
131131 let mut savecat = wd:: WC_Any ;
132132
133+ // Whether or not the previous category was ZWJ
134+ // ZWJs get collapsed, so this handles precedence of WB3c over WB4
135+ let mut prev_zwj;
133136 for ( curr, ch) in self . string . char_indices ( ) {
134137 idx = curr;
135-
138+ prev_zwj = cat == wd :: WC_ZWJ ;
136139 // if there's a category cached, grab it
137140 cat = match self . cat {
138141 None => wd:: word_category ( ch) ,
@@ -141,42 +144,49 @@ impl<'a> Iterator for UWordBounds<'a> {
141144 take_cat = true ;
142145
143146 // handle rule WB4
144- // just skip all format and extend chars
147+ // just skip all format, extend, and zwj chars
145148 // note that Start is a special case: if there's a bunch of Format | Extend
146149 // characters at the beginning of a block of text, dump them out as one unit.
147150 //
148151 // (This is not obvious from the wording of UAX#29, but if you look at the
149152 // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
150153 // then the "correct" interpretation of WB4 becomes apparent.)
151- //
154+ if state != Start {
155+ match cat {
156+ wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => {
157+ continue
158+ }
159+ _ => { }
160+ }
161+ }
162+
163+ // rule WB3c
152164 // WB4 makes all ZWJs collapse into the previous state
153165 // but you can still be in a Zwj state if you started with Zwj
154166 //
155167 // This means that Zwj + Extend will collapse into Zwj, which is wrong,
156168 // since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
157169 // and that rule (WB3c) has higher priority
158170 //
159- // Thus, when in the Zwj state, we track if the last collapsed character is also
160- // a ZWJ. If it isn't, we treat that as a "tainted" zwj, which is basically
161- // of the form ZWJ (Format | Extend | ZWJ)* (Format | Extend).
162- if state != Start {
163- match cat {
164- wd:: WC_Extend | wd:: WC_Format => {
165- if let Zwj ( ref mut taint) = state {
166- * taint = true ;
167- }
168- continue
169- }
170- wd:: WC_ZWJ => {
171- if let Zwj ( ref mut taint) = state {
172- * taint = false ;
173- }
174- continue
175- }
176- _ => { }
171+ // Additionally, Emoji_Base+ZWJ+(EBG/GAZ) will collapse into Emoji_Base+EBG/GAZ
172+ // which won't have a boundary even though EB+ZWJ+GAZ should have a boundary.
173+ //
174+ // Thus, we separately keep track of whether or not the last character
175+ // was a ZWJ. This is an additional bit of state tracked outside of the
176+ // state enum; the state enum represents the last non-zwj state encountered.
177+ // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
178+ // however we are in the previous state for the purposes of all other rules.
179+ if prev_zwj {
180+ match cat {
181+ wd:: WC_Glue_After_Zwj => continue ,
182+ wd:: WC_E_Base_GAZ => {
183+ state = Emoji ;
184+ continue ;
185+ } ,
186+ _ => ( )
177187 }
178188 }
179-
189+ // Don't use `continue` in this match without updating `cat`
180190 state = match state {
181191 Start if cat == wd:: WC_CR => {
182192 idx += match self . get_next_cat ( idx) {
@@ -193,7 +203,7 @@ impl<'a> Iterator for UWordBounds<'a> {
193203 wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a, WB13b
194204 wd:: WC_Regional_Indicator => Regional ( RegionalState :: Half ) , // rule WB13c
195205 wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
196- wd:: WC_ZWJ => Zwj ( false ) , // rule WB3c
206+ wd:: WC_ZWJ => Zwj , // rule WB3c
197207 wd:: WC_E_Base | wd:: WC_E_Base_GAZ => Emoji , // rule WB14
198208 _ => {
199209 if let Some ( ncat) = self . get_next_cat ( idx) { // rule WB4
@@ -206,21 +216,13 @@ impl<'a> Iterator for UWordBounds<'a> {
206216 break ; // rule WB999
207217 }
208218 } ,
209- Zwj ( true ) => {
219+ Zwj => {
220+ // We already handle WB3c above. At this point,
221+ // the current category is not GAZ or EBG,
222+ // or the previous character was not actually a ZWJ
210223 take_curr = false ;
211- break
224+ break ;
212225 }
213- Zwj ( false ) => match cat { // rule WB3c
214- wd:: WC_Glue_After_Zwj => continue ,
215- wd:: WC_E_Base_GAZ => {
216- state = Emoji ;
217- continue ;
218- } ,
219- _ => {
220- take_curr = false ;
221- break ;
222- }
223- } ,
224226 Letter | HLetter => match cat {
225227 wd:: WC_ALetter => Letter , // rule WB5
226228 wd:: WC_Hebrew_Letter => HLetter , // rule WB5
@@ -294,7 +296,7 @@ impl<'a> Iterator for UWordBounds<'a> {
294296 } ,
295297 Regional ( _) => unreachable ! ( "RegionalState::Unknown should not occur on forward iteration" ) ,
296298 Emoji => match cat { // rule WB14
297- wd:: WC_E_Modifier => continue ,
299+ wd:: WC_E_Modifier => state ,
298300 _ => {
299301 take_curr = false ;
300302 break ;
@@ -358,6 +360,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
358360 let mut state = Start ;
359361 let mut savestate = Start ;
360362 let mut cat = wd:: WC_Any ;
363+
361364 for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
362365 previdx = idx;
363366 idx = curr;
@@ -375,9 +378,11 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
375378 // Hebrew Letter immediately before it.
376379 // (2) Format and Extend char handling takes some gymnastics.
377380
378- if cat == wd:: WC_Extend || cat == wd:: WC_Format || cat == wd:: WC_ZWJ {
381+ if cat == wd:: WC_Extend
382+ || cat == wd:: WC_Format
383+ || ( cat == wd:: WC_ZWJ && state != Zwj ) { // WB3c has more priority so we should not
384+ // fold in that case
379385 if match state {
380- Zwj ( _) if cat == wd:: WC_ZWJ => false ,
381386 FormatExtend ( _) | Start => false ,
382387 _ => true
383388 } {
@@ -396,6 +401,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
396401 take_cat = false ;
397402 }
398403
404+ // Don't use `continue` in this match without updating `catb`
399405 state = match state {
400406 Start | FormatExtend ( AcceptAny ) => match cat {
401407 wd:: WC_ALetter => Letter , // rule WB5, WB7, WB10, WB13b
@@ -404,7 +410,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
404410 wd:: WC_Katakana => Katakana , // rule WB13, WB13b
405411 wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a
406412 wd:: WC_Regional_Indicator => Regional ( RegionalState :: Unknown ) , // rule WB13c
407- wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj ( false ) , // rule WB3c
413+ wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj , // rule WB3c
408414 // rule WB4:
409415 wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => FormatExtend ( AcceptAny ) ,
410416 wd:: WC_Single_Quote => {
@@ -427,8 +433,10 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
427433 } ,
428434 _ => break // rule WB999
429435 } ,
430- Zwj ( _) => match cat { // rule WB3c
431- wd:: WC_ZWJ => continue ,
436+ Zwj => match cat { // rule WB3c
437+ wd:: WC_ZWJ => {
438+ FormatExtend ( AcceptAny )
439+ }
432440 _ => {
433441 take_curr = false ;
434442 break ;
@@ -515,8 +523,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
515523 } ,
516524 Emoji => match cat { // rule WB14
517525 wd:: WC_E_Base | wd:: WC_E_Base_GAZ => {
518- state = Zwj ( false ) ;
519- continue
526+ Zwj
520527 } ,
521528 _ => {
522529 take_curr = false ;
0 commit comments