@@ -72,7 +72,7 @@ impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
7272}
7373
7474// state machine for word boundary rules
75- #[ derive( Clone , Copy , PartialEq , Eq ) ]
75+ #[ derive( Clone , Copy , PartialEq , Eq , Debug ) ]
7676enum UWordBoundsState {
7777 Start ,
7878 Letter ,
@@ -82,12 +82,12 @@ enum UWordBoundsState {
8282 ExtendNumLet ,
8383 Regional ,
8484 FormatExtend ( FormatExtendType ) ,
85- Zwj ,
85+ Zwj ( bool ) ,
8686 Emoji ,
8787}
8888
8989// subtypes for FormatExtend state in UWordBoundsState
90- #[ derive( Clone , Copy , PartialEq , Eq ) ]
90+ #[ derive( Clone , Copy , PartialEq , Eq , Debug ) ]
9191enum FormatExtendType {
9292 AcceptAny ,
9393 AcceptNone ,
@@ -122,6 +122,7 @@ impl<'a> Iterator for UWordBounds<'a> {
122122 let mut state = Start ;
123123 let mut cat = wd:: WC_Any ;
124124 let mut savecat = wd:: WC_Any ;
125+
125126 for ( curr, ch) in self . string . char_indices ( ) {
126127 idx = curr;
127128
@@ -140,9 +141,31 @@ impl<'a> Iterator for UWordBounds<'a> {
140141 // (This is not obvious from the wording of UAX#29, but if you look at the
141142 // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
142143 // then the "correct" interpretation of WB4 becomes apparent.)
144+ //
145+ // WB4 makes all ZWJs collapse into the previous state
146+ // but you can still be in a Zwj state if you started with Zwj
147+ //
148+ // This means that Zwj + Extend will collapse into Zwj, which is wrong,
149+ // since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
150+ // and that rule (WB3c) has higher priority
151+ //
152+ // Thus, when in the Zwj state, we track if the last collapsed character is also
153+ // a ZWJ. If it isn't, we treat that as a "tainted" zwj, which is basically
154+ // of the form ZWJ (Format | Extend | ZWJ)* (Format | Extend).
143155 if state != Start {
144156 match cat {
145- wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => continue ,
157+ wd:: WC_Extend | wd:: WC_Format => {
158+ if let Zwj ( ref mut taint) = state {
159+ * taint = true ;
160+ }
161+ continue
162+ }
163+ wd:: WC_ZWJ => {
164+ if let Zwj ( ref mut taint) = state {
165+ * taint = false ;
166+ }
167+ continue
168+ }
146169 _ => { }
147170 }
148171 }
@@ -163,7 +186,7 @@ impl<'a> Iterator for UWordBounds<'a> {
163186 wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a, WB13b
164187 wd:: WC_Regional_Indicator => Regional , // rule WB13c
165188 wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
166- wd:: WC_ZWJ => Zwj , // rule WB3c
189+ wd:: WC_ZWJ => Zwj ( false ) , // rule WB3c
167190 wd:: WC_E_Base | wd:: WC_E_Base_GAZ => Emoji , // rule WB14
168191 _ => {
169192 if let Some ( ncat) = self . get_next_cat ( idx) { // rule WB4
@@ -176,8 +199,14 @@ impl<'a> Iterator for UWordBounds<'a> {
176199 break ; // rule WB999
177200 }
178201 } ,
179- Zwj => match cat { // rule WB3c
180- wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => continue ,
202+ Zwj ( true ) => {
203+ take_curr = false ;
204+ break
205+ }
206+ Zwj ( false ) => match cat { // rule WB3c
207+ wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => {
208+ continue ;
209+ } ,
181210 _ => {
182211 take_curr = false ;
183212 break ;
@@ -329,7 +358,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
329358 // Hebrew Letter immediately before it.
330359 // (2) Format and Extend char handling takes some gymnastics.
331360
332- if cat == wd:: WC_Extend || cat == wd:: WC_Format {
361+ if cat == wd:: WC_Extend || cat == wd:: WC_Format || cat == wd :: WC_ZWJ {
333362 if match state {
334363 FormatExtend ( _) | Start => false ,
335364 _ => true
@@ -357,7 +386,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
357386 wd:: WC_Katakana => Katakana , // rule WB13, WB13b
358387 wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a
359388 wd:: WC_Regional_Indicator => Regional , // rule WB13c
360- wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj , // rule WB3c
389+ wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj ( false ) , // rule WB3c
361390 // rule WB4:
362391 wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => FormatExtend ( AcceptAny ) ,
363392 wd:: WC_Single_Quote => {
@@ -380,7 +409,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
380409 } ,
381410 _ => break // rule WB999
382411 } ,
383- Zwj => match cat { // rule WB3c
412+ Zwj ( _ ) => match cat { // rule WB3c
384413 wd:: WC_ZWJ => continue ,
385414 _ => {
386415 take_curr = false ;
0 commit comments