@@ -82,6 +82,7 @@ enum UWordBoundsState {
8282 ExtendNumLet ,
8383 Regional ,
8484 FormatExtend ( FormatExtendType ) ,
85+ Zwj ,
8586}
8687
8788// subtypes for FormatExtend state in UWordBoundsState
@@ -138,8 +139,11 @@ impl<'a> Iterator for UWordBounds<'a> {
138139 // (This is not obvious from the wording of UAX#29, but if you look at the
139140 // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
140141 // then the "correct" interpretation of WB4 becomes apparent.)
141- if state != Start && ( cat == wd:: WC_Extend || cat == wd:: WC_Format ) {
142- continue ;
142+ if state != Start {
143+ match cat {
144+ wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => continue ,
145+ _ => { }
146+ }
143147 }
144148
145149 state = match state {
@@ -158,9 +162,10 @@ impl<'a> Iterator for UWordBounds<'a> {
158162 wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a, WB13b
159163 wd:: WC_Regional_Indicator => Regional , // rule WB13c
160164 wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
165+ wd:: WC_ZWJ => Zwj , // rule WB3c
161166 _ => {
162167 if let Some ( ncat) = self . get_next_cat ( idx) { // rule WB4
163- if ncat == wd:: WC_Format || ncat == wd:: WC_Extend {
168+ if ncat == wd:: WC_Format || ncat == wd:: WC_Extend || ncat == wd :: WC_ZWJ {
164169 state = FormatExtend ( AcceptNone ) ;
165170 self . cat = Some ( ncat) ;
166171 continue ;
@@ -169,6 +174,13 @@ impl<'a> Iterator for UWordBounds<'a> {
169174 break ; // rule WB14
170175 }
171176 } ,
177+ Zwj => match cat { // rule WB3c
178+ wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => continue ,
179+ _ => {
180+ take_curr = false ;
181+ break ;
182+ }
183+ } ,
172184 Letter | HLetter => match cat {
173185 wd:: WC_ALetter => Letter , // rule WB5
174186 wd:: WC_Hebrew_Letter => HLetter , // rule WB5
@@ -336,7 +348,9 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
336348 wd:: WC_Katakana => Katakana , // rule WB13, WB13b
337349 wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a
338350 wd:: WC_Regional_Indicator => Regional , // rule WB13c
339- wd:: WC_Extend | wd:: WC_Format => FormatExtend ( AcceptAny ) , // rule WB4
351+ wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj , // rule WB3c
352+ // rule WB4:
353+ wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => FormatExtend ( AcceptAny ) ,
340354 wd:: WC_Single_Quote => {
341355 saveidx = idx;
342356 FormatExtend ( AcceptQLetter ) // rule WB7a
@@ -356,6 +370,13 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
356370 } ,
357371 _ => break // rule WB14
358372 } ,
373+ Zwj => match cat { // rule WB3c
374+ wd:: WC_ZWJ => continue ,
375+ _ => {
376+ take_curr = false ;
377+ break ;
378+ }
379+ } ,
359380 Letter | HLetter => match cat {
360381 wd:: WC_ALetter => Letter , // rule WB5
361382 wd:: WC_Hebrew_Letter => HLetter , // rule WB5
0 commit comments