@@ -80,7 +80,7 @@ enum UWordBoundsState {
8080 Numeric ,
8181 Katakana ,
8282 ExtendNumLet ,
83- Regional ( /* half */ bool ) ,
83+ Regional ( RegionalState ) ,
8484 FormatExtend ( FormatExtendType ) ,
8585 Zwj ( /* tainted */ bool ) ,
8686 Emoji ,
@@ -97,6 +97,13 @@ enum FormatExtendType {
9797 RequireNumeric ,
9898}
9999
100+ #[ derive( Clone , Copy , PartialEq , Eq , Debug ) ]
101+ enum RegionalState {
102+ Half ,
103+ Full ,
104+ Unknown ,
105+ }
106+
100107impl < ' a > Iterator for UWordBounds < ' a > {
101108 type Item = & ' a str ;
102109
@@ -184,7 +191,7 @@ impl<'a> Iterator for UWordBounds<'a> {
184191 wd:: WC_Numeric => Numeric , // rule WB8, WB10, WB12, WB13a
185192 wd:: WC_Katakana => Katakana , // rule WB13, WB13a
186193 wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a, WB13b
187- wd:: WC_Regional_Indicator => Regional ( /* half = */ true ) , // rule WB13c
194+ wd:: WC_Regional_Indicator => Regional ( RegionalState :: Half ) , // rule WB13c
188195 wd:: WC_LF | wd:: WC_Newline => break , // rule WB3a
189196 wd:: WC_ZWJ => Zwj ( false ) , // rule WB3c
190197 wd:: WC_E_Base | wd:: WC_E_Base_GAZ => Emoji , // rule WB14
@@ -269,20 +276,21 @@ impl<'a> Iterator for UWordBounds<'a> {
269276 break ;
270277 }
271278 } ,
272- Regional ( false ) => {
279+ Regional ( RegionalState :: Full ) => {
273280 // if it reaches here we've gone too far,
274281 // a full flag can only compose with ZWJ/Extend/Format
275282 // proceeding it.
276283 take_curr = false ;
277284 break ;
278285 }
279- Regional ( /* half */ true ) => match cat {
280- wd:: WC_Regional_Indicator => Regional ( false ) , // rule WB13c
286+ Regional ( RegionalState :: Half ) => match cat {
287+ wd:: WC_Regional_Indicator => Regional ( RegionalState :: Full ) , // rule WB13c
281288 _ => {
282289 take_curr = false ;
283290 break ;
284291 }
285292 } ,
293+ Regional ( _) => unreachable ! ( "RegionalState::Unknown should not occur on forward iteration" ) ,
286294 Emoji => match cat { // rule WB14
287295 wd:: WC_E_Modifier => continue ,
288296 _ => {
@@ -392,7 +400,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
392400 wd:: WC_Numeric => Numeric , // rule WB8, WB9, WB11, WB13b
393401 wd:: WC_Katakana => Katakana , // rule WB13, WB13b
394402 wd:: WC_ExtendNumLet => ExtendNumLet , // rule WB13a
395- wd:: WC_Regional_Indicator => Regional ( true ) , // rule WB13c
403+ wd:: WC_Regional_Indicator => Regional ( RegionalState :: Unknown ) , // rule WB13c
396404 wd:: WC_Glue_After_Zwj | wd:: WC_E_Base_GAZ => Zwj ( false ) , // rule WB3c
397405 // rule WB4:
398406 wd:: WC_Extend | wd:: WC_Format | wd:: WC_ZWJ => FormatExtend ( AcceptAny ) ,
@@ -474,8 +482,31 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
474482 break ;
475483 }
476484 } ,
477- Regional ( _) => match cat {
478- wd:: WC_Regional_Indicator => Regional ( true ) , // rule WB13c
485+ Regional ( mut regional_state) => match cat {
486+ // rule WB13c
487+ wd:: WC_Regional_Indicator => {
488+ if regional_state == RegionalState :: Unknown {
489+ let count = self . string [ ..previdx]
490+ . chars ( ) . rev ( )
491+ . map ( |c| wd:: word_category ( c) )
492+ // Ignore because of WB4
493+ // Combining characters *inside* flag emoji. Yay.
494+ . filter ( |& c| ! ( c == wd:: WC_ZWJ || c == wd:: WC_Extend || c == wd:: WC_Format ) )
495+ . take_while ( |& c| c == wd:: WC_Regional_Indicator )
496+ . count ( ) ;
497+ regional_state = if count % 2 == 0 {
498+ RegionalState :: Full
499+ } else {
500+ RegionalState :: Half
501+ } ;
502+ }
503+ if regional_state == RegionalState :: Full {
504+ take_curr = false ;
505+ break ;
506+ } else {
507+ Regional ( RegionalState :: Full )
508+ }
509+ }
479510 _ => {
480511 take_curr = false ;
481512 break ;
0 commit comments