@@ -51,14 +51,15 @@ pub struct Graphemes<'a> {
5151}
5252
5353// state machine for cluster boundary rules
54- #[ derive( PartialEq , Eq ) ]
54+ #[ derive( Copy , Clone , PartialEq , Eq ) ]
5555enum GraphemeState {
5656 Start ,
5757 FindExtend ,
5858 HangulL ,
5959 HangulLV ,
6060 HangulLVT ,
6161 Regional ,
62+ Emoji ,
6263 Zwj ,
6364}
6465
@@ -94,6 +95,10 @@ impl<'a> Iterator for Graphemes<'a> {
9495 _ => self . cat . take ( ) . unwrap ( )
9596 } ;
9697
98+ if ( state, cat) == ( Emoji , gr:: GC_Extend ) {
99+ continue ; // rule GB10
100+ }
101+
97102 if let Some ( new_state) = match cat {
98103 gr:: GC_Extend => Some ( FindExtend ) , // rule GB9
99104 gr:: GC_SpacingMark if self . extended => Some ( FindExtend ) , // rule GB9a
@@ -119,6 +124,7 @@ impl<'a> Iterator for Graphemes<'a> {
119124 gr:: GC_LV | gr:: GC_V => HangulLV ,
120125 gr:: GC_LVT | gr:: GC_T => HangulLVT ,
121126 gr:: GC_Regional_Indicator => Regional ,
127+ gr:: GC_E_Base | gr:: GC_E_Base_GAZ => Emoji ,
122128 _ => FindExtend
123129 } ,
124130 FindExtend => { // found non-extending when looking for extending
@@ -156,8 +162,16 @@ impl<'a> Iterator for Graphemes<'a> {
156162 break ;
157163 }
158164 } ,
165+ Emoji => match cat { // rule GB10: (E_Base|EBG) Extend* x E_Modifier
166+ gr:: GC_E_Modifier => continue ,
167+ _ => {
168+ take_curr = false ;
169+ break ;
170+ }
171+ } ,
159172 Zwj => match cat { // rule GB11: ZWJ x (GAZ|EBG)
160- gr:: GC_Glue_After_Zwj | gr:: GC_E_Base_GAZ => continue ,
173+ gr:: GC_Glue_After_Zwj => continue ,
174+ gr:: GC_E_Base_GAZ => Emoji ,
161175 _ => {
162176 take_curr = false ;
163177 break ;
@@ -193,7 +207,8 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
193207 let mut previdx = idx;
194208 let mut state = Start ;
195209 let mut cat = gr:: GC_Any ;
196- for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
210+
211+ ' outer: for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
197212 previdx = idx;
198213 idx = curr;
199214
@@ -225,6 +240,7 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
225240 gr:: GC_Extend => FindExtend ,
226241 gr:: GC_SpacingMark if self . extended => FindExtend ,
227242 gr:: GC_ZWJ => FindExtend ,
243+ gr:: GC_E_Modifier => Emoji ,
228244 gr:: GC_Glue_After_Zwj | gr:: GC_E_Base_GAZ => Zwj ,
229245 gr:: GC_L | gr:: GC_LV | gr:: GC_LVT => HangulL ,
230246 gr:: GC_V => HangulLV ,
@@ -267,6 +283,38 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
267283 break ;
268284 }
269285 } ,
286+ Emoji => { // char to right is E_Modifier
287+ // In order to decide whether to break before this E_Modifier char, we need to
288+ // scan backward past any Extend chars to look for (E_Base|(ZWJ? EBG)).
289+ let mut ebg_idx = None ;
290+ for ( startidx, prev) in self . string [ ..previdx] . char_indices ( ) . rev ( ) {
291+ match ( ebg_idx, gr:: grapheme_category ( prev) ) {
292+ ( None , gr:: GC_Extend ) => continue ,
293+ ( None , gr:: GC_E_Base ) => { // rule GB10
294+ // Found an Emoji modifier sequence. Return the whole sequence.
295+ idx = startidx;
296+ break ' outer;
297+ }
298+ ( None , gr:: GC_E_Base_GAZ ) => { // rule GB10
299+ // Keep scanning in case this is part of an ZWJ x EBJ pair.
300+ ebg_idx = Some ( startidx) ;
301+ }
302+ ( Some ( _) , gr:: GC_ZWJ ) => { // rule GB11
303+ idx = startidx;
304+ break ' outer;
305+ }
306+ _ => break
307+ }
308+ }
309+ if let Some ( ebg_idx) = ebg_idx {
310+ // Found an EBG without a ZWJ before it.
311+ idx = ebg_idx;
312+ break ;
313+ }
314+ // Not part of an Emoji modifier sequence. Break here.
315+ take_curr = false ;
316+ break ;
317+ } ,
270318 Zwj => match cat { // char to right is (GAZ|EBG)
271319 gr:: GC_ZWJ => continue , // rule GB11: ZWJ x (GAZ|EBG)
272320 _ => {
0 commit comments