@@ -48,17 +48,21 @@ pub struct Graphemes<'a> {
4848 extended : bool ,
4949 cat : Option < GraphemeCat > ,
5050 catb : Option < GraphemeCat > ,
51+ regional_count_back : Option < usize > ,
5152}
5253
5354// state machine for cluster boundary rules
54- #[ derive( PartialEq , Eq ) ]
55+ #[ derive( Copy , Clone , PartialEq , Eq ) ]
5556enum GraphemeState {
5657 Start ,
5758 FindExtend ,
5859 HangulL ,
5960 HangulLV ,
6061 HangulLVT ,
62+ Prepend ,
6163 Regional ,
64+ Emoji ,
65+ Zwj ,
6266}
6367
6468impl < ' a > Iterator for Graphemes < ' a > {
@@ -82,6 +86,11 @@ impl<'a> Iterator for Graphemes<'a> {
8286 let mut idx = 0 ;
8387 let mut state = Start ;
8488 let mut cat = gr:: GC_Any ;
89+
90+ // caching used by next_back() should be invalidated
91+ self . regional_count_back = None ;
92+ self . catb = None ;
93+
8594 for ( curr, ch) in self . string . char_indices ( ) {
8695 idx = curr;
8796
@@ -93,13 +102,18 @@ impl<'a> Iterator for Graphemes<'a> {
93102 _ => self . cat . take ( ) . unwrap ( )
94103 } ;
95104
96- if match cat {
97- gr:: GC_Extend => true ,
98- gr:: GC_SpacingMark if self . extended => true ,
99- _ => false
105+ if ( state, cat) == ( Emoji , gr:: GC_Extend ) {
106+ continue ; // rule GB10
107+ }
108+
109+ if let Some ( new_state) = match cat {
110+ gr:: GC_Extend => Some ( FindExtend ) , // rule GB9
111+ gr:: GC_SpacingMark if self . extended => Some ( FindExtend ) , // rule GB9a
112+ gr:: GC_ZWJ => Some ( Zwj ) , // rule GB9/GB11
113+ _ => None
100114 } {
101- state = FindExtend ; // rule GB9/GB9a
102- continue ;
115+ state = new_state ;
116+ continue ;
103117 }
104118
105119 state = match state {
@@ -116,7 +130,9 @@ impl<'a> Iterator for Graphemes<'a> {
116130 gr:: GC_L => HangulL ,
117131 gr:: GC_LV | gr:: GC_V => HangulLV ,
118132 gr:: GC_LVT | gr:: GC_T => HangulLVT ,
133+ gr:: GC_Prepend if self . extended => Prepend ,
119134 gr:: GC_Regional_Indicator => Regional ,
135+ gr:: GC_E_Base | gr:: GC_E_Base_GAZ => Emoji ,
120136 _ => FindExtend
121137 } ,
122138 FindExtend => { // found non-extending when looking for extending
@@ -147,13 +163,35 @@ impl<'a> Iterator for Graphemes<'a> {
147163 break ;
148164 }
149165 } ,
150- Regional => match cat { // rule GB8a
151- gr:: GC_Regional_Indicator => continue ,
166+ Prepend => match cat { // rule GB9b
167+ gr:: GC_Control => {
168+ take_curr = false ;
169+ break ;
170+ }
171+ _ => continue
172+ } ,
173+ Regional => match cat { // rule GB12/GB13
174+ gr:: GC_Regional_Indicator => FindExtend ,
152175 _ => {
153176 take_curr = false ;
154177 break ;
155178 }
156- }
179+ } ,
180+ Emoji => match cat { // rule GB10: (E_Base|EBG) Extend* x E_Modifier
181+ gr:: GC_E_Modifier => continue ,
182+ _ => {
183+ take_curr = false ;
184+ break ;
185+ }
186+ } ,
187+ Zwj => match cat { // rule GB11: ZWJ x (GAZ|EBG)
188+ gr:: GC_Glue_After_Zwj => continue ,
189+ gr:: GC_E_Base_GAZ => Emoji ,
190+ _ => {
191+ take_curr = false ;
192+ break ;
193+ }
194+ } ,
157195 }
158196 }
159197
@@ -184,7 +222,11 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
184222 let mut previdx = idx;
185223 let mut state = Start ;
186224 let mut cat = gr:: GC_Any ;
187- for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
225+
226+ // caching used by next() should be invalidated
227+ self . cat = None ;
228+
229+ ' outer: for ( curr, ch) in self . string . char_indices ( ) . rev ( ) {
188230 previdx = idx;
189231 idx = curr;
190232
@@ -215,6 +257,9 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
215257 Start | FindExtend => match cat {
216258 gr:: GC_Extend => FindExtend ,
217259 gr:: GC_SpacingMark if self . extended => FindExtend ,
260+ gr:: GC_ZWJ => FindExtend ,
261+ gr:: GC_E_Modifier => Emoji ,
262+ gr:: GC_Glue_After_Zwj | gr:: GC_E_Base_GAZ => Zwj ,
218263 gr:: GC_L | gr:: GC_LV | gr:: GC_LVT => HangulL ,
219264 gr:: GC_V => HangulLV ,
220265 gr:: GC_T => HangulLVT ,
@@ -249,8 +294,62 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
249294 break ;
250295 }
251296 } ,
252- Regional => match cat { // rule GB8a
253- gr:: GC_Regional_Indicator => continue ,
297+ Prepend => {
298+ // not used in reverse iteration
299+ unreachable ! ( )
300+ } ,
301+ Regional => { // rule GB12/GB13
302+ // Need to scan backward to find if this is preceded by an odd or even number
303+ // of Regional_Indicator characters.
304+ let count = match self . regional_count_back {
305+ Some ( count) => count,
306+ None => self . string [ ..previdx] . chars ( ) . rev ( ) . take_while ( |c| {
307+ gr:: grapheme_category ( * c) == gr:: GC_Regional_Indicator
308+ } ) . count ( )
309+ } ;
310+ // Cache the count to avoid re-scanning the same chars on the next iteration.
311+ self . regional_count_back = count. checked_sub ( 1 ) ;
312+
313+ if count % 2 == 0 {
314+ take_curr = false ;
315+ break ;
316+ }
317+ continue ;
318+ } ,
319+ Emoji => { // char to right is E_Modifier
320+ // In order to decide whether to break before this E_Modifier char, we need to
321+ // scan backward past any Extend chars to look for (E_Base|(ZWJ? EBG)).
322+ let mut ebg_idx = None ;
323+ for ( startidx, prev) in self . string [ ..previdx] . char_indices ( ) . rev ( ) {
324+ match ( ebg_idx, gr:: grapheme_category ( prev) ) {
325+ ( None , gr:: GC_Extend ) => continue ,
326+ ( None , gr:: GC_E_Base ) => { // rule GB10
327+ // Found an Emoji modifier sequence. Return the whole sequence.
328+ idx = startidx;
329+ break ' outer;
330+ }
331+ ( None , gr:: GC_E_Base_GAZ ) => { // rule GB10
332+ // Keep scanning in case this is part of an ZWJ x EBJ pair.
333+ ebg_idx = Some ( startidx) ;
334+ }
335+ ( Some ( _) , gr:: GC_ZWJ ) => { // rule GB11
336+ idx = startidx;
337+ break ' outer;
338+ }
339+ _ => break
340+ }
341+ }
342+ if let Some ( ebg_idx) = ebg_idx {
343+ // Found an EBG without a ZWJ before it.
344+ idx = ebg_idx;
345+ break ;
346+ }
347+ // Not part of an Emoji modifier sequence. Break here.
348+ take_curr = false ;
349+ break ;
350+ } ,
351+ Zwj => match cat { // char to right is (GAZ|EBG)
352+ gr:: GC_ZWJ => continue , // rule GB11: ZWJ x (GAZ|EBG)
254353 _ => {
255354 take_curr = false ;
256355 break ;
@@ -266,6 +365,19 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
266365 Some ( cat)
267366 } ;
268367
368+ if self . extended && cat != gr:: GC_Control {
369+ // rule GB9b: include any preceding Prepend characters
370+ for ( i, c) in self . string [ ..idx] . char_indices ( ) . rev ( ) {
371+ match gr:: grapheme_category ( c) {
372+ gr:: GC_Prepend => idx = i,
373+ cat => {
374+ self . catb = Some ( cat) ;
375+ break ;
376+ }
377+ }
378+ }
379+ }
380+
269381 let retstr = & self . string [ idx..] ;
270382 self . string = & self . string [ ..idx] ;
271383 Some ( retstr)
@@ -274,7 +386,13 @@ impl<'a> DoubleEndedIterator for Graphemes<'a> {
274386
275387#[ inline]
276388pub fn new_graphemes < ' b > ( s : & ' b str , is_extended : bool ) -> Graphemes < ' b > {
277- Graphemes { string : s, extended : is_extended, cat : None , catb : None }
389+ Graphemes {
390+ string : s,
391+ extended : is_extended,
392+ cat : None ,
393+ catb : None ,
394+ regional_count_back : None
395+ }
278396}
279397
280398#[ inline]
0 commit comments