@@ -252,13 +252,28 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
252252#[ derive( Clone , Debug ) ]
253253pub struct CharSearcher < ' a > {
254254 haystack : & ' a str ,
255- // invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
255+ // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
256+ // This invariant can be broken *within* next_match and next_match_back, however
257+ // they must exit with fingers on valid code point boundaries.
258+
259+ /// `finger` is the current byte index of the forward search.
260+ /// Imagine that it exists before the byte at its index, i.e.
261+ /// haystack[finger] is the first byte of the slice we must inspect during
262+ /// forward searching
256263 finger : usize ,
264+ /// `finger_back` is the current byte index of the reverse search.
265+ /// Imagine that it exists after the byte at its index, i.e.
266+ /// haystack[finger_back - 1] is the last byte of the slice we must inspect during
267+ /// forward searching (and thus the first byte to be inspected when calling next_back())
257268 finger_back : usize ,
269+ /// The character being searched for
258270 needle : char ,
259- // For ascii chars
260- // invariant: must be an ASCII byte (no high bit)
261- single_byte : Option < u8 > ,
271+
272+ // safety invariant: `utf8_size` must be less than 5
273+ /// The number of bytes `needle` takes up when encoded in utf8
274+ utf8_size : usize ,
275+ /// A utf8 encoded copy of the `needle`
276+ utf8_encoded : [ u8 ; 4 ] ,
262277}
263278
264279unsafe impl < ' a > Searcher < ' a > for CharSearcher < ' a > {
@@ -269,12 +284,12 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
269284 #[ inline]
270285 fn next ( & mut self ) -> SearchStep {
271286 let old_finger = self . finger ;
272- let slice = unsafe { self . haystack . get_unchecked ( old_finger..) } ;
287+ let slice = unsafe { self . haystack . get_unchecked ( old_finger..self . haystack . len ( ) ) } ;
273288 let mut iter = slice. chars ( ) ;
274289 let old_len = iter. iter . len ( ) ;
275290 if let Some ( ch) = iter. next ( ) {
276291 // add byte offset of current character
277- // without recalculating
292+ // without re-encoding as utf-8
278293 self . finger += old_len - iter. iter . len ( ) ;
279294 if ch == self . needle {
280295 SearchStep :: Match ( old_finger, self . finger )
@@ -287,25 +302,44 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
287302 }
288303 #[ inline]
289304 fn next_match ( & mut self ) -> Option < ( usize , usize ) > {
290- if let Some ( byte) = self . single_byte {
291- let old_finger = self . finger ;
292- let slice = unsafe { self . haystack . get_unchecked ( old_finger..) } ;
293- let bytes = slice. as_bytes ( ) ;
294- if let Some ( index) = memchr:: memchr ( byte, bytes) {
295- // index is the index of a valid ASCII byte,
296- // so we can add one to it
297- self . finger += index + 1 ;
298- Some ( ( self . finger - 1 , self . finger ) )
305+ loop {
306+ // get the haystack after the last character found
307+ let bytes = if let Some ( slice) = self . haystack . as_bytes ( ) . get ( self . finger ..) {
308+ slice
299309 } else {
300- None
301- }
302- } else {
303- loop {
304- match self . next ( ) {
305- SearchStep :: Match ( a, b) => break Some ( ( a, b) ) ,
306- SearchStep :: Done => break None ,
307- _ => continue ,
310+ return None ;
311+ } ;
312+ // the last byte of the utf8 encoded needle
313+ let last_byte = unsafe { * self . utf8_encoded . get_unchecked ( self . utf8_size - 1 ) } ;
314+ if let Some ( index) = memchr:: memchr ( last_byte, bytes) {
315+ // The new finger is the index of the byte we found,
316+ // plus one, since we memchr'd for the last byte of the character.
317+ //
318+ // Note that this doesn't always give us a finger on a UTF8 boundary.
319+ // If we *didn't* find our character
320+ // we may have indexed to the non-last byte of a 3-byte or 4-byte character.
321+ // We can't just skip to the next valid starting byte because a character like
322+ // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
323+ // the second byte when searching for the third.
324+ //
325+ // However, this is totally okay. While we have the invariant that
326+ // self.finger is on a UTF8 boundary, this invariant is not relid upon
327+ // within this method (it is relied upon in CharSearcher::next()).
328+ //
329+ // We only exit this method when we reach the end of the string, or if we
330+ // find something. When we find something the `finger` will be set
331+ // to a UTF8 boundary.
332+ self . finger += index + 1 ;
333+ let found_char = self . finger - self . utf8_size ;
334+ if let Some ( slice) = self . haystack . as_bytes ( ) . get ( found_char..self . finger ) {
335+ if slice == & self . utf8_encoded [ 0 ..self . utf8_size ] {
336+ return Some ( ( found_char, self . finger ) ) ;
337+ }
308338 }
339+ } else {
340+ // found nothing, exit
341+ self . finger = self . haystack . len ( ) ;
342+ return None ;
309343 }
310344 }
311345 }
@@ -322,7 +356,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
322356 let old_len = iter. iter . len ( ) ;
323357 if let Some ( ch) = iter. next_back ( ) {
324358 // subtract byte offset of current character
325- // without recalculating
359+ // without re-encoding as utf-8
326360 self . finger_back -= old_len - iter. iter . len ( ) ;
327361 if ch == self . needle {
328362 SearchStep :: Match ( self . finger_back , old_finger)
@@ -335,24 +369,47 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
335369 }
336370 #[ inline]
337371 fn next_match_back ( & mut self ) -> Option < ( usize , usize ) > {
338- if let Some ( byte) = self . single_byte {
339- let old_finger = self . finger_back ;
340- let slice = unsafe { self . haystack . slice_unchecked ( 0 , old_finger) } ;
341- let bytes = slice. as_bytes ( ) ;
342- if let Some ( index) = memchr:: memrchr ( byte, bytes) {
343- // index is the index of a valid ASCII byte
344- self . finger_back = index;
345- Some ( ( self . finger_back , self . finger_back + 1 ) )
372+ let haystack = self . haystack . as_bytes ( ) ;
373+ loop {
374+ // get the haystack up to but not including the last character searched
375+ let bytes = if let Some ( slice) = haystack. get ( ..self . finger_back ) {
376+ slice
346377 } else {
347- None
348- }
349- } else {
350- loop {
351- match self . next_back ( ) {
352- SearchStep :: Match ( a, b) => break Some ( ( a, b) ) ,
353- SearchStep :: Done => break None ,
354- _ => continue ,
378+ return None ;
379+ } ;
380+ // the last byte of the utf8 encoded needle
381+ let last_byte = unsafe { * self . utf8_encoded . get_unchecked ( self . utf8_size - 1 ) } ;
382+ if let Some ( index) = memchr:: memrchr ( last_byte, bytes) {
383+ // memrchr will return the index of the byte we wish to
384+ // find. In case of an ASCII character, this is indeed
385+ // were we wish our new finger to be ("after" the found
386+ // char in the paradigm of reverse iteration). For
387+ // multibyte chars we need to skip down by the number of more
388+ // bytes they have than ASCII
389+ let found_char = index - ( self . utf8_size - 1 ) ;
390+ if let Some ( slice) = haystack. get ( found_char..( found_char + self . utf8_size ) ) {
391+ if slice == & self . utf8_encoded [ 0 ..self . utf8_size ] {
392+ // move finger to before the character found (i.e. at its start index)
393+ self . finger_back = found_char;
394+ return Some ( ( self . finger_back , self . finger_back + self . utf8_size ) ) ;
395+ }
355396 }
397+ // We can't use finger_back = index - size + 1 here. If we found the last char
398+ // of a different-sized character (or the middle byte of a different character)
399+ // we need to bump the finger_back down to `index`. This similarly makes
400+ // `finger_back` have the potential to no longer be on a boundary,
401+ // but this is OK since we only exit this function on a boundary
402+ // or when the haystack has been searched completely.
403+ //
404+ // Unlike next_match this does not
405+ // have the problem of repeated bytes in utf-8 because
406+ // we're searching for the last byte, and we can only have
407+ // found the last byte when searching in reverse.
408+ self . finger_back = index;
409+ } else {
410+ self . finger_back = 0 ;
411+ // found nothing, exit
412+ return None ;
356413 }
357414 }
358415 }
@@ -368,19 +425,16 @@ impl<'a> Pattern<'a> for char {
368425
369426 #[ inline]
370427 fn into_searcher ( self , haystack : & ' a str ) -> Self :: Searcher {
371- let single_byte = if self . len_utf8 ( ) == 1 {
372- let mut storage = [ 0 ] ;
373- self . encode_utf8 ( & mut storage) ;
374- Some ( storage[ 0 ] )
375- } else {
376- None
377- } ;
428+ let mut utf8_encoded = [ 0 ; 4 ] ;
429+ self . encode_utf8 ( & mut utf8_encoded) ;
430+ let utf8_size = self . len_utf8 ( ) ;
378431 CharSearcher {
379432 haystack,
380433 finger : 0 ,
381434 finger_back : haystack. len ( ) ,
382435 needle : self ,
383- single_byte,
436+ utf8_size,
437+ utf8_encoded
384438 }
385439 }
386440
0 commit comments