@@ -7,7 +7,7 @@ pub struct WithUtf16<'object_pool, 'text> {
77 /// line is a string reference
88 pub line : & ' text str ,
99 /// the byte position of each `char` in `line` string slice .
10- pub utf16_byte_indices : OnceCell < Pooled < ' object_pool > > ,
10+ pub utf16_byte_indices : OnceCell < Option < Pooled < ' object_pool > > > ,
1111 object_pool : & ' object_pool ObjectPool ,
1212}
1313
@@ -21,31 +21,68 @@ impl<'object_pool, 'text> WithUtf16<'object_pool, 'text> {
2121 }
2222
2323 /// substring::SubString with cache
24- pub fn substring ( & self , start_index : usize , end_index : usize ) -> & ' text str {
25- if end_index <= start_index {
24+ #[ allow( unsafe_code) ]
25+ pub fn substring (
26+ & self ,
27+ start_utf16_index : usize ,
28+ end_utf16_index : usize ,
29+ ) -> & ' text str {
30+ if end_utf16_index <= start_utf16_index {
2631 return "" ;
2732 }
2833
2934 let utf16_byte_indices = self . utf16_byte_indices . get_or_init ( || {
3035 let mut vec = self . object_pool . pull ( self . line . len ( ) ) ;
31- for ( byte_index, ch) in self . line . char_indices ( ) {
32- match ch. len_utf16 ( ) {
33- 1 => vec. push ( byte_index) ,
34- 2 => {
35- vec. push ( byte_index) ;
36- vec. push ( byte_index) ;
37- }
38- _ => unreachable ! ( ) ,
36+ let bytes = self . line . as_bytes ( ) ;
37+ let mut byte_pos = 0 ;
38+
39+ while byte_pos < bytes. len ( ) {
40+ let byte = unsafe { * bytes. get_unchecked ( byte_pos) } ;
41+
42+ if byte < 0x80 {
43+ // ASCII: 1 byte = 1 UTF-16 unit
44+ vec. push ( byte_pos) ;
45+ byte_pos += 1 ;
46+ } else if byte < 0xE0 {
47+ // 2-byte UTF-8 = 1 UTF-16 unit
48+ vec. push ( byte_pos) ;
49+ byte_pos += 2 ;
50+ } else if byte < 0xF0 {
51+ // 3-byte UTF-8 = 1 UTF-16 unit
52+ vec. push ( byte_pos) ;
53+ byte_pos += 3 ;
54+ } else {
55+ // 4-byte UTF-8 = 2 UTF-16 units (surrogate pair)
56+ vec. push ( byte_pos) ;
57+ vec. push ( byte_pos) ;
58+ byte_pos += 4 ;
3959 }
4060 }
41- vec
61+
62+ if vec. len ( ) == self . line . len ( ) {
63+ // Optimization: UTF-16 length equals UTF-8 length, indicating no surrogate pairs.
64+ // Return None to release the vector back to the object pool for better memory efficiency.
65+ None
66+ } else {
67+ Some ( vec)
68+ }
4269 } ) ;
4370
44- let str_len = self . line . len ( ) ;
45- let start = * utf16_byte_indices. get ( start_index) . unwrap_or ( & str_len) ;
46- let end = * utf16_byte_indices. get ( end_index) . unwrap_or ( & str_len) ;
71+ let utf8_len = self . line . len ( ) ;
72+
73+ let Some ( utf16_byte_indices) = utf16_byte_indices else {
74+ let start_utf16_index = start_utf16_index. min ( utf8_len) ;
75+ let end_utf16_index = end_utf16_index. min ( utf8_len) ;
76+ return unsafe {
77+ self . line . get_unchecked ( start_utf16_index..end_utf16_index)
78+ } ;
79+ } ;
80+
81+ let start = * utf16_byte_indices
82+ . get ( start_utf16_index)
83+ . unwrap_or ( & utf8_len) ;
84+ let end = * utf16_byte_indices. get ( end_utf16_index) . unwrap_or ( & utf8_len) ;
4785
48- #[ allow( unsafe_code) ]
4986 unsafe {
5087 // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
5188 // that the indices obtained from it will always be within the bounds of `self` and they
0 commit comments