1- use std:: { cell:: Cell , marker:: PhantomData } ;
1+ use std:: { cell:: OnceCell , marker:: PhantomData } ;
22
33use crate :: helpers:: SourceText ;
44
99{
1010 /// line is a string reference
1111 pub line : S ,
12- last_char_index_to_byte_index : Cell < ( u32 , u32 ) > ,
12+ /// the byte position of each `char` in `line` string slice .
13+ pub indices_indexes : OnceCell < Vec < usize > > ,
1314 data : PhantomData < & ' a S > ,
1415}
1516
@@ -19,102 +20,32 @@ where
1920{
2021 pub fn new ( line : S ) -> Self {
2122 Self {
23+ indices_indexes : OnceCell :: new ( ) ,
2224 line,
23- last_char_index_to_byte_index : Cell :: new ( ( 0 , 0 ) ) ,
2425 data : PhantomData ,
2526 }
2627 }
2728
28- pub ( crate ) fn substring (
29- & self ,
30- start_char_index : usize ,
31- end_char_index : usize ,
32- ) -> S {
33- if end_char_index <= start_char_index {
29+ /// substring::SubString with cache
30+ pub ( crate ) fn substring ( & self , start_index : usize , end_index : usize ) -> S {
31+ if end_index <= start_index {
3432 return S :: default ( ) ;
3533 }
3634
37- let line_len = self . line . len ( ) ;
38- let mut start_byte_index =
39- if start_char_index == 0 { Some ( 0 ) } else { None } ;
40- let mut end_byte_index = if end_char_index == usize:: MAX {
41- Some ( line_len)
42- } else {
43- None
44- } ;
35+ let indices_indexes = self . indices_indexes . get_or_init ( || {
36+ self . line . char_indices ( ) . map ( |( i, _) | i) . collect :: < Vec < _ > > ( )
37+ } ) ;
4538
46- if start_byte_index. is_some ( ) && end_byte_index. is_some ( ) {
47- return self . line . clone ( ) ;
48- }
49-
50- let ( last_char_index, last_byte_index) =
51- self . last_char_index_to_byte_index . get ( ) ;
52- let byte_index = last_byte_index as usize ;
53-
54- if start_char_index >= last_char_index as usize
55- || end_char_index >= last_char_index as usize
56- {
57- let mut char_index = last_char_index as usize ;
58- #[ allow( unsafe_code) ]
59- let slice = unsafe {
60- // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
61- // that the indices obtained from it will always be within the bounds of `self` and they
62- // will always lie on UTF-8 sequence boundaries.
63- self . line . byte_slice_unchecked ( byte_index..line_len)
64- } ;
65- for ( byte_offset, _) in slice. char_indices ( ) {
66- if char_index == start_char_index {
67- start_byte_index = Some ( byte_index + byte_offset) ;
68- if end_byte_index. is_some ( ) {
69- break ;
70- }
71- } else if char_index == end_char_index {
72- end_byte_index = Some ( byte_index + byte_offset) ;
73- self
74- . last_char_index_to_byte_index
75- . set ( ( char_index as u32 , ( byte_index + byte_offset) as u32 ) ) ;
76- break ;
77- }
78- char_index += 1 ;
79- }
80- }
81-
82- if start_char_index < last_char_index as usize
83- || end_char_index < last_char_index as usize
84- {
85- let mut char_index = last_char_index as usize ;
86- #[ allow( unsafe_code) ]
87- let slice = unsafe {
88- // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
89- // that the indices obtained from it will always be within the bounds of `self` and they
90- // will always lie on UTF-8 sequence boundaries.
91- self . line . byte_slice_unchecked ( 0 ..byte_index)
92- } ;
93- for ( byte_index, char) in slice. char_indices ( ) . rev ( ) {
94- if char_index == end_char_index {
95- end_byte_index = Some ( byte_index + char. len_utf8 ( ) ) ;
96- if start_byte_index. is_some ( ) {
97- break ;
98- }
99- } else if char_index == start_char_index {
100- start_byte_index = Some ( byte_index + char. len_utf8 ( ) ) ;
101- break ;
102- }
103- char_index -= 1 ;
104- }
105- }
106-
107- let start_byte_index = start_byte_index. unwrap_or ( line_len) ;
108- let end_byte_index = end_byte_index. unwrap_or ( line_len) ;
39+ let str_len = self . line . len ( ) ;
40+ let start = * indices_indexes. get ( start_index) . unwrap_or ( & str_len) ;
41+ let end = * indices_indexes. get ( end_index) . unwrap_or ( & str_len) ;
10942
11043 #[ allow( unsafe_code) ]
11144 unsafe {
11245 // SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
11346 // that the indices obtained from it will always be within the bounds of `self` and they
11447 // will always lie on UTF-8 sequence boundaries.
115- self
116- . line
117- . byte_slice_unchecked ( start_byte_index..end_byte_index)
48+ self . line . byte_slice_unchecked ( start..end)
11849 }
11950 }
12051}
@@ -159,19 +90,4 @@ mod tests {
15990 "øbα"
16091 ) ;
16192 }
162-
163- #[ test]
164- fn test_last_char_index_to_byte_index ( ) {
165- let rope_with_indices =
166- WithIndices :: new ( Rope :: from ( "hello world 你好世界" ) ) ;
167- assert_eq ! ( rope_with_indices. substring( 10 , 13 ) , "d 你" ) ;
168- assert_eq ! ( rope_with_indices. substring( 13 , 15 ) , "好世" ) ;
169- assert_eq ! ( rope_with_indices. substring( 10 , 13 ) , "d 你" ) ;
170-
171- let rope_with_indices =
172- WithIndices :: new ( Rope :: from ( "export const answer = 42;\n " ) ) ;
173- assert_eq ! ( rope_with_indices. substring( 7 , 13 ) , "const " ) ;
174- assert_eq ! ( rope_with_indices. substring( 13 , 19 ) , "answer" ) ;
175- assert_eq ! ( rope_with_indices. substring( 7 , 22 ) , "const answer = " ) ;
176- }
17793}
0 commit comments