@@ -7,56 +7,72 @@ use syntax::{TextRange, TextSize};
77
88#[ derive( Clone , Debug , PartialEq , Eq ) ]
99pub struct LineIndex {
10- /// Offset the the beginning of each line, zero-based
10+ /// Offset the beginning of each line, zero-based.
1111 pub ( crate ) newlines : Vec < TextSize > ,
12- /// List of non-ASCII characters on each line
13- pub ( crate ) utf16_lines : NoHashHashMap < u32 , Vec < Utf16Char > > ,
12+ /// List of non-ASCII characters on each line.
13+ pub ( crate ) line_wide_chars : NoHashHashMap < u32 , Vec < WideChar > > ,
1414}
1515
16+ /// Line/Column information in native, utf8 format.
1617#[ derive( Clone , Copy , Debug , PartialEq , Eq , Hash ) ]
17- pub struct LineColUtf16 {
18+ pub struct LineCol {
1819 /// Zero-based
1920 pub line : u32 ,
20- /// Zero-based
21+ /// Zero-based utf8 offset
2122 pub col : u32 ,
2223}
2324
2425#[ derive( Clone , Copy , Debug , PartialEq , Eq , Hash ) ]
25- pub struct LineCol {
26+ pub enum WideEncoding {
27+ Utf16 ,
28+ Utf32 ,
29+ }
30+
31+ /// Line/Column information in legacy encodings.
32+ ///
33+ /// Deliberately not a generic type and different from `LineCol`.
34+ #[ derive( Clone , Copy , Debug , PartialEq , Eq , Hash ) ]
35+ pub struct WideLineCol {
2636 /// Zero-based
2737 pub line : u32 ,
28- /// Zero-based utf8 offset
38+ /// Zero-based
2939 pub col : u32 ,
3040}
3141
3242#[ derive( Clone , Debug , Hash , PartialEq , Eq ) ]
33- pub ( crate ) struct Utf16Char {
43+ pub ( crate ) struct WideChar {
3444 /// Start offset of a character inside a line, zero-based
3545 pub ( crate ) start : TextSize ,
3646 /// End offset of a character inside a line, zero-based
3747 pub ( crate ) end : TextSize ,
3848}
3949
40- impl Utf16Char {
50+ impl WideChar {
4151 /// Returns the length in 8-bit UTF-8 code units.
4252 fn len ( & self ) -> TextSize {
4353 self . end - self . start
4454 }
4555
46- /// Returns the length in 16-bit UTF-16 code units.
47- fn len_utf16 ( & self ) -> usize {
48- if self . len ( ) == TextSize :: from ( 4 ) {
49- 2
50- } else {
51- 1
56+ /// Returns the length in UTF-16 or UTF-32 code units.
57+ fn wide_len ( & self , enc : WideEncoding ) -> usize {
58+ match enc {
59+ WideEncoding :: Utf16 => {
60+ if self . len ( ) == TextSize :: from ( 4 ) {
61+ 2
62+ } else {
63+ 1
64+ }
65+ }
66+
67+ WideEncoding :: Utf32 => 1 ,
5268 }
5369 }
5470}
5571
5672impl LineIndex {
5773 pub fn new ( text : & str ) -> LineIndex {
58- let mut utf16_lines = NoHashHashMap :: default ( ) ;
59- let mut utf16_chars = Vec :: new ( ) ;
74+ let mut line_wide_chars = NoHashHashMap :: default ( ) ;
75+ let mut wide_chars = Vec :: new ( ) ;
6076
6177 let mut newlines = Vec :: with_capacity ( 16 ) ;
6278 newlines. push ( TextSize :: from ( 0 ) ) ;
@@ -71,8 +87,8 @@ impl LineIndex {
7187 newlines. push ( curr_row) ;
7288
7389 // Save any utf-16 characters seen in the previous line
74- if !utf16_chars . is_empty ( ) {
75- utf16_lines . insert ( line, mem:: take ( & mut utf16_chars ) ) ;
90+ if !wide_chars . is_empty ( ) {
91+ line_wide_chars . insert ( line, mem:: take ( & mut wide_chars ) ) ;
7692 }
7793
7894 // Prepare for processing the next line
@@ -82,18 +98,18 @@ impl LineIndex {
8298 }
8399
84100 if !c. is_ascii ( ) {
85- utf16_chars . push ( Utf16Char { start : curr_col, end : curr_col + c_len } ) ;
101+ wide_chars . push ( WideChar { start : curr_col, end : curr_col + c_len } ) ;
86102 }
87103
88104 curr_col += c_len;
89105 }
90106
91107 // Save any utf-16 characters seen in the last line
92- if !utf16_chars . is_empty ( ) {
93- utf16_lines . insert ( line, utf16_chars ) ;
108+ if !wide_chars . is_empty ( ) {
109+ line_wide_chars . insert ( line, wide_chars ) ;
94110 }
95111
96- LineIndex { newlines, utf16_lines }
112+ LineIndex { newlines, line_wide_chars }
97113 }
98114
99115 pub fn line_col ( & self , offset : TextSize ) -> LineCol {
@@ -109,13 +125,13 @@ impl LineIndex {
109125 . map ( |offset| offset + TextSize :: from ( line_col. col ) )
110126 }
111127
112- pub fn to_utf16 ( & self , line_col : LineCol ) -> LineColUtf16 {
113- let col = self . utf8_to_utf16_col ( line_col. line , line_col. col . into ( ) ) ;
114- LineColUtf16 { line : line_col. line , col : col as u32 }
128+ pub fn to_wide ( & self , enc : WideEncoding , line_col : LineCol ) -> WideLineCol {
129+ let col = self . utf8_to_wide_col ( enc , line_col. line , line_col. col . into ( ) ) ;
130+ WideLineCol { line : line_col. line , col : col as u32 }
115131 }
116132
117- pub fn to_utf8 ( & self , line_col : LineColUtf16 ) -> LineCol {
118- let col = self . utf16_to_utf8_col ( line_col. line , line_col. col ) ;
133+ pub fn to_utf8 ( & self , enc : WideEncoding , line_col : WideLineCol ) -> LineCol {
134+ let col = self . wide_to_utf8_col ( enc , line_col. line , line_col. col ) ;
119135 LineCol { line : line_col. line , col : col. into ( ) }
120136 }
121137
@@ -132,12 +148,12 @@ impl LineIndex {
132148 . filter ( |it| !it. is_empty ( ) )
133149 }
134150
135- fn utf8_to_utf16_col ( & self , line : u32 , col : TextSize ) -> usize {
151+ fn utf8_to_wide_col ( & self , enc : WideEncoding , line : u32 , col : TextSize ) -> usize {
136152 let mut res: usize = col. into ( ) ;
137- if let Some ( utf16_chars ) = self . utf16_lines . get ( & line) {
138- for c in utf16_chars {
153+ if let Some ( wide_chars ) = self . line_wide_chars . get ( & line) {
154+ for c in wide_chars {
139155 if c. end <= col {
140- res -= usize:: from ( c. len ( ) ) - c. len_utf16 ( ) ;
156+ res -= usize:: from ( c. len ( ) ) - c. wide_len ( enc ) ;
141157 } else {
142158 // From here on, all utf16 characters come *after* the character we are mapping,
143159 // so we don't need to take them into account
@@ -148,11 +164,11 @@ impl LineIndex {
148164 res
149165 }
150166
151- fn utf16_to_utf8_col ( & self , line : u32 , mut col : u32 ) -> TextSize {
152- if let Some ( utf16_chars ) = self . utf16_lines . get ( & line) {
153- for c in utf16_chars {
167+ fn wide_to_utf8_col ( & self , enc : WideEncoding , line : u32 , mut col : u32 ) -> TextSize {
168+ if let Some ( wide_chars ) = self . line_wide_chars . get ( & line) {
169+ for c in wide_chars {
154170 if col > u32:: from ( c. start ) {
155- col += u32:: from ( c. len ( ) ) - c. len_utf16 ( ) as u32 ;
171+ col += u32:: from ( c. len ( ) ) - c. wide_len ( enc ) as u32 ;
156172 } else {
157173 // From here on, all utf16 characters come *after* the character we are mapping,
158174 // so we don't need to take them into account
@@ -167,6 +183,9 @@ impl LineIndex {
167183
168184#[ cfg( test) ]
169185mod tests {
186+ use test_utils:: skip_slow_tests;
187+
188+ use super :: WideEncoding :: { Utf16 , Utf32 } ;
170189 use super :: * ;
171190
172191 #[ test]
@@ -210,67 +229,59 @@ mod tests {
210229const C: char = 'x';
211230" ,
212231 ) ;
213- assert_eq ! ( col_index. utf16_lines . len( ) , 0 ) ;
232+ assert_eq ! ( col_index. line_wide_chars . len( ) , 0 ) ;
214233 }
215234
216235 #[ test]
217- fn test_single_char ( ) {
218- let col_index = LineIndex :: new (
219- "
220- const C: char = 'メ';
221- " ,
222- ) ;
223-
224- assert_eq ! ( col_index. utf16_lines. len( ) , 1 ) ;
225- assert_eq ! ( col_index. utf16_lines[ & 1 ] . len( ) , 1 ) ;
226- assert_eq ! ( col_index. utf16_lines[ & 1 ] [ 0 ] , Utf16Char { start: 17 . into( ) , end: 20 . into( ) } ) ;
227-
228- // UTF-8 to UTF-16, no changes
229- assert_eq ! ( col_index. utf8_to_utf16_col( 1 , 15 . into( ) ) , 15 ) ;
230-
231- // UTF-8 to UTF-16
232- assert_eq ! ( col_index. utf8_to_utf16_col( 1 , 22 . into( ) ) , 20 ) ;
233-
234- // UTF-16 to UTF-8, no changes
235- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 15 ) , TextSize :: from( 15 ) ) ;
236-
237- // UTF-16 to UTF-8
238- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 19 ) , TextSize :: from( 21 ) ) ;
239-
240- let col_index = LineIndex :: new ( "a𐐏b" ) ;
241- assert_eq ! ( col_index. utf16_to_utf8_col( 0 , 3 ) , TextSize :: from( 5 ) ) ;
242- }
243-
244- #[ test]
245- fn test_string ( ) {
246- let col_index = LineIndex :: new (
247- "
248- const C: char = \" メ メ\" ;
249- " ,
250- ) ;
251-
252- assert_eq ! ( col_index. utf16_lines. len( ) , 1 ) ;
253- assert_eq ! ( col_index. utf16_lines[ & 1 ] . len( ) , 2 ) ;
254- assert_eq ! ( col_index. utf16_lines[ & 1 ] [ 0 ] , Utf16Char { start: 17 . into( ) , end: 20 . into( ) } ) ;
255- assert_eq ! ( col_index. utf16_lines[ & 1 ] [ 1 ] , Utf16Char { start: 21 . into( ) , end: 24 . into( ) } ) ;
256-
257- // UTF-8 to UTF-16
258- assert_eq ! ( col_index. utf8_to_utf16_col( 1 , 15 . into( ) ) , 15 ) ;
259-
260- assert_eq ! ( col_index. utf8_to_utf16_col( 1 , 21 . into( ) ) , 19 ) ;
261- assert_eq ! ( col_index. utf8_to_utf16_col( 1 , 25 . into( ) ) , 21 ) ;
262-
263- assert ! ( col_index. utf8_to_utf16_col( 2 , 15 . into( ) ) == 15 ) ;
264-
265- // UTF-16 to UTF-8
266- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 15 ) , TextSize :: from( 15 ) ) ;
236+ fn test_every_chars ( ) {
237+ if skip_slow_tests ( ) {
238+ return ;
239+ }
267240
268- // メ UTF-8: 0xE3 0x83 0xA1, UTF-16: 0x30E1
269- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 17 ) , TextSize :: from( 17 ) ) ; // first メ at 17..20
270- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 18 ) , TextSize :: from( 20 ) ) ; // space
271- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 19 ) , TextSize :: from( 21 ) ) ; // second メ at 21..24
241+ let text: String = {
242+ let mut chars: Vec < char > = ( ( 0 as char ) ..char:: MAX ) . collect ( ) ; // Neat!
243+ chars. extend ( "\n " . repeat ( chars. len ( ) / 16 ) . chars ( ) ) ;
244+ let mut rng = oorandom:: Rand32 :: new ( stdx:: rand:: seed ( ) ) ;
245+ stdx:: rand:: shuffle ( & mut chars, |i| rng. rand_range ( 0 ..i as u32 ) as usize ) ;
246+ chars. into_iter ( ) . collect ( )
247+ } ;
248+ assert ! ( text. contains( '💩' ) ) ; // Sanity check.
249+
250+ let line_index = LineIndex :: new ( & text) ;
251+
252+ let mut lin_col = LineCol { line : 0 , col : 0 } ;
253+ let mut col_utf16 = 0 ;
254+ let mut col_utf32 = 0 ;
255+ for ( offset, c) in text. char_indices ( ) {
256+ let got_offset = line_index. offset ( lin_col) . unwrap ( ) ;
257+ assert_eq ! ( usize :: from( got_offset) , offset) ;
258+
259+ let got_lin_col = line_index. line_col ( got_offset) ;
260+ assert_eq ! ( got_lin_col, lin_col) ;
261+
262+ for enc in [ Utf16 , Utf32 ] {
263+ let wide_lin_col = line_index. to_wide ( enc, lin_col) ;
264+ let got_lin_col = line_index. to_utf8 ( enc, wide_lin_col) ;
265+ assert_eq ! ( got_lin_col, lin_col) ;
266+
267+ let want_col = match enc {
268+ Utf16 => col_utf16,
269+ Utf32 => col_utf32,
270+ } ;
271+ assert_eq ! ( wide_lin_col. col, want_col)
272+ }
272273
273- assert_eq ! ( col_index. utf16_to_utf8_col( 2 , 15 ) , TextSize :: from( 15 ) ) ;
274+ if c == '\n' {
275+ lin_col. line += 1 ;
276+ lin_col. col = 0 ;
277+ col_utf16 = 0 ;
278+ col_utf32 = 0 ;
279+ } else {
280+ lin_col. col += c. len_utf8 ( ) as u32 ;
281+ col_utf16 += c. len_utf16 ( ) as u32 ;
282+ col_utf32 += 1 ;
283+ }
284+ }
274285 }
275286
276287 #[ test]
0 commit comments