Skip to content

Commit 51c8677

Browse files
authored
revert: substring heuristic algorithm (#155)
1 parent 1c5d9f2 commit 51c8677

File tree

3 files changed

+47
-199
lines changed

3 files changed

+47
-199
lines changed

src/helpers.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use std::{
22
borrow::{BorrowMut, Cow},
33
cell::{OnceCell, RefCell},
4-
fmt::Debug,
54
marker::PhantomData,
65
ops::Range,
76
};
@@ -1244,7 +1243,7 @@ pub fn stream_and_get_source_and_map<'a, S: StreamChunks>(
12441243
}
12451244

12461245
/// Represents a text source that can be manipulated for source mapping purposes.
1247-
pub trait SourceText<'a>: Default + Clone + ToString + Debug {
1246+
pub trait SourceText<'a>: Default + Clone + ToString {
12481247
/// Splits the text into lines, returning an iterator over each line.
12491248
/// Each line includes its line ending character if present.
12501249
fn split_into_lines(&self) -> impl Iterator<Item = Self>;
@@ -1253,7 +1252,7 @@ pub trait SourceText<'a>: Default + Clone + ToString + Debug {
12531252
fn ends_with(&self, value: &str) -> bool;
12541253

12551254
/// Returns an iterator over the char indices in the text.
1256-
fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)>;
1255+
fn char_indices(&self) -> impl Iterator<Item = (usize, char)>;
12571256

12581257
/// Gets the byte at the specified index, if it exists.
12591258
fn get_byte(&self, byte_index: usize) -> Option<u8>;
@@ -1290,7 +1289,7 @@ impl<'a> SourceText<'a> for Rope<'a> {
12901289
(*self).ends_with(value)
12911290
}
12921291

1293-
fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)> {
1292+
fn char_indices(&self) -> impl Iterator<Item = (usize, char)> {
12941293
self.char_indices()
12951294
}
12961295

@@ -1332,7 +1331,7 @@ impl<'a> SourceText<'a> for &'a str {
13321331
(*self).ends_with(value)
13331332
}
13341333

1335-
fn char_indices(&self) -> impl DoubleEndedIterator<Item = (usize, char)> {
1334+
fn char_indices(&self) -> impl Iterator<Item = (usize, char)> {
13361335
(*self).char_indices()
13371336
}
13381337

src/rope.rs

Lines changed: 29 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
use std::{
44
borrow::Cow,
5+
collections::VecDeque,
56
hash::Hash,
67
ops::{Bound, RangeBounds},
78
rc::Rc,
@@ -133,24 +134,13 @@ impl<'a> Rope<'a> {
133134
iter: s.char_indices(),
134135
},
135136
},
136-
Repr::Full(vec) => {
137-
let right_byte_offset = vec.iter().map(|(s, _)| s.len() as u32).sum();
138-
139-
CharIndices {
140-
iter: CharIndicesEnum::Full {
141-
iters: vec
142-
.iter()
143-
.map(|(s, _)| s.char_indices())
144-
.collect::<Vec<_>>(),
145-
left_chunk_index: 0,
146-
left_byte_offset: 0,
147-
last_left_indice: None,
148-
right_chunk_index: (vec.len() - 1) as u32,
149-
right_byte_offset,
150-
right_byte_offset_for: vec.len() as u32,
151-
},
152-
}
153-
}
137+
Repr::Full(data) => CharIndices {
138+
iter: CharIndicesEnum::Full {
139+
chunks: data,
140+
char_indices: VecDeque::new(),
141+
chunk_index: 0,
142+
},
143+
},
154144
}
155145
}
156146

@@ -668,13 +658,9 @@ enum CharIndicesEnum<'a, 'b> {
668658
iter: std::str::CharIndices<'b>,
669659
},
670660
Full {
671-
iters: Vec<std::str::CharIndices<'a>>,
672-
left_chunk_index: u32,
673-
left_byte_offset: u32,
674-
last_left_indice: Option<(usize, char)>,
675-
right_chunk_index: u32,
676-
right_byte_offset: u32,
677-
right_byte_offset_for: u32,
661+
chunks: &'a [(&'b str, usize)],
662+
char_indices: VecDeque<(usize, char)>,
663+
chunk_index: usize,
678664
},
679665
}
680666

@@ -689,59 +675,29 @@ impl Iterator for CharIndices<'_, '_> {
689675
match &mut self.iter {
690676
CharIndicesEnum::Light { iter } => iter.next(),
691677
CharIndicesEnum::Full {
692-
iters,
693-
left_chunk_index,
694-
left_byte_offset,
695-
last_left_indice,
696-
..
678+
chunks,
679+
char_indices,
680+
chunk_index,
697681
} => {
698-
if (*left_chunk_index as usize) >= iters.len() {
699-
return None;
682+
if let Some(item) = char_indices.pop_front() {
683+
return Some(item);
700684
}
701-
if let Some((byte_index, char)) =
702-
iters[*left_chunk_index as usize].next()
703-
{
704-
*last_left_indice = Some((byte_index, char));
705-
Some((byte_index + (*left_byte_offset as usize), char))
706-
} else {
707-
*left_chunk_index += 1;
708-
if let Some((byte_index, char)) = last_left_indice.take() {
709-
*left_byte_offset =
710-
*left_byte_offset + byte_index as u32 + char.len_utf8() as u32;
711-
}
712-
self.next()
685+
686+
if *chunk_index >= chunks.len() {
687+
return None;
713688
}
714-
}
715-
}
716-
}
717-
}
718689

719-
impl DoubleEndedIterator for CharIndices<'_, '_> {
720-
fn next_back(&mut self) -> Option<Self::Item> {
721-
match &mut self.iter {
722-
CharIndicesEnum::Light { iter } => iter.next_back(),
723-
CharIndicesEnum::Full {
724-
iters,
725-
right_chunk_index,
726-
right_byte_offset,
727-
right_byte_offset_for,
728-
..
729-
} => {
730-
if let Some((byte_index, char)) =
731-
iters[*right_chunk_index as usize].next_back()
732-
{
733-
if *right_byte_offset_for != *right_chunk_index {
734-
*right_byte_offset =
735-
*right_byte_offset - byte_index as u32 - char.len_utf8() as u32;
736-
*right_byte_offset_for = *right_chunk_index;
737-
}
738-
Some((byte_index + (*right_byte_offset as usize), char))
739-
} else if *right_chunk_index > 0 {
740-
*right_chunk_index -= 1;
741-
self.next_back()
742-
} else {
743-
None
690+
// skip empty chunks
691+
while *chunk_index < chunks.len() && chunks[*chunk_index].0.is_empty() {
692+
*chunk_index += 1;
744693
}
694+
695+
let (chunk, start_pos) = chunks[*chunk_index];
696+
697+
char_indices
698+
.extend(chunk.char_indices().map(|(i, c)| (start_pos + i, c)));
699+
*chunk_index += 1;
700+
char_indices.pop_front()
745701
}
746702
}
747703
}
@@ -1212,29 +1168,6 @@ mod tests {
12121168
);
12131169
}
12141170

1215-
#[test]
1216-
fn reverse_char_indices() {
1217-
let mut a = Rope::new();
1218-
a.add("abc");
1219-
a.add("def");
1220-
assert_eq!(
1221-
a.char_indices().rev().collect::<Vec<_>>(),
1222-
"abcdef".char_indices().rev().collect::<Vec<_>>()
1223-
);
1224-
1225-
let mut a = Rope::new();
1226-
a.add("こんにちは");
1227-
assert_eq!(
1228-
a.char_indices().rev().collect::<Vec<_>>(),
1229-
"こんにちは".char_indices().rev().collect::<Vec<_>>()
1230-
);
1231-
a.add("世界");
1232-
assert_eq!(
1233-
a.char_indices().rev().collect::<Vec<_>>(),
1234-
"こんにちは世界".char_indices().rev().collect::<Vec<_>>()
1235-
);
1236-
}
1237-
12381171
#[test]
12391172
fn lines1() {
12401173
let rope = Rope::from("abc");

src/with_indices.rs

Lines changed: 14 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::{cell::Cell, marker::PhantomData};
1+
use std::{cell::OnceCell, marker::PhantomData};
22

33
use crate::helpers::SourceText;
44

@@ -9,7 +9,8 @@ where
99
{
1010
/// line is a string reference
1111
pub line: S,
12-
last_char_index_to_byte_index: Cell<(u32, u32)>,
12+
/// the byte position of each `char` in `line` string slice .
13+
pub indices_indexes: OnceCell<Vec<usize>>,
1314
data: PhantomData<&'a S>,
1415
}
1516

@@ -19,102 +20,32 @@ where
1920
{
2021
pub fn new(line: S) -> Self {
2122
Self {
23+
indices_indexes: OnceCell::new(),
2224
line,
23-
last_char_index_to_byte_index: Cell::new((0, 0)),
2425
data: PhantomData,
2526
}
2627
}
2728

28-
pub(crate) fn substring(
29-
&self,
30-
start_char_index: usize,
31-
end_char_index: usize,
32-
) -> S {
33-
if end_char_index <= start_char_index {
29+
/// substring::SubString with cache
30+
pub(crate) fn substring(&self, start_index: usize, end_index: usize) -> S {
31+
if end_index <= start_index {
3432
return S::default();
3533
}
3634

37-
let line_len = self.line.len();
38-
let mut start_byte_index =
39-
if start_char_index == 0 { Some(0) } else { None };
40-
let mut end_byte_index = if end_char_index == usize::MAX {
41-
Some(line_len)
42-
} else {
43-
None
44-
};
35+
let indices_indexes = self.indices_indexes.get_or_init(|| {
36+
self.line.char_indices().map(|(i, _)| i).collect::<Vec<_>>()
37+
});
4538

46-
if start_byte_index.is_some() && end_byte_index.is_some() {
47-
return self.line.clone();
48-
}
49-
50-
let (last_char_index, last_byte_index) =
51-
self.last_char_index_to_byte_index.get();
52-
let byte_index = last_byte_index as usize;
53-
54-
if start_char_index >= last_char_index as usize
55-
|| end_char_index >= last_char_index as usize
56-
{
57-
let mut char_index = last_char_index as usize;
58-
#[allow(unsafe_code)]
59-
let slice = unsafe {
60-
// SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
61-
// that the indices obtained from it will always be within the bounds of `self` and they
62-
// will always lie on UTF-8 sequence boundaries.
63-
self.line.byte_slice_unchecked(byte_index..line_len)
64-
};
65-
for (byte_offset, _) in slice.char_indices() {
66-
if char_index == start_char_index {
67-
start_byte_index = Some(byte_index + byte_offset);
68-
if end_byte_index.is_some() {
69-
break;
70-
}
71-
} else if char_index == end_char_index {
72-
end_byte_index = Some(byte_index + byte_offset);
73-
self
74-
.last_char_index_to_byte_index
75-
.set((char_index as u32, (byte_index + byte_offset) as u32));
76-
break;
77-
}
78-
char_index += 1;
79-
}
80-
}
81-
82-
if start_char_index < last_char_index as usize
83-
|| end_char_index < last_char_index as usize
84-
{
85-
let mut char_index = last_char_index as usize;
86-
#[allow(unsafe_code)]
87-
let slice = unsafe {
88-
// SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
89-
// that the indices obtained from it will always be within the bounds of `self` and they
90-
// will always lie on UTF-8 sequence boundaries.
91-
self.line.byte_slice_unchecked(0..byte_index)
92-
};
93-
for (byte_index, char) in slice.char_indices().rev() {
94-
if char_index == end_char_index {
95-
end_byte_index = Some(byte_index + char.len_utf8());
96-
if start_byte_index.is_some() {
97-
break;
98-
}
99-
} else if char_index == start_char_index {
100-
start_byte_index = Some(byte_index + char.len_utf8());
101-
break;
102-
}
103-
char_index -= 1;
104-
}
105-
}
106-
107-
let start_byte_index = start_byte_index.unwrap_or(line_len);
108-
let end_byte_index = end_byte_index.unwrap_or(line_len);
39+
let str_len = self.line.len();
40+
let start = *indices_indexes.get(start_index).unwrap_or(&str_len);
41+
let end = *indices_indexes.get(end_index).unwrap_or(&str_len);
10942

11043
#[allow(unsafe_code)]
11144
unsafe {
11245
// SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
11346
// that the indices obtained from it will always be within the bounds of `self` and they
11447
// will always lie on UTF-8 sequence boundaries.
115-
self
116-
.line
117-
.byte_slice_unchecked(start_byte_index..end_byte_index)
48+
self.line.byte_slice_unchecked(start..end)
11849
}
11950
}
12051
}
@@ -159,19 +90,4 @@ mod tests {
15990
"øbα"
16091
);
16192
}
162-
163-
#[test]
164-
fn test_last_char_index_to_byte_index() {
165-
let rope_with_indices =
166-
WithIndices::new(Rope::from("hello world 你好世界"));
167-
assert_eq!(rope_with_indices.substring(10, 13), "d 你");
168-
assert_eq!(rope_with_indices.substring(13, 15), "好世");
169-
assert_eq!(rope_with_indices.substring(10, 13), "d 你");
170-
171-
let rope_with_indices =
172-
WithIndices::new(Rope::from("export const answer = 42;\n"));
173-
assert_eq!(rope_with_indices.substring(7, 13), "const ");
174-
assert_eq!(rope_with_indices.substring(13, 19), "answer");
175-
assert_eq!(rope_with_indices.substring(7, 22), "const answer = ");
176-
}
17793
}

0 commit comments

Comments
 (0)