Skip to content

Commit 083552b

Browse files
committed
perf: WithUtf16
1 parent 746949b commit 083552b

File tree

1 file changed

+53
-16
lines changed

1 file changed

+53
-16
lines changed

src/with_utf16.rs

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ pub struct WithUtf16<'object_pool, 'text> {
77
/// line is a string reference
88
pub line: &'text str,
99
/// the byte position of each `char` in `line` string slice .
10-
pub utf16_byte_indices: OnceCell<Pooled<'object_pool>>,
10+
pub utf16_byte_indices: OnceCell<Option<Pooled<'object_pool>>>,
1111
object_pool: &'object_pool ObjectPool,
1212
}
1313

@@ -21,31 +21,68 @@ impl<'object_pool, 'text> WithUtf16<'object_pool, 'text> {
2121
}
2222

2323
/// substring::SubString with cache
24-
pub fn substring(&self, start_index: usize, end_index: usize) -> &'text str {
25-
if end_index <= start_index {
24+
#[allow(unsafe_code)]
25+
pub fn substring(
26+
&self,
27+
start_utf16_index: usize,
28+
end_utf16_index: usize,
29+
) -> &'text str {
30+
if end_utf16_index <= start_utf16_index {
2631
return "";
2732
}
2833

2934
let utf16_byte_indices = self.utf16_byte_indices.get_or_init(|| {
3035
let mut vec = self.object_pool.pull(self.line.len());
31-
for (byte_index, ch) in self.line.char_indices() {
32-
match ch.len_utf16() {
33-
1 => vec.push(byte_index),
34-
2 => {
35-
vec.push(byte_index);
36-
vec.push(byte_index);
37-
}
38-
_ => unreachable!(),
36+
let bytes = self.line.as_bytes();
37+
let mut byte_pos = 0;
38+
39+
while byte_pos < bytes.len() {
40+
let byte = unsafe { *bytes.get_unchecked(byte_pos) };
41+
42+
if byte < 0x80 {
43+
// ASCII: 1 byte = 1 UTF-16 unit
44+
vec.push(byte_pos);
45+
byte_pos += 1;
46+
} else if byte < 0xE0 {
47+
// 2-byte UTF-8 = 1 UTF-16 unit
48+
vec.push(byte_pos);
49+
byte_pos += 2;
50+
} else if byte < 0xF0 {
51+
// 3-byte UTF-8 = 1 UTF-16 unit
52+
vec.push(byte_pos);
53+
byte_pos += 3;
54+
} else {
55+
// 4-byte UTF-8 = 2 UTF-16 units (surrogate pair)
56+
vec.push(byte_pos);
57+
vec.push(byte_pos);
58+
byte_pos += 4;
3959
}
4060
}
41-
vec
61+
62+
if vec.len() == self.line.len() {
63+
// Optimization: UTF-16 length equals UTF-8 length, indicating no surrogate pairs.
64+
// Return None to release the vector back to the object pool for better memory efficiency.
65+
None
66+
} else {
67+
Some(vec)
68+
}
4269
});
4370

44-
let str_len = self.line.len();
45-
let start = *utf16_byte_indices.get(start_index).unwrap_or(&str_len);
46-
let end = *utf16_byte_indices.get(end_index).unwrap_or(&str_len);
71+
let utf8_len = self.line.len();
72+
73+
let Some(utf16_byte_indices) = utf16_byte_indices else {
74+
let start_utf16_index = start_utf16_index.min(utf8_len);
75+
let end_utf16_index = end_utf16_index.min(utf8_len);
76+
return unsafe {
77+
self.line.get_unchecked(start_utf16_index..end_utf16_index)
78+
};
79+
};
80+
81+
let start = *utf16_byte_indices
82+
.get(start_utf16_index)
83+
.unwrap_or(&utf8_len);
84+
let end = *utf16_byte_indices.get(end_utf16_index).unwrap_or(&utf8_len);
4785

48-
#[allow(unsafe_code)]
4986
unsafe {
5087
// SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
5188
// that the indices obtained from it will always be within the bounds of `self` and they

0 commit comments

Comments
 (0)