Skip to content

Commit 3d48556

Browse files
committed
perf: WithUtf16
1 parent 746949b commit 3d48556

File tree

1 file changed

+29
-8
lines changed

1 file changed

+29
-8
lines changed

src/with_utf16.rs

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ pub struct WithUtf16<'object_pool, 'text> {
77
/// line is a string reference
88
pub line: &'text str,
99
/// the byte position of each `char` in `line` string slice .
10-
pub utf16_byte_indices: OnceCell<Pooled<'object_pool>>,
10+
pub utf16_byte_indices: OnceCell<Option<Pooled<'object_pool>>>,
1111
object_pool: &'object_pool ObjectPool,
1212
}
1313

@@ -21,8 +21,13 @@ impl<'object_pool, 'text> WithUtf16<'object_pool, 'text> {
2121
}
2222

2323
/// substring::SubString with cache
24-
pub fn substring(&self, start_index: usize, end_index: usize) -> &'text str {
25-
if end_index <= start_index {
24+
#[allow(unsafe_code)]
25+
pub fn substring(
26+
&self,
27+
start_utf16_index: usize,
28+
end_utf16_index: usize,
29+
) -> &'text str {
30+
if end_utf16_index <= start_utf16_index {
2631
return "";
2732
}
2833

@@ -38,14 +43,30 @@ impl<'object_pool, 'text> WithUtf16<'object_pool, 'text> {
3843
_ => unreachable!(),
3944
}
4045
}
41-
vec
46+
if vec.len() == self.line.len() {
47+
// Optimization: UTF-16 length equals UTF-8 length, indicating no surrogate pairs.
48+
// Return None to release the vector back to the object pool for better memory efficiency.
49+
None
50+
} else {
51+
Some(vec)
52+
}
4253
});
4354

44-
let str_len = self.line.len();
45-
let start = *utf16_byte_indices.get(start_index).unwrap_or(&str_len);
46-
let end = *utf16_byte_indices.get(end_index).unwrap_or(&str_len);
55+
let utf8_len = self.line.len();
56+
57+
let Some(utf16_byte_indices) = utf16_byte_indices else {
58+
let start_utf16_index = start_utf16_index.min(utf8_len);
59+
let end_utf16_index = end_utf16_index.min(utf8_len);
60+
return unsafe {
61+
self.line.get_unchecked(start_utf16_index..end_utf16_index)
62+
};
63+
};
64+
65+
let start = *utf16_byte_indices
66+
.get(start_utf16_index)
67+
.unwrap_or(&utf8_len);
68+
let end = *utf16_byte_indices.get(end_utf16_index).unwrap_or(&utf8_len);
4769

48-
#[allow(unsafe_code)]
4970
unsafe {
5071
// SAFETY: Since `indices` iterates over the `CharIndices` of `self`, we can guarantee
5172
// that the indices obtained from it will always be within the bounds of `self` and they

0 commit comments

Comments
 (0)