Skip to content

Commit bc23974

Browse files
committed
perf: WithUtf16 by bytes
1 parent 045e2c0 commit bc23974

File tree

1 file changed

+23
-8
lines changed

1 file changed

+23
-8
lines changed

src/with_utf16.rs

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,31 @@ impl<'object_pool, 'text> WithUtf16<'object_pool, 'text> {
3333

3434
let utf16_byte_indices = self.utf16_byte_indices.get_or_init(|| {
3535
let mut vec = self.object_pool.pull(self.line.len());
36-
for (byte_index, ch) in self.line.char_indices() {
37-
match ch.len_utf16() {
38-
1 => vec.push(byte_index),
39-
2 => {
40-
vec.push(byte_index);
41-
vec.push(byte_index);
42-
}
43-
_ => unreachable!(),
36+
37+
let bytes = self.line.as_bytes();
38+
let mut byte_pos = 0;
39+
while byte_pos < bytes.len() {
40+
let byte = unsafe { *bytes.get_unchecked(byte_pos) };
41+
if byte < 0x80 {
42+
// ASCII: 1 byte = 1 UTF-16 unit
43+
vec.push(byte_pos);
44+
byte_pos += 1;
45+
} else if byte < 0xE0 {
46+
// 2-byte UTF-8 = 1 UTF-16 unit
47+
vec.push(byte_pos);
48+
byte_pos += 2;
49+
} else if byte < 0xF0 {
50+
// 3-byte UTF-8 = 1 UTF-16 unit
51+
vec.push(byte_pos);
52+
byte_pos += 3;
53+
} else {
54+
// 4-byte UTF-8 = 2 UTF-16 units (surrogate pair)
55+
vec.push(byte_pos);
56+
vec.push(byte_pos);
57+
byte_pos += 4;
4458
}
4559
}
60+
4661
if vec.len() == self.line.len() {
4762
// Optimization: UTF-16 length equals UTF-8 length, indicating no surrogate pairs.
4863
// Return None to release the vector back to the object pool for better memory efficiency.

0 commit comments

Comments
 (0)