Skip to content

Commit ee72991

Browse files
authored
fix: fixes UTF-16 encoding issues (#202)
1 parent 446ceaf commit ee72991

File tree

4 files changed

+429
-143
lines changed

4 files changed

+429
-143
lines changed

src/cached_source.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ use crate::{
2121
struct CachedData {
2222
hash: OnceLock<u64>,
2323
size: OnceLock<usize>,
24-
line_only_map: OnceLock<Option<SourceMap>>,
2524
columns_map: OnceLock<Option<SourceMap>>,
25+
line_only_map: OnceLock<Option<SourceMap>>,
2626
}
2727

2828
/// It tries to reused cached results from other methods to avoid calculations,

src/helpers.rs

Lines changed: 139 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -91,18 +91,11 @@ pub trait Chunks {
9191
/// [StreamChunks] abstraction, see [webpack-sources source.streamChunks](https://github.com/webpack/webpack-sources/blob/9f98066311d53a153fdc7c633422a1d086528027/lib/helpers/streamChunks.js#L13).
9292
pub trait StreamChunks {
9393
/// [StreamChunks] abstraction
94-
fn stream_chunks<'a>(
95-
&'a self,
96-
// object_pool: &'a ObjectPool,
97-
// options: &MapOptions,
98-
// on_chunk: OnChunk<'_, 'a>,
99-
// on_source: OnSource<'_, 'a>,
100-
// on_name: OnName<'_, 'a>,
101-
) -> Box<dyn Chunks + 'a>;
94+
fn stream_chunks<'a>(&'a self) -> Box<dyn Chunks + 'a>;
10295
}
10396

10497
/// [OnChunk] abstraction, see [webpack-sources onChunk](https://github.com/webpack/webpack-sources/blob/9f98066311d53a153fdc7c633422a1d086528027/lib/helpers/streamChunks.js#L13).
105-
pub type OnChunk<'a, 'b> = &'a mut dyn FnMut(Option<Cow<'b, str>>, Mapping);
98+
pub type OnChunk<'a, 'b> = &'a mut dyn FnMut(Option<&'b str>, Mapping);
10699

107100
/// [OnSource] abstraction, see [webpack-sources onSource](https://github.com/webpack/webpack-sources/blob/9f98066311d53a153fdc7c633422a1d086528027/lib/helpers/streamChunks.js#L13).
108101
///
@@ -160,57 +153,94 @@ pub fn encode_mappings(mappings: impl Iterator<Item = Mapping>) -> String {
160153
encoder.drain()
161154
}
162155

156+
#[derive(Debug, PartialEq, Eq)]
157+
pub struct Token<'a> {
158+
pub text: &'a str,
159+
pub utf16_len: usize,
160+
}
161+
163162
pub struct PotentialTokens<'a> {
164163
bytes: &'a [u8],
165-
source: &'a str,
164+
text: &'a str,
166165
index: usize,
166+
utf16_len: usize,
167167
}
168168

169169
impl<'a> Iterator for PotentialTokens<'a> {
170-
type Item = &'a str;
170+
type Item = Token<'a>;
171171

172+
#[allow(unsafe_code)]
172173
fn next(&mut self) -> Option<Self::Item> {
173-
if let Some(&c) = self.bytes.get(self.index) {
174-
let start = self.index;
175-
let mut c = char::from(c);
176-
while c != '\n' && c != ';' && c != '{' && c != '}' {
174+
if self.index >= self.bytes.len() {
175+
return None;
176+
}
177+
178+
let start = self.index;
179+
let mut c = self.bytes[self.index];
180+
while c != b'\n' && c != b';' && c != b'{' && c != b'}' {
181+
// Determine character boundaries based on UTF-8 bytes and calculate UTF-16 length
182+
if c < 0x80 {
183+
// ASCII character: 1 byte -> 1 UTF-16 code unit
184+
self.utf16_len += 1;
177185
self.index += 1;
178-
if let Some(&ch) = self.bytes.get(self.index) {
179-
c = char::from(ch);
180-
} else {
181-
return Some(&self.source[start..self.index]);
182-
}
186+
} else if c < 0xE0 {
187+
// 2-byte UTF-8 sequence -> 1 UTF-16 code unit
188+
self.utf16_len += 1;
189+
self.index += 2;
190+
} else if c < 0xF0 {
191+
// 3-byte UTF-8 sequence -> 1 UTF-16 code unit
192+
self.utf16_len += 1;
193+
self.index += 3;
194+
} else {
195+
// 4-byte UTF-8 sequence -> 2 UTF-16 code units (surrogate pair)
196+
self.utf16_len += 2;
197+
self.index += 4;
183198
}
184-
while c == ';'
185-
|| c == ' '
186-
|| c == '{'
187-
|| c == '}'
188-
|| c == '\r'
189-
|| c == '\t'
190-
{
191-
self.index += 1;
192-
if let Some(&ch) = self.bytes.get(self.index) {
193-
c = char::from(ch);
194-
} else {
195-
return Some(&self.source[start..self.index]);
196-
}
199+
200+
if self.index < self.bytes.len() {
201+
c = self.bytes[self.index];
202+
} else {
203+
let text = unsafe { self.text.get_unchecked(start..) };
204+
return Some(Token {
205+
text,
206+
utf16_len: self.utf16_len,
207+
});
197208
}
198-
if c == '\n' {
199-
self.index += 1;
209+
}
210+
211+
while self.index < self.bytes.len() {
212+
match self.bytes[self.index] {
213+
b';' | b' ' | b'{' | b'}' | b'\r' | b'\t' => {
214+
self.index += 1;
215+
self.utf16_len += 1;
216+
}
217+
b'\n' => {
218+
self.index += 1;
219+
self.utf16_len += 1;
220+
break;
221+
}
222+
_ => {
223+
break;
224+
}
200225
}
201-
Some(&self.source[start..self.index])
202-
} else {
203-
None
204226
}
227+
let text = unsafe { self.text.get_unchecked(start..self.index) };
228+
let token = Token {
229+
text,
230+
utf16_len: self.utf16_len,
231+
};
232+
self.utf16_len = 0;
233+
Some(token)
205234
}
206235
}
207236

208237
// /[^\n;{}]+[;{} \r\t]*\n?|[;{} \r\t]+\n?|\n/g
209-
pub fn split_into_potential_tokens(source: &str) -> PotentialTokens {
238+
pub fn split_into_potential_tokens(text: &str) -> PotentialTokens {
210239
PotentialTokens {
211-
bytes: source.as_bytes(),
212-
source,
240+
bytes: text.as_bytes(),
241+
text,
213242
index: 0,
243+
utf16_len: 0,
214244
}
215245
}
216246

@@ -283,7 +313,7 @@ pub fn stream_chunks_of_raw_source<'a>(
283313
let mut last_line = None;
284314
for l in split_into_lines(source) {
285315
on_chunk(
286-
Some(Cow::Borrowed(l)),
316+
Some(l),
287317
Mapping {
288318
generated_line: line,
289319
generated_column: 0,
@@ -487,7 +517,7 @@ fn stream_chunks_of_source_map_full<'a>(
487517
}
488518
if !chunk.is_empty() {
489519
on_chunk(
490-
Some(Cow::Borrowed(chunk)),
520+
Some(chunk),
491521
Mapping {
492522
generated_line: mapping_line,
493523
generated_column: mapping_column,
@@ -504,7 +534,7 @@ fn stream_chunks_of_source_map_full<'a>(
504534
let chunk = lines[(current_generated_line - 1) as usize]
505535
.substring(current_generated_column as usize, usize::MAX);
506536
on_chunk(
507-
Some(Cow::Borrowed(chunk)),
537+
Some(chunk),
508538
Mapping {
509539
generated_line: current_generated_line,
510540
generated_column: current_generated_column,
@@ -519,7 +549,7 @@ fn stream_chunks_of_source_map_full<'a>(
519549
if current_generated_line as usize <= lines.len() {
520550
let chunk = &lines[(current_generated_line as usize) - 1].line;
521551
on_chunk(
522-
Some(Cow::Borrowed(chunk)),
552+
Some(chunk),
523553
Mapping {
524554
generated_line: current_generated_line,
525555
generated_column: 0,
@@ -536,7 +566,7 @@ fn stream_chunks_of_source_map_full<'a>(
536566
mapping.generated_column as usize,
537567
);
538568
on_chunk(
539-
Some(Cow::Borrowed(chunk)),
569+
Some(chunk),
540570
Mapping {
541571
generated_line: current_generated_line,
542572
generated_column: current_generated_column,
@@ -648,7 +678,7 @@ fn stream_chunks_of_source_map_lines_full<'a>(
648678
if current_generated_line as usize <= lines.len() {
649679
let chunk = &lines[current_generated_line as usize - 1];
650680
on_chunk(
651-
Some(Cow::Borrowed(chunk)),
681+
Some(chunk),
652682
Mapping {
653683
generated_line: current_generated_line,
654684
generated_column: 0,
@@ -666,7 +696,7 @@ fn stream_chunks_of_source_map_lines_full<'a>(
666696
let chunk = &lines[current_generated_line as usize - 1];
667697
mapping.generated_column = 0;
668698
original.name_index = None;
669-
on_chunk(Some(Cow::Borrowed(chunk)), mapping);
699+
on_chunk(Some(chunk), mapping);
670700
current_generated_line += 1;
671701
}
672702
};
@@ -676,7 +706,7 @@ fn stream_chunks_of_source_map_lines_full<'a>(
676706
while current_generated_line as usize <= lines.len() {
677707
let chunk = &lines[current_generated_line as usize - 1];
678708
on_chunk(
679-
Some(Cow::Borrowed(chunk)),
709+
Some(chunk),
680710
Mapping {
681711
generated_line: current_generated_line,
682712
generated_column: 0,
@@ -706,7 +736,7 @@ fn stream_chunks_of_source_map_lines_full<'a>(
706736
#[derive(Debug)]
707737
struct SourceMapLineData<'a> {
708738
pub mappings_data: Vec<i64>,
709-
pub chunks: Vec<Cow<'a, str>>,
739+
pub chunks: Vec<&'a str>,
710740
}
711741

712742
type InnerSourceIndexValueMapping<'a> =
@@ -1257,11 +1287,13 @@ mod tests {
12571287
use std::sync::LazyLock;
12581288

12591289
use super::{
1260-
stream_chunks_of_source_map_final, stream_chunks_of_source_map_full,
1261-
stream_chunks_of_source_map_lines_final,
1290+
split_into_potential_tokens, stream_chunks_of_source_map_final,
1291+
stream_chunks_of_source_map_full, stream_chunks_of_source_map_lines_final,
12621292
stream_chunks_of_source_map_lines_full, GeneratedInfo,
12631293
};
1264-
use crate::{Mapping, ObjectPool, OriginalLocation, SourceMap};
1294+
use crate::{
1295+
helpers::Token, Mapping, ObjectPool, OriginalLocation, SourceMap,
1296+
};
12651297

12661298
const UTF16_SOURCE: &'static str = "var i18n = JSON.parse('{\"魑魅魍魉\":{\"en-US\":\"Evil spirits\",\"zh-CN\":\"魑魅魍魉\"}}');\nvar __webpack_exports___ = i18n[\"魑魅魍魉\"];\nexport { __webpack_exports___ as 魑魅魍魉 };";
12671299

@@ -1291,22 +1323,22 @@ mod tests {
12911323
assert_eq!(
12921324
chunks,
12931325
vec![
1294-
("var ".into(), Mapping { generated_line: 1, generated_column: 0, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 0, name_index: None }) }),
1295-
("i18n = ".into(), Mapping { generated_line: 1, generated_column: 4, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 4, name_index: Some(0) }) }),
1296-
("JSON.".into(), Mapping { generated_line: 1, generated_column: 11, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 11, name_index: Some(1) }) }),
1297-
("parse".into(), Mapping { generated_line: 1, generated_column: 16, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 16, name_index: None }) }),
1298-
("(".into(), Mapping { generated_line: 1, generated_column: 21, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 21, name_index: None }) }),
1299-
("'{\"魑魅魍魉\":{\"en-US\":\"Evil spirits\",\"zh-CN\":\"魑魅魍魉\"}}');\n".into(), Mapping { generated_line: 1, generated_column: 22, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 22, name_index: None }) }),
1300-
("var ".into(), Mapping { generated_line: 2, generated_column: 0, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 0, name_index: None }) }),
1301-
("__webpack_exports___ = ".into(), Mapping { generated_line: 2, generated_column: 4, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 4, name_index: Some(2) }) }),
1302-
("i18n".into(), Mapping { generated_line: 2, generated_column: 27, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 27, name_index: Some(0) }) }),
1303-
("[".into(), Mapping { generated_line: 2, generated_column: 31, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 31, name_index: None }) }),
1304-
("\"魑魅魍魉\"]".into(), Mapping { generated_line: 2, generated_column: 32, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 32, name_index: None }) }),
1305-
(";\n".into(), Mapping { generated_line: 2, generated_column: 39, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 39, name_index: None }) }),
1306-
("export { ".into(), Mapping { generated_line: 3, generated_column: 0, original: Some(OriginalLocation { source_index: 0, original_line: 3, original_column: 0, name_index: None }) }),
1307-
("__webpack_exports___ as ".into(), Mapping { generated_line: 3, generated_column: 9, original: Some(OriginalLocation { source_index: 0, original_line: 3, original_column: 9, name_index: Some(2) }) }),
1308-
("魑魅魍魉".into(), Mapping { generated_line: 3, generated_column: 33, original: Some(OriginalLocation { source_index: 0, original_line: 3, original_column: 33, name_index: Some(3) }) }),
1309-
(" };".into(), Mapping { generated_line: 3, generated_column: 37, original: Some(OriginalLocation { source_index: 0, original_line: 3, original_column: 37, name_index: None }) })
1326+
("var ", Mapping { generated_line: 1, generated_column: 0, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 0, name_index: None }) }),
1327+
("i18n = ", Mapping { generated_line: 1, generated_column: 4, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 4, name_index: Some(0) }) }),
1328+
("JSON.", Mapping { generated_line: 1, generated_column: 11, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 11, name_index: Some(1) }) }),
1329+
("parse", Mapping { generated_line: 1, generated_column: 16, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 16, name_index: None }) }),
1330+
("(", Mapping { generated_line: 1, generated_column: 21, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 21, name_index: None }) }),
1331+
("'{\"魑魅魍魉\":{\"en-US\":\"Evil spirits\",\"zh-CN\":\"魑魅魍魉\"}}');\n", Mapping { generated_line: 1, generated_column: 22, original: Some(OriginalLocation { source_index: 0, original_line: 1, original_column: 22, name_index: None }) }),
1332+
("var ", Mapping { generated_line: 2, generated_column: 0, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 0, name_index: None }) }),
1333+
("__webpack_exports___ = ", Mapping { generated_line: 2, generated_column: 4, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 4, name_index: Some(2) }) }),
1334+
("i18n", Mapping { generated_line: 2, generated_column: 27, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 27, name_index: Some(0) }) }),
1335+
("[", Mapping { generated_line: 2, generated_column: 31, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 31, name_index: None }) }),
1336+
("\"魑魅魍魉\"]", Mapping { generated_line: 2, generated_column: 32, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 32, name_index: None }) }),
1337+
(";\n", Mapping { generated_line: 2, generated_column: 39, original: Some(OriginalLocation { source_index: 0, original_line: 2, original_column: 39, name_index: None }) }),
1338+
("export { ", Mapping { generated_line: 3, generated_column: 0, original: Some(OriginalLocation { source_index: 0, original_line: 3, original_column: 0, name_index: None }) }),
1339+
("__webpack_exports___ as ", Mapping { generated_line: 3, generated_column: 9, original: Some(OriginalLocation { source_index: 0, original_line: 3, original_column: 9, name_index: Some(2) }) }),
1340+
("魑魅魍魉", Mapping { generated_line: 3, generated_column: 33, original: Some(OriginalLocation { source_index: 0, original_line: 3, original_column: 33, name_index: Some(3) }) }),
1341+
(" };", Mapping { generated_line: 3, generated_column: 37, original: Some(OriginalLocation { source_index: 0, original_line: 3, original_column: 37, name_index: None }) })
13101342
]
13111343
);
13121344

@@ -1384,4 +1416,42 @@ mod tests {
13841416
}
13851417
)
13861418
}
1419+
1420+
#[test]
1421+
fn test_split_into_potential_tokens() {
1422+
let tokens = split_into_potential_tokens("var i18n = JSON.parse('{\"魑魅魍魉\":{\"en-US\":\"Evil spirits\",\"zh-CN\":\"魑魅魍魉\"}}');\nvar __webpack_exports___ = i18n[\"魑魅魍魉\"];\nexport { __webpack_exports___ as 魑魅魍魉 };").collect::<Vec<_>>();
1423+
assert_eq!(
1424+
tokens,
1425+
vec![
1426+
Token {
1427+
text: "var i18n = JSON.parse('{",
1428+
utf16_len: 24,
1429+
},
1430+
Token {
1431+
text: "\"魑魅魍魉\":{",
1432+
utf16_len: 8,
1433+
},
1434+
Token {
1435+
text: "\"en-US\":\"Evil spirits\",\"zh-CN\":\"魑魅魍魉\"}}",
1436+
utf16_len: 39,
1437+
},
1438+
Token {
1439+
text: "');\n",
1440+
utf16_len: 4,
1441+
},
1442+
Token {
1443+
text: "var __webpack_exports___ = i18n[\"魑魅魍魉\"];\n",
1444+
utf16_len: 41,
1445+
},
1446+
Token {
1447+
text: "export { ",
1448+
utf16_len: 9,
1449+
},
1450+
Token {
1451+
text: "__webpack_exports___ as 魑魅魍魉 };",
1452+
utf16_len: 31,
1453+
},
1454+
]
1455+
);
1456+
}
13871457
}

0 commit comments

Comments
 (0)