Skip to content

Commit 1ebb431

Browse files
committed
fix(iconv): separate endianess and buffer filling in utf16, utf32 to UCS-4 format
1 parent 0903ea4 commit 1ebb431

File tree

2 files changed

+47
-46
lines changed

2 files changed

+47
-46
lines changed

i18n/iconv_lib/utf_16.rs

Lines changed: 33 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ pub fn from_ucs4(
128128
suppress_error: bool,
129129
variant: UTF16Variant,
130130
) -> (u32, Vec<u8>) {
131-
let mut utf16: Vec<u8> = Vec::with_capacity(input.len() * 4); // Pre-allocate assuming worst case
131+
let mut utf16: Vec<u16> = Vec::new();
132132
let variant = match variant {
133133
UTF16Variant::UTF16LE => UTF16Variant::UTF16LE,
134134
UTF16Variant::UTF16BE => UTF16Variant::UTF16BE,
@@ -142,49 +142,48 @@ pub fn from_ucs4(
142142
};
143143

144144
for &code_point in input {
145-
if code_point <= 0xFFFF {
146-
if (0xD800..=0xDFFF).contains(&code_point) {
147-
if !suppress_error {
148-
eprintln!("Error: Isolated surrogate code point U+{:04X}", code_point);
149-
}
150-
if omit_invalid {
151-
continue;
152-
} else {
153-
return (1, utf16);
154-
}
145+
if code_point <= 0xD7FF || (0xE000..=0xFFFF).contains(&code_point) {
146+
utf16.push(u16::try_from(code_point).unwrap());
147+
} else if (0xD800..=0xDFFF).contains(&code_point) {
148+
if !suppress_error {
149+
eprintln!("Error: Isolated surrogate code point U+{:04X}", code_point);
155150
}
156-
157-
match variant {
158-
UTF16Variant::UTF16LE => LittleEndian::write_u16(&mut utf16, code_point as u16),
159-
UTF16Variant::UTF16BE => BigEndian::write_u16(&mut utf16, code_point as u16),
160-
_ => unreachable!(),
151+
if !omit_invalid {
152+
return (1, to_bytes(&utf16, variant));
161153
}
162154
} else if code_point <= 0x10FFFF {
163155
let code_point = code_point - 0x10000;
164-
let high_surrogate = (code_point >> 10) as u16 + 0xD800;
165-
let low_surrogate = (code_point & 0x3FF) as u16 + 0xDC00;
166-
match variant {
167-
UTF16Variant::UTF16LE => {
168-
LittleEndian::write_u16(&mut utf16, high_surrogate);
169-
LittleEndian::write_u16(&mut utf16, low_surrogate);
170-
}
171-
UTF16Variant::UTF16BE => {
172-
BigEndian::write_u16(&mut utf16, high_surrogate);
173-
BigEndian::write_u16(&mut utf16, low_surrogate);
174-
}
175-
_ => unreachable!(),
176-
}
156+
let high_surrogate = ((code_point >> 10) as u16) + 0xD800;
157+
let low_surrogate = ((code_point & 0x3FF) as u16) + 0xDC00;
158+
utf16.push(high_surrogate);
159+
utf16.push(low_surrogate);
177160
} else {
178161
if !suppress_error {
179162
eprintln!("Error: Invalid Unicode code point U+{:X}", code_point);
180163
}
181-
if omit_invalid {
182-
continue;
183-
} else {
184-
return (1, utf16);
164+
if !omit_invalid {
165+
return (1, to_bytes(&utf16, variant));
185166
}
186167
}
187168
}
188169

189-
(0, utf16)
170+
(0, to_bytes(&utf16, variant))
171+
}
172+
173+
fn to_bytes(utf16: &[u16], variant: UTF16Variant) -> Vec<u8> {
174+
let mut bytes = Vec::with_capacity(utf16.len() * 2);
175+
match variant {
176+
UTF16Variant::UTF16LE => {
177+
for &code_unit in utf16 {
178+
bytes.extend_from_slice(&code_unit.to_le_bytes());
179+
}
180+
}
181+
UTF16Variant::UTF16BE => {
182+
for &code_unit in utf16 {
183+
bytes.extend_from_slice(&code_unit.to_be_bytes());
184+
}
185+
}
186+
_ => unreachable!(),
187+
}
188+
bytes
190189
}

i18n/iconv_lib/utf_32.rs

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -102,30 +102,32 @@ pub fn from_ucs4(
102102
};
103103

104104
if include_bom {
105-
match variant {
106-
UTF32Variant::UTF32LE => LittleEndian::write_u32(&mut utf32, BOM),
107-
UTF32Variant::UTF32BE => BigEndian::write_u32(&mut utf32, BOM),
108-
_ => unreachable!(),
109-
}
105+
write_u32(&mut utf32, BOM, variant);
110106
}
111107

112108
for &code_point in input {
113-
if code_point >= 0x110000 {
109+
if code_point > 0x10FFFF {
114110
if !suppress_error {
115-
eprintln!("Error: Invalid code point U+{:X}", code_point);
111+
eprintln!("Error: Invalid Unicode code point U+{:X}", code_point);
116112
}
117113
if omit_invalid {
118114
continue;
119115
} else {
120116
return (1, utf32);
121117
}
122118
}
123-
match variant {
124-
UTF32Variant::UTF32LE => LittleEndian::write_u32(&mut utf32, code_point),
125-
UTF32Variant::UTF32BE => BigEndian::write_u32(&mut utf32, code_point),
126-
_ => unreachable!(),
127-
}
119+
write_u32(&mut utf32, code_point, variant);
128120
}
129121

130122
(0, utf32)
131123
}
124+
125+
fn write_u32(buffer: &mut Vec<u8>, value: u32, variant: UTF32Variant) {
126+
let mut temp = [0u8; 4];
127+
match variant {
128+
UTF32Variant::UTF32LE => LittleEndian::write_u32(&mut temp, value),
129+
UTF32Variant::UTF32BE => BigEndian::write_u32(&mut temp, value),
130+
_ => unreachable!(),
131+
}
132+
buffer.extend_from_slice(&temp);
133+
}

0 commit comments

Comments
 (0)