Skip to content

Commit 8bd8f76

Browse files
committed
feat(iconv): support streaming by working directly with iterators
1 parent 614034f commit 8bd8f76

File tree

5 files changed

+654
-449
lines changed

5 files changed

+654
-449
lines changed

i18n/iconv.rs

Lines changed: 175 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,101 @@ struct Args {
5858
files: Option<Vec<PathBuf>>,
5959
}
6060

61+
struct CircularBuffer<R: Read> {
62+
reader: R,
63+
buffer: [u8; 10000],
64+
capacity: usize,
65+
read_pos: usize,
66+
write_pos: usize,
67+
length: usize,
68+
}
69+
70+
impl<R: Read> CircularBuffer<R> {
71+
fn new(reader: R) -> Self {
72+
CircularBuffer {
73+
reader,
74+
buffer: [0; 10000],
75+
capacity: 10000,
76+
read_pos: 0,
77+
write_pos: 0,
78+
length: 0,
79+
}
80+
}
81+
82+
fn available_space(&self) -> usize {
83+
self.capacity - self.length
84+
}
85+
86+
fn fill_buffer(&mut self) -> io::Result<()> {
87+
while self.length < self.capacity {
88+
let mut temp_buf = vec![0; self.available_space()];
89+
match self.reader.read(&mut temp_buf) {
90+
Ok(0) => return Ok(()), // EOF reached
91+
Ok(n) => {
92+
self.write(&temp_buf[..n]);
93+
}
94+
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
95+
Err(e) => return Err(e),
96+
}
97+
}
98+
Ok(())
99+
}
100+
101+
fn write(&mut self, data: &[u8]) -> usize {
102+
let mut bytes_written = 0;
103+
for &byte in data.iter().take(self.available_space()) {
104+
self.buffer[self.write_pos] = byte;
105+
self.write_pos = (self.write_pos + 1) % self.capacity;
106+
self.length += 1;
107+
bytes_written += 1;
108+
}
109+
bytes_written
110+
}
111+
112+
fn iter(self) -> CircularBufferIterator<R> {
113+
CircularBufferIterator { buffer: self }
114+
}
115+
}
116+
117+
struct CircularBufferIterator<R: Read> {
118+
buffer: CircularBuffer<R>,
119+
}
120+
121+
impl<R: Read> Iterator for CircularBufferIterator<R> {
122+
type Item = u8;
123+
124+
fn next(&mut self) -> Option<Self::Item> {
125+
if self.buffer.length == 0 {
126+
match self.buffer.fill_buffer() {
127+
Ok(()) if self.buffer.length == 0 => return None, // EOF reached
128+
Ok(()) => {}
129+
Err(e) => {
130+
eprintln!("Error: {}", e);
131+
exit(1);
132+
}
133+
}
134+
}
135+
136+
if self.buffer.length > 0 {
137+
let item = self.buffer.buffer[self.buffer.read_pos];
138+
self.buffer.read_pos = (self.buffer.read_pos + 1) % self.buffer.capacity;
139+
self.buffer.length -= 1;
140+
Some(item)
141+
} else {
142+
None
143+
}
144+
}
145+
}
146+
147+
impl<R: Read> IntoIterator for CircularBuffer<R> {
148+
type Item = u8;
149+
type IntoIter = CircularBufferIterator<R>;
150+
151+
fn into_iter(self) -> Self::IntoIter {
152+
self.iter()
153+
}
154+
}
155+
61156
#[derive(EnumString, EnumIter, Debug, PartialEq, Display)]
62157
#[strum(serialize_all = "SCREAMING-KEBAB-CASE")]
63158
#[allow(non_camel_case_types)]
@@ -267,122 +362,127 @@ fn parse_codeset(codeset: &str) -> Result<CodesetType, Box<dyn std::error::Error
267362
fn encoding_conversion(
268363
from: &Encodings,
269364
to: &Encodings,
270-
input: &[u8],
365+
input: CircularBuffer<Box<dyn Read>>,
271366
omit_invalid: bool,
272367
supress_error: bool,
273-
) -> (Vec<u8>, u32) {
274-
let (input_exit_code, input) = match from {
275-
Encodings::UTF_8 => utf_8::to_ucs4(input, omit_invalid, supress_error),
368+
) {
369+
let iter = input.into_iter();
370+
let ucs4 = match from {
371+
Encodings::UTF_8 => utf_8::to_ucs4(iter, omit_invalid, supress_error),
276372
Encodings::UTF_16 => {
277-
utf_16::to_ucs4(input, omit_invalid, supress_error, UTF16Variant::UTF16)
373+
utf_16::to_ucs4(iter, omit_invalid, supress_error, UTF16Variant::UTF16)
278374
}
279375
Encodings::UTF_16LE => {
280-
utf_16::to_ucs4(input, omit_invalid, supress_error, UTF16Variant::UTF16LE)
376+
utf_16::to_ucs4(iter, omit_invalid, supress_error, UTF16Variant::UTF16LE)
281377
}
282378
Encodings::UTF_16BE => {
283-
utf_16::to_ucs4(input, omit_invalid, supress_error, UTF16Variant::UTF16BE)
379+
utf_16::to_ucs4(iter, omit_invalid, supress_error, UTF16Variant::UTF16BE)
284380
}
285381
Encodings::UTF_32 => {
286-
utf_32::to_ucs4(input, omit_invalid, supress_error, UTF32Variant::UTF32)
382+
utf_32::to_ucs4(iter, omit_invalid, supress_error, UTF32Variant::UTF32)
287383
}
288384
Encodings::UTF_32LE => {
289-
utf_32::to_ucs4(input, omit_invalid, supress_error, UTF32Variant::UTF32LE)
385+
utf_32::to_ucs4(iter, omit_invalid, supress_error, UTF32Variant::UTF32LE)
290386
}
291387
Encodings::UTF_32BE => {
292-
utf_32::to_ucs4(input, omit_invalid, supress_error, UTF32Variant::UTF32BE)
388+
utf_32::to_ucs4(iter, omit_invalid, supress_error, UTF32Variant::UTF32BE)
293389
}
294-
Encodings::ASCII => ascii::to_ucs4(input, omit_invalid, supress_error),
390+
Encodings::ASCII => ascii::to_ucs4(iter, omit_invalid, supress_error),
295391
};
296392

297-
let (output_exit_code, output) = match to {
298-
Encodings::UTF_8 => utf_8::from_ucs4(input.as_slice(), omit_invalid, supress_error),
299-
Encodings::UTF_16 => utf_16::from_ucs4(
300-
input.as_slice(),
301-
omit_invalid,
302-
supress_error,
303-
UTF16Variant::UTF16,
304-
),
305-
Encodings::UTF_16LE => utf_16::from_ucs4(
306-
input.as_slice(),
307-
omit_invalid,
308-
supress_error,
309-
UTF16Variant::UTF16LE,
310-
),
311-
Encodings::UTF_16BE => utf_16::from_ucs4(
312-
input.as_slice(),
313-
omit_invalid,
314-
supress_error,
315-
UTF16Variant::UTF16BE,
316-
),
317-
Encodings::UTF_32 => utf_32::from_ucs4(
318-
input.as_slice(),
319-
omit_invalid,
320-
supress_error,
321-
UTF32Variant::UTF32,
322-
),
323-
Encodings::UTF_32LE => utf_32::from_ucs4(
324-
input.as_slice(),
325-
omit_invalid,
326-
supress_error,
327-
UTF32Variant::UTF32LE,
328-
),
329-
Encodings::UTF_32BE => utf_32::from_ucs4(
330-
input.as_slice(),
331-
omit_invalid,
332-
supress_error,
333-
UTF32Variant::UTF32BE,
334-
),
335-
Encodings::ASCII => ascii::from_ucs4(input.as_slice(), omit_invalid, supress_error),
393+
let expected = match to {
394+
Encodings::UTF_8 => utf_8::from_ucs4(ucs4, omit_invalid, supress_error),
395+
Encodings::UTF_16 => {
396+
utf_16::from_ucs4(ucs4, omit_invalid, supress_error, UTF16Variant::UTF16)
397+
}
398+
Encodings::UTF_16BE => {
399+
utf_16::from_ucs4(ucs4, omit_invalid, supress_error, UTF16Variant::UTF16BE)
400+
}
401+
Encodings::UTF_16LE => {
402+
utf_16::from_ucs4(ucs4, omit_invalid, supress_error, UTF16Variant::UTF16LE)
403+
}
404+
Encodings::UTF_32 => {
405+
utf_32::from_ucs4(ucs4, omit_invalid, supress_error, UTF32Variant::UTF32)
406+
}
407+
Encodings::UTF_32LE => {
408+
utf_32::from_ucs4(ucs4, omit_invalid, supress_error, UTF32Variant::UTF32LE)
409+
}
410+
Encodings::UTF_32BE => {
411+
utf_32::from_ucs4(ucs4, omit_invalid, supress_error, UTF32Variant::UTF32BE)
412+
}
413+
Encodings::ASCII => ascii::from_ucs4(ucs4, omit_invalid, supress_error),
336414
};
337415

338-
let exit_code = input_exit_code.max(output_exit_code);
339-
340-
(output, exit_code)
416+
expected.for_each(|byte| {
417+
io::stdout().write_all(&[byte]).unwrap();
418+
io::stdout().flush().unwrap();
419+
});
341420
}
421+
342422
fn charmap_conversion(
343423
from: &Charmap,
344424
to: &Charmap,
345-
input: &[u8],
425+
input: CircularBuffer<Box<dyn Read>>,
346426
omit_invalid: bool,
347427
suppress_error: bool,
348-
) -> (Vec<u8>, u32) {
349-
let mut output = Vec::new();
350-
let mut error_count = 0;
428+
) {
429+
let mut buffer = Vec::new();
430+
let stdout = io::stdout();
431+
let mut stdout = stdout.lock();
351432

352-
let mut i = 0;
353-
while i < input.len() {
433+
for byte in input {
434+
buffer.push(byte);
354435
let mut found = false;
355436
for (_, entry) in &from.entries {
356-
if input[i..].starts_with(&entry.encoding) {
437+
if buffer.starts_with(&entry.encoding) {
357438
if let Some(to_entry) = to
358439
.entries
359440
.values()
360441
.find(|e| e.symbolic_name == entry.symbolic_name)
361442
{
362-
output.extend_from_slice(&to_entry.encoding);
363-
i += entry.encoding.len();
443+
if let Err(e) = stdout.write_all(&to_entry.encoding) {
444+
eprintln!("Error writing to stdout: {}", e);
445+
}
446+
if let Err(e) = stdout.flush() {
447+
eprintln!("Error flushing stdout: {}", e);
448+
}
449+
buffer.clear();
364450
found = true;
365451
break;
366452
}
367453
}
368454
}
369-
370-
if !found {
455+
if !found && buffer.len() >= from.header.mb_cur_max {
371456
if !suppress_error {
372-
eprintln!("Error: Invalid or unmapped character at position {}", i);
457+
eprintln!("Error: Invalid or unmapped character");
373458
}
374-
error_count += 1;
375459
if omit_invalid {
376-
i += 1;
460+
buffer.clear();
377461
} else {
378-
output.push(input[i]);
379-
i += 1;
462+
if let Err(e) = stdout.write_all(&[buffer[0]]) {
463+
eprintln!("Error writing to stdout: {}", e);
464+
}
465+
if let Err(e) = stdout.flush() {
466+
eprintln!("Error flushing stdout: {}", e);
467+
}
468+
buffer.remove(0);
380469
}
381470
}
382471
}
383472

384-
let exit_code = if error_count > 0 { 1 } else { 0 };
385-
(output, exit_code)
473+
for &byte in &buffer {
474+
if !omit_invalid {
475+
if let Err(e) = stdout.write_all(&[byte]) {
476+
eprintln!("Error writing to stdout: {}", e);
477+
}
478+
if let Err(e) = stdout.flush() {
479+
eprintln!("Error flushing stdout: {}", e);
480+
}
481+
}
482+
if !suppress_error {
483+
eprintln!("Error: Invalid or unmapped character at end of input");
484+
}
485+
}
386486
}
387487

388488
fn main() -> Result<(), Box<dyn std::error::Error>> {
@@ -428,35 +528,14 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
428528
None => vec![Box::new(io::stdin().lock())],
429529
};
430530

431-
for mut input in inputs {
432-
let mut inp_buf = Vec::new();
433-
input.read_to_end(&mut inp_buf)?;
434-
531+
for input in inputs {
532+
let buf = CircularBuffer::new(input);
435533
match (&from_codeset, &to_codeset) {
436534
(CodesetType::Encoding(from), CodesetType::Encoding(to)) => {
437-
let (output, exit_code) = encoding_conversion(
438-
from,
439-
to,
440-
&inp_buf,
441-
args.omit_invalid,
442-
args.suppress_messages,
443-
);
444-
445-
io::stdout().write_all(&output)?;
446-
exit(exit_code as i32);
535+
encoding_conversion(from, to, buf, args.omit_invalid, args.suppress_messages);
447536
}
448-
449537
(CodesetType::Charmap(from), CodesetType::Charmap(to)) => {
450-
let (output, exit_code) = charmap_conversion(
451-
from,
452-
to,
453-
&inp_buf,
454-
args.omit_invalid,
455-
args.suppress_messages,
456-
);
457-
458-
io::stdout().write_all(&output)?;
459-
exit(exit_code as i32);
538+
charmap_conversion(from, to, buf, args.omit_invalid, args.suppress_messages);
460539
}
461540
_ => {
462541
eprintln!(

0 commit comments

Comments
 (0)