Skip to content

Commit 9c59d53

Browse files
committed
Add Utf8BytesReader infrastructure
When the source of the bytes isn't UTF-8 (or isn't known to be), the bytes need to be decoded first, or at least validated as such. Wrap 'Read'ers with Utf8BytesReader to ensure this happens. Defer the validating portion for now.
1 parent 4b6f0c0 commit 9c59d53

File tree

10 files changed

+102
-87
lines changed

10 files changed

+102
-87
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ include = ["src/*", "LICENSE-MIT.md", "README.md"]
1616
[dependencies]
1717
document-features = { version = "0.2", optional = true }
1818
encoding_rs = { version = "0.8", optional = true }
19+
encoding_rs_io = { version = "0.1", optional = true }
1920
serde = { version = "1.0.100", optional = true }
2021
tokio = { version = "1.10", optional = true, default-features = false, features = ["io-util"] }
2122
memchr = "2.1"
@@ -109,7 +110,7 @@ async-tokio = ["tokio"]
109110
## [UTF-16LE]: encoding_rs::UTF_16LE
110111
## [ISO-2022-JP]: encoding_rs::ISO_2022_JP
111112
## [#158]: https://github.com/tafia/quick-xml/issues/158
112-
encoding = ["encoding_rs"]
113+
encoding = ["encoding_rs", "encoding_rs_io"]
113114

114115
## Enables support for recognizing all [HTML 5 entities] in [`unescape`] and
115116
## [`unescape_with`] functions. The full list of entities also can be found in

src/de/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1931,7 +1931,7 @@ pub use crate::errors::serialize::DeError;
19311931
pub use resolver::{EntityResolver, NoEntityResolver};
19321932

19331933
use crate::{
1934-
encoding::Decoder,
1934+
encoding::{Decoder, Utf8BytesReader},
19351935
errors::Error,
19361936
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
19371937
name::QName,
@@ -2677,7 +2677,7 @@ where
26772677
}
26782678
}
26792679

2680-
impl<'de, R> Deserializer<'de, IoReader<R>>
2680+
impl<'de, R> Deserializer<'de, IoReader<Utf8BytesReader<R>>>
26812681
where
26822682
R: BufRead,
26832683
{

src/encoding.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
//! A module for wrappers that encode / decode data.
22
33
use std::borrow::Cow;
4+
use std::io;
45

56
#[cfg(feature = "encoding")]
67
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
8+
#[cfg(feature = "encoding")]
9+
use encoding_rs_io::{DecodeReaderBytes, DecodeReaderBytesBuilder};
710

811
#[cfg(feature = "encoding")]
912
use crate::Error;
@@ -21,6 +24,57 @@ pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
2124
#[cfg(feature = "encoding")]
2225
pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
2326

27+
/// A struct for transparently decoding / validating bytes as UTF-8.
28+
#[derive(Debug)]
29+
pub struct Utf8BytesReader<R> {
30+
#[cfg(feature = "encoding")]
31+
reader: io::BufReader<DecodeReaderBytes<R, Vec<u8>>>,
32+
#[cfg(not(feature = "encoding"))]
33+
reader: io::BufReader<R>,
34+
}
35+
36+
impl<R: io::Read> Utf8BytesReader<R> {
37+
/// Build a new reader which decodes a stream of bytes in an unknown encoding into UTF-8.
38+
/// Note: The consumer is responsible for finding the correct character boundaries when
39+
/// treating a given range of bytes as UTF-8.
40+
#[cfg(feature = "encoding")]
41+
pub fn new(reader: R) -> Self {
42+
let decoder = DecodeReaderBytesBuilder::new()
43+
.bom_override(true)
44+
.build(reader);
45+
46+
Self {
47+
reader: io::BufReader::new(decoder),
48+
}
49+
}
50+
51+
/// Build a new reader which (will eventually) validate UTF-8.
52+
/// Note: The consumer is responsible for finding the correct character boundaries when
53+
/// treating a given range of bytes as UTF-8.
54+
#[cfg(not(feature = "encoding"))]
55+
pub fn new(reader: R) -> Self {
56+
Self {
57+
reader: io::BufReader::new(reader),
58+
}
59+
}
60+
}
61+
62+
impl<R: io::Read> io::Read for Utf8BytesReader<R> {
63+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
64+
self.reader.read(buf)
65+
}
66+
}
67+
68+
impl<R: io::Read> io::BufRead for Utf8BytesReader<R> {
69+
fn fill_buf(&mut self) -> io::Result<&[u8]> {
70+
self.reader.fill_buf()
71+
}
72+
73+
fn consume(&mut self, amt: usize) {
74+
self.reader.consume(amt)
75+
}
76+
}
77+
2478
/// Decoder of byte slices into strings.
2579
///
2680
/// If feature `encoding` is enabled, this encoding taken from the `"encoding"`

src/reader/buffered_reader.rs

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22
//! underlying byte stream.
33
44
use std::fs::File;
5-
use std::io::{self, BufRead, BufReader};
5+
use std::io::{self, BufRead};
66
use std::path::Path;
77

88
use memchr;
99

10+
use crate::encoding::Utf8BytesReader;
1011
use crate::errors::{Error, Result};
1112
use crate::events::Event;
1213
use crate::name::QName;
@@ -34,6 +35,7 @@ macro_rules! impl_buffered_source {
3435

3536
#[cfg(feature = "encoding")]
3637
$($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
38+
// TODO: broken because decoder sends UTF-8
3739
loop {
3840
break match self $(.$reader)? .fill_buf() $(.$await)? {
3941
Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
@@ -399,15 +401,12 @@ impl<R: BufRead> Reader<R> {
399401
}
400402
}
401403

402-
impl Reader<BufReader<File>> {
404+
impl Reader<Utf8BytesReader<File>> {
403405
/// Creates an XML reader from a file path.
404406
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
405-
let file = File::open(path)?;
406-
let reader = BufReader::new(file);
407-
Ok(Self::from_reader(reader))
407+
Ok(Self::from_reader(File::open(path)?))
408408
}
409409
}
410-
411410
#[cfg(test)]
412411
mod test {
413412
use crate::reader::test::{check, small_buffers};

src/reader/mod.rs

Lines changed: 12 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
//! Contains high-level interface for a pull-based XML parser.
22
3+
use std::io::Read;
4+
use std::ops::Range;
5+
36
#[cfg(feature = "encoding")]
47
use encoding_rs::Encoding;
5-
use std::ops::Range;
68

7-
use crate::encoding::Decoder;
9+
use crate::encoding::{Decoder, Utf8BytesReader};
810
use crate::errors::{Error, Result};
911
use crate::events::Event;
1012
use crate::reader::parser::Parser;
@@ -428,7 +430,7 @@ enum ParseState {
428430
/// BomDetected -- "encoding=..." --> XmlDetected
429431
/// ```
430432
#[cfg(feature = "encoding")]
431-
#[derive(Clone, Copy)]
433+
#[derive(Clone, Copy, Debug)]
432434
enum EncodingRef {
433435
/// Encoding was implicitly assumed to have a specified value. It can be refined
434436
/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
@@ -528,73 +530,22 @@ pub struct Reader<R> {
528530
}
529531

530532
/// Builder methods
531-
impl<R> Reader<R> {
533+
impl<R: Read> Reader<Utf8BytesReader<R>> {
532534
/// Creates a `Reader` that reads from a given reader.
533535
pub fn from_reader(reader: R) -> Self {
534536
Self {
535-
reader,
537+
reader: Utf8BytesReader::new(reader),
536538
parser: Parser::default(),
537539
}
538540
}
539-
540-
configure_methods!();
541541
}
542542

543-
/// Getters
543+
/// Public implementation-independent functionality
544544
impl<R> Reader<R> {
545-
/// Consumes `Reader` returning the underlying reader
546-
///
547-
/// Can be used to compute line and column of a parsing error position
548-
///
549-
/// # Examples
550-
///
551-
/// ```
552-
/// # use pretty_assertions::assert_eq;
553-
/// use std::{str, io::Cursor};
554-
/// use quick_xml::events::Event;
555-
/// use quick_xml::reader::Reader;
556-
///
557-
/// let xml = r#"<tag1 att1 = "test">
558-
/// <tag2><!--Test comment-->Test</tag2>
559-
/// <tag3>Test 2</tag3>
560-
/// </tag1>"#;
561-
/// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
562-
/// let mut buf = Vec::new();
563-
///
564-
/// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
565-
/// let end_pos = reader.buffer_position();
566-
/// let mut cursor = reader.into_inner();
567-
/// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
568-
/// .expect("can't make a string");
569-
/// let mut line = 1;
570-
/// let mut column = 0;
571-
/// for c in s.chars() {
572-
/// if c == '\n' {
573-
/// line += 1;
574-
/// column = 0;
575-
/// } else {
576-
/// column += 1;
577-
/// }
578-
/// }
579-
/// (line, column)
580-
/// }
581-
///
582-
/// loop {
583-
/// match reader.read_event_into(&mut buf) {
584-
/// Ok(Event::Start(ref e)) => match e.name().as_ref() {
585-
/// b"tag1" | b"tag2" => (),
586-
/// tag => {
587-
/// assert_eq!(b"tag3", tag);
588-
/// assert_eq!((3, 22), into_line_and_column(reader));
589-
/// break;
590-
/// }
591-
/// },
592-
/// Ok(Event::Eof) => unreachable!(),
593-
/// _ => (),
594-
/// }
595-
/// buf.clear();
596-
/// }
597-
/// ```
545+
// Configuration setters
546+
configure_methods!();
547+
548+
/// Consumes `Reader` returning the underlying reader.
598549
pub fn into_inner(self) -> R {
599550
self.reader
600551
}

src/reader/ns_reader.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66
77
use std::borrow::Cow;
88
use std::fs::File;
9-
use std::io::{BufRead, BufReader};
9+
use std::io::{BufRead, Read};
1010
use std::ops::Deref;
1111
use std::path::Path;
1212

13+
use crate::encoding::Utf8BytesReader;
1314
use crate::errors::Result;
1415
use crate::events::Event;
1516
use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult};
1617
use crate::reader::{Reader, Span, XmlSource};
17-
1818
/// A low level encoding-agnostic XML event reader that performs namespace resolution.
1919
///
2020
/// Consumes a [`BufRead`] and streams XML `Event`s.
@@ -33,14 +33,12 @@ pub struct NsReader<R> {
3333
}
3434

3535
/// Builder methods
36-
impl<R> NsReader<R> {
36+
impl<R: Read> NsReader<Utf8BytesReader<R>> {
3737
/// Creates a `NsReader` that reads from a reader.
3838
#[inline]
3939
pub fn from_reader(reader: R) -> Self {
4040
Self::new(Reader::from_reader(reader))
4141
}
42-
43-
configure_methods!(reader);
4442
}
4543

4644
/// Private methods
@@ -118,8 +116,11 @@ impl<R> NsReader<R> {
118116
}
119117
}
120118

121-
/// Getters
119+
/// Public implementation-independent functionality
122120
impl<R> NsReader<R> {
121+
// Configuration setters
122+
configure_methods!(reader);
123+
123124
/// Consumes `NsReader` returning the underlying reader
124125
///
125126
/// See the [`Reader::into_inner`] for examples
@@ -528,7 +529,7 @@ impl<R: BufRead> NsReader<R> {
528529
}
529530
}
530531

531-
impl NsReader<BufReader<File>> {
532+
impl NsReader<Utf8BytesReader<File>> {
532533
/// Creates an XML reader from a file path.
533534
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
534535
Ok(Self::new(Reader::from_file(path)?))

src/reader/parser.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ impl Parser {
187187
if len > 2 && buf[len - 1] == b'?' {
188188
if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
189189
let event = BytesDecl::from_start(BytesStart::wrap(&buf[1..len - 1], 3));
190-
191190
// Try getting encoding from the declaration event
192191
#[cfg(feature = "encoding")]
193192
if self.encoding.can_be_refined() {

src/reader/slice_reader.rs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,33 @@ use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, Xml
1616

1717
use memchr;
1818

19-
/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
20-
/// This implementation supports not using an intermediate buffer as the byte slice
21-
/// itself can be used to borrow from.
19+
use super::parser::Parser;
20+
21+
/// This is an implementation of [`Reader`] for reading from a `&[u8]` as
22+
/// underlying byte stream. This implementation supports not using an
23+
/// intermediate buffer as the byte slice itself can be used to borrow from.
2224
impl<'a> Reader<&'a [u8]> {
2325
/// Creates an XML reader from a string slice.
2426
#[allow(clippy::should_implement_trait)]
2527
pub fn from_str(s: &'a str) -> Self {
2628
// Rust strings are guaranteed to be UTF-8, so lock the encoding
2729
#[cfg(feature = "encoding")]
2830
{
29-
let mut reader = Self::from_reader(s.as_bytes());
30-
reader.parser.encoding = EncodingRef::Explicit(UTF_8);
31-
reader
31+
let mut parser = Parser::default();
32+
parser.encoding = EncodingRef::Explicit(UTF_8);
33+
Self {
34+
reader: s.as_bytes(),
35+
parser: parser,
36+
}
3237
}
3338

3439
#[cfg(not(feature = "encoding"))]
35-
Self::from_reader(s.as_bytes())
40+
{
41+
Self {
42+
reader: s.as_bytes(),
43+
parser: Parser::default(),
44+
}
45+
}
3646
}
3747

3848
/// Read an event that borrows from the input rather than a buffer.

tests/test.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ fn test_issue94() {
9797
let mut reader = Reader::from_reader(&data[..]);
9898
reader.trim_text(true);
9999
loop {
100-
match reader.read_event() {
100+
match reader.read_event_into(&mut Vec::new()) {
101101
Ok(Eof) | Err(..) => break,
102102
_ => (),
103103
}

tests/xmlrs_reader_tests.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ fn test_bytes(input: &[u8], output: &[u8], trim: bool) {
392392

393393
let mut decoder = reader.decoder();
394394
loop {
395-
let line = match reader.read_resolved_event() {
395+
let line = match reader.read_resolved_event_into(&mut Vec::new()) {
396396
Ok((_, Event::Decl(e))) => {
397397
// Declaration could change decoder
398398
decoder = reader.decoder();

0 commit comments

Comments
 (0)