Skip to content

Commit 8c4e0eb

Browse files
committed
Add support for undecodable binary text for serde
1 parent cdff285 commit 8c4e0eb

File tree

8 files changed

+2214
-9
lines changed

8 files changed

+2214
-9
lines changed

src/de/map.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,15 +247,15 @@ where
247247
// We shouldn't have both `$value` and `$text` fields in the same
248248
// struct, so if we have `$value` field, the we should deserialize
249249
// text content to `$value`
250-
DeEvent::Text(_) if self.has_value_field => {
250+
DeEvent::Text(_) | DeEvent::Binary(_) if self.has_value_field => {
251251
self.source = ValueSource::Content;
252252
// Deserialize `key` from special attribute name which means
253253
// that value should be taken from the text content of the
254254
// XML node
255255
let de = BorrowedStrDeserializer::<DeError>::new(VALUE_KEY);
256256
seed.deserialize(de).map(Some)
257257
}
258-
DeEvent::Text(_) => {
258+
DeEvent::Text(_) | DeEvent::Binary(_) => {
259259
self.source = ValueSource::Text;
260260
// Deserialize `key` from special attribute name which means
261261
// that value should be taken from the text content of the
@@ -943,6 +943,9 @@ where
943943
// SAFETY: we just checked that the next event is Text
944944
_ => unreachable!(),
945945
},
946+
DeEvent::Binary(_) => Err(Self::Error::Unsupported(
947+
"undecodable binary data among a sequence of xml elements".into(),
948+
)),
946949
DeEvent::Start(_) => match self.map.de.next()? {
947950
DeEvent::Start(start) => seed
948951
.deserialize(ElementDeserializer {

src/de/mod.rs

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2056,6 +2056,31 @@ impl<'a> From<&'a str> for Text<'a> {
20562056
}
20572057
}
20582058

2059+
/// Docs
2060+
#[derive(Clone, Debug, PartialEq, Eq)]
2061+
pub struct Binary<'a> {
2062+
/// Field
2063+
pub text: Cow<'a, [u8]>,
2064+
}
2065+
2066+
impl<'a> Deref for Binary<'a> {
2067+
type Target = [u8];
2068+
2069+
#[inline]
2070+
fn deref(&self) -> &Self::Target {
2071+
self.text.deref()
2072+
}
2073+
}
2074+
2075+
impl<'a> From<&'a [u8]> for Binary<'a> {
2076+
#[inline]
2077+
fn from(text: &'a [u8]) -> Self {
2078+
Self {
2079+
text: Cow::Borrowed(text),
2080+
}
2081+
}
2082+
}
2083+
20592084
////////////////////////////////////////////////////////////////////////////////////////////////////
20602085

20612086
/// Simplified event which contains only these variants that used by deserializer
@@ -2074,6 +2099,8 @@ pub enum DeEvent<'a> {
20742099
/// [`Comment`]: Event::Comment
20752100
/// [`PI`]: Event::PI
20762101
Text(Text<'a>),
2102+
/// Binary undecoded
2103+
Binary(Binary<'a>),
20772104
/// End of XML document.
20782105
Eof,
20792106
}
@@ -2217,7 +2244,16 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
22172244
// FIXME: Actually, we should trim after decoding text, but now we trim before
22182245
continue;
22192246
}
2220-
self.drain_text(e.unescape_with(|entity| self.entity_resolver.resolve(entity))?)
2247+
match e
2248+
.unescape_with(|entity| self.entity_resolver.resolve(entity))
2249+
.map(|res| self.drain_text(res))
2250+
{
2251+
Ok(x) => x,
2252+
// failed to escape treat as binary blob.
2253+
Err(_) => Ok(DeEvent::Binary(Binary {
2254+
text: e.into_inner(),
2255+
})),
2256+
}
22212257
}
22222258
PayloadEvent::CData(e) => self.drain_text(e.decode()?),
22232259
PayloadEvent::DocType(e) => {
@@ -2687,6 +2723,8 @@ where
26872723
fn read_string_impl(&mut self, allow_start: bool) -> Result<Cow<'de, str>, DeError> {
26882724
match self.next()? {
26892725
DeEvent::Text(e) => Ok(e.text),
2726+
// SAFETY: Binary event should never be emitted for decoded strings.
2727+
DeEvent::Binary(e) => unreachable!("{:?}", e),
26902728
// allow one nested level
26912729
DeEvent::Start(e) if allow_start => self.read_text(e.name()),
26922730
DeEvent::Start(e) => Err(DeError::UnexpectedStart(e.name().as_ref().to_owned())),
@@ -2708,10 +2746,12 @@ where
27082746
// The matching tag name is guaranteed by the reader
27092747
DeEvent::End(_) => Ok(e.text),
27102748
// SAFETY: Cannot be two consequent Text events, they would be merged into one
2711-
DeEvent::Text(_) => unreachable!(),
2749+
DeEvent::Text(_) | DeEvent::Binary(_) => unreachable!(),
27122750
DeEvent::Start(e) => Err(DeError::UnexpectedStart(e.name().as_ref().to_owned())),
27132751
DeEvent::Eof => Err(Error::missed_end(name, self.reader.decoder()).into()),
27142752
},
2753+
// SAFETY: Binary event should never be emitted for decoded strings.
2754+
DeEvent::Binary(e) => unreachable!("{:?}", e),
27152755
// We can get End event in case of `<tag></tag>` or `<tag/>` input
27162756
// Return empty text in that case
27172757
// The matching tag name is guaranteed by the reader
@@ -2827,6 +2867,30 @@ where
28272867
}
28282868
}
28292869

2870+
impl<'de, R> Deserializer<'de, IoReader<R>>
2871+
where
2872+
R: BufRead,
2873+
{
2874+
/// Create new deserializer that will copy data from the specified reader
2875+
/// into internal buffer.
2876+
///
2877+
/// If you already have a string use [`Self::from_str`] instead, because it
2878+
/// will borrow instead of copy. If you have `&[u8]` which is known to represent
2879+
/// UTF-8, you can decode it first before using [`from_str`].
2880+
///
2881+
/// Deserializer created with this method will not resolve custom entities.
2882+
pub fn from_custom_reader(reader: Reader<R>) -> Self {
2883+
Self::new(
2884+
IoReader {
2885+
reader,
2886+
start_trimmer: StartTrimmer::default(),
2887+
buf: Vec::new(),
2888+
},
2889+
PredefinedEntityResolver,
2890+
)
2891+
}
2892+
}
2893+
28302894
impl<'de, R, E> Deserializer<'de, IoReader<R>, E>
28312895
where
28322896
R: BufRead,
@@ -2884,6 +2948,10 @@ where
28842948
Cow::Borrowed(s) => visitor.visit_borrowed_str(s),
28852949
Cow::Owned(s) => visitor.visit_string(s),
28862950
},
2951+
DeEvent::Binary(e) => match e.text {
2952+
Cow::Borrowed(s) => visitor.visit_borrowed_bytes(s),
2953+
Cow::Owned(s) => visitor.visit_byte_buf(s),
2954+
},
28872955
DeEvent::Eof => Err(DeError::UnexpectedEof),
28882956
}
28892957
}
@@ -2914,7 +2982,7 @@ where
29142982
self.read_to_end(s.name())?;
29152983
visitor.visit_unit()
29162984
}
2917-
DeEvent::Text(_) => visitor.visit_unit(),
2985+
DeEvent::Text(_) | DeEvent::Binary(_) => visitor.visit_unit(),
29182986
// SAFETY: The reader is guaranteed that we don't have unmatched tags
29192987
// If we here, then out deserializer has a bug
29202988
DeEvent::End(e) => unreachable!("{:?}", e),

src/de/var.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ where
4646
seed.deserialize(QNameDeserializer::from_elem(e.raw_name(), decoder)?)?,
4747
false,
4848
),
49-
DeEvent::Text(_) => (
49+
DeEvent::Text(_) | DeEvent::Binary(_) => (
5050
seed.deserialize(BorrowedStrDeserializer::<DeError>::new(TEXT_KEY))?,
5151
true,
5252
),

src/errors.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,4 +464,10 @@ pub mod serialize {
464464
Self::Custom(e.to_string())
465465
}
466466
}
467+
impl From<std::io::Error> for DeError {
468+
#[inline]
469+
fn from(e: std::io::Error) -> Self {
470+
Self::Custom(e.to_string())
471+
}
472+
}
467473
}

0 commit comments

Comments
 (0)