Skip to content

Commit 4820820

Browse files
committed
add serializer for binary xml text + tweak binary deserializer
Added reading the config to determine trimming options.
1 parent 8c4e0eb commit 4820820

File tree

11 files changed

+4125
-1770
lines changed

11 files changed

+4125
-1770
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ serde = { version = ">=1.0.139", optional = true }
2525
tokio = { version = "1.10", optional = true, default-features = false, features = ["io-util"] }
2626
memchr = "2.1"
2727
arbitrary = { version = "1", features = ["derive"], optional = true }
28+
ref-cast = "1"
2829

2930
[dev-dependencies]
3031
criterion = "0.4"

src/de/mod.rs

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2005,7 +2005,7 @@ use crate::{
20052005
errors::Error,
20062006
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
20072007
name::QName,
2008-
reader::Reader,
2008+
reader::{Config, Reader},
20092009
};
20102010
use serde::de::{self, Deserialize, DeserializeOwned, DeserializeSeed, SeqAccess, Visitor};
20112011
use std::borrow::Cow;
@@ -2168,6 +2168,31 @@ struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolve
21682168
entity_resolver: E,
21692169
}
21702170

2171+
fn trim_cow<'a, F>(value: Cow<'a, str>, trim: F) -> Cow<'a, str>
2172+
where
2173+
F: FnOnce(&str) -> &str,
2174+
{
2175+
match value {
2176+
Cow::Borrowed(bytes) => Cow::Borrowed(trim(bytes)),
2177+
Cow::Owned(mut bytes) => {
2178+
let trimmed = trim(&bytes);
2179+
if trimmed.len() != bytes.len() {
2180+
bytes = trimmed.to_string();
2181+
}
2182+
Cow::Owned(bytes)
2183+
}
2184+
}
2185+
}
2186+
2187+
/// Removes trailing XML whitespace bytes from text content.
2188+
///
2189+
/// Returns `true` if content is empty after that
2190+
fn inplace_trim_end(mut s: &mut Cow<str>) -> bool {
2191+
let c: Cow<str> = replace(&mut s, Cow::Borrowed(""));
2192+
*s = trim_cow(c, str::trim_end);
2193+
s.is_empty()
2194+
}
2195+
21712196
impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
21722197
fn new(mut reader: R, entity_resolver: E) -> Self {
21732198
// Lookahead by one event immediately, so we do not need to check in the
@@ -2206,19 +2231,22 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
22062231
/// occurs. Content of all events would be appended to `result` and returned
22072232
/// as [`DeEvent::Text`].
22082233
///
2234+
/// If the resulting text empty, this function returns None to avoid creating an empty Event.
2235+
///
22092236
/// [`Text`]: PayloadEvent::Text
22102237
/// [`CData`]: PayloadEvent::CData
2211-
fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result<DeEvent<'i>, DeError> {
2238+
fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result<Option<DeEvent<'i>>, DeError> {
22122239
loop {
22132240
if self.current_event_is_last_text() {
22142241
break;
22152242
}
2216-
22172243
match self.next_impl()? {
22182244
PayloadEvent::Text(mut e) => {
22192245
if self.current_event_is_last_text() {
22202246
// FIXME: Actually, we should trim after decoding text, but now we trim before
2221-
e.inplace_trim_end();
2247+
if self.reader.config().trim_text_end {
2248+
e.inplace_trim_end();
2249+
}
22222250
}
22232251
result
22242252
.to_mut()
@@ -2227,10 +2255,12 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
22272255
PayloadEvent::CData(e) => result.to_mut().push_str(&e.decode()?),
22282256

22292257
// SAFETY: current_event_is_last_text checks that event is Text or CData
2230-
_ => unreachable!("Only `Text` and `CData` events can come here"),
2258+
e => {
2259+
unreachable!("Only `Text` and `CData` events can come here: {:?}", &e);
2260+
}
22312261
}
22322262
}
2233-
Ok(DeEvent::Text(Text { text: result }))
2263+
Ok(Some(DeEvent::Text(Text { text: result })))
22342264
}
22352265

22362266
/// Return an input-borrowing event.
@@ -2240,22 +2270,29 @@ impl<'i, R: XmlRead<'i>, E: EntityResolver> XmlReader<'i, R, E> {
22402270
PayloadEvent::Start(e) => Ok(DeEvent::Start(e)),
22412271
PayloadEvent::End(e) => Ok(DeEvent::End(e)),
22422272
PayloadEvent::Text(mut e) => {
2243-
if self.current_event_is_last_text() && e.inplace_trim_end() {
2244-
// FIXME: Actually, we should trim after decoding text, but now we trim before
2245-
continue;
2273+
if self.current_event_is_last_text() {
2274+
if self.reader.config().trim_text_end && e.inplace_trim_end() {
2275+
continue;
2276+
}
22462277
}
2278+
22472279
match e
22482280
.unescape_with(|entity| self.entity_resolver.resolve(entity))
22492281
.map(|res| self.drain_text(res))
22502282
{
2251-
Ok(x) => x,
2283+
Ok(Ok(None)) => continue,
2284+
Ok(Ok(Some(x))) => Ok(x),
2285+
Ok(Err(x)) => Err(x),
22522286
// failed to escape treat as binary blob.
22532287
Err(_) => Ok(DeEvent::Binary(Binary {
22542288
text: e.into_inner(),
22552289
})),
22562290
}
22572291
}
2258-
PayloadEvent::CData(e) => self.drain_text(e.decode()?),
2292+
PayloadEvent::CData(e) => match self.drain_text(e.decode()?).transpose() {
2293+
None => continue,
2294+
Some(x) => x,
2295+
},
22592296
PayloadEvent::DocType(e) => {
22602297
self.entity_resolver
22612298
.capture(e)
@@ -2838,6 +2875,8 @@ where
28382875
pub fn from_str_with_resolver(source: &'de str, entity_resolver: E) -> Self {
28392876
let mut reader = Reader::from_str(source);
28402877
let config = reader.config_mut();
2878+
config.trim_text_start = true;
2879+
config.trim_text_end = true;
28412880
config.expand_empty_elements = true;
28422881

28432882
Self::new(
@@ -3139,6 +3178,9 @@ pub trait XmlRead<'i> {
31393178

31403179
/// A copy of the reader's decoder used to decode strings.
31413180
fn decoder(&self) -> Decoder;
3181+
3182+
/// Returns a reference to the reader config.
3183+
fn config(&self) -> &Config;
31423184
}
31433185

31443186
/// XML input source that reads from a std::io input stream.
@@ -3208,6 +3250,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader<R> {
32083250
fn decoder(&self) -> Decoder {
32093251
self.reader.decoder()
32103252
}
3253+
3254+
fn config(&self) -> &Config {
3255+
self.reader.config()
3256+
}
32113257
}
32123258

32133259
/// XML input source that reads from a slice of bytes and can borrow from it.
@@ -3273,6 +3319,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> {
32733319
fn decoder(&self) -> Decoder {
32743320
self.reader.decoder()
32753321
}
3322+
3323+
fn config(&self) -> &Config {
3324+
self.reader.config()
3325+
}
32763326
}
32773327

32783328
#[cfg(test)]

src/se/content.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ macro_rules! write_primitive {
5858
/// with indent, sequence of strings become one big string with additional content
5959
/// and it would be impossible to distinguish between content of the original
6060
/// strings and inserted indent characters.
61-
pub struct ContentSerializer<'w, 'i, W: Write> {
61+
pub struct ContentSerializer<'w, 'i, W> {
6262
pub writer: &'w mut W,
6363
/// Defines which XML characters need to be escaped in text content
6464
pub level: QuoteLevel,

src/se/element.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ macro_rules! write_primitive {
5656
/// - other variants are not supported ([`DeError::Unsupported`] is returned);
5757
///
5858
/// Usage of empty tags depends on the [`ContentSerializer::expand_empty_elements`] setting.
59-
pub struct ElementSerializer<'w, 'k, W: Write> {
59+
pub struct ElementSerializer<'w, 'k, W> {
6060
/// The inner serializer that contains the settings and mostly do the actual work
6161
pub ser: ContentSerializer<'w, 'k, W>,
6262
/// Tag name used to wrap serialized types except enum variants which uses the variant name

0 commit comments

Comments
 (0)