Skip to content

Commit 72647ef

Browse files
committed
Implement EOL normalization procedure as described in "2.11 End-of-Line Handling" section of XML 1.1 spec
https://www.w3.org/TR/xml11/#sec-line-ends
1 parent 166376d commit 72647ef

File tree

1 file changed

+165
-1
lines changed

1 file changed

+165
-1
lines changed

src/escape.rs

Lines changed: 165 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//! Manage xml character escapes
22
3-
use memchr::memchr2_iter;
3+
use memchr::{memchr2_iter, memchr3};
44
use std::borrow::Cow;
55
use std::num::ParseIntError;
66
use std::ops::Range;
@@ -302,6 +302,108 @@ where
302302
}
303303
}
304304

305+
////////////////////////////////////////////////////////////////////////////////////////////////////
306+
307+
// TODO: It would be better to reuse buffer after decoding if possible
308+
pub(crate) fn normalize_eols<'input>(text: &'input str) -> Cow<'input, str> {
309+
let bytes = text.as_bytes();
310+
311+
// The following sequences of UTF-8 encoded input should be translated into
312+
// a single `\n` (U+000a) character to normalize EOLs:
313+
//
314+
// |UTF-8 |String|
315+
// |--------|------|
316+
// |0d 0a |\r\n |
317+
// |0d c2 85|\r\x85|
318+
// |0d |\r |
319+
// |c2 85 |\x85 |
320+
// |e2 80 a8|\u2028|
321+
if let Some(i) = memchr3(b'\r', 0xC2, 0xE2, bytes) {
322+
// We found a character that requires normalization, so create new normalized
323+
// string, put the prefix as is and then put normalized character
324+
let mut normalized = String::with_capacity(text.len());
325+
// NOTE: unsafe { text.get_unchecked(0..i) } could be used because
326+
// we are sure that index within string
327+
normalized.push_str(&text[0..i]);
328+
329+
let mut pos = normalize_eol_step(&mut normalized, bytes, i, '\n');
330+
while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) {
331+
let index = pos + i;
332+
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
333+
// we are sure that index within string
334+
normalized.push_str(&text[pos..index]);
335+
pos = normalize_eol_step(&mut normalized, bytes, index, '\n');
336+
}
337+
if let Some(rest) = text.get(pos..) {
338+
normalized.push_str(rest);
339+
}
340+
return normalized.into();
341+
}
342+
Cow::Borrowed(text)
343+
}
344+
345+
/// All line breaks MUST have been normalized on input to #xA as described
346+
/// in [2.11 End-of-Line Handling][eof], so the rest of this algorithm operates
347+
/// on text normalized in this way.
348+
///
349+
/// To simplify the tasks of applications, the XML processor MUST behave
350+
/// as if it normalized all line breaks in external parsed entities
351+
/// (including the document entity) on input, before parsing, by translating
352+
/// all of the following to a single #xA character (_which attribute normalization
353+
/// routine will replace by #x20 character_):
354+
///
355+
/// 1. the two-character sequence #xD #xA
356+
/// 2. the two-character sequence #xD #x85
357+
/// 3. the single character #x85
358+
/// 4. the single character #x2028
359+
/// 5. any #xD character that is not immediately followed by #xA or #x85.
360+
///
361+
/// The characters #x85 and #x2028 cannot be reliably recognized and translated
362+
/// until an entity's encoding declaration (if present) has been read.
363+
/// Therefore, it is a fatal error to use them within the XML declaration or text declaration.
364+
///
365+
/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
366+
fn normalize_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize {
367+
match input[index] {
368+
b'\r' => {
369+
normalized.push(ch);
370+
if index + 1 < input.len() {
371+
let next = input[index + 1];
372+
if next == b'\n' {
373+
return index + 2; // skip \r\n
374+
}
375+
// Because input is correct UTF-8 and in UTF-8 every character has
376+
// an unique prefix, byte C2 means only start of #x85 character
377+
if next == 0xC2 {
378+
return index + 3; // skip UTF-8 encoding of #xD #x85 characters (0d c2 85)
379+
}
380+
}
381+
index + 1 // skip \r
382+
}
383+
b'\n' => {
384+
normalized.push(ch);
385+
index + 1 // skip \n
386+
}
387+
// Start of UTF-8 encoding of #x85 character (c2 85)
388+
0xC2 => {
389+
normalized.push(ch);
390+
index + 2 // skip UTF-8 encoding of #x85 character (c2 85)
391+
}
392+
// Start of UTF-8 encoding of #x2028 character (e2 80 a8)
393+
0xE2 => {
394+
normalized.push(ch);
395+
index + 3 // skip UTF-8 encoding of #x2028 character (e2 80 a8)
396+
}
397+
398+
x => unreachable!(
399+
"at {}: expected ''\\n', '\\r', '\\xC2', or '\\xE2', found '{}' / {} / `0x{:X}`",
400+
index, x as char, x, x
401+
),
402+
}
403+
}
404+
405+
////////////////////////////////////////////////////////////////////////////////////////////////////
406+
305407
/// Resolves predefined XML entities or all HTML5 entities depending on the feature
306408
/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
307409
///
@@ -1844,3 +1946,65 @@ fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> {
18441946
_ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber),
18451947
}
18461948
}
1949+
1950+
////////////////////////////////////////////////////////////////////////////////////////////////////
1951+
1952+
#[cfg(test)]
1953+
mod normalization {
1954+
use super::*;
1955+
1956+
mod eol {
1957+
use super::*;
1958+
use pretty_assertions::assert_eq;
1959+
1960+
#[test]
1961+
fn empty() {
1962+
assert_eq!(normalize_eols(""), "");
1963+
}
1964+
1965+
#[test]
1966+
fn already_normalized() {
1967+
assert_eq!(
1968+
normalize_eols("\nalready \n\n normalized\n"),
1969+
"\nalready \n\n normalized\n"
1970+
);
1971+
}
1972+
1973+
#[test]
1974+
fn cr_lf() {
1975+
assert_eq!(normalize_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext");
1976+
}
1977+
1978+
#[test]
1979+
fn cr_u0085() {
1980+
assert_eq!(
1981+
normalize_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"),
1982+
"\nsome\n\ntext"
1983+
);
1984+
}
1985+
1986+
#[test]
1987+
fn u0085() {
1988+
assert_eq!(
1989+
normalize_eols("\u{0085}some\u{0085}\u{0085}text"),
1990+
"\nsome\n\ntext"
1991+
);
1992+
}
1993+
1994+
#[test]
1995+
fn u2028() {
1996+
assert_eq!(
1997+
normalize_eols("\u{2028}some\u{2028}\u{2028}text"),
1998+
"\nsome\n\ntext"
1999+
);
2000+
}
2001+
2002+
#[test]
2003+
fn mixed() {
2004+
assert_eq!(
2005+
normalize_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"),
2006+
"\n\n\n\n\n\nsome\n\n\ntext"
2007+
);
2008+
}
2009+
}
2010+
}

0 commit comments

Comments
 (0)