|
1 | 1 | //! Manage xml character escapes |
2 | 2 |
|
3 | | -use memchr::memchr2_iter; |
| 3 | +use memchr::{memchr2_iter, memchr3}; |
4 | 4 | use std::borrow::Cow; |
5 | 5 | use std::num::ParseIntError; |
6 | 6 | use std::ops::Range; |
@@ -302,6 +302,108 @@ where |
302 | 302 | } |
303 | 303 | } |
304 | 304 |
|
| 305 | +//////////////////////////////////////////////////////////////////////////////////////////////////// |
| 306 | + |
| 307 | +// TODO: It would be better to reuse buffer after decoding if possible |
| 308 | +pub(crate) fn normalize_eols<'input>(text: &'input str) -> Cow<'input, str> { |
| 309 | + let bytes = text.as_bytes(); |
| 310 | + |
| 311 | + // The following sequences of UTF-8 encoded input should be translated into |
| 312 | + // a single `\n` (U+000a) character to normalize EOLs: |
| 313 | + // |
| 314 | + // |UTF-8 |String| |
| 315 | + // |--------|------| |
| 316 | + // |0d 0a |\r\n | |
| 317 | + // |0d c2 85|\r\x85| |
| 318 | + // |0d |\r | |
| 319 | + // |c2 85 |\x85 | |
| 320 | + // |e2 80 a8|\u2028| |
| 321 | + if let Some(i) = memchr3(b'\r', 0xC2, 0xE2, bytes) { |
| 322 | + // We found a character that requires normalization, so create new normalized |
| 323 | + // string, put the prefix as is and then put normalized character |
| 324 | + let mut normalized = String::with_capacity(text.len()); |
| 325 | + // NOTE: unsafe { text.get_unchecked(0..i) } could be used because |
| 326 | + // we are sure that index within string |
| 327 | + normalized.push_str(&text[0..i]); |
| 328 | + |
| 329 | + let mut pos = normalize_eol_step(&mut normalized, bytes, i, '\n'); |
| 330 | + while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) { |
| 331 | + let index = pos + i; |
| 332 | + // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because |
| 333 | + // we are sure that index within string |
| 334 | + normalized.push_str(&text[pos..index]); |
| 335 | + pos = normalize_eol_step(&mut normalized, bytes, index, '\n'); |
| 336 | + } |
| 337 | + if let Some(rest) = text.get(pos..) { |
| 338 | + normalized.push_str(rest); |
| 339 | + } |
| 340 | + return normalized.into(); |
| 341 | + } |
| 342 | + Cow::Borrowed(text) |
| 343 | +} |
| 344 | + |
| 345 | +/// All line breaks MUST have been normalized on input to #xA as described |
| 346 | +/// in [2.11 End-of-Line Handling][eof], so the rest of this algorithm operates |
| 347 | +/// on text normalized in this way. |
| 348 | +/// |
| 349 | +/// To simplify the tasks of applications, the XML processor MUST behave |
| 350 | +/// as if it normalized all line breaks in external parsed entities |
| 351 | +/// (including the document entity) on input, before parsing, by translating |
| 352 | +/// all of the following to a single #xA character (_which attribute normalization |
| 353 | +/// routine will replace by #x20 character_): |
| 354 | +/// |
| 355 | +/// 1. the two-character sequence #xD #xA |
| 356 | +/// 2. the two-character sequence #xD #x85 |
| 357 | +/// 3. the single character #x85 |
| 358 | +/// 4. the single character #x2028 |
| 359 | +/// 5. any #xD character that is not immediately followed by #xA or #x85. |
| 360 | +/// |
| 361 | +/// The characters #x85 and #x2028 cannot be reliably recognized and translated |
| 362 | +/// until an entity's encoding declaration (if present) has been read. |
| 363 | +/// Therefore, it is a fatal error to use them within the XML declaration or text declaration. |
| 364 | +/// |
| 365 | +/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends |
| 366 | +fn normalize_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize { |
| 367 | + match input[index] { |
| 368 | + b'\r' => { |
| 369 | + normalized.push(ch); |
| 370 | + if index + 1 < input.len() { |
| 371 | + let next = input[index + 1]; |
| 372 | + if next == b'\n' { |
| 373 | + return index + 2; // skip \r\n |
| 374 | + } |
| 375 | + // Because input is correct UTF-8 and in UTF-8 every character has |
| 376 | + // an unique prefix, byte C2 means only start of #x85 character |
| 377 | + if next == 0xC2 { |
| 378 | + return index + 3; // skip UTF-8 encoding of #xD #x85 characters (0d c2 85) |
| 379 | + } |
| 380 | + } |
| 381 | + index + 1 // skip \r |
| 382 | + } |
| 383 | + b'\n' => { |
| 384 | + normalized.push(ch); |
| 385 | + index + 1 // skip \n |
| 386 | + } |
| 387 | + // Start of UTF-8 encoding of #x85 character (c2 85) |
| 388 | + 0xC2 => { |
| 389 | + normalized.push(ch); |
| 390 | + index + 2 // skip UTF-8 encoding of #x85 character (c2 85) |
| 391 | + } |
| 392 | + // Start of UTF-8 encoding of #x2028 character (e2 80 a8) |
| 393 | + 0xE2 => { |
| 394 | + normalized.push(ch); |
| 395 | + index + 3 // skip UTF-8 encoding of #x2028 character (e2 80 a8) |
| 396 | + } |
| 397 | + |
| 398 | + x => unreachable!( |
| 399 | + "at {}: expected ''\\n', '\\r', '\\xC2', or '\\xE2', found '{}' / {} / `0x{:X}`", |
| 400 | + index, x as char, x, x |
| 401 | + ), |
| 402 | + } |
| 403 | +} |
| 404 | + |
| 405 | +//////////////////////////////////////////////////////////////////////////////////////////////////// |
| 406 | + |
305 | 407 | /// Resolves predefined XML entities or all HTML5 entities depending on the feature |
306 | 408 | /// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html). |
307 | 409 | /// |
@@ -1844,3 +1946,65 @@ fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> { |
1844 | 1946 | _ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber), |
1845 | 1947 | } |
1846 | 1948 | } |
| 1949 | + |
| 1950 | +//////////////////////////////////////////////////////////////////////////////////////////////////// |
| 1951 | + |
| 1952 | +#[cfg(test)] |
| 1953 | +mod normalization { |
| 1954 | + use super::*; |
| 1955 | + |
| 1956 | + mod eol { |
| 1957 | + use super::*; |
| 1958 | + use pretty_assertions::assert_eq; |
| 1959 | + |
| 1960 | + #[test] |
| 1961 | + fn empty() { |
| 1962 | + assert_eq!(normalize_eols(""), ""); |
| 1963 | + } |
| 1964 | + |
| 1965 | + #[test] |
| 1966 | + fn already_normalized() { |
| 1967 | + assert_eq!( |
| 1968 | + normalize_eols("\nalready \n\n normalized\n"), |
| 1969 | + "\nalready \n\n normalized\n" |
| 1970 | + ); |
| 1971 | + } |
| 1972 | + |
| 1973 | + #[test] |
| 1974 | + fn cr_lf() { |
| 1975 | + assert_eq!(normalize_eols("\r\nsome\r\n\r\ntext"), "\nsome\n\ntext"); |
| 1976 | + } |
| 1977 | + |
| 1978 | + #[test] |
| 1979 | + fn cr_u0085() { |
| 1980 | + assert_eq!( |
| 1981 | + normalize_eols("\r\u{0085}some\r\u{0085}\r\u{0085}text"), |
| 1982 | + "\nsome\n\ntext" |
| 1983 | + ); |
| 1984 | + } |
| 1985 | + |
| 1986 | + #[test] |
| 1987 | + fn u0085() { |
| 1988 | + assert_eq!( |
| 1989 | + normalize_eols("\u{0085}some\u{0085}\u{0085}text"), |
| 1990 | + "\nsome\n\ntext" |
| 1991 | + ); |
| 1992 | + } |
| 1993 | + |
| 1994 | + #[test] |
| 1995 | + fn u2028() { |
| 1996 | + assert_eq!( |
| 1997 | + normalize_eols("\u{2028}some\u{2028}\u{2028}text"), |
| 1998 | + "\nsome\n\ntext" |
| 1999 | + ); |
| 2000 | + } |
| 2001 | + |
| 2002 | + #[test] |
| 2003 | + fn mixed() { |
| 2004 | + assert_eq!( |
| 2005 | + normalize_eols("\r\r\r\u{2028}\n\r\nsome\n\u{0085}\r\u{0085}text"), |
| 2006 | + "\n\n\n\n\n\nsome\n\n\ntext" |
| 2007 | + ); |
| 2008 | + } |
| 2009 | + } |
| 2010 | +} |
0 commit comments