Skip to content

Commit df7df69

Browse files
committed
Implement an attribute normalization routine as described in "3.3.3 Attribute-Value Normalization" section of XML 1.1. spec
https://www.w3.org/TR/xml11/#AVNormalize
1 parent 8cfcbb5 commit df7df69

File tree

1 file changed

+339
-0
lines changed

1 file changed

+339
-0
lines changed

src/escape.rs

Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use memchr::{memchr2_iter, memchr3};
44
use std::borrow::Cow;
55
use std::num::ParseIntError;
66
use std::ops::Range;
7+
use std::slice::Iter;
78

89
/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
910
#[derive(Clone, Debug, PartialEq)]
@@ -50,6 +51,12 @@ pub enum EscapeError {
5051
/// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
5152
/// was unsuccessful, not all characters are decimal or hexadecimal numbers.
5253
InvalidCharRef(ParseCharRefError),
54+
/// Expanded more than maximum possible entities during attribute normalization.
55+
///
56+
/// Attribute normalization includes expanding of general entities (`&entity;`)
57+
/// which replacement text also could contain entities, which is also must be expanded.
58+
/// If more than 128 entities would be expanded, this error is returned.
59+
TooManyNestedEntities,
5360
}
5461

5562
impl std::fmt::Display for EscapeError {
@@ -66,6 +73,9 @@ impl std::fmt::Display for EscapeError {
6673
Self::InvalidCharRef(e) => {
6774
write!(f, "invalid character reference: {}", e)
6875
}
76+
Self::TooManyNestedEntities => {
77+
f.write_str("too many nested entities in an attribute value")
78+
}
6979
}
7080
}
7181
}
@@ -404,6 +414,218 @@ fn normalize_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: c
404414

405415
////////////////////////////////////////////////////////////////////////////////////////////////////
406416

417+
const fn is_normalization_char(b: &u8) -> bool {
418+
// The following sequences should be translated into a single `\n` (U+000a) character
419+
// to normalize EOLs:
420+
//
421+
// |UTF-8 |String|
422+
// |--------|------|
423+
// |0d 0a |\r\n |
424+
// |0d c2 85|\r\x85|
425+
// |0d |\r |
426+
// |c2 85 |\x85 |
427+
// |e2 80 a8|\x2028|
428+
matches!(*b, b'\t' | b'\r' | b'\n' | 0xC2 | 0xE2 | b'&')
429+
}
430+
431+
/// Returns the attribute value normalized as per [the XML specification],
432+
/// using a custom entity resolver.
433+
///
434+
/// Do not use this method with HTML attributes.
435+
///
436+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
437+
/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
438+
/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
439+
/// take precedence.
440+
///
441+
/// This will allocate unless the raw attribute value does not require normalization.
442+
///
443+
/// # Parameters
444+
///
445+
/// - `value`: unnormalized attribute value
446+
/// - `depth`: maximum number of nested entities that can be expanded. If expansion
447+
/// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`]
448+
/// - `resolve_entity`: a function to resolve entity. This function could be called
449+
/// multiple times on the same input and can return different values in each case
450+
/// for the same input, although it is not recommended
451+
///
452+
/// # Lifetimes
453+
///
454+
/// - `'input`: lifetime of the unnormalized attribute. If normalization is not requred,
455+
/// the input returned unchanged with the same lifetime
456+
/// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
457+
///
458+
/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
459+
pub(crate) fn normalize_attribute_value<'input, 'entity, F>(
460+
value: &'input str,
461+
depth: usize,
462+
mut resolve_entity: F,
463+
) -> Result<Cow<'input, str>, EscapeError>
464+
where
465+
// the lifetime of the output comes from a capture or is `'static`
466+
F: FnMut(&str) -> Option<&'entity str>,
467+
{
468+
let mut iter = value.as_bytes().iter();
469+
470+
// If we found the charater that requires normalization, create a normalized
471+
// version of the attribute, otherwise return the value unchanged
472+
if let Some(i) = iter.position(is_normalization_char) {
473+
let mut normalized = String::with_capacity(value.len());
474+
let pos = normalize_step(
475+
&mut normalized,
476+
&mut iter,
477+
value,
478+
0,
479+
i,
480+
depth,
481+
&mut resolve_entity,
482+
)?;
483+
484+
normalize_steps(
485+
&mut normalized,
486+
&mut iter,
487+
value,
488+
pos,
489+
depth,
490+
&mut resolve_entity,
491+
)?;
492+
return Ok(normalized.into());
493+
}
494+
Ok(Cow::Borrowed(value))
495+
}
496+
497+
fn normalize_steps<'entity, F>(
498+
normalized: &mut String,
499+
iter: &mut Iter<u8>,
500+
input: &str,
501+
mut pos: usize,
502+
depth: usize,
503+
resolve_entity: &mut F,
504+
) -> Result<(), EscapeError>
505+
where
506+
// the lifetime of the output comes from a capture or is `'static`
507+
F: FnMut(&str) -> Option<&'entity str>,
508+
{
509+
while let Some(i) = iter.position(is_normalization_char) {
510+
pos = normalize_step(normalized, iter, input, pos, pos + i, depth, resolve_entity)?;
511+
}
512+
if let Some(rest) = input.get(pos..) {
513+
normalized.push_str(rest);
514+
}
515+
Ok(())
516+
}
517+
518+
/// Performs one step of the [normalization algorithm] (but with recursive part):
519+
///
520+
/// 1. For a character reference, append the referenced character
521+
/// to the normalized value.
522+
/// 2. For an entity reference, recursively apply this algorithm
523+
/// to the replacement text of the entity.
524+
/// 3. For a white space character (#x20, #xD, #xA, #x9), append
525+
/// a space character (#x20) to the normalized value.
526+
/// 4. For another character, append the character to the normalized value.
527+
///
528+
/// Because [according to the specification], XML parser should parse line-of-end
529+
/// normalized input, but quick-xml does not do that, this function also performs
530+
/// normalization of EOL characters. That should be done before expanding entities
531+
/// and character references, so cannot be processed later.
532+
///
533+
/// This function could be used also just to normalize line ends if the iterator
534+
/// won't be stop on `&` characters.
535+
///
536+
/// # Parameters
537+
///
538+
/// - `normalized`: Output of the algorithm. Normalized value will be placed here
539+
/// - `iter`: Iterator over bytes of `input`
540+
/// - `input`: Original non-normalized value
541+
/// - `last_pos`: Index of the last byte in `input` that was processed
542+
/// - `index`: Index of the byte in `input` that should be processed now
543+
/// - `seen_cr`: `\r\n` and `\r\x85` sequences should be normalized into one space
544+
/// so this parameter tracks if we seen the `\r` before processing the current byte
545+
/// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
546+
/// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
547+
///
548+
/// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
549+
/// [according to the specification]: https://www.w3.org/TR/xml11/#sec-line-ends
550+
fn normalize_step<'entity, F>(
551+
normalized: &mut String,
552+
iter: &mut Iter<u8>,
553+
input: &str,
554+
last_pos: usize,
555+
index: usize,
556+
depth: usize,
557+
resolve_entity: &mut F,
558+
) -> Result<usize, EscapeError>
559+
where
560+
// the lifetime of the output comes from a capture or is `'static`
561+
F: FnMut(&str) -> Option<&'entity str>,
562+
{
563+
if depth == 0 {
564+
return Err(EscapeError::TooManyNestedEntities);
565+
}
566+
// 4. For another character, append the character to the normalized value.
567+
normalized.push_str(&input[last_pos..index]);
568+
569+
match input.as_bytes()[index] {
570+
b'&' => {
571+
let start = index + 1; // +1 - skip `&`
572+
let end = start
573+
+ match iter.position(|&b| b == b';') {
574+
Some(end) => end,
575+
None => return Err(EscapeError::UnterminatedEntity(index..input.len())),
576+
};
577+
578+
// Content between & and ; - &pat;
579+
// Note, that this content have non-normalized EOLs as required by the specification,
580+
// but because numbers in any case cannot have spaces inside, this is not the problem.
581+
// Normalization of spaces in entity references and checking that they corresponds to
582+
// [`Name`] production on conscience `resolve_entity`.
583+
//
584+
// [`Name`]: https://www.w3.org/TR/xml11/#NT-Name
585+
let pat = &input[start..end];
586+
// 1. For a character reference, append the referenced character
587+
// to the normalized value.
588+
if pat.starts_with('#') {
589+
let entity = &pat[1..]; // starts after the #
590+
let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?;
591+
normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
592+
} else
593+
// 2. For an entity reference, recursively apply this algorithm
594+
// to the replacement text of the entity.
595+
if let Some(value) = resolve_entity(pat) {
596+
normalize_steps(
597+
normalized,
598+
&mut value.as_bytes().iter(),
599+
value,
600+
0,
601+
depth.saturating_sub(1),
602+
resolve_entity,
603+
)?;
604+
} else {
605+
return Err(EscapeError::UnrecognizedEntity(start..end, pat.to_string()));
606+
}
607+
Ok(end + 1) // +1 - skip `;`
608+
}
609+
// 3. For a white space character (#x20, #xD, #xA, #x9), append
610+
// a space character (#x20) to the normalized value.
611+
// Space character has no special meaning, so it is handled on step 4
612+
b'\t' => {
613+
normalized.push(' ');
614+
Ok(index + 1) // +1 - skip \t
615+
}
616+
_ => {
617+
let pos = normalize_eol_step(normalized, input.as_bytes(), index, ' ');
618+
// We should advance iterator because we may skip several characters
619+
for _ in 0..pos - index - 1 {
620+
iter.next();
621+
}
622+
Ok(pos)
623+
}
624+
}
625+
}
626+
627+
////////////////////////////////////////////////////////////////////////////////////////////////////
628+
407629
/// Resolves predefined XML entities or all HTML5 entities depending on the feature
408630
/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
409631
///
@@ -2007,4 +2229,121 @@ mod normalization {
20072229
);
20082230
}
20092231
}
2232+
2233+
mod attribute {
2234+
use super::*;
2235+
use pretty_assertions::assert_eq;
2236+
2237+
#[test]
2238+
fn empty() {
2239+
assert_eq!(
2240+
normalize_attribute_value("", 5, |_| { None }),
2241+
Ok("".into())
2242+
);
2243+
}
2244+
2245+
#[test]
2246+
fn only_spaces() {
2247+
assert_eq!(
2248+
normalize_attribute_value(" ", 5, |_| { None }),
2249+
Ok(" ".into())
2250+
);
2251+
assert_eq!(
2252+
normalize_attribute_value("\t\t\t", 5, |_| { None }),
2253+
Ok(" ".into())
2254+
);
2255+
assert_eq!(
2256+
normalize_attribute_value("\r\r\r", 5, |_| { None }),
2257+
Ok(" ".into())
2258+
);
2259+
assert_eq!(
2260+
normalize_attribute_value("\n\n\n", 5, |_| { None }),
2261+
Ok(" ".into())
2262+
);
2263+
}
2264+
2265+
#[test]
2266+
fn already_normalized() {
2267+
assert_eq!(
2268+
normalize_attribute_value("already normalized", 5, |_| { None }),
2269+
Ok("already normalized".into())
2270+
);
2271+
}
2272+
2273+
#[test]
2274+
fn characters() {
2275+
assert_eq!(
2276+
normalize_attribute_value("string with &#32; character", 5, |_| { None }),
2277+
Ok("string with character".into())
2278+
);
2279+
assert_eq!(
2280+
normalize_attribute_value("string with &#x20; character", 5, |_| { None }),
2281+
Ok("string with character".into())
2282+
);
2283+
}
2284+
2285+
#[test]
2286+
fn entities() {
2287+
assert_eq!(
2288+
normalize_attribute_value("string with &entity; reference", 5, |_| {
2289+
Some("replacement")
2290+
}),
2291+
Ok("string with replacement reference".into())
2292+
);
2293+
assert_eq!(
2294+
normalize_attribute_value("string with &entity-1; reference", 5, |entity| {
2295+
match entity {
2296+
"entity-1" => Some("recursive &entity-2;"),
2297+
"entity-2" => Some("entity&#32;2"),
2298+
_ => None,
2299+
}
2300+
}),
2301+
Ok("string with recursive entity 2 reference".into())
2302+
);
2303+
}
2304+
2305+
#[test]
2306+
fn unclosed_entity() {
2307+
assert_eq!(
2308+
normalize_attribute_value("string with unclosed &entity reference", 5, |_| {
2309+
// 0 ^ = 21 ^ = 38
2310+
Some("replacement")
2311+
}),
2312+
Err(EscapeError::UnterminatedEntity(21..38))
2313+
);
2314+
assert_eq!(
2315+
normalize_attribute_value(
2316+
"string with unclosed &#32 (character) reference",
2317+
5,
2318+
|_| {
2319+
// 0 ^ = 21 ^ = 47
2320+
None
2321+
}
2322+
),
2323+
Err(EscapeError::UnterminatedEntity(21..47))
2324+
);
2325+
}
2326+
2327+
#[test]
2328+
fn unknown_entity() {
2329+
assert_eq!(
2330+
normalize_attribute_value("string with unknown &entity; reference", 5, |_| {
2331+
None
2332+
}),
2333+
// 0 ^ ^ = 21..27
2334+
Err(EscapeError::UnrecognizedEntity(
2335+
21..27,
2336+
"entity".to_string(),
2337+
))
2338+
);
2339+
}
2340+
2341+
#[test]
2342+
fn recursive_entity() {
2343+
assert_eq!(
2344+
normalize_attribute_value("&entity; reference", 5, |_| Some("recursive &entity;")),
2345+
Err(EscapeError::TooManyNestedEntities),
2346+
);
2347+
}
2348+
}
20102349
}

0 commit comments

Comments
 (0)