@@ -4,6 +4,7 @@ use memchr::{memchr2_iter, memchr3};
44use std:: borrow:: Cow ;
55use std:: num:: ParseIntError ;
66use std:: ops:: Range ;
7+ use std:: slice:: Iter ;
78
89/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
910#[ derive( Clone , Debug , PartialEq ) ]
@@ -50,6 +51,12 @@ pub enum EscapeError {
5051 /// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
5152 /// was unsuccessful, not all characters are decimal or hexadecimal numbers.
5253 InvalidCharRef ( ParseCharRefError ) ,
54+ /// Expanded more than maximum possible entities during attribute normalization.
55+ ///
56+ /// Attribute normalization includes expanding of general entities (`&entity;`)
57+ /// which replacement text also could contain entities, which is also must be expanded.
58+ /// If more than 128 entities would be expanded, this error is returned.
59+ TooManyNestedEntities ,
5360}
5461
5562impl std:: fmt:: Display for EscapeError {
@@ -66,6 +73,9 @@ impl std::fmt::Display for EscapeError {
6673 Self :: InvalidCharRef ( e) => {
6774 write ! ( f, "invalid character reference: {}" , e)
6875 }
76+ Self :: TooManyNestedEntities => {
77+ f. write_str ( "too many nested entities in an attribute value" )
78+ }
6979 }
7080 }
7181}
@@ -404,6 +414,218 @@ fn normalize_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: c
404414
405415////////////////////////////////////////////////////////////////////////////////////////////////////
406416
417+ const fn is_normalization_char ( b : & u8 ) -> bool {
418+ // The following sequences should be translated into a single `\n` (U+000a) character
419+ // to normalize EOLs:
420+ //
421+ // |UTF-8 |String|
422+ // |--------|------|
423+ // |0d 0a |\r\n |
424+ // |0d c2 85|\r\x85|
425+ // |0d |\r |
426+ // |c2 85 |\x85 |
427+ // |e2 80 a8|\x2028|
428+ matches ! ( * b, b'\t' | b'\r' | b'\n' | 0xC2 | 0xE2 | b'&' )
429+ }
430+
431+ /// Returns the attribute value normalized as per [the XML specification],
432+ /// using a custom entity resolver.
433+ ///
434+ /// Do not use this method with HTML attributes.
435+ ///
436+ /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>`
437+ /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
438+ /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
439+ /// take precedence.
440+ ///
441+ /// This will allocate unless the raw attribute value does not require normalization.
442+ ///
443+ /// # Parameters
444+ ///
445+ /// - `value`: unnormalized attribute value
446+ /// - `depth`: maximum number of nested entities that can be expanded. If expansion
447+ /// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`]
448+ /// - `resolve_entity`: a function to resolve entity. This function could be called
449+ /// multiple times on the same input and can return different values in each case
450+ /// for the same input, although it is not recommended
451+ ///
452+ /// # Lifetimes
453+ ///
454+ /// - `'input`: lifetime of the unnormalized attribute. If normalization is not requred,
455+ /// the input returned unchanged with the same lifetime
456+ /// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
457+ ///
458+ /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
459+ pub ( crate ) fn normalize_attribute_value < ' input , ' entity , F > (
460+ value : & ' input str ,
461+ depth : usize ,
462+ mut resolve_entity : F ,
463+ ) -> Result < Cow < ' input , str > , EscapeError >
464+ where
465+ // the lifetime of the output comes from a capture or is `'static`
466+ F : FnMut ( & str ) -> Option < & ' entity str > ,
467+ {
468+ let mut iter = value. as_bytes ( ) . iter ( ) ;
469+
470+ // If we found the charater that requires normalization, create a normalized
471+ // version of the attribute, otherwise return the value unchanged
472+ if let Some ( i) = iter. position ( is_normalization_char) {
473+ let mut normalized = String :: with_capacity ( value. len ( ) ) ;
474+ let pos = normalize_step (
475+ & mut normalized,
476+ & mut iter,
477+ value,
478+ 0 ,
479+ i,
480+ depth,
481+ & mut resolve_entity,
482+ ) ?;
483+
484+ normalize_steps (
485+ & mut normalized,
486+ & mut iter,
487+ value,
488+ pos,
489+ depth,
490+ & mut resolve_entity,
491+ ) ?;
492+ return Ok ( normalized. into ( ) ) ;
493+ }
494+ Ok ( Cow :: Borrowed ( value) )
495+ }
496+
497+ fn normalize_steps < ' entity , F > (
498+ normalized : & mut String ,
499+ iter : & mut Iter < u8 > ,
500+ input : & str ,
501+ mut pos : usize ,
502+ depth : usize ,
503+ resolve_entity : & mut F ,
504+ ) -> Result < ( ) , EscapeError >
505+ where
506+ // the lifetime of the output comes from a capture or is `'static`
507+ F : FnMut ( & str ) -> Option < & ' entity str > ,
508+ {
509+ while let Some ( i) = iter. position ( is_normalization_char) {
510+ pos = normalize_step ( normalized, iter, input, pos, pos + i, depth, resolve_entity) ?;
511+ }
512+ if let Some ( rest) = input. get ( pos..) {
513+ normalized. push_str ( rest) ;
514+ }
515+ Ok ( ( ) )
516+ }
517+
518+ /// Performs one step of the [normalization algorithm] (but with recursive part):
519+ ///
520+ /// 1. For a character reference, append the referenced character
521+ /// to the normalized value.
522+ /// 2. For an entity reference, recursively apply this algorithm
523+ /// to the replacement text of the entity.
524+ /// 3. For a white space character (#x20, #xD, #xA, #x9), append
525+ /// a space character (#x20) to the normalized value.
526+ /// 4. For another character, append the character to the normalized value.
527+ ///
528+ /// Because [according to the specification], XML parser should parse line-of-end
529+ /// normalized input, but quick-xml does not do that, this function also performs
530+ /// normalization of EOL characters. That should be done before expanding entities
531+ /// and character references, so cannot be processed later.
532+ ///
533+ /// This function could be used also just to normalize line ends if the iterator
534+ /// won't be stop on `&` characters.
535+ ///
536+ /// # Parameters
537+ ///
538+ /// - `normalized`: Output of the algorithm. Normalized value will be placed here
539+ /// - `iter`: Iterator over bytes of `input`
540+ /// - `input`: Original non-normalized value
541+ /// - `last_pos`: Index of the last byte in `input` that was processed
542+ /// - `index`: Index of the byte in `input` that should be processed now
543+ /// - `seen_cr`: `\r\n` and `\r\x85` sequences should be normalized into one space
544+ /// so this parameter tracks if we seen the `\r` before processing the current byte
545+ /// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
546+ /// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
547+ ///
548+ /// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
549+ /// [according to the specification]: https://www.w3.org/TR/xml11/#sec-line-ends
550+ fn normalize_step < ' entity , F > (
551+ normalized : & mut String ,
552+ iter : & mut Iter < u8 > ,
553+ input : & str ,
554+ last_pos : usize ,
555+ index : usize ,
556+ depth : usize ,
557+ resolve_entity : & mut F ,
558+ ) -> Result < usize , EscapeError >
559+ where
560+ // the lifetime of the output comes from a capture or is `'static`
561+ F : FnMut ( & str ) -> Option < & ' entity str > ,
562+ {
563+ if depth == 0 {
564+ return Err ( EscapeError :: TooManyNestedEntities ) ;
565+ }
566+ // 4. For another character, append the character to the normalized value.
567+ normalized. push_str ( & input[ last_pos..index] ) ;
568+
569+ match input. as_bytes ( ) [ index] {
570+ b'&' => {
571+ let start = index + 1 ; // +1 - skip `&`
572+ let end = start
573+ + match iter. position ( |& b| b == b';' ) {
574+ Some ( end) => end,
575+ None => return Err ( EscapeError :: UnterminatedEntity ( index..input. len ( ) ) ) ,
576+ } ;
577+
578+ // Content between & and ; - &pat;
579+ // Note, that this content have non-normalized EOLs as required by the specification,
580+ // but because numbers in any case cannot have spaces inside, this is not the problem.
581+ // Normalization of spaces in entity references and checking that they corresponds to
582+ // [`Name`] production on conscience `resolve_entity`.
583+ //
584+ // [`Name`]: https://www.w3.org/TR/xml11/#NT-Name
585+ let pat = & input[ start..end] ;
586+ // 1. For a character reference, append the referenced character
587+ // to the normalized value.
588+ if pat. starts_with ( '#' ) {
589+ let entity = & pat[ 1 ..] ; // starts after the #
590+ let codepoint = parse_number ( entity) . map_err ( EscapeError :: InvalidCharRef ) ?;
591+ normalized. push_str ( codepoint. encode_utf8 ( & mut [ 0u8 ; 4 ] ) ) ;
592+ } else
593+ // 2. For an entity reference, recursively apply this algorithm
594+ // to the replacement text of the entity.
595+ if let Some ( value) = resolve_entity ( pat) {
596+ normalize_steps (
597+ normalized,
598+ & mut value. as_bytes ( ) . iter ( ) ,
599+ value,
600+ 0 ,
601+ depth. saturating_sub ( 1 ) ,
602+ resolve_entity,
603+ ) ?;
604+ } else {
605+ return Err ( EscapeError :: UnrecognizedEntity ( start..end, pat. to_string ( ) ) ) ;
606+ }
607+ Ok ( end + 1 ) // +1 - skip `;`
608+ }
609+ // 3. For a white space character (#x20, #xD, #xA, #x9), append
610+ // a space character (#x20) to the normalized value.
611+ // Space character has no special meaning, so it is handled on step 4
612+ b'\t' => {
613+ normalized. push ( ' ' ) ;
614+ Ok ( index + 1 ) // +1 - skip \t
615+ }
616+ _ => {
617+ let pos = normalize_eol_step ( normalized, input. as_bytes ( ) , index, ' ' ) ;
618+ // We should advance iterator because we may skip several characters
619+ for _ in 0 ..pos - index - 1 {
620+ iter. next ( ) ;
621+ }
622+ Ok ( pos)
623+ }
624+ }
625+ }
626+
627+ ////////////////////////////////////////////////////////////////////////////////////////////////////
628+
407629/// Resolves predefined XML entities or all HTML5 entities depending on the feature
408630/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
409631///
@@ -2007,4 +2229,121 @@ mod normalization {
20072229 ) ;
20082230 }
20092231 }
2232+
2233+ mod attribute {
2234+ use super :: * ;
2235+ use pretty_assertions:: assert_eq;
2236+
2237+ #[ test]
2238+ fn empty ( ) {
2239+ assert_eq ! (
2240+ normalize_attribute_value( "" , 5 , |_| { None } ) ,
2241+ Ok ( "" . into( ) )
2242+ ) ;
2243+ }
2244+
2245+ #[ test]
2246+ fn only_spaces ( ) {
2247+ assert_eq ! (
2248+ normalize_attribute_value( " " , 5 , |_| { None } ) ,
2249+ Ok ( " " . into( ) )
2250+ ) ;
2251+ assert_eq ! (
2252+ normalize_attribute_value( "\t \t \t " , 5 , |_| { None } ) ,
2253+ Ok ( " " . into( ) )
2254+ ) ;
2255+ assert_eq ! (
2256+ normalize_attribute_value( "\r \r \r " , 5 , |_| { None } ) ,
2257+ Ok ( " " . into( ) )
2258+ ) ;
2259+ assert_eq ! (
2260+ normalize_attribute_value( "\n \n \n " , 5 , |_| { None } ) ,
2261+ Ok ( " " . into( ) )
2262+ ) ;
2263+ }
2264+
2265+ #[ test]
2266+ fn already_normalized ( ) {
2267+ assert_eq ! (
2268+ normalize_attribute_value( "already normalized" , 5 , |_| { None } ) ,
2269+ Ok ( "already normalized" . into( ) )
2270+ ) ;
2271+ }
2272+
2273+ #[ test]
2274+ fn characters ( ) {
2275+ assert_eq ! (
2276+ normalize_attribute_value( "string with   character" , 5 , |_| { None } ) ,
2277+ Ok ( "string with character" . into( ) )
2278+ ) ;
2279+ assert_eq ! (
2280+ normalize_attribute_value( "string with   character" , 5 , |_| { None } ) ,
2281+ Ok ( "string with character" . into( ) )
2282+ ) ;
2283+ }
2284+
2285+ #[ test]
2286+ fn entities ( ) {
2287+ assert_eq ! (
2288+ normalize_attribute_value( "string with &entity; reference" , 5 , |_| {
2289+ Some ( "replacement" )
2290+ } ) ,
2291+ Ok ( "string with replacement reference" . into( ) )
2292+ ) ;
2293+ assert_eq ! (
2294+ normalize_attribute_value( "string with &entity-1; reference" , 5 , |entity| {
2295+ match entity {
2296+ "entity-1" => Some ( "recursive &entity-2;" ) ,
2297+ "entity-2" => Some ( "entity 2" ) ,
2298+ _ => None ,
2299+ }
2300+ } ) ,
2301+ Ok ( "string with recursive entity 2 reference" . into( ) )
2302+ ) ;
2303+ }
2304+
2305+ #[ test]
2306+ fn unclosed_entity ( ) {
2307+ assert_eq ! (
2308+ normalize_attribute_value( "string with unclosed &entity reference" , 5 , |_| {
2309+ // 0 ^ = 21 ^ = 38
2310+ Some ( "replacement" )
2311+ } ) ,
2312+ Err ( EscapeError :: UnterminatedEntity ( 21 ..38 ) )
2313+ ) ;
2314+ assert_eq ! (
2315+ normalize_attribute_value(
2316+ "string with unclosed   (character) reference" ,
2317+ 5 ,
2318+ |_| {
2319+ // 0 ^ = 21 ^ = 47
2320+ None
2321+ }
2322+ ) ,
2323+ Err ( EscapeError :: UnterminatedEntity ( 21 ..47 ) )
2324+ ) ;
2325+ }
2326+
2327+ #[ test]
2328+ fn unknown_entity ( ) {
2329+ assert_eq ! (
2330+ normalize_attribute_value( "string with unknown &entity; reference" , 5 , |_| {
2331+ None
2332+ } ) ,
2333+ // 0 ^ ^ = 21..27
2334+ Err ( EscapeError :: UnrecognizedEntity (
2335+ 21 ..27 ,
2336+ "entity" . to_string( ) ,
2337+ ) )
2338+ ) ;
2339+ }
2340+
2341+ #[ test]
2342+ fn recursive_entity ( ) {
2343+ assert_eq ! (
2344+ normalize_attribute_value( "&entity; reference" , 5 , |_| Some ( "recursive &entity;" ) ) ,
2345+ Err ( EscapeError :: TooManyNestedEntities ) ,
2346+ ) ;
2347+ }
2348+ }
20102349}
0 commit comments