@@ -338,8 +338,23 @@ impl<'a> Tokenizer<'a> {
338338 #[ inline]
339339 fn has_at_least ( & self , n : usize ) -> bool { self . position + n < self . input . len ( ) }
340340
341+ // Advance over N bytes in the input. This function can advance
342+ // over ASCII bytes (excluding newlines), or UTF-8 sequence
343+ // leaders (excluding leaders for 4-byte sequences).
341344 #[ inline]
342- pub fn advance ( & mut self , n : usize ) { self . position += n }
345+ pub fn advance ( & mut self , n : usize ) {
346+ if cfg ! ( debug_assertions) {
347+ // Each byte must either be an ASCII byte or a sequence
348+ // leader, but not a 4-byte leader; also newlines are
349+ // rejected.
350+ for i in 0 ..n {
351+ let b = self . byte_at ( i) ;
352+ debug_assert ! ( b. is_ascii( ) || ( b & 0xF0 != 0xF0 && b & 0xC0 != 0x80 ) ) ;
353+ debug_assert ! ( b != b'\r' && b != b'\n' && b != b'\x0C' ) ;
354+ }
355+ }
356+ self . position += n
357+ }
343358
344359 // Assumes non-EOF
345360 #[ inline]
@@ -350,10 +365,27 @@ impl<'a> Tokenizer<'a> {
350365 self . input . as_bytes ( ) [ self . position + offset]
351366 }
352367
368+ // Advance over a single byte; the byte must be a UTF-8 sequence
369+ // leader for a 4-byte sequence.
370+ #[ inline]
371+ fn consume_4byte_intro ( & mut self ) {
372+ debug_assert ! ( self . next_byte_unchecked( ) & 0xF0 == 0xF0 ) ;
373+ self . position += 1 ;
374+ }
375+
376+ // Advance over a single byte; the byte must be a UTF-8
377+ // continuation byte.
353378 #[ inline]
354- fn consume_byte ( & mut self ) -> u8 {
379+ fn consume_continuation_byte ( & mut self ) {
380+ debug_assert ! ( self . next_byte_unchecked( ) & 0xC0 == 0x80 ) ;
381+ self . position += 1 ;
382+ }
383+
384+ // Advance over any kind of byte, excluding newlines.
385+ #[ inline( never) ]
386+ fn consume_known_byte ( & mut self , byte : u8 ) {
387+ debug_assert ! ( byte != b'\r' && byte != b'\n' && byte != b'\x0C' ) ;
355388 self . position += 1 ;
356- self . input . as_bytes ( ) [ self . position - 1 ]
357389 }
358390
359391 #[ inline]
@@ -667,7 +699,10 @@ fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
667699 b'\n' | b'\x0C' | b'\r' => {
668700 tokenizer. consume_newline( ) ;
669701 }
702+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
703+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
670704 _ => {
705+ // ASCII or other leading byte.
671706 tokenizer. advance( 1 ) ;
672707 }
673708 }
@@ -703,13 +738,15 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
703738 tokenizer. advance( 1 ) ;
704739 return Ok ( value. into( ) )
705740 }
741+ tokenizer. advance( 1 ) ;
706742 }
707743 b'\'' => {
708744 if single_quote {
709745 let value = tokenizer. slice_from( start_pos) ;
710746 tokenizer. advance( 1 ) ;
711747 return Ok ( value. into( ) )
712748 }
749+ tokenizer. advance( 1 ) ;
713750 }
714751 b'\\' | b'\0' => {
715752 // * The tokenizer’s input is UTF-8 since it’s `&str`.
@@ -723,33 +760,40 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
723760 b'\n' | b'\r' | b'\x0C' => {
724761 return Err ( tokenizer. slice_from( start_pos) . into( ) )
725762 } ,
726- _ => { }
763+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
764+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
765+ _ => {
766+ // ASCII or other leading byte.
767+ tokenizer. advance( 1 ) ;
768+ }
727769 }
728- tokenizer. consume_byte ( ) ;
729770 }
730771
731772 while !tokenizer. is_eof ( ) {
732- if matches ! ( tokenizer. next_byte_unchecked( ) , b'\n' | b'\r' | b'\x0C' ) {
733- return Err (
734- // string_bytes is well-formed UTF-8, see other comments.
735- unsafe {
736- from_utf8_release_unchecked ( string_bytes)
737- } . into ( )
738- ) ;
739- }
740- let b = tokenizer. consume_byte ( ) ;
773+ let b = tokenizer. next_byte_unchecked ( ) ;
741774 match_byte ! { b,
775+ b'\n' | b'\r' | b'\x0C' => {
776+ return Err (
777+ // string_bytes is well-formed UTF-8, see other comments.
778+ unsafe {
779+ from_utf8_release_unchecked( string_bytes)
780+ } . into( )
781+ ) ;
782+ }
742783 b'"' => {
784+ tokenizer. advance( 1 ) ;
743785 if !single_quote {
744786 break ;
745787 }
746788 }
747789 b'\'' => {
790+ tokenizer. advance( 1 ) ;
748791 if single_quote {
749792 break ;
750793 }
751794 }
752795 b'\\' => {
796+ tokenizer. advance( 1 ) ;
753797 if !tokenizer. is_eof( ) {
754798 match tokenizer. next_byte_unchecked( ) {
755799 // Escaped newline
@@ -764,10 +808,16 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
764808 continue ;
765809 }
766810 b'\0' => {
811+ tokenizer. advance( 1 ) ;
767812 string_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
768813 continue ;
769814 }
770- _ => { } ,
815+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
816+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
817+ _ => {
818+ // ASCII or other leading byte.
819+ tokenizer. advance( 1 ) ;
820+ } ,
771821 }
772822
773823 // If this byte is part of a multi-byte code point,
@@ -835,11 +885,11 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
835885 value_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
836886 break
837887 }
888+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
889+ b'\xC0' ...b'\xEF' => { tokenizer. advance( 1 ) ; }
890+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
838891 b => {
839- if b. is_ascii( ) {
840- return tokenizer. slice_from( start_pos) . into( ) ;
841- }
842- tokenizer. advance( 1 ) ;
892+ return tokenizer. slice_from( start_pos) . into( ) ;
843893 }
844894 }
845895 }
@@ -861,15 +911,26 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
861911 tokenizer. advance( 1 ) ;
862912 value_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
863913 } ,
864- _ => {
865- if b. is_ascii( ) {
866- break ;
867- }
868- tokenizer. advance( 1 ) ;
914+ b'\x80' ...b'\xBF' => {
915+ // This byte *is* part of a multi-byte code point,
916+ // we’ll end up copying the whole code point before this loop does something else.
917+ tokenizer. consume_continuation_byte( ) ;
918+ value_bytes. push( b)
919+ }
920+ b'\xC0' ...b'\xEF' => {
869921 // This byte *is* part of a multi-byte code point,
870922 // we’ll end up copying the whole code point before this loop does something else.
923+ tokenizer. advance( 1 ) ;
871924 value_bytes. push( b)
872925 }
926+ b'\xF0' ...b'\xFF' => {
927+ tokenizer. consume_4byte_intro( ) ;
928+ value_bytes. push( b)
929+ }
930+ _ => {
931+ // ASCII
932+ break ;
933+ }
873934 }
874935 }
875936 // string_bytes is well-formed UTF-8, see other comments.
@@ -1048,11 +1109,15 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
10481109 }
10491110 b'"' | b'\'' => { return Err ( ( ) ) } , // Do not advance
10501111 b')' => {
1051- tokenizer. advance( offset + 1 ) ;
1112+ // Don't use advance, because we may be skipping
1113+ // newlines here, and we want to avoid the assert.
1114+ tokenizer. position += offset + 1 ;
10521115 break
10531116 }
10541117 _ => {
1055- tokenizer. advance( offset) ;
1118+ // Don't use advance, because we may be skipping
1119+ // newlines here, and we want to avoid the assert.
1120+ tokenizer. position += offset;
10561121 found_printable_char = true ;
10571122 break
10581123 }
@@ -1104,27 +1169,33 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11041169 string_bytes = tokenizer. slice_from( start_pos) . as_bytes( ) . to_owned( ) ;
11051170 break
11061171 }
1172+ b'\x80' ...b'\xBF' => { tokenizer. consume_continuation_byte( ) ; }
1173+ b'\xF0' ...b'\xFF' => { tokenizer. consume_4byte_intro( ) ; }
11071174 _ => {
1175+ // ASCII or other leading byte.
11081176 tokenizer. advance( 1 ) ;
11091177 }
11101178 }
11111179 }
11121180 while !tokenizer. is_eof ( ) {
1113- match_byte ! { tokenizer. consume_byte( ) ,
1181+ let b = tokenizer. next_byte_unchecked ( ) ;
1182+ match_byte ! { b,
11141183 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
11151184 // string_bytes is well-formed UTF-8, see other comments.
11161185 let string = unsafe { from_utf8_release_unchecked( string_bytes) } . into( ) ;
1117- tokenizer. position -= 1 ;
11181186 return consume_url_end( tokenizer, start_pos, string)
11191187 }
11201188 b')' => {
1189+ tokenizer. advance( 1 ) ;
11211190 break ;
11221191 }
11231192 b'\x01' ...b'\x08' | b'\x0B' | b'\x0E' ...b'\x1F' | b'\x7F' // non-printable
11241193 | b'"' | b'\'' | b'(' => {
1194+ tokenizer. advance( 1 ) ;
11251195 return consume_bad_url( tokenizer, start_pos) ;
11261196 }
11271197 b'\\' => {
1198+ tokenizer. advance( 1 ) ;
11281199 if tokenizer. has_newline_at( 0 ) {
11291200 return consume_bad_url( tokenizer, start_pos)
11301201 }
@@ -1133,11 +1204,28 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11331204 consume_escape_and_write( tokenizer, & mut string_bytes)
11341205 } ,
11351206 b'\0' => {
1207+ tokenizer. advance( 1 ) ;
11361208 string_bytes. extend( "\u{FFFD} " . as_bytes( ) ) ;
11371209 }
1210+ b'\x80' ...b'\xBF' => {
1211+ // We’ll end up copying the whole code point
1212+ // before this loop does something else.
1213+ tokenizer. consume_continuation_byte( ) ;
1214+ string_bytes. push( b) ;
1215+ }
1216+ b'\xF0' ...b'\xFF' => {
1217+ // We’ll end up copying the whole code point
1218+ // before this loop does something else.
1219+ tokenizer. consume_4byte_intro( ) ;
1220+ string_bytes. push( b) ;
1221+ }
11381222 // If this byte is part of a multi-byte code point,
11391223 // we’ll end up copying the whole code point before this loop does something else.
1140- b => { string_bytes. push( b) }
1224+ b => {
1225+ // ASCII or other leading byte.
1226+ tokenizer. advance( 1 ) ;
1227+ string_bytes. push( b)
1228+ }
11411229 }
11421230 }
11431231 UnquotedUrl (
@@ -1160,8 +1248,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11601248 b'\n' | b'\x0C' | b'\r' => {
11611249 tokenizer. consume_newline( ) ;
11621250 }
1163- _ => {
1164- tokenizer. advance ( 1 ) ;
1251+ b => {
1252+ tokenizer. consume_known_byte ( b ) ;
11651253 return consume_bad_url( tokenizer, start_pos) ;
11661254 }
11671255 }
@@ -1186,8 +1274,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
11861274 b'\n' | b'\x0C' | b'\r' => {
11871275 tokenizer. consume_newline( ) ;
11881276 }
1189- _ => {
1190- tokenizer. advance ( 1 ) ;
1277+ b => {
1278+ tokenizer. consume_known_byte ( b ) ;
11911279 }
11921280 }
11931281 }
0 commit comments