@@ -379,6 +379,14 @@ impl<'a> Tokenizer<'a> {
379379 self . input [ self . position ..] . chars ( ) . next ( ) . unwrap ( )
380380 }
381381
382+ fn seen_newline ( & mut self , is_cr : bool ) {
383+ if is_cr && self . next_byte ( ) == Some ( /* LF */ b'\n' ) {
384+ return
385+ }
386+ self . current_line_start_position = self . position ;
387+ self . current_line_number += 1 ;
388+ }
389+
382390 #[ inline]
383391 fn has_newline_at ( & self , offset : usize ) -> bool {
384392 self . position + offset < self . input . len ( ) &&
@@ -420,16 +428,14 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
420428 }
421429 let b = tokenizer. next_byte_unchecked ( ) ;
422430 let token = match_byte ! { b,
423- b'\t' | b'\n' | b' ' | b'\r' | b'\x0C' => {
424- let start_position = tokenizer. position( ) ;
425- tokenizer. advance( 1 ) ;
426- while !tokenizer. is_eof( ) {
427- match tokenizer. next_byte_unchecked( ) {
428- b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => tokenizer. advance( 1 ) ,
429- _ => break ,
430- }
431- }
432- WhiteSpace ( tokenizer. slice_from( start_position) )
431+ b' ' | b'\t' => {
432+ consume_whitespace( tokenizer, false , false )
433+ } ,
434+ b'\n' | b'\x0C' => {
435+ consume_whitespace( tokenizer, true , false )
436+ } ,
437+ b'\r' => {
438+ consume_whitespace( tokenizer, true , true )
433439 } ,
434440 b'"' => { consume_string( tokenizer, false ) } ,
435441 b'#' => {
@@ -501,21 +507,7 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
501507 }
502508 b'/' => {
503509 if tokenizer. starts_with( b"/*" ) {
504- tokenizer. advance( 2 ) ; // consume "/*"
505- let start_position = tokenizer. position( ) ;
506- let content;
507- match tokenizer. input[ tokenizer. position..] . find( "*/" ) {
508- Some ( offset) => {
509- tokenizer. advance( offset) ;
510- content = tokenizer. slice_from( start_position) ;
511- tokenizer. advance( 2 ) ;
512- }
513- None => {
514- tokenizer. position = tokenizer. input. len( ) ;
515- content = tokenizer. slice_from( start_position) ;
516- }
517- }
518- Comment ( content)
510+ Comment ( consume_comment( tokenizer) )
519511 } else {
520512 tokenizer. advance( 1 ) ;
521513 Delim ( '/' )
@@ -573,6 +565,64 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
573565}
574566
575567
568+ fn consume_whitespace < ' a > ( tokenizer : & mut Tokenizer < ' a > , newline : bool , is_cr : bool ) -> Token < ' a > {
569+ let start_position = tokenizer. position ( ) ;
570+ tokenizer. advance ( 1 ) ;
571+ if newline {
572+ tokenizer. seen_newline ( is_cr)
573+ }
574+ while !tokenizer. is_eof ( ) {
575+ let b = tokenizer. next_byte_unchecked ( ) ;
576+ match_byte ! { b,
577+ b' ' | b'\t' => {
578+ tokenizer. advance( 1 ) ;
579+ }
580+ b'\n' | b'\x0C' => {
581+ tokenizer. advance( 1 ) ;
582+ tokenizer. seen_newline( false ) ;
583+ }
584+ b'\r' => {
585+ tokenizer. advance( 1 ) ;
586+ tokenizer. seen_newline( true ) ;
587+ }
588+ _ => {
589+ break
590+ }
591+ }
592+ }
593+ WhiteSpace ( tokenizer. slice_from ( start_position) )
594+ }
595+
596+
597+ fn consume_comment < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> & ' a str {
598+ tokenizer. advance ( 2 ) ; // consume "/*"
599+ let start_position = tokenizer. position ( ) ;
600+ while !tokenizer. is_eof ( ) {
601+ match_byte ! { tokenizer. next_byte_unchecked( ) ,
602+ b'*' => {
603+ let end_position = tokenizer. position( ) ;
604+ tokenizer. advance( 1 ) ;
605+ if tokenizer. next_byte( ) == Some ( b'/' ) {
606+ tokenizer. advance( 1 ) ;
607+ return tokenizer. slice( start_position..end_position)
608+ }
609+ }
610+ b'\n' | b'\x0C' => {
611+ tokenizer. advance( 1 ) ;
612+ tokenizer. seen_newline( false ) ;
613+ }
614+ b'\r' => {
615+ tokenizer. advance( 1 ) ;
616+ tokenizer. seen_newline( true ) ;
617+ }
618+ _ => {
619+ tokenizer. advance( 1 ) ;
620+ }
621+ }
622+ }
623+ tokenizer. slice_from ( start_position)
624+ }
625+
576626fn consume_string < ' a > ( tokenizer : & mut Tokenizer < ' a > , single_quote : bool ) -> Token < ' a > {
577627 match consume_quoted_string ( tokenizer, single_quote) {
578628 Ok ( value) => QuotedString ( value) ,
@@ -649,12 +699,19 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
649699 if !tokenizer. is_eof( ) {
650700 match tokenizer. next_byte_unchecked( ) {
651701 // Escaped newline
652- b'\n' | b'\x0C' => tokenizer. advance( 1 ) ,
702+ b'\n' | b'\x0C' => {
703+ tokenizer. advance( 1 ) ;
704+ tokenizer. seen_newline( false ) ;
705+ }
653706 b'\r' => {
654707 tokenizer. advance( 1 ) ;
655708 if tokenizer. next_byte( ) == Some ( b'\n' ) {
656709 tokenizer. advance( 1 ) ;
657710 }
711+ // `is_cr = true` is useful to skip \r when the next iteration
712+ // of a loop will call `seen_newline` again for the following \n.
713+ // In this case we’re consuming both in this iteration, so passing `false`.
714+ tokenizer. seen_newline( false ) ;
658715 }
659716 // This pushes one well-formed code point
660717 _ => consume_escape_and_write( tokenizer, & mut string_bytes)
@@ -921,24 +978,57 @@ unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
921978
922979fn consume_unquoted_url < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Result < Token < ' a > , ( ) > {
923980 // This is only called after "url(", so the current position is a code point boundary.
924- for ( offset, c) in tokenizer. input [ tokenizer. position ..] . bytes ( ) . enumerate ( ) {
925- match_byte ! { c,
926- b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
981+ let start_position = tokenizer. position ;
982+ let from_start = & tokenizer. input [ tokenizer. position ..] ;
983+ let mut newlines = 0 ;
984+ let mut last_newline = 0 ;
985+ let mut found_printable_char = false ;
986+ let mut iter = from_start. bytes ( ) . enumerate ( ) ;
987+ loop {
988+ let ( offset, b) = match iter. next ( ) {
989+ Some ( item) => item,
990+ None => {
991+ tokenizer. position = tokenizer. input . len ( ) ;
992+ break
993+ }
994+ } ;
995+ match_byte ! { b,
996+ b' ' | b'\t' => { } ,
997+ b'\n' | b'\x0C' => {
998+ newlines += 1 ;
999+ last_newline = offset;
1000+ }
1001+ b'\r' => {
1002+ if from_start. as_bytes( ) . get( offset + 1 ) != Some ( & b'\n' ) {
1003+ newlines += 1 ;
1004+ last_newline = offset;
1005+ }
1006+ }
9271007 b'"' | b'\'' => { return Err ( ( ) ) } , // Do not advance
9281008 b')' => {
9291009 tokenizer. advance( offset + 1 ) ;
930- return Ok ( UnquotedUrl ( "" . into ( ) ) ) ;
1010+ break
9311011 }
9321012 _ => {
9331013 tokenizer. advance( offset) ;
934- // This function only consumed ASCII (whitespace) bytes,
935- // so the current position is a code point boundary.
936- return Ok ( consume_unquoted_url_internal( tokenizer) )
1014+ found_printable_char = true ;
1015+ break
9371016 }
9381017 }
9391018 }
940- tokenizer. position = tokenizer. input . len ( ) ;
941- return Ok ( UnquotedUrl ( "" . into ( ) ) ) ;
1019+
1020+ if newlines > 0 {
1021+ tokenizer. current_line_number += newlines;
1022+ tokenizer. current_line_start_position = start_position + last_newline + 1 ;
1023+ }
1024+
1025+ if found_printable_char {
1026+ // This function only consumed ASCII (whitespace) bytes,
1027+ // so the current position is a code point boundary.
1028+ return Ok ( consume_unquoted_url_internal ( tokenizer) )
1029+ } else {
1030+ return Ok ( UnquotedUrl ( "" . into ( ) ) )
1031+ }
9421032
9431033 fn consume_unquoted_url_internal < ' a > ( tokenizer : & mut Tokenizer < ' a > ) -> Token < ' a > {
9441034 // This function is only called with start_pos at a code point boundary.
@@ -951,7 +1041,6 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
9511041 match_byte ! { tokenizer. next_byte_unchecked( ) ,
9521042 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
9531043 let value = tokenizer. slice_from( start_pos) ;
954- tokenizer. advance( 1 ) ;
9551044 return consume_url_end( tokenizer, start_pos, value. into( ) )
9561045 }
9571046 b')' => {
@@ -974,7 +1063,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
9741063 break
9751064 }
9761065 _ => {
977- tokenizer. consume_byte ( ) ;
1066+ tokenizer. advance ( 1 ) ;
9781067 }
9791068 }
9801069 }
@@ -983,6 +1072,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
9831072 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
9841073 // string_bytes is well-formed UTF-8, see other comments.
9851074 let string = unsafe { from_utf8_release_unchecked( string_bytes) } . into( ) ;
1075+ tokenizer. position -= 1 ;
9861076 return consume_url_end( tokenizer, start_pos, string)
9871077 }
9881078 b')' => {
@@ -1020,8 +1110,16 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
10201110 -> Token < ' a > {
10211111 while !tokenizer. is_eof ( ) {
10221112 match_byte ! { tokenizer. consume_byte( ) ,
1023- b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => { } ,
1024- b')' => { break } ,
1113+ b')' => {
1114+ break
1115+ }
1116+ b' ' | b'\t' => { }
1117+ b'\n' | b'\x0C' => {
1118+ tokenizer. seen_newline( false ) ;
1119+ }
1120+ b'\r' => {
1121+ tokenizer. seen_newline( true ) ;
1122+ }
10251123 _ => {
10261124 return consume_bad_url( tokenizer, start_pos) ;
10271125 }
@@ -1034,12 +1132,20 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
10341132 // Consume up to the closing )
10351133 while !tokenizer. is_eof ( ) {
10361134 match_byte ! { tokenizer. consume_byte( ) ,
1037- b')' => { break } ,
1135+ b')' => {
1136+ break
1137+ }
10381138 b'\\' => {
10391139 if matches!( tokenizer. next_byte( ) , Some ( b')' ) | Some ( b'\\' ) ) {
10401140 tokenizer. advance( 1 ) ; // Skip an escaped ')' or '\'
10411141 }
10421142 }
1143+ b'\n' | b'\x0C' => {
1144+ tokenizer. seen_newline( false ) ;
1145+ }
1146+ b'\r' => {
1147+ tokenizer. seen_newline( true ) ;
1148+ }
10431149 _ => { } ,
10441150 }
10451151 }
@@ -1080,15 +1186,22 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
10801186 b'0' ...b'9' | b'A' ...b'F' | b'a' ...b'f' => {
10811187 let ( c, _) = consume_hex_digits( tokenizer) ;
10821188 if !tokenizer. is_eof( ) {
1083- match tokenizer. next_byte_unchecked( ) {
1084- b' ' | b'\t' | b'\n' | b'\x0C' => tokenizer. advance( 1 ) ,
1189+ match_byte! { tokenizer. next_byte_unchecked( ) ,
1190+ b' ' | b'\t' => {
1191+ tokenizer. advance( 1 )
1192+ }
1193+ b'\n' | b'\x0C' => {
1194+ tokenizer. advance( 1 ) ;
1195+ tokenizer. seen_newline( false )
1196+ }
10851197 b'\r' => {
10861198 tokenizer. advance( 1 ) ;
10871199 if !tokenizer. is_eof( ) && tokenizer. next_byte_unchecked( ) == b'\n' {
10881200 tokenizer. advance( 1 ) ;
10891201 }
1202+ tokenizer. seen_newline( false )
10901203 }
1091- _ => ( )
1204+ _ => { }
10921205 }
10931206 }
10941207 static REPLACEMENT_CHAR : char = '\u{FFFD}' ;
0 commit comments