Introduce specialized advancing functions and use them universally

tromey · tromey · commit 94bdfdcde2c8 · 2017-08-31T12:19:55.000-06:00
diff --git a/src/parser.rs b/src/parser.rs
@@ -811,6 +811,7 @@ pub fn parse_until_after<'i: 't, 't, F, T, E>(parser: &mut Parser<'i, 't>,
     let next_byte = (parser.input.tokenizer).next_byte();
     if next_byte.is_some() && !parser.stop_before.contains(Delimiters::from_byte(next_byte)) {
         debug_assert!(delimiters.contains(Delimiters::from_byte(next_byte)));
+        // We know this byte is ASCII.
         (parser.input.tokenizer).advance(1);
         if next_byte == Some(b'{') {
             consume_until_end_of_block(BlockType::CurlyBracket, &mut parser.input.tokenizer);
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -338,8 +338,23 @@ impl<'a> Tokenizer<'a> {
     #[inline]
     fn has_at_least(&self, n: usize) -> bool { self.position + n < self.input.len() }
 
+    // Advance over N bytes in the input.  This function can advance
+    // over ASCII bytes (excluding newlines), or UTF-8 sequence
+    // leaders (excluding leaders for 4-byte sequences).
     #[inline]
-    pub fn advance(&mut self, n: usize) { self.position += n }
+    pub fn advance(&mut self, n: usize) {
+        if cfg!(debug_assertions) {
+            // Each byte must either be an ASCII byte or a sequence
+            // leader, but not a 4-byte leader; also newlines are
+            // rejected.
+            for i in 0..n {
+                let b = self.byte_at(i);
+                debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
+                debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
+            }
+        }
+        self.position += n
+    }
 
     // Assumes non-EOF
     #[inline]
@@ -350,10 +365,27 @@ impl<'a> Tokenizer<'a> {
         self.input.as_bytes()[self.position + offset]
     }
 
+    // Advance over a single byte; the byte must be a UTF-8 sequence
+    // leader for a 4-byte sequence.
+    #[inline]
+    fn consume_4byte_intro(&mut self) {
+        debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
+        self.position += 1;
+    }
+
+    // Advance over a single byte; the byte must be a UTF-8
+    // continuation byte.
     #[inline]
-    fn consume_byte(&mut self) -> u8 {
+    fn consume_continuation_byte(&mut self) {
+        debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
+        self.position += 1;
+    }
+
+    // Advance over any kind of byte, excluding newlines.
+    #[inline(never)]
+    fn consume_known_byte(&mut self, byte: u8) {
+        debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
         self.position += 1;
-        self.input.as_bytes()[self.position - 1]
     }
 
     #[inline]
@@ -667,7 +699,10 @@ fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
             b'\n' | b'\x0C' | b'\r' => {
                 tokenizer.consume_newline();
             }
+            b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
+            b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
             _ => {
+                // ASCII or other leading byte.
                 tokenizer.advance(1);
             }
         }
@@ -703,13 +738,15 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
                     tokenizer.advance(1);
                     return Ok(value.into())
                 }
+                tokenizer.advance(1);
             }
             b'\'' => {
                 if single_quote {
                     let value = tokenizer.slice_from(start_pos);
                     tokenizer.advance(1);
                     return Ok(value.into())
                 }
+                tokenizer.advance(1);
             }
             b'\\' | b'\0' => {
                 // * The tokenizer’s input is UTF-8 since it’s `&str`.
@@ -723,33 +760,40 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
             b'\n' | b'\r' | b'\x0C' => {
                 return Err(tokenizer.slice_from(start_pos).into())
             },
-            _ => {}
+            b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
+            b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
+            _ => {
+                // ASCII or other leading byte.
+                tokenizer.advance(1);
+            }
         }
-        tokenizer.consume_byte();
     }
 
     while !tokenizer.is_eof() {
-        if matches!(tokenizer.next_byte_unchecked(), b'\n' | b'\r' | b'\x0C') {
-            return Err(
-                // string_bytes is well-formed UTF-8, see other comments.
-                unsafe {
-                    from_utf8_release_unchecked(string_bytes)
-                }.into()
-            );
-        }
-        let b = tokenizer.consume_byte();
+        let b = tokenizer.next_byte_unchecked();
         match_byte! { b,
+            b'\n' | b'\r' | b'\x0C' => {
+                return Err(
+                    // string_bytes is well-formed UTF-8, see other comments.
+                    unsafe {
+                        from_utf8_release_unchecked(string_bytes)
+                    }.into()
+                );
+            }
             b'"' => {
+                tokenizer.advance(1);
                 if !single_quote {
                     break;
                 }
             }
             b'\'' => {
+                tokenizer.advance(1);
                 if single_quote {
                     break;
                 }
             }
             b'\\' => {
+                tokenizer.advance(1);
                 if !tokenizer.is_eof() {
                     match tokenizer.next_byte_unchecked() {
                         // Escaped newline
@@ -764,10 +808,16 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
                 continue;
             }
             b'\0' => {
+                tokenizer.advance(1);
                 string_bytes.extend("\u{FFFD}".as_bytes());
                 continue;
             }
-            _ => {},
+            b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
+            b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
+            _ => {
+                // ASCII or other leading byte.
+                tokenizer.advance(1);
+            },
         }
 
         // If this byte is part of a multi-byte code point,
@@ -835,11 +885,11 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
                 value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
                 break
             }
+            b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
+            b'\xC0'...b'\xEF' => { tokenizer.advance(1); }
+            b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
             b => {
-                if b.is_ascii() {
-                    return tokenizer.slice_from(start_pos).into();
-                }
-                tokenizer.advance(1);
+                return tokenizer.slice_from(start_pos).into();
             }
         }
     }
@@ -861,15 +911,26 @@ fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
                 tokenizer.advance(1);
                 value_bytes.extend("\u{FFFD}".as_bytes());
             },
-            _ => {
-                if b.is_ascii() {
-                    break;
-                }
-                tokenizer.advance(1);
+            b'\x80'...b'\xBF' => {
+                // This byte *is* part of a multi-byte code point,
+                // we’ll end up copying the whole code point before this loop does something else.
+                tokenizer.consume_continuation_byte();
+                value_bytes.push(b)
+            }
+            b'\xC0'...b'\xEF' => {
                 // This byte *is* part of a multi-byte code point,
                 // we’ll end up copying the whole code point before this loop does something else.
+                tokenizer.advance(1);
                 value_bytes.push(b)
             }
+            b'\xF0'...b'\xFF' => {
+                tokenizer.consume_4byte_intro();
+                value_bytes.push(b)
+            }
+            _ => {
+                // ASCII
+                break;
+            }
         }
     }
     // string_bytes is well-formed UTF-8, see other comments.
@@ -1048,11 +1109,15 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
             }
             b'"' | b'\'' => { return Err(()) },  // Do not advance
             b')' => {
-                tokenizer.advance(offset + 1);
+                // Don't use advance, because we may be skipping
+                // newlines here, and we want to avoid the assert.
+                tokenizer.position += offset + 1;
                 break
             }
             _ => {
-                tokenizer.advance(offset);
+                // Don't use advance, because we may be skipping
+                // newlines here, and we want to avoid the assert.
+                tokenizer.position += offset;
                 found_printable_char = true;
                 break
             }
@@ -1104,27 +1169,33 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
                     string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
                     break
                 }
+                b'\x80'...b'\xBF' => { tokenizer.consume_continuation_byte(); }
+                b'\xF0'...b'\xFF' => { tokenizer.consume_4byte_intro(); }
                 _ => {
+                    // ASCII or other leading byte.
                     tokenizer.advance(1);
                 }
             }
         }
         while !tokenizer.is_eof() {
-            match_byte! { tokenizer.consume_byte(),
+            let b = tokenizer.next_byte_unchecked();
+            match_byte! { b,
                 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
                     // string_bytes is well-formed UTF-8, see other comments.
                     let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
-                    tokenizer.position -= 1;
                     return consume_url_end(tokenizer, start_pos, string)
                 }
                 b')' => {
+                    tokenizer.advance(1);
                     break;
                 }
                 b'\x01'...b'\x08' | b'\x0B' | b'\x0E'...b'\x1F' | b'\x7F'  // non-printable
                     | b'"' | b'\'' | b'(' => {
+                    tokenizer.advance(1);
                     return consume_bad_url(tokenizer, start_pos);
                 }
                 b'\\' => {
+                    tokenizer.advance(1);
                     if tokenizer.has_newline_at(0) {
                         return consume_bad_url(tokenizer, start_pos)
                     }
@@ -1133,11 +1204,28 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
                     consume_escape_and_write(tokenizer, &mut string_bytes)
                 },
                 b'\0' => {
+                    tokenizer.advance(1);
                     string_bytes.extend("\u{FFFD}".as_bytes());
                 }
+                b'\x80'...b'\xBF' => {
+                    // We’ll end up copying the whole code point
+                    // before this loop does something else.
+                    tokenizer.consume_continuation_byte();
+                    string_bytes.push(b);
+                }
+                b'\xF0'...b'\xFF' => {
+                    // We’ll end up copying the whole code point
+                    // before this loop does something else.
+                    tokenizer.consume_4byte_intro();
+                    string_bytes.push(b);
+                }
                 // If this byte is part of a multi-byte code point,
                 // we’ll end up copying the whole code point before this loop does something else.
-                b => { string_bytes.push(b) }
+                b => {
+                    // ASCII or other leading byte.
+                    tokenizer.advance(1);
+                    string_bytes.push(b)
+                }
             }
         }
         UnquotedUrl(
@@ -1160,8 +1248,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
                 b'\n' | b'\x0C' | b'\r' => {
                     tokenizer.consume_newline();
                 }
-                _ => {
-                    tokenizer.advance(1);
+                b => {
+                    tokenizer.consume_known_byte(b);
                     return consume_bad_url(tokenizer, start_pos);
                 }
             }
@@ -1186,8 +1274,8 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
                 b'\n' | b'\x0C' | b'\r' => {
                     tokenizer.consume_newline();
                 }
-                _ => {
-                    tokenizer.advance(1);
+                b => {
+                    tokenizer.consume_known_byte(b);
                 }
             }
         }