Auto merge of #192 - tromey:utf-16-columns, r=SimonSapin

bors-servo · web-flow · commit 7560c3a5d081 · 2017-08-31T23:28:43.000-05:00
Utf 16 columns This series changes columns numbers to be reported in units of UTF-16 characters.  --- This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/rust-cssparser/192)
diff --git a/src/parser.rs b/src/parser.rs
@@ -811,6 +811,7 @@ pub fn parse_until_after<'i: 't, 't, F, T, E>(parser: &mut Parser<'i, 't>,
     let next_byte = (parser.input.tokenizer).next_byte();
     if next_byte.is_some() && !parser.stop_before.contains(Delimiters::from_byte(next_byte)) {
         debug_assert!(delimiters.contains(Delimiters::from_byte(next_byte)));
+        // We know this byte is ASCII.
         (parser.input.tokenizer).advance(1);
         if next_byte == Some(b'{') {
             consume_until_end_of_block(BlockType::CurlyBracket, &mut parser.input.tokenizer);
diff --git a/src/tests.rs b/src/tests.rs
@@ -1049,3 +1049,53 @@ fn roundtrip_percentage_token() {
         }
     }
 }
+
+#[test]
+fn utf16_columns() {
+    // This particular test serves two purposes.  First, it checks
+    // that the column number computations are correct.  Second, it
+    // checks that tokenizer code paths correctly differentiate
+    // between the different UTF-8 encoding bytes.  In particular
+    // different leader bytes and continuation bytes are treated
+    // differently, so we make sure to include all lengths in the
+    // tests, using the string "QΡ✈🆒".  Also, remember that because
+    // the column is in units of UTF-16, the 4-byte sequence results
+    // in two columns.
+    let tests = vec![
+        ("", 0),
+        ("ascii", 5),
+        ("/*QΡ✈🆒*/", 9),
+        ("'QΡ✈🆒*'", 8),
+        ("\"\\\"'QΡ✈🆒*'", 11),
+        ("\\Q\\Ρ\\✈\\🆒", 9),
+        ("QΡ✈🆒", 5),
+        ("QΡ✈🆒\\Q\\Ρ\\✈\\🆒", 14),
+        ("newline\r\nQΡ✈🆒", 5),
+        ("url(QΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 19),
+        ("url(QΡ✈🆒)", 10),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒)", 15),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒", 14),
+        ("url(\r\nQΡ✈🆒\\Q\\Ρ\\✈\\🆒 x", 16),
+        ("QΡ✈🆒()", 7),
+        // Test that under/over-flow of current_line_start_position is
+        // handled properly; see the special case in consume_4byte_intro.
+        ("🆒", 2),
+    ];
+
+    for test in tests {
+        let mut input = ParserInput::new(test.0);
+        let mut parser = Parser::new(&mut input);
+
+        // Read all tokens.
+        loop {
+            match parser.next() {
+                Err(BasicParseError::EndOfInput) => { break; }
+                Err(_) => { assert!(false); }
+                Ok(_) => {}
+            };
+        }
+
+        // Check the resulting column.
+        assert_eq!(parser.current_source_location().column, test.1);
+    }
+}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs