Auto merge of #168 - servo:foo, r=SimonSapin

bors-servo · web-flow · commit 5e9c5cbc6dc0 · 2017-07-05T09:31:02.000-07:00
Parser changes for Gecko integration This is a grab bag of changes that were important for getting Stylo tests passing.  --- This change is [<img src="https://reviewable.io/review_button.svg" height="34" align="absmiddle" alt="Reviewable"/>](https://reviewable.io/reviews/servo/rust-cssparser/168)
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 
 name = "cssparser"
-version = "0.16.1"
+version = "0.17.0"
 authors = [ "Simon Sapin <simon.sapin@exyr.org>" ]
 
 description = "Rust implementation of CSS Syntax Level 3"
diff --git a/src/parser.rs b/src/parser.rs
@@ -25,12 +25,12 @@ pub struct SourcePosition {
 pub enum BasicParseError<'a> {
     /// An unexpected token was encountered.
     UnexpectedToken(Token<'a>),
-    /// A particular token was expected but not found.
-    ExpectedToken(Token<'a>),
     /// The end of the input was encountered unexpectedly.
     EndOfInput,
     /// An `@` rule was encountered that was invalid.
-    AtRuleInvalid,
+    AtRuleInvalid(CompactCowStr<'a>),
+    /// The body of an '@' rule was invalid.
+    AtRuleBodyInvalid,
     /// A qualified rule was encountered that was invalid.
     QualifiedRuleInvalid,
 }
@@ -188,6 +188,11 @@ impl<'i: 't, 't> Parser<'i, 't> {
         }
     }
 
+    /// Return the current line that is being parsed.
+    pub fn current_line(&self) -> &'i str {
+        self.tokenizer.0.current_source_line()
+    }
+
     /// Check whether the input is exhausted. That is, if `.next()` would return a token.
     ///
     /// This ignores whitespace and comments.
@@ -357,9 +362,9 @@ impl<'i: 't, 't> Parser<'i, 't> {
     #[inline]
     pub fn parse_entirely<F, T, E>(&mut self, parse: F) -> Result<T, ParseError<'i, E>>
     where F: FnOnce(&mut Parser<'i, 't>) -> Result<T, ParseError<'i, E>> {
-        let result = parse(self);
+        let result = parse(self)?;
         self.expect_exhausted()?;
-        result
+        Ok(result)
     }
 
     /// Parse a list of comma-separated values, all with the same syntax.
@@ -482,8 +487,7 @@ impl<'i: 't, 't> Parser<'i, 't> {
         match self.next()? {
             Token::UnquotedUrl(value) => Ok(value),
             Token::Function(ref name) if name.eq_ignore_ascii_case("url") => {
-                self.parse_nested_block(|input| input.expect_string()
-                                        .map_err(|e| ParseError::Basic(e)))
+                self.parse_nested_block(|input| input.expect_string().map_err(ParseError::Basic))
                     .map_err(ParseError::<()>::basic)
             },
             t => Err(BasicParseError::UnexpectedToken(t))
@@ -497,7 +501,7 @@ impl<'i: 't, 't> Parser<'i, 't> {
             Token::UnquotedUrl(value) => Ok(value),
             Token::QuotedString(value) => Ok(value),
             Token::Function(ref name) if name.eq_ignore_ascii_case("url") => {
-                self.parse_nested_block(|input| input.expect_string().map_err(|e| ParseError::Basic(e)))
+                self.parse_nested_block(|input| input.expect_string().map_err(ParseError::Basic))
                     .map_err(ParseError::<()>::basic)
             },
             t => Err(BasicParseError::UnexpectedToken(t))
diff --git a/src/rules_and_declarations.rs b/src/rules_and_declarations.rs
@@ -116,7 +116,7 @@ pub trait AtRuleParser<'i> {
                      -> Result<AtRuleType<Self::Prelude, Self::AtRule>, ParseError<'i, Self::Error>> {
         let _ = name;
         let _ = input;
-        Err(ParseError::Basic(BasicParseError::AtRuleInvalid))
+        Err(ParseError::Basic(BasicParseError::AtRuleInvalid(name)))
     }
 
     /// Parse the content of a `{ /* ... */ }` block for the body of the at-rule.
@@ -131,7 +131,7 @@ pub trait AtRuleParser<'i> {
                        -> Result<Self::AtRule, ParseError<'i, Self::Error>> {
         let _ = prelude;
         let _ = input;
-        Err(ParseError::Basic(BasicParseError::AtRuleInvalid))
+        Err(ParseError::Basic(BasicParseError::AtRuleBodyInvalid))
     }
 
     /// An `OptionalBlock` prelude was followed by `;`.
@@ -257,9 +257,9 @@ where P: DeclarationParser<'i, Declaration = I, Error = E> +
                 Ok(Token::AtKeyword(name)) => {
                     return Some(parse_at_rule(start_position, name, self.input, &mut self.parser))
                 }
-                Ok(_) => {
+                Ok(t) => {
                     return Some(self.input.parse_until_after(Delimiter::Semicolon,
-                                                             |_| Err(ParseError::Basic(BasicParseError::ExpectedToken(Token::Semicolon))))
+                                                             |_| Err(ParseError::Basic(BasicParseError::UnexpectedToken(t))))
                                 .map_err(|e| PreciseParseError {
                                     error: e,
                                     span: start_position..self.input.position()
@@ -462,16 +462,14 @@ fn parse_at_rule<'i: 't, 't, P, E>(start_position: SourcePosition, name: Compact
                 _ => unreachable!()
             }
         }
-        Err(_) => {
+        Err(error) => {
             let end_position = input.position();
-            let error = match input.next() {
-                Ok(Token::CurlyBracketBlock) => BasicParseError::UnexpectedToken(Token::CurlyBracketBlock),
-                Ok(Token::Semicolon) => BasicParseError::UnexpectedToken(Token::Semicolon),
-                Err(e) => e,
+            match input.next() {
+                Ok(Token::CurlyBracketBlock) | Ok(Token::Semicolon) | Err(_) => {},
                 _ => unreachable!()
             };
             Err(PreciseParseError {
-                error: ParseError::Basic(error),
+                error: error,
                 span: start_position..end_position,
             })
         }
diff --git a/src/serializer.rs b/src/serializer.rs
@@ -129,8 +129,8 @@ impl<'a> ToCss for Token<'a> {
             Token::SquareBracketBlock => dest.write_str("[")?,
             Token::CurlyBracketBlock => dest.write_str("{")?,
 
-            Token::BadUrl => dest.write_str("url(<bad url>)")?,
-            Token::BadString => dest.write_str("\"<bad string>\n")?,
+            Token::BadUrl(_) => dest.write_str("url(<bad url>)")?,
+            Token::BadString(_) => dest.write_str("\"<bad string>\n")?,
             Token::CloseParenthesis => dest.write_str(")")?,
             Token::CloseSquareBracket => dest.write_str("]")?,
             Token::CloseCurlyBracket => dest.write_str("}")?,
@@ -376,7 +376,7 @@ impl<'a> Token<'a> {
         TokenSerializationType(match *self {
             Token::Ident(_) => Ident,
             Token::AtKeyword(_) | Token::Hash(_) | Token::IDHash(_) => AtKeywordOrHash,
-            Token::UnquotedUrl(_) | Token::BadUrl => UrlOrBadUrl,
+            Token::UnquotedUrl(_) | Token::BadUrl(_) => UrlOrBadUrl,
             Token::Delim('#') => DelimHash,
             Token::Delim('@') => DelimAt,
             Token::Delim('.') | Token::Delim('+') => DelimDotOrPlus,
@@ -400,7 +400,7 @@ impl<'a> Token<'a> {
             Token::ParenthesisBlock => OpenParen,
             Token::SquareBracketBlock | Token::CurlyBracketBlock |
             Token::CloseParenthesis | Token::CloseSquareBracket | Token::CloseCurlyBracket |
-            Token::QuotedString(_) | Token::BadString |
+            Token::QuotedString(_) | Token::BadString(_) |
             Token::Delim(_) | Token::Colon | Token::Semicolon | Token::Comma | Token::CDO |
             Token::IncludeMatch | Token::PrefixMatch | Token::SuffixMatch
             => Other,
diff --git a/src/tests.rs b/src/tests.rs
@@ -451,26 +451,26 @@ fn serialize_rgba_two_digit_float_if_roundtrips() {
 fn line_numbers() {
     let mut input = ParserInput::new("foo bar\nbaz\r\n\n\"a\\\r\nb\"");
     let mut input = Parser::new(&mut input);
-    assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 1 });
+    assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 0 });
     assert_eq!(input.next_including_whitespace(), Ok(Token::Ident("foo".into())));
-    assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 4 });
+    assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 3 });
     assert_eq!(input.next_including_whitespace(), Ok(Token::WhiteSpace(" ")));
-    assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 5 });
+    assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 4 });
     assert_eq!(input.next_including_whitespace(), Ok(Token::Ident("bar".into())));
-    assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 8 });
+    assert_eq!(input.current_source_location(), SourceLocation { line: 0, column: 7 });
     assert_eq!(input.next_including_whitespace(), Ok(Token::WhiteSpace("\n")));
-    assert_eq!(input.current_source_location(), SourceLocation { line: 2, column: 1 });
+    assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 0 });
     assert_eq!(input.next_including_whitespace(), Ok(Token::Ident("baz".into())));
-    assert_eq!(input.current_source_location(), SourceLocation { line: 2, column: 4 });
+    assert_eq!(input.current_source_location(), SourceLocation { line: 1, column: 3 });
     let position = input.position();
 
     assert_eq!(input.next_including_whitespace(), Ok(Token::WhiteSpace("\r\n\n")));
-    assert_eq!(input.current_source_location(), SourceLocation { line: 4, column: 1 });
+    assert_eq!(input.current_source_location(), SourceLocation { line: 3, column: 0 });
 
-    assert_eq!(input.source_location(position), SourceLocation { line: 2, column: 4 });
+    assert_eq!(input.source_location(position), SourceLocation { line: 1, column: 3 });
 
     assert_eq!(input.next_including_whitespace(), Ok(Token::QuotedString("ab".into())));
-    assert_eq!(input.current_source_location(), SourceLocation { line: 5, column: 3 });
+    assert_eq!(input.current_source_location(), SourceLocation { line: 4, column: 2 });
     assert!(input.next_including_whitespace().is_err());
 }
 
@@ -848,8 +848,8 @@ fn one_component_value_to_json(token: Token, input: &mut Parser) -> Json {
             v.extend(nested(input));
             v
         }),
-        Token::BadUrl => JArray!["error", "bad-url"],
-        Token::BadString => JArray!["error", "bad-string"],
+        Token::BadUrl(_) => JArray!["error", "bad-url"],
+        Token::BadString(_) => JArray!["error", "bad-string"],
         Token::CloseParenthesis => JArray!["error", ")"],
         Token::CloseSquareBracket => JArray!["error", "]"],
         Token::CloseCurlyBracket => JArray!["error", "}"],
@@ -920,3 +920,32 @@ fn parse_until_before_stops_at_delimiter_or_end_of_input() {
         }
     }
 }
+
+#[test]
+fn parser_maintains_current_line() {
+    let mut input = ParserInput::new("ident ident;\nident ident ident;\nident");
+    let mut parser = Parser::new(&mut input);
+    assert_eq!(parser.current_line(), "ident ident;");
+    assert_eq!(parser.next(), Ok(Token::Ident("ident".into())));
+    assert_eq!(parser.next(), Ok(Token::Ident("ident".into())));
+    assert_eq!(parser.next(), Ok(Token::Semicolon));
+
+    assert_eq!(parser.next(), Ok(Token::Ident("ident".into())));
+    assert_eq!(parser.current_line(), "ident ident ident;");
+    assert_eq!(parser.next(), Ok(Token::Ident("ident".into())));
+    assert_eq!(parser.next(), Ok(Token::Ident("ident".into())));
+    assert_eq!(parser.next(), Ok(Token::Semicolon));
+
+    assert_eq!(parser.next(), Ok(Token::Ident("ident".into())));
+    assert_eq!(parser.current_line(), "ident");
+}
+
+#[test]
+fn parse_entirely_reports_first_error() {
+    #[derive(PartialEq, Debug)]
+    enum E { Foo }
+    let mut input = ParserInput::new("ident");
+    let mut parser = Parser::new(&mut input);
+    let result: Result<(), _> = parser.parse_entirely(|_| Err(ParseError::Custom(E::Foo)));
+    assert_eq!(result, Err(ParseError::Custom(E::Foo)));
+}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -157,12 +157,12 @@ pub enum Token<'a> {
     /// A `<bad-url-token>`
     ///
     /// This token always indicates a parse error.
-    BadUrl,
+    BadUrl(CompactCowStr<'a>),
 
     /// A `<bad-string-token>`
     ///
     /// This token always indicates a parse error.
-    BadString,
+    BadString(CompactCowStr<'a>),
 
     /// A `<)-token>`
     ///
@@ -194,7 +194,7 @@ impl<'a> Token<'a> {
     pub fn is_parse_error(&self) -> bool {
         matches!(
             *self,
-            BadUrl | BadString | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
+            BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
         )
     }
 }
@@ -226,7 +226,7 @@ impl<'a> Tokenizer<'a> {
             input: input,
             position: 0,
             last_known_source_location: Cell::new((SourcePosition(0),
-                                                   SourceLocation { line: 1, column: 1 })),
+                                                   SourceLocation { line: 0, column: 0 })),
             var_functions: SeenStatus::DontCare,
             viewport_percentages: SeenStatus::DontCare,
         }
@@ -287,6 +287,17 @@ impl<'a> Tokenizer<'a> {
         self.source_location(position)
     }
 
+    pub fn current_source_line(&self) -> &'a str {
+        let current = self.position;
+        let start = self.input[0..current]
+            .rfind(|c| matches!(c, '\r' | '\n' | '\x0C'))
+            .map_or(0, |start| start + 1);
+        let end = self.input[current..]
+            .find(|c| matches!(c, '\r' | '\n' | '\x0C'))
+            .map_or(self.input.len(), |end| current + end);
+        &self.input[start..end]
+    }
+
     pub fn source_location(&self, position: SourcePosition) -> SourceLocation {
         let target = position.0;
         let mut location;
@@ -301,7 +312,7 @@ impl<'a> Tokenizer<'a> {
             // So if the requested position is before the last known one,
             // start over from the beginning.
             position = 0;
-            location = SourceLocation { line: 1, column: 1 };
+            location = SourceLocation { line: 0, column: 0 };
         }
         let mut source = &self.input[position..target];
         while let Some(newline_position) = source.find(|c| matches!(c, '\n' | '\r' | '\x0C')) {
@@ -310,7 +321,7 @@ impl<'a> Tokenizer<'a> {
             source = &source[offset..];
             position += offset;
             location.line += 1;
-            location.column = 1;
+            location.column = 0;
         }
         debug_assert!(position <= target);
         location.column += (target - position) as u32;
@@ -386,10 +397,10 @@ pub struct SourcePosition(usize);
 /// The line and column number for a given position within the input.
 #[derive(PartialEq, Eq, Debug, Clone, Copy)]
 pub struct SourceLocation {
-    /// The line number, starting at 1 for the first line.
+    /// The line number, starting at 0 for the first line.
     pub line: u32,
 
-    /// The column number within a line, starting at 1 for first the character of the line.
+    /// The column number within a line, starting at 0 for first the character of the line.
     pub column: u32,
 }
 
@@ -556,14 +567,14 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
 fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
     match consume_quoted_string(tokenizer, single_quote) {
         Ok(value) => QuotedString(value),
-        Err(()) => BadString
+        Err(value) => BadString(value)
     }
 }
 
 
 /// Return `Err(())` on syntax error (ie. unescaped newline)
 fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
-                             -> Result<CompactCowStr<'a>, ()> {
+                             -> Result<CompactCowStr<'a>, CompactCowStr<'a>> {
     tokenizer.advance(1);  // Skip the initial quote
     // start_pos is at code point boundary, after " or '
     let start_pos = tokenizer.position();
@@ -596,15 +607,22 @@ fn consume_quoted_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool)
                 string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
                 break
             }
-            b'\n' | b'\r' | b'\x0C' => { return Err(()) },
+            b'\n' | b'\r' | b'\x0C' => {
+                return Err(tokenizer.slice_from(start_pos).into())
+            },
             _ => {}
         }
         tokenizer.consume_byte();
     }
 
     while !tokenizer.is_eof() {
         if matches!(tokenizer.next_byte_unchecked(), b'\n' | b'\r' | b'\x0C') {
-            return Err(());
+            return Err(
+                // string_bytes is well-formed UTF-8, see other comments.
+                unsafe {
+                    from_utf8_release_unchecked(string_bytes)
+                }.into()
+            );
         }
         let b = tokenizer.consume_byte();
         match_byte! { b,
@@ -1013,6 +1031,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
     }
 
     fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
+        let start_pos = tokenizer.position();
         // Consume up to the closing )
         while !tokenizer.is_eof() {
             match_byte! { tokenizer.consume_byte(),
@@ -1023,7 +1042,7 @@ fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>,
                 _ => {},
             }
         }
-        BadUrl
+        BadUrl(tokenizer.slice_from(start_pos).into())
     }
 }