@@ -57,11 +57,10 @@ impl Token {
5757/// Enum representing common lexeme types.
5858#[ derive( Clone , Copy , Debug , PartialEq , Eq ) ]
5959pub enum TokenKind {
60- // Multi-char tokens:
61- /// "// comment"
60+ /// A line comment, e.g. `// comment`.
6261 LineComment { doc_style : Option < DocStyle > } ,
6362
64- /// `/* block comment */`
63+ /// A block comment, e.g. `/* block comment */`.
6564 ///
6665 /// Block comments can be recursive, so a sequence like `/* /* */`
6766 /// will not be considered terminated and will result in a parsing error.
@@ -70,18 +69,17 @@ pub enum TokenKind {
7069 /// Any whitespace character sequence.
7170 Whitespace ,
7271
73- /// "ident" or "continue"
74- ///
75- /// At this step, keywords are also considered identifiers.
72+ /// An identifier or keyword, e.g. `ident` or `continue`.
7673 Ident ,
7774
78- /// Like the above, but containing invalid unicode codepoints .
75+ /// An identifier that is invalid because it contains emoji .
7976 InvalidIdent ,
8077
81- /// "r#ident"
78+ /// A raw identifier, e.g. "r#ident".
8279 RawIdent ,
8380
84- /// An unknown prefix, like `foo#`, `foo'`, `foo"`.
81+ /// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes
82+ /// literal prefixes that contain emoji, which are considered "invalid".
8583 ///
8684 /// Note that only the
8785 /// prefix (`foo`) is included in the token, not the separator (which is
@@ -93,87 +91,83 @@ pub enum TokenKind {
9391
9492 /// An unknown prefix in a lifetime, like `'foo#`.
9593 ///
96- /// Note that like above , only the `'` and prefix are included in the token
94+ /// Like `UnknownPrefix` , only the `'` and prefix are included in the token
9795 /// and not the separator.
9896 UnknownPrefixLifetime ,
9997
100- /// `'r#lt`, which in edition < 2021 is split into several tokens: `'r # lt`.
98+ /// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into
99+ /// several tokens: `'r` and `#` and `foo`.
101100 RawLifetime ,
102101
103- /// Similar to the above, but *always* an error on every edition. This is used
104- /// for emoji identifier recovery, as those are not meant to be ever accepted.
105- InvalidPrefix ,
106-
107102 /// Guarded string literal prefix: `#"` or `##`.
108103 ///
109104 /// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
110105 /// Split into the component tokens on older editions.
111106 GuardedStrPrefix ,
112107
113- /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
108+ /// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
114109 /// suffix, but may be present here on string and float literals. Users of
115110 /// this type will need to check for and reject that case.
116111 ///
117112 /// See [LiteralKind] for more details.
118113 Literal { kind : LiteralKind , suffix_start : u32 } ,
119114
120- /// "'a"
115+ /// A lifetime, e.g. `'a`.
121116 Lifetime { starts_with_number : bool } ,
122117
123- // One-char tokens:
124- /// ";"
118+ /// `;`
125119 Semi ,
126- /// ","
120+ /// `,`
127121 Comma ,
128- /// "."
122+ /// `.`
129123 Dot ,
130- /// "("
124+ /// `(`
131125 OpenParen ,
132- /// ")"
126+ /// `)`
133127 CloseParen ,
134- /// "{"
128+ /// `{`
135129 OpenBrace ,
136- /// "}"
130+ /// `}`
137131 CloseBrace ,
138- /// "["
132+ /// `[`
139133 OpenBracket ,
140- /// "]"
134+ /// `]`
141135 CloseBracket ,
142- /// "@"
136+ /// `@`
143137 At ,
144- /// "#"
138+ /// `#`
145139 Pound ,
146- /// "~"
140+ /// `~`
147141 Tilde ,
148- /// "?"
142+ /// `?`
149143 Question ,
150- /// ":"
144+ /// `:`
151145 Colon ,
152- /// "$"
146+ /// `$`
153147 Dollar ,
154- /// "="
148+ /// `=`
155149 Eq ,
156- /// "!"
150+ /// `!`
157151 Bang ,
158- /// "<"
152+ /// `<`
159153 Lt ,
160- /// ">"
154+ /// `>`
161155 Gt ,
162- /// "-"
156+ /// `-`
163157 Minus ,
164- /// "&"
158+ /// `&`
165159 And ,
166- /// "|"
160+ /// `|`
167161 Or ,
168- /// "+"
162+ /// `+`
169163 Plus ,
170- /// "*"
164+ /// `*`
171165 Star ,
172- /// "/"
166+ /// `/`
173167 Slash ,
174- /// "^"
168+ /// `^`
175169 Caret ,
176- /// "%"
170+ /// `%`
177171 Percent ,
178172
179173 /// Unknown token, not expected by the lexer, e.g. "№"
@@ -468,7 +462,7 @@ impl Cursor<'_> {
468462 Literal { kind, suffix_start }
469463 }
470464 // Identifier starting with an emoji. Only lexed for graceful error recovery.
471- c if !c. is_ascii ( ) && c. is_emoji_char ( ) => self . fake_ident_or_unknown_prefix ( ) ,
465+ c if !c. is_ascii ( ) && c. is_emoji_char ( ) => self . invalid_ident ( ) ,
472466 _ => Unknown ,
473467 } ;
474468 let res = Token :: new ( token_kind, self . pos_within_token ( ) ) ;
@@ -552,24 +546,22 @@ impl Cursor<'_> {
552546 // we see a prefix here, it is definitely an unknown prefix.
553547 match self . first ( ) {
554548 '#' | '"' | '\'' => UnknownPrefix ,
555- c if !c. is_ascii ( ) && c. is_emoji_char ( ) => self . fake_ident_or_unknown_prefix ( ) ,
549+ c if !c. is_ascii ( ) && c. is_emoji_char ( ) => self . invalid_ident ( ) ,
556550 _ => Ident ,
557551 }
558552 }
559553
560- fn fake_ident_or_unknown_prefix ( & mut self ) -> TokenKind {
554+ fn invalid_ident ( & mut self ) -> TokenKind {
561555 // Start is already eaten, eat the rest of identifier.
562556 self . eat_while ( |c| {
563- unicode_xid:: UnicodeXID :: is_xid_continue ( c)
564- || ( !c. is_ascii ( ) && c. is_emoji_char ( ) )
565- || c == '\u{200d}'
557+ const ZERO_WIDTH_JOINER : char = '\u{200d}' ;
558+ is_id_continue ( c) || ( !c. is_ascii ( ) && c. is_emoji_char ( ) ) || c == ZERO_WIDTH_JOINER
566559 } ) ;
567- // Known prefixes must have been handled earlier. So if
568- // we see a prefix here, it is definitely an unknown prefix.
569- match self . first ( ) {
570- '#' | '"' | '\'' => InvalidPrefix ,
571- _ => InvalidIdent ,
572- }
560+ // An invalid identifier followed by '#' or '"' or '\'' could be
561+ // interpreted as an invalid literal prefix. We don't bother doing that
562+ // because the treatment of invalid identifiers and invalid prefixes
563+ // would be the same.
564+ InvalidIdent
573565 }
574566
575567 fn c_or_byte_string (
0 commit comments