@@ -86,13 +86,182 @@ impl<'a> StringReader<'a> {
8686
8787 debug ! ( "next_token: {:?}({:?})" , token. kind, self . str_from( start) ) ;
8888
89- match self . cook_lexer_token ( token. kind , start) {
90- Some ( kind) => {
89+ // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
90+ // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
91+ // additional validation.
92+ let kind = match token. kind {
93+ rustc_lexer:: TokenKind :: LineComment { doc_style } => {
94+ // Skip non-doc comments
95+ let Some ( doc_style) = doc_style else {
96+ self . lint_unicode_text_flow ( start) ;
97+ preceded_by_whitespace = true ;
98+ continue ;
99+ } ;
100+
101+ // Opening delimiter of the length 3 is not included into the symbol.
102+ let content_start = start + BytePos ( 3 ) ;
103+ let content = self . str_from ( content_start) ;
104+ self . cook_doc_comment ( content_start, content, CommentKind :: Line , doc_style)
105+ }
106+ rustc_lexer:: TokenKind :: BlockComment { doc_style, terminated } => {
107+ if !terminated {
108+ self . report_unterminated_block_comment ( start, doc_style) ;
109+ }
110+
111+ // Skip non-doc comments
112+ let Some ( doc_style) = doc_style else {
113+ self . lint_unicode_text_flow ( start) ;
114+ preceded_by_whitespace = true ;
115+ continue ;
116+ } ;
117+
118+ // Opening delimiter of the length 3 and closing delimiter of the length 2
119+ // are not included into the symbol.
120+ let content_start = start + BytePos ( 3 ) ;
121+ let content_end = self . pos - BytePos ( if terminated { 2 } else { 0 } ) ;
122+ let content = self . str_from_to ( content_start, content_end) ;
123+ self . cook_doc_comment ( content_start, content, CommentKind :: Block , doc_style)
124+ }
125+ rustc_lexer:: TokenKind :: Whitespace => {
126+ preceded_by_whitespace = true ;
127+ continue ;
128+ }
129+ rustc_lexer:: TokenKind :: Ident => {
130+ let sym = nfc_normalize ( self . str_from ( start) ) ;
91131 let span = self . mk_sp ( start, self . pos ) ;
92- return ( Token :: new ( kind, span) , preceded_by_whitespace) ;
132+ self . sess . symbol_gallery . insert ( sym, span) ;
133+ token:: Ident ( sym, false )
93134 }
94- None => preceded_by_whitespace = true ,
95- }
135+ rustc_lexer:: TokenKind :: RawIdent => {
136+ let sym = nfc_normalize ( self . str_from ( start + BytePos ( 2 ) ) ) ;
137+ let span = self . mk_sp ( start, self . pos ) ;
138+ self . sess . symbol_gallery . insert ( sym, span) ;
139+ if !sym. can_be_raw ( ) {
140+ self . err_span ( span, & format ! ( "`{}` cannot be a raw identifier" , sym) ) ;
141+ }
142+ self . sess . raw_identifier_spans . borrow_mut ( ) . push ( span) ;
143+ token:: Ident ( sym, true )
144+ }
145+ rustc_lexer:: TokenKind :: UnknownPrefix => {
146+ self . report_unknown_prefix ( start) ;
147+ let sym = nfc_normalize ( self . str_from ( start) ) ;
148+ let span = self . mk_sp ( start, self . pos ) ;
149+ self . sess . symbol_gallery . insert ( sym, span) ;
150+ token:: Ident ( sym, false )
151+ }
152+ rustc_lexer:: TokenKind :: InvalidIdent
153+ // Do not recover an identifier with emoji if the codepoint is a confusable
154+ // with a recoverable substitution token, like `➖`.
155+ if !UNICODE_ARRAY
156+ . iter ( )
157+ . any ( |& ( c, _, _) | {
158+ let sym = self . str_from ( start) ;
159+ sym. chars ( ) . count ( ) == 1 && c == sym. chars ( ) . next ( ) . unwrap ( )
160+ } ) =>
161+ {
162+ let sym = nfc_normalize ( self . str_from ( start) ) ;
163+ let span = self . mk_sp ( start, self . pos ) ;
164+ self . sess . bad_unicode_identifiers . borrow_mut ( ) . entry ( sym) . or_default ( )
165+ . push ( span) ;
166+ token:: Ident ( sym, false )
167+ }
168+ rustc_lexer:: TokenKind :: Literal { kind, suffix_start } => {
169+ let suffix_start = start + BytePos ( suffix_start) ;
170+ let ( kind, symbol) = self . cook_lexer_literal ( start, suffix_start, kind) ;
171+ let suffix = if suffix_start < self . pos {
172+ let string = self . str_from ( suffix_start) ;
173+ if string == "_" {
174+ self . sess
175+ . span_diagnostic
176+ . struct_span_warn (
177+ self . mk_sp ( suffix_start, self . pos ) ,
178+ "underscore literal suffix is not allowed" ,
179+ )
180+ . warn (
181+ "this was previously accepted by the compiler but is \
182+ being phased out; it will become a hard error in \
183+ a future release!",
184+ )
185+ . note (
186+ "see issue #42326 \
187+ <https://github.com/rust-lang/rust/issues/42326> \
188+ for more information",
189+ )
190+ . emit ( ) ;
191+ None
192+ } else {
193+ Some ( Symbol :: intern ( string) )
194+ }
195+ } else {
196+ None
197+ } ;
198+ token:: Literal ( token:: Lit { kind, symbol, suffix } )
199+ }
200+ rustc_lexer:: TokenKind :: Lifetime { starts_with_number } => {
201+ // Include the leading `'` in the real identifier, for macro
202+ // expansion purposes. See #12512 for the gory details of why
203+ // this is necessary.
204+ let lifetime_name = self . str_from ( start) ;
205+ if starts_with_number {
206+ self . err_span_ ( start, self . pos , "lifetimes cannot start with a number" ) ;
207+ }
208+ let ident = Symbol :: intern ( lifetime_name) ;
209+ token:: Lifetime ( ident)
210+ }
211+ rustc_lexer:: TokenKind :: Semi => token:: Semi ,
212+ rustc_lexer:: TokenKind :: Comma => token:: Comma ,
213+ rustc_lexer:: TokenKind :: Dot => token:: Dot ,
214+ rustc_lexer:: TokenKind :: OpenParen => token:: OpenDelim ( Delimiter :: Parenthesis ) ,
215+ rustc_lexer:: TokenKind :: CloseParen => token:: CloseDelim ( Delimiter :: Parenthesis ) ,
216+ rustc_lexer:: TokenKind :: OpenBrace => token:: OpenDelim ( Delimiter :: Brace ) ,
217+ rustc_lexer:: TokenKind :: CloseBrace => token:: CloseDelim ( Delimiter :: Brace ) ,
218+ rustc_lexer:: TokenKind :: OpenBracket => token:: OpenDelim ( Delimiter :: Bracket ) ,
219+ rustc_lexer:: TokenKind :: CloseBracket => token:: CloseDelim ( Delimiter :: Bracket ) ,
220+ rustc_lexer:: TokenKind :: At => token:: At ,
221+ rustc_lexer:: TokenKind :: Pound => token:: Pound ,
222+ rustc_lexer:: TokenKind :: Tilde => token:: Tilde ,
223+ rustc_lexer:: TokenKind :: Question => token:: Question ,
224+ rustc_lexer:: TokenKind :: Colon => token:: Colon ,
225+ rustc_lexer:: TokenKind :: Dollar => token:: Dollar ,
226+ rustc_lexer:: TokenKind :: Eq => token:: Eq ,
227+ rustc_lexer:: TokenKind :: Bang => token:: Not ,
228+ rustc_lexer:: TokenKind :: Lt => token:: Lt ,
229+ rustc_lexer:: TokenKind :: Gt => token:: Gt ,
230+ rustc_lexer:: TokenKind :: Minus => token:: BinOp ( token:: Minus ) ,
231+ rustc_lexer:: TokenKind :: And => token:: BinOp ( token:: And ) ,
232+ rustc_lexer:: TokenKind :: Or => token:: BinOp ( token:: Or ) ,
233+ rustc_lexer:: TokenKind :: Plus => token:: BinOp ( token:: Plus ) ,
234+ rustc_lexer:: TokenKind :: Star => token:: BinOp ( token:: Star ) ,
235+ rustc_lexer:: TokenKind :: Slash => token:: BinOp ( token:: Slash ) ,
236+ rustc_lexer:: TokenKind :: Caret => token:: BinOp ( token:: Caret ) ,
237+ rustc_lexer:: TokenKind :: Percent => token:: BinOp ( token:: Percent ) ,
238+
239+ rustc_lexer:: TokenKind :: Unknown | rustc_lexer:: TokenKind :: InvalidIdent => {
240+ let c = self . str_from ( start) . chars ( ) . next ( ) . unwrap ( ) ;
241+ let mut err =
242+ self . struct_err_span_char ( start, self . pos , "unknown start of token" , c) ;
243+ // FIXME: the lexer could be used to turn the ASCII version of unicode
244+ // homoglyphs, instead of keeping a table in `check_for_substitution`into the
245+ // token. Ideally, this should be inside `rustc_lexer`. However, we should
246+ // first remove compound tokens like `<<` from `rustc_lexer`, and then add
247+ // fancier error recovery to it, as there will be less overall work to do this
248+ // way.
249+ let token = unicode_chars:: check_for_substitution ( self , start, c, & mut err) ;
250+ if c == '\x00' {
251+ err. help ( "source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used" ) ;
252+ }
253+ err. emit ( ) ;
254+ if let Some ( token) = token {
255+ token
256+ } else {
257+ preceded_by_whitespace = true ;
258+ continue ;
259+ }
260+ }
261+ rustc_lexer:: TokenKind :: Eof => token:: Eof ,
262+ } ;
263+ let span = self . mk_sp ( start, self . pos ) ;
264+ return ( Token :: new ( kind, span) , preceded_by_whitespace) ;
96265 }
97266 }
98267
@@ -158,172 +327,6 @@ impl<'a> StringReader<'a> {
158327 }
159328 }
160329
161- /// Turns simple `rustc_lexer::TokenKind` enum into a rich
162- /// `rustc_ast::TokenKind`. This turns strings into interned
163- /// symbols and runs additional validation.
164- fn cook_lexer_token ( & self , token : rustc_lexer:: TokenKind , start : BytePos ) -> Option < TokenKind > {
165- Some ( match token {
166- rustc_lexer:: TokenKind :: LineComment { doc_style } => {
167- // Skip non-doc comments
168- let Some ( doc_style) = doc_style else {
169- self . lint_unicode_text_flow ( start) ;
170- return None ;
171- } ;
172-
173- // Opening delimiter of the length 3 is not included into the symbol.
174- let content_start = start + BytePos ( 3 ) ;
175- let content = self . str_from ( content_start) ;
176- self . cook_doc_comment ( content_start, content, CommentKind :: Line , doc_style)
177- }
178- rustc_lexer:: TokenKind :: BlockComment { doc_style, terminated } => {
179- if !terminated {
180- self . report_unterminated_block_comment ( start, doc_style) ;
181- }
182-
183- // Skip non-doc comments
184- let Some ( doc_style) = doc_style else {
185- self . lint_unicode_text_flow ( start) ;
186- return None ;
187- } ;
188-
189- // Opening delimiter of the length 3 and closing delimiter of the length 2
190- // are not included into the symbol.
191- let content_start = start + BytePos ( 3 ) ;
192- let content_end = self . pos - BytePos ( if terminated { 2 } else { 0 } ) ;
193- let content = self . str_from_to ( content_start, content_end) ;
194- self . cook_doc_comment ( content_start, content, CommentKind :: Block , doc_style)
195- }
196- rustc_lexer:: TokenKind :: Whitespace => return None ,
197- rustc_lexer:: TokenKind :: Ident => {
198- let sym = nfc_normalize ( self . str_from ( start) ) ;
199- let span = self . mk_sp ( start, self . pos ) ;
200- self . sess . symbol_gallery . insert ( sym, span) ;
201- token:: Ident ( sym, false )
202- }
203- rustc_lexer:: TokenKind :: RawIdent => {
204- let sym = nfc_normalize ( self . str_from ( start + BytePos ( 2 ) ) ) ;
205- let span = self . mk_sp ( start, self . pos ) ;
206- self . sess . symbol_gallery . insert ( sym, span) ;
207- if !sym. can_be_raw ( ) {
208- self . err_span ( span, & format ! ( "`{}` cannot be a raw identifier" , sym) ) ;
209- }
210- self . sess . raw_identifier_spans . borrow_mut ( ) . push ( span) ;
211- token:: Ident ( sym, true )
212- }
213- rustc_lexer:: TokenKind :: UnknownPrefix => {
214- self . report_unknown_prefix ( start) ;
215- let sym = nfc_normalize ( self . str_from ( start) ) ;
216- let span = self . mk_sp ( start, self . pos ) ;
217- self . sess . symbol_gallery . insert ( sym, span) ;
218- token:: Ident ( sym, false )
219- }
220- rustc_lexer:: TokenKind :: InvalidIdent
221- // Do not recover an identifier with emoji if the codepoint is a confusable
222- // with a recoverable substitution token, like `➖`.
223- if !UNICODE_ARRAY
224- . iter ( )
225- . any ( |& ( c, _, _) | {
226- let sym = self . str_from ( start) ;
227- sym. chars ( ) . count ( ) == 1 && c == sym. chars ( ) . next ( ) . unwrap ( )
228- } )
229- =>
230- {
231- let sym = nfc_normalize ( self . str_from ( start) ) ;
232- let span = self . mk_sp ( start, self . pos ) ;
233- self . sess . bad_unicode_identifiers . borrow_mut ( ) . entry ( sym) . or_default ( ) . push ( span) ;
234- token:: Ident ( sym, false )
235- }
236- rustc_lexer:: TokenKind :: Literal { kind, suffix_start } => {
237- let suffix_start = start + BytePos ( suffix_start) ;
238- let ( kind, symbol) = self . cook_lexer_literal ( start, suffix_start, kind) ;
239- let suffix = if suffix_start < self . pos {
240- let string = self . str_from ( suffix_start) ;
241- if string == "_" {
242- self . sess
243- . span_diagnostic
244- . struct_span_warn (
245- self . mk_sp ( suffix_start, self . pos ) ,
246- "underscore literal suffix is not allowed" ,
247- )
248- . warn (
249- "this was previously accepted by the compiler but is \
250- being phased out; it will become a hard error in \
251- a future release!",
252- )
253- . note (
254- "see issue #42326 \
255- <https://github.com/rust-lang/rust/issues/42326> \
256- for more information",
257- )
258- . emit ( ) ;
259- None
260- } else {
261- Some ( Symbol :: intern ( string) )
262- }
263- } else {
264- None
265- } ;
266- token:: Literal ( token:: Lit { kind, symbol, suffix } )
267- }
268- rustc_lexer:: TokenKind :: Lifetime { starts_with_number } => {
269- // Include the leading `'` in the real identifier, for macro
270- // expansion purposes. See #12512 for the gory details of why
271- // this is necessary.
272- let lifetime_name = self . str_from ( start) ;
273- if starts_with_number {
274- self . err_span_ ( start, self . pos , "lifetimes cannot start with a number" ) ;
275- }
276- let ident = Symbol :: intern ( lifetime_name) ;
277- token:: Lifetime ( ident)
278- }
279- rustc_lexer:: TokenKind :: Semi => token:: Semi ,
280- rustc_lexer:: TokenKind :: Comma => token:: Comma ,
281- rustc_lexer:: TokenKind :: Dot => token:: Dot ,
282- rustc_lexer:: TokenKind :: OpenParen => token:: OpenDelim ( Delimiter :: Parenthesis ) ,
283- rustc_lexer:: TokenKind :: CloseParen => token:: CloseDelim ( Delimiter :: Parenthesis ) ,
284- rustc_lexer:: TokenKind :: OpenBrace => token:: OpenDelim ( Delimiter :: Brace ) ,
285- rustc_lexer:: TokenKind :: CloseBrace => token:: CloseDelim ( Delimiter :: Brace ) ,
286- rustc_lexer:: TokenKind :: OpenBracket => token:: OpenDelim ( Delimiter :: Bracket ) ,
287- rustc_lexer:: TokenKind :: CloseBracket => token:: CloseDelim ( Delimiter :: Bracket ) ,
288- rustc_lexer:: TokenKind :: At => token:: At ,
289- rustc_lexer:: TokenKind :: Pound => token:: Pound ,
290- rustc_lexer:: TokenKind :: Tilde => token:: Tilde ,
291- rustc_lexer:: TokenKind :: Question => token:: Question ,
292- rustc_lexer:: TokenKind :: Colon => token:: Colon ,
293- rustc_lexer:: TokenKind :: Dollar => token:: Dollar ,
294- rustc_lexer:: TokenKind :: Eq => token:: Eq ,
295- rustc_lexer:: TokenKind :: Bang => token:: Not ,
296- rustc_lexer:: TokenKind :: Lt => token:: Lt ,
297- rustc_lexer:: TokenKind :: Gt => token:: Gt ,
298- rustc_lexer:: TokenKind :: Minus => token:: BinOp ( token:: Minus ) ,
299- rustc_lexer:: TokenKind :: And => token:: BinOp ( token:: And ) ,
300- rustc_lexer:: TokenKind :: Or => token:: BinOp ( token:: Or ) ,
301- rustc_lexer:: TokenKind :: Plus => token:: BinOp ( token:: Plus ) ,
302- rustc_lexer:: TokenKind :: Star => token:: BinOp ( token:: Star ) ,
303- rustc_lexer:: TokenKind :: Slash => token:: BinOp ( token:: Slash ) ,
304- rustc_lexer:: TokenKind :: Caret => token:: BinOp ( token:: Caret ) ,
305- rustc_lexer:: TokenKind :: Percent => token:: BinOp ( token:: Percent ) ,
306-
307- rustc_lexer:: TokenKind :: Unknown | rustc_lexer:: TokenKind :: InvalidIdent => {
308- let c = self . str_from ( start) . chars ( ) . next ( ) . unwrap ( ) ;
309- let mut err =
310- self . struct_err_span_char ( start, self . pos , "unknown start of token" , c) ;
311- // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
312- // instead of keeping a table in `check_for_substitution`into the token. Ideally,
313- // this should be inside `rustc_lexer`. However, we should first remove compound
314- // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
315- // as there will be less overall work to do this way.
316- let token = unicode_chars:: check_for_substitution ( self , start, c, & mut err) ;
317- if c == '\x00' {
318- err. help ( "source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used" ) ;
319- }
320- err. emit ( ) ;
321- token?
322- }
323- rustc_lexer:: TokenKind :: Eof => token:: Eof ,
324- } )
325- }
326-
327330 fn cook_doc_comment (
328331 & self ,
329332 content_start : BytePos ,
0 commit comments