@@ -18,7 +18,6 @@ use log::debug;
1818
1919use rustc_data_structures:: fx:: FxHashSet ;
2020use std:: borrow:: Cow ;
21- use std:: iter;
2221use std:: path:: { Path , PathBuf } ;
2322use std:: str;
2423
@@ -34,6 +33,11 @@ pub mod diagnostics;
3433
3534pub mod classify;
3635
36+ pub ( crate ) mod unescape;
37+ use unescape:: { unescape_str, unescape_char, unescape_byte_str, unescape_byte} ;
38+
39+ pub ( crate ) mod unescape_error_reporting;
40+
3741/// Info about a parsing session.
3842pub struct ParseSess {
3943 pub span_diagnostic : Handler ,
@@ -307,133 +311,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> {
307311 Parser :: new ( sess, stream, None , true , false )
308312}
309313
310- /// Parses a string representing a character literal into its final form.
311- /// Rather than just accepting/rejecting a given literal, unescapes it as
312- /// well. Can take any slice prefixed by a character escape. Returns the
313- /// character and the number of characters consumed.
314- fn char_lit ( lit : & str , diag : Option < ( Span , & Handler ) > ) -> ( char , isize ) {
315- use std:: char;
316-
317- // Handle non-escaped chars first.
318- if lit. as_bytes ( ) [ 0 ] != b'\\' {
319- // If the first byte isn't '\\' it might part of a multi-byte char, so
320- // get the char with chars().
321- let c = lit. chars ( ) . next ( ) . unwrap ( ) ;
322- return ( c, 1 ) ;
323- }
324-
325- // Handle escaped chars.
326- match lit. as_bytes ( ) [ 1 ] as char {
327- '"' => ( '"' , 2 ) ,
328- 'n' => ( '\n' , 2 ) ,
329- 'r' => ( '\r' , 2 ) ,
330- 't' => ( '\t' , 2 ) ,
331- '\\' => ( '\\' , 2 ) ,
332- '\'' => ( '\'' , 2 ) ,
333- '0' => ( '\0' , 2 ) ,
334- 'x' => {
335- let v = u32:: from_str_radix ( & lit[ 2 ..4 ] , 16 ) . unwrap ( ) ;
336- let c = char:: from_u32 ( v) . unwrap ( ) ;
337- ( c, 4 )
338- }
339- 'u' => {
340- assert_eq ! ( lit. as_bytes( ) [ 2 ] , b'{' ) ;
341- let idx = lit. find ( '}' ) . unwrap ( ) ;
342-
343- // All digits and '_' are ascii, so treat each byte as a char.
344- let mut v: u32 = 0 ;
345- for c in lit[ 3 ..idx] . bytes ( ) {
346- let c = char:: from ( c) ;
347- if c != '_' {
348- let x = c. to_digit ( 16 ) . unwrap ( ) ;
349- v = v. checked_mul ( 16 ) . unwrap ( ) . checked_add ( x) . unwrap ( ) ;
350- }
351- }
352- let c = char:: from_u32 ( v) . unwrap_or_else ( || {
353- if let Some ( ( span, diag) ) = diag {
354- let mut diag = diag. struct_span_err ( span, "invalid unicode character escape" ) ;
355- if v > 0x10FFFF {
356- diag. help ( "unicode escape must be at most 10FFFF" ) . emit ( ) ;
357- } else {
358- diag. help ( "unicode escape must not be a surrogate" ) . emit ( ) ;
359- }
360- }
361- '\u{FFFD}'
362- } ) ;
363- ( c, ( idx + 1 ) as isize )
364- }
365- _ => panic ! ( "lexer should have rejected a bad character escape {}" , lit)
366- }
367- }
368-
369- /// Parses a string representing a string literal into its final form. Does unescaping.
370- fn str_lit ( lit : & str , diag : Option < ( Span , & Handler ) > ) -> String {
371- debug ! ( "str_lit: given {}" , lit. escape_default( ) ) ;
372- let mut res = String :: with_capacity ( lit. len ( ) ) ;
373-
374- let error = |i| format ! ( "lexer should have rejected {} at {}" , lit, i) ;
375-
376- /// Eat everything up to a non-whitespace.
377- fn eat < ' a > ( it : & mut iter:: Peekable < str:: CharIndices < ' a > > ) {
378- loop {
379- match it. peek ( ) . map ( |x| x. 1 ) {
380- Some ( ' ' ) | Some ( '\n' ) | Some ( '\r' ) | Some ( '\t' ) => {
381- it. next ( ) ;
382- } ,
383- _ => { break ; }
384- }
385- }
386- }
387-
388- let mut chars = lit. char_indices ( ) . peekable ( ) ;
389- while let Some ( ( i, c) ) = chars. next ( ) {
390- match c {
391- '\\' => {
392- let ch = chars. peek ( ) . unwrap_or_else ( || {
393- panic ! ( "{}" , error( i) )
394- } ) . 1 ;
395-
396- if ch == '\n' {
397- eat ( & mut chars) ;
398- } else if ch == '\r' {
399- chars. next ( ) ;
400- let ch = chars. peek ( ) . unwrap_or_else ( || {
401- panic ! ( "{}" , error( i) )
402- } ) . 1 ;
403-
404- if ch != '\n' {
405- panic ! ( "lexer accepted bare CR" ) ;
406- }
407- eat ( & mut chars) ;
408- } else {
409- // otherwise, a normal escape
410- let ( c, n) = char_lit ( & lit[ i..] , diag) ;
411- for _ in 0 ..n - 1 { // we don't need to move past the first \
412- chars. next ( ) ;
413- }
414- res. push ( c) ;
415- }
416- } ,
417- '\r' => {
418- let ch = chars. peek ( ) . unwrap_or_else ( || {
419- panic ! ( "{}" , error( i) )
420- } ) . 1 ;
421-
422- if ch != '\n' {
423- panic ! ( "lexer accepted bare CR" ) ;
424- }
425- chars. next ( ) ;
426- res. push ( '\n' ) ;
427- }
428- c => res. push ( c) ,
429- }
430- }
431-
432- res. shrink_to_fit ( ) ; // probably not going to do anything, unless there was an escape.
433- debug ! ( "parse_str_lit: returning {}" , res) ;
434- res
435- }
436-
437314/// Parses a string representing a raw string literal into its final form. The
438315/// only operation this does is convert embedded CRLF into a single LF.
439316fn raw_str_lit ( lit : & str ) -> String {
@@ -476,9 +353,21 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
476353 use ast:: LitKind ;
477354
478355 match lit {
479- token:: Byte ( i) => ( true , Some ( LitKind :: Byte ( byte_lit ( & i. as_str ( ) ) . 0 ) ) ) ,
480- token:: Char ( i) => ( true , Some ( LitKind :: Char ( char_lit ( & i. as_str ( ) , diag) . 0 ) ) ) ,
481- token:: Err ( i) => ( true , Some ( LitKind :: Err ( i) ) ) ,
356+ token:: Byte ( i) => {
357+ let lit_kind = match unescape_byte ( & i. as_str ( ) ) {
358+ Ok ( c) => LitKind :: Byte ( c) ,
359+ Err ( _) => LitKind :: Err ( i) ,
360+ } ;
361+ ( true , Some ( lit_kind) )
362+ } ,
363+ token:: Char ( i) => {
364+ let lit_kind = match unescape_char ( & i. as_str ( ) ) {
365+ Ok ( c) => LitKind :: Char ( c) ,
366+ Err ( _) => LitKind :: Err ( i) ,
367+ } ;
368+ ( true , Some ( lit_kind) )
369+ } ,
370+ token:: Err ( i) => ( true , Some ( LitKind :: Err ( i) ) ) ,
482371
483372 // There are some valid suffixes for integer and float literals,
484373 // so all the handling is done internally.
@@ -490,10 +379,22 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
490379 // reuse the symbol from the Token. Otherwise, we must generate a
491380 // new symbol because the string in the LitKind is different to the
492381 // string in the Token.
382+ let mut has_error = false ;
493383 let s = & sym. as_str ( ) ;
494384 if s. as_bytes ( ) . iter ( ) . any ( |& c| c == b'\\' || c == b'\r' ) {
495- sym = Symbol :: intern ( & str_lit ( s, diag) ) ;
385+ let mut buf = String :: with_capacity ( s. len ( ) ) ;
386+ unescape_str ( s, & mut |_, unescaped_char| {
387+ match unescaped_char {
388+ Ok ( c) => buf. push ( c) ,
389+ Err ( _) => has_error = true ,
390+ }
391+ } ) ;
392+ if has_error {
393+ return ( true , Some ( LitKind :: Err ( sym) ) ) ;
394+ }
395+ sym = Symbol :: intern ( & buf)
496396 }
397+
497398 ( true , Some ( LitKind :: Str ( sym, ast:: StrStyle :: Cooked ) ) )
498399 }
499400 token:: StrRaw ( mut sym, n) => {
@@ -505,7 +406,20 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
505406 ( true , Some ( LitKind :: Str ( sym, ast:: StrStyle :: Raw ( n) ) ) )
506407 }
507408 token:: ByteStr ( i) => {
508- ( true , Some ( LitKind :: ByteStr ( byte_str_lit ( & i. as_str ( ) ) ) ) )
409+ let s = & i. as_str ( ) ;
410+ let mut buf = Vec :: with_capacity ( s. len ( ) ) ;
411+ let mut has_error = false ;
412+ unescape_byte_str ( s, & mut |_, unescaped_byte| {
413+ match unescaped_byte {
414+ Ok ( c) => buf. push ( c) ,
415+ Err ( _) => has_error = true ,
416+ }
417+ } ) ;
418+ if has_error {
419+ return ( true , Some ( LitKind :: Err ( i) ) ) ;
420+ }
421+ buf. shrink_to_fit ( ) ;
422+ ( true , Some ( LitKind :: ByteStr ( Lrc :: new ( buf) ) ) )
509423 }
510424 token:: ByteStrRaw ( i, _) => {
511425 ( true , Some ( LitKind :: ByteStr ( Lrc :: new ( i. to_string ( ) . into_bytes ( ) ) ) ) )
@@ -560,95 +474,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
560474 filtered_float_lit ( Symbol :: intern ( s) , suffix, diag)
561475}
562476
563- /// Parses a string representing a byte literal into its final form. Similar to `char_lit`.
564- fn byte_lit ( lit : & str ) -> ( u8 , usize ) {
565- let err = |i| format ! ( "lexer accepted invalid byte literal {} step {}" , lit, i) ;
566-
567- if lit. len ( ) == 1 {
568- ( lit. as_bytes ( ) [ 0 ] , 1 )
569- } else {
570- assert_eq ! ( lit. as_bytes( ) [ 0 ] , b'\\' , "{}" , err( 0 ) ) ;
571- let b = match lit. as_bytes ( ) [ 1 ] {
572- b'"' => b'"' ,
573- b'n' => b'\n' ,
574- b'r' => b'\r' ,
575- b't' => b'\t' ,
576- b'\\' => b'\\' ,
577- b'\'' => b'\'' ,
578- b'0' => b'\0' ,
579- _ => {
580- match u64:: from_str_radix ( & lit[ 2 ..4 ] , 16 ) . ok ( ) {
581- Some ( c) =>
582- if c > 0xFF {
583- panic ! ( err( 2 ) )
584- } else {
585- return ( c as u8 , 4 )
586- } ,
587- None => panic ! ( err( 3 ) )
588- }
589- }
590- } ;
591- ( b, 2 )
592- }
593- }
594-
595- fn byte_str_lit ( lit : & str ) -> Lrc < Vec < u8 > > {
596- let mut res = Vec :: with_capacity ( lit. len ( ) ) ;
597-
598- let error = |i| panic ! ( "lexer should have rejected {} at {}" , lit, i) ;
599-
600- /// Eat everything up to a non-whitespace.
601- fn eat < I : Iterator < Item =( usize , u8 ) > > ( it : & mut iter:: Peekable < I > ) {
602- loop {
603- match it. peek ( ) . map ( |x| x. 1 ) {
604- Some ( b' ' ) | Some ( b'\n' ) | Some ( b'\r' ) | Some ( b'\t' ) => {
605- it. next ( ) ;
606- } ,
607- _ => { break ; }
608- }
609- }
610- }
611-
612- // byte string literals *must* be ASCII, but the escapes don't have to be
613- let mut chars = lit. bytes ( ) . enumerate ( ) . peekable ( ) ;
614- loop {
615- match chars. next ( ) {
616- Some ( ( i, b'\\' ) ) => {
617- match chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 {
618- b'\n' => eat ( & mut chars) ,
619- b'\r' => {
620- chars. next ( ) ;
621- if chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 != b'\n' {
622- panic ! ( "lexer accepted bare CR" ) ;
623- }
624- eat ( & mut chars) ;
625- }
626- _ => {
627- // otherwise, a normal escape
628- let ( c, n) = byte_lit ( & lit[ i..] ) ;
629- // we don't need to move past the first \
630- for _ in 0 ..n - 1 {
631- chars. next ( ) ;
632- }
633- res. push ( c) ;
634- }
635- }
636- } ,
637- Some ( ( i, b'\r' ) ) => {
638- if chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1 != b'\n' {
639- panic ! ( "lexer accepted bare CR" ) ;
640- }
641- chars. next ( ) ;
642- res. push ( b'\n' ) ;
643- }
644- Some ( ( _, c) ) => res. push ( c) ,
645- None => break ,
646- }
647- }
648-
649- Lrc :: new ( res)
650- }
651-
652477fn integer_lit ( s : & str , suffix : Option < Symbol > , diag : Option < ( Span , & Handler ) > )
653478 -> Option < ast:: LitKind > {
654479 // s can only be ascii, byte indexing is fine
0 commit comments