@@ -9,6 +9,9 @@ use Mode::*;
99#[ cfg( test) ]
1010mod tests;
1111
12+ // njn: need to add tests in tests/ui/mixed-utf8-literals/; see
13+ // tests/ui/try-block/ for an example to follow
14+
1215/// Errors and warnings that can occur during string unescaping. They mostly
1316/// relate to malformed escape sequences, but there are a few that are about
1417/// other problems.
@@ -85,7 +88,7 @@ impl EscapeError {
8588///
8689/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
8790/// the callback will be called exactly once.
88- pub fn unescape_unicode < F > ( src : & str , mode : Mode , callback : & mut F )
91+ pub fn unescape_unicode < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
8992where
9093 F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
9194{
@@ -94,16 +97,17 @@ where
9497 let mut chars = src. chars ( ) ;
9598 let res = unescape_char_or_byte ( & mut chars, mode) ;
9699 callback ( 0 ..( src. len ( ) - chars. as_str ( ) . len ( ) ) , res) ;
100+ Rfc3349 :: Unused // rfc3349 is not relevant for char or byte literals
97101 }
98- Str | ByteStr => unescape_non_raw_common ( src, mode, callback) ,
102+ Str => unescape_non_raw_common ( src, mode, callback) ,
99103 RawStr | RawByteStr => check_raw_common ( src, mode, callback) ,
100104 RawCStr => check_raw_common ( src, mode, & mut |r, mut result| {
101105 if let Ok ( '\0' ) = result {
102106 result = Err ( EscapeError :: NulInCStr ) ;
103107 }
104108 callback ( r, result)
105109 } ) ,
106- CStr => unreachable ! ( ) ,
110+ ByteStr | CStr => unreachable ! ( ) ,
107111 }
108112}
109113
@@ -142,18 +146,19 @@ impl From<u8> for MixedUnit {
142146/// a sequence of escaped characters or errors.
143147///
144148/// Values are returned by invoking `callback`.
145- pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
149+ pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
146150where
147151 F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
148152{
149153 match mode {
154+ ByteStr => unescape_non_raw_common ( src, mode, callback) ,
150155 CStr => unescape_non_raw_common ( src, mode, & mut |r, mut result| {
151156 if let Ok ( MixedUnit :: Char ( '\0' ) ) = result {
152157 result = Err ( EscapeError :: NulInCStr ) ;
153158 }
154159 callback ( r, result)
155160 } ) ,
156- Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable ! ( ) ,
161+ Char | Byte | Str | RawStr | RawByteStr | RawCStr => unreachable ! ( ) ,
157162 }
158163}
159164
@@ -169,6 +174,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
169174 unescape_char_or_byte ( & mut src. chars ( ) , Byte ) . map ( byte_from_char)
170175}
171176
177+ /// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
178+ /// literal to be valid. Once rfc3349 is stabilized this type can be removed.
179+ #[ derive( Debug , PartialEq ) ]
180+ #[ must_use]
181+ pub enum Rfc3349 {
182+ Used ,
183+ Unused ,
184+ }
185+
172186/// What kind of literal do we parse.
173187#[ derive( Debug , Clone , Copy , PartialEq ) ]
174188pub enum Mode {
@@ -205,17 +219,25 @@ impl Mode {
205219
206220 /// Are unicode (non-ASCII) chars allowed?
207221 #[ inline]
208- fn allow_unicode_chars ( self ) -> bool {
222+ fn allow_unicode_chars ( self , rfc3349 : & mut Rfc3349 ) -> bool {
209223 match self {
210- Byte | ByteStr | RawByteStr => false ,
224+ Byte => false ,
225+ ByteStr | RawByteStr => {
226+ * rfc3349 = Rfc3349 :: Used ;
227+ true
228+ }
211229 Char | Str | RawStr | CStr | RawCStr => true ,
212230 }
213231 }
214232
215233 /// Are unicode escapes (`\u`) allowed?
216- fn allow_unicode_escapes ( self ) -> bool {
234+ fn allow_unicode_escapes ( self , rfc3349 : & mut Rfc3349 ) -> bool {
217235 match self {
218- Byte | ByteStr => false ,
236+ Byte => false ,
237+ ByteStr => {
238+ * rfc3349 = Rfc3349 :: Used ;
239+ true
240+ }
219241 Char | Str | CStr => true ,
220242 RawByteStr | RawStr | RawCStr => unreachable ! ( ) ,
221243 }
@@ -233,6 +255,7 @@ impl Mode {
233255fn scan_escape < T : From < char > + From < u8 > > (
234256 chars : & mut Chars < ' _ > ,
235257 mode : Mode ,
258+ rfc3349 : & mut Rfc3349 ,
236259) -> Result < T , EscapeError > {
237260 // Previous character was '\\', unescape what follows.
238261 let res: char = match chars. next ( ) . ok_or ( EscapeError :: LoneSlash ) ? {
@@ -262,13 +285,17 @@ fn scan_escape<T: From<char> + From<u8>>(
262285 Ok ( T :: from ( value as u8 ) )
263286 } ;
264287 }
265- 'u' => return scan_unicode ( chars, mode. allow_unicode_escapes ( ) ) . map ( T :: from) ,
288+ 'u' => return scan_unicode ( chars, mode, rfc3349 ) . map ( T :: from) ,
266289 _ => return Err ( EscapeError :: InvalidEscape ) ,
267290 } ;
268291 Ok ( T :: from ( res) )
269292}
270293
271- fn scan_unicode ( chars : & mut Chars < ' _ > , allow_unicode_escapes : bool ) -> Result < char , EscapeError > {
294+ fn scan_unicode (
295+ chars : & mut Chars < ' _ > ,
296+ mode : Mode ,
297+ rfc3349 : & mut Rfc3349 ,
298+ ) -> Result < char , EscapeError > {
272299 // We've parsed '\u', now we have to parse '{..}'.
273300
274301 if chars. next ( ) != Some ( '{' ) {
@@ -296,7 +323,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
296323
297324 // Incorrect syntax has higher priority for error reporting
298325 // than unallowed value for a literal.
299- if !allow_unicode_escapes {
326+ if !mode . allow_unicode_escapes ( rfc3349 ) {
300327 return Err ( EscapeError :: UnicodeEscapeInByte ) ;
301328 }
302329
@@ -322,18 +349,28 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
322349}
323350
324351#[ inline]
325- fn ascii_check ( c : char , allow_unicode_chars : bool ) -> Result < char , EscapeError > {
326- if allow_unicode_chars || c. is_ascii ( ) { Ok ( c) } else { Err ( EscapeError :: NonAsciiCharInByte ) }
352+ fn ascii_check ( c : char , mode : Mode , rfc3349 : & mut Rfc3349 ) -> Result < char , EscapeError > {
353+ // We must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
354+ if c. is_ascii ( ) || mode. allow_unicode_chars ( rfc3349) {
355+ Ok ( c)
356+ } else {
357+ Err ( EscapeError :: NonAsciiCharInByte )
358+ }
327359}
328360
329361fn unescape_char_or_byte ( chars : & mut Chars < ' _ > , mode : Mode ) -> Result < char , EscapeError > {
330362 let c = chars. next ( ) . ok_or ( EscapeError :: ZeroChars ) ?;
363+ let mut rfc3349 = Rfc3349 :: Unused ;
331364 let res = match c {
332- '\\' => scan_escape ( chars, mode) ,
365+ '\\' => scan_escape ( chars, mode, & mut rfc3349 ) ,
333366 '\n' | '\t' | '\'' => Err ( EscapeError :: EscapeOnlyChar ) ,
334367 '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
335- _ => ascii_check ( c, mode. allow_unicode_chars ( ) ) ,
368+ _ => ascii_check ( c, mode, & mut rfc3349 ) ,
336369 } ?;
370+
371+ // rfc3349 is not relevant for char or byte literals.
372+ assert_eq ! ( rfc3349, Rfc3349 :: Unused ) ;
373+
337374 if chars. next ( ) . is_some ( ) {
338375 return Err ( EscapeError :: MoreThanOneChar ) ;
339376 }
@@ -342,12 +379,16 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
342379
343380/// Takes a contents of a string literal (without quotes) and produces a
344381/// sequence of escaped characters or errors.
345- fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F )
382+ fn unescape_non_raw_common < F , T : From < char > + From < u8 > > (
383+ src : & str ,
384+ mode : Mode ,
385+ callback : & mut F ,
386+ ) -> Rfc3349
346387where
347388 F : FnMut ( Range < usize > , Result < T , EscapeError > ) ,
348389{
349390 let mut chars = src. chars ( ) ;
350- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
391+ let mut rfc3349 = Rfc3349 :: Unused ;
351392
352393 // The `start` and `end` computation here is complicated because
353394 // `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -367,16 +408,17 @@ where
367408 } ) ;
368409 continue ;
369410 }
370- _ => scan_escape :: < T > ( & mut chars, mode) ,
411+ _ => scan_escape :: < T > ( & mut chars, mode, & mut rfc3349 ) ,
371412 }
372413 }
373414 '"' => Err ( EscapeError :: EscapeOnlyChar ) ,
374415 '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
375- _ => ascii_check ( c, allow_unicode_chars ) . map ( T :: from) ,
416+ _ => ascii_check ( c, mode , & mut rfc3349 ) . map ( T :: from) ,
376417 } ;
377418 let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
378419 callback ( start..end, res) ;
379420 }
421+ rfc3349
380422}
381423
382424fn skip_ascii_whitespace < F > ( chars : & mut Chars < ' _ > , start : usize , callback : & mut F )
@@ -409,12 +451,12 @@ where
409451/// sequence of characters or errors.
410452/// NOTE: Raw strings do not perform any explicit character escaping, here we
411453/// only produce errors on bare CR.
412- fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F )
454+ fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F ) -> Rfc3349
413455where
414456 F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
415457{
416458 let mut chars = src. chars ( ) ;
417- let allow_unicode_chars = mode . allow_unicode_chars ( ) ; // get this outside the loop
459+ let mut rfc3349 = Rfc3349 :: Unused ;
418460
419461 // The `start` and `end` computation here matches the one in
420462 // `unescape_non_raw_common` for consistency, even though this function
@@ -423,16 +465,17 @@ where
423465 let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
424466 let res = match c {
425467 '\r' => Err ( EscapeError :: BareCarriageReturnInRawString ) ,
426- _ => ascii_check ( c, allow_unicode_chars ) ,
468+ _ => ascii_check ( c, mode , & mut rfc3349 ) ,
427469 } ;
428470 let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
429471 callback ( start..end, res) ;
430472 }
473+ rfc3349
431474}
432475
433476#[ inline]
434- pub fn byte_from_char ( c : char ) -> u8 {
477+ pub ( crate ) fn byte_from_char ( c : char ) -> u8 {
435478 let res = c as u32 ;
436- debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of ByteStr " ) ;
479+ debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of Byte " ) ;
437480 res as u8
438481}
0 commit comments