@@ -9,6 +9,9 @@ use Mode::*;
99#[ cfg( test) ]
1010mod tests;
1111
12+ // njn: need to add tests in tests/ui/mixed-utf8-literals/; see
13+ // tests/ui/try-block/ for an example to follow
14+
1215/// Errors and warnings that can occur during string unescaping. They mostly
1316/// relate to malformed escape sequences, but there are a few that are about
1417/// other problems.
@@ -80,12 +83,12 @@ impl EscapeError {
8083 }
8184}
8285
83- /// Takes a contents of a literal (without quotes) and produces a sequence of
84- /// escaped characters or errors.
86+ /// Takes a contents of a non-mixed-utf8 literal (without quotes) and produces
87+ /// a sequence of escaped characters or errors.
8588///
8689/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
8790/// the callback will be called exactly once.
88- pub fn unescape_literal < F > ( src : & str , mode : Mode , callback : & mut F )
91+ pub fn unescape_non_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
8992where
9093 F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
9194{
9598 let res = unescape_char_or_byte ( & mut chars, mode) ;
9699 callback ( 0 ..( src. len ( ) - chars. as_str ( ) . len ( ) ) , res) ;
97100 }
98- Str | ByteStr => unescape_non_raw_common ( src, mode, callback) ,
99- RawStr | RawByteStr => check_raw_common ( src, mode, callback) ,
100- CStr | RawCStr => unreachable ! ( ) ,
101+ Str => unescape_non_raw_common ( src, mode, callback) ,
102+ RawStr => check_raw_common ( src, mode, callback) ,
103+ RawByteStr { .. } => check_raw_common ( src, mode, & mut |r, result| callback ( r, result) ) ,
104+ RawCStr => {
105+ check_raw_common ( src, mode, & mut |r, mut result| {
106+ if let Ok ( '\0' ) = result {
107+ result = Err ( EscapeError :: NulInCStr ) ;
108+ }
109+ callback ( r, result)
110+ } ) ;
111+ }
112+ ByteStr { .. } | CStr => unreachable ! ( ) ,
101113 }
102114}
103115
@@ -132,11 +144,16 @@ impl From<u8> for MixedUnit {
132144 }
133145}
134146
135- pub fn unescape_c_string < F > ( src : & str , mode : Mode , callback : & mut F )
147+ /// Takes a contents of a mixed-utf8 literal (without quotes) and produces
148+ /// a sequence of escaped characters or errors.
149+ ///
150+ /// Values are returned by invoking `callback`.
151+ pub fn unescape_mixed < F > ( src : & str , mode : Mode , callback : & mut F )
136152where
137153 F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
138154{
139155 match mode {
156+ ByteStr { .. } => unescape_non_raw_common ( src, mode, & mut |r, result| callback ( r, result) ) ,
140157 CStr => {
141158 unescape_non_raw_common ( src, mode, & mut |r, mut result| {
142159 if let Ok ( MixedUnit :: Char ( '\0' ) ) = result {
@@ -145,16 +162,7 @@ where
145162 callback ( r, result)
146163 } ) ;
147164 }
148- RawCStr => {
149- check_raw_common ( src, mode, & mut |r, mut result| {
150- if let Ok ( '\0' ) = result {
151- result = Err ( EscapeError :: NulInCStr ) ;
152- }
153- // High bytes aren't possible in raw strings.
154- callback ( r, result. map ( MixedUnit :: Char ) )
155- } ) ;
156- }
157- Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable ! ( ) ,
165+ Char | Byte | Str | RawStr | RawByteStr { .. } | RawCStr => unreachable ! ( ) ,
158166 }
159167}
160168
@@ -180,8 +188,8 @@ pub enum Mode {
180188 Str ,
181189 RawStr ,
182190
183- ByteStr ,
184- RawByteStr ,
191+ ByteStr { rfc3349 : bool } ,
192+ RawByteStr { rfc3349 : bool } ,
185193
186194 CStr ,
187195 RawCStr ,
@@ -190,7 +198,7 @@ pub enum Mode {
190198impl Mode {
191199 pub fn in_double_quotes ( self ) -> bool {
192200 match self {
193- Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true ,
201+ Str | RawStr | ByteStr { .. } | RawByteStr { .. } | CStr | RawCStr => true ,
194202 Char | Byte => false ,
195203 }
196204 }
@@ -199,33 +207,39 @@ impl Mode {
199207 fn allow_high_bytes ( self ) -> bool {
200208 match self {
201209 Char | Str => false ,
202- Byte | ByteStr | CStr => true ,
203- RawStr | RawByteStr | RawCStr => unreachable ! ( ) ,
210+ Byte | ByteStr { .. } | CStr => true ,
211+ RawStr | RawByteStr { .. } | RawCStr => unreachable ! ( ) ,
204212 }
205213 }
206214
207215 /// Are unicode (non-ASCII) chars allowed?
208216 #[ inline]
209217 fn allow_unicode_chars ( self ) -> bool {
210218 match self {
211- Byte | ByteStr | RawByteStr => false ,
212- Char | Str | RawStr | CStr | RawCStr => true ,
219+ Byte | ByteStr { rfc3349 : false } | RawByteStr { rfc3349 : false } => false ,
220+ Char
221+ | Str
222+ | RawStr
223+ | ByteStr { rfc3349 : true }
224+ | RawByteStr { rfc3349 : true }
225+ | CStr
226+ | RawCStr => true ,
213227 }
214228 }
215229
216230 /// Are unicode escapes (`\u`) allowed?
217231 fn allow_unicode_escapes ( self ) -> bool {
218232 match self {
219- Byte | ByteStr => false ,
220- Char | Str | CStr => true ,
221- RawByteStr | RawStr | RawCStr => unreachable ! ( ) ,
233+ Byte | ByteStr { rfc3349 : false } => false ,
234+ Char | Str | ByteStr { rfc3349 : true } | CStr => true ,
235+ RawByteStr { .. } | RawStr | RawCStr => unreachable ! ( ) ,
222236 }
223237 }
224238
225239 pub fn prefix_noraw ( self ) -> & ' static str {
226240 match self {
227241 Char | Str | RawStr => "" ,
228- Byte | ByteStr | RawByteStr => "b" ,
242+ Byte | ByteStr { .. } | RawByteStr { .. } => "b" ,
229243 CStr | RawCStr => "c" ,
230244 }
231245 }
@@ -263,12 +277,14 @@ fn scan_escape<T: From<char> + From<u8>>(
263277 Ok ( T :: from ( value as u8 ) )
264278 } ;
265279 }
280+ // njn: gate: is it a ByteStr?
266281 'u' => return scan_unicode ( chars, mode. allow_unicode_escapes ( ) ) . map ( T :: from) ,
267282 _ => return Err ( EscapeError :: InvalidEscape ) ,
268283 } ;
269284 Ok ( T :: from ( res) )
270285}
271286
287+ // njn: change arg to mode in precursor?
272288fn scan_unicode ( chars : & mut Chars < ' _ > , allow_unicode_escapes : bool ) -> Result < char , EscapeError > {
273289 // We've parsed '\u', now we have to parse '{..}'.
274290
@@ -333,6 +349,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
333349 '\\' => scan_escape ( chars, mode) ,
334350 '\n' | '\t' | '\'' => Err ( EscapeError :: EscapeOnlyChar ) ,
335351 '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
352+ // njn: this is the only ascii_check that will remain
336353 _ => ascii_check ( c, mode. allow_unicode_chars ( ) ) ,
337354 } ?;
338355 if chars. next ( ) . is_some ( ) {
@@ -373,6 +390,10 @@ where
373390 }
374391 '"' => Err ( EscapeError :: EscapeOnlyChar ) ,
375392 '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
393+
394+ // njn: gate, similar to check_raw_common, check:
395+ // - is it a ByteStr AND does it contain a unicode char
396+
376397 _ => ascii_check ( c, allow_unicode_chars) . map ( T :: from) ,
377398 } ;
378399 let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
@@ -424,6 +445,15 @@ where
424445 let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
425446 let res = match c {
426447 '\r' => Err ( EscapeError :: BareCarriageReturnInRawString ) ,
448+
449+ // njn: gate: need to somehow return an indication of whether
450+ // rfc3349 unicode char allowance was required for this literal,
451+ // i.e. check
452+ // - is it a RawByteStr AND does it contain a unicode char
453+ //
454+ // njn: but the ascii_check itself isn't necessary
455+ // - or make it return three values? ok, ok-with-3349, bad?
456+
427457 _ => ascii_check ( c, allow_unicode_chars) ,
428458 } ;
429459 let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
@@ -432,8 +462,8 @@ where
432462}
433463
434464#[ inline]
435- pub fn byte_from_char ( c : char ) -> u8 {
465+ pub ( crate ) fn byte_from_char ( c : char ) -> u8 {
436466 let res = c as u32 ;
437- debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of ByteStr " ) ;
467+ debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of Byte " ) ;
438468 res as u8
439469}
0 commit comments