@@ -101,32 +101,45 @@ where
101101 }
102102}
103103
104- /// A unit within CStr. Must not be a nul character.
105- pub enum CStrUnit {
106- Byte ( u8 ) ,
104+ /// Used for mixed utf8 string literals, i.e. those that allow both unicode
105+ /// chars and high bytes.
106+ pub enum MixedUnit {
107+ /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
108+ /// and Unicode chars (written directly or via `\u` escapes).
109+ ///
110+ /// For example, if '¥' appears in a string it is represented here as
111+ /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
112+ /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
107113 Char ( char ) ,
114+
115+ /// Used for high bytes (`\x80`..`\xff`).
116+ ///
117+ /// For example, if `\xa5` appears in a string it is represented here as
118+ /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
119+ /// byte string as the single byte `0xa5`.
120+ HighByte ( u8 ) ,
108121}
109122
110- impl From < u8 > for CStrUnit {
111- fn from ( value : u8 ) -> Self {
112- CStrUnit :: Byte ( value )
123+ impl From < char > for MixedUnit {
124+ fn from ( c : char ) -> Self {
125+ MixedUnit :: Char ( c )
113126 }
114127}
115128
116- impl From < char > for CStrUnit {
117- fn from ( value : char ) -> Self {
118- CStrUnit :: Char ( value )
129+ impl From < u8 > for MixedUnit {
130+ fn from ( n : u8 ) -> Self {
131+ if n . is_ascii ( ) { MixedUnit :: Char ( n as char ) } else { MixedUnit :: HighByte ( n ) }
119132 }
120133}
121134
122135pub fn unescape_c_string < F > ( src : & str , mode : Mode , callback : & mut F )
123136where
124- F : FnMut ( Range < usize > , Result < CStrUnit , EscapeError > ) ,
137+ F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
125138{
126139 match mode {
127140 CStr => {
128141 unescape_non_raw_common ( src, mode, & mut |r, mut result| {
129- if let Ok ( CStrUnit :: Byte ( 0 ) | CStrUnit :: Char ( '\0' ) ) = result {
142+ if let Ok ( MixedUnit :: Char ( '\0' ) ) = result {
130143 result = Err ( EscapeError :: NulInCStr ) ;
131144 }
132145 callback ( r, result)
@@ -137,7 +150,8 @@ where
137150 if let Ok ( '\0' ) = result {
138151 result = Err ( EscapeError :: NulInCStr ) ;
139152 }
140- callback ( r, result. map ( CStrUnit :: Char ) )
153+ // High bytes aren't possible in raw strings.
154+ callback ( r, result. map ( MixedUnit :: Char ) )
141155 } ) ;
142156 }
143157 Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable ! ( ) ,
@@ -217,20 +231,19 @@ impl Mode {
217231 }
218232}
219233
220- fn scan_escape < T : From < u8 > + From < char > > (
234+ fn scan_escape < T : From < char > + From < u8 > > (
221235 chars : & mut Chars < ' _ > ,
222236 mode : Mode ,
223237) -> Result < T , EscapeError > {
224238 // Previous character was '\\', unescape what follows.
225- let res: u8 = match chars. next ( ) . ok_or ( EscapeError :: LoneSlash ) ? {
226- '"' => b'"' ,
227- 'n' => b'\n' ,
228- 'r' => b'\r' ,
229- 't' => b'\t' ,
230- '\\' => b'\\' ,
231- '\'' => b'\'' ,
232- '0' => b'\0' ,
233-
239+ let res: char = match chars. next ( ) . ok_or ( EscapeError :: LoneSlash ) ? {
240+ '"' => '"' ,
241+ 'n' => '\n' ,
242+ 'r' => '\r' ,
243+ 't' => '\t' ,
244+ '\\' => '\\' ,
245+ '\'' => '\'' ,
246+ '0' => '\0' ,
234247 'x' => {
235248 // Parse hexadecimal character code.
236249
@@ -240,15 +253,17 @@ fn scan_escape<T: From<u8> + From<char>>(
240253 let lo = chars. next ( ) . ok_or ( EscapeError :: TooShortHexEscape ) ?;
241254 let lo = lo. to_digit ( 16 ) . ok_or ( EscapeError :: InvalidCharInHexEscape ) ?;
242255
243- let value = hi * 16 + lo;
244-
245- if mode. ascii_escapes_should_be_ascii ( ) && !is_ascii ( value) {
246- return Err ( EscapeError :: OutOfRangeHexEscape ) ;
247- }
256+ let value = ( hi * 16 + lo) as u8 ;
248257
249- value as u8
258+ return if mode. ascii_escapes_should_be_ascii ( ) && !value. is_ascii ( ) {
259+ Err ( EscapeError :: OutOfRangeHexEscape )
260+ } else {
261+ // This may be a high byte, but that will only happen if `T` is
262+ // `MixedUnit`, because of the `ascii_escapes_should_be_ascii`
263+ // check above.
264+ Ok ( T :: from ( value as u8 ) )
265+ } ;
250266 }
251-
252267 'u' => return scan_unicode ( chars, mode. is_unicode_escape_disallowed ( ) ) . map ( T :: from) ,
253268 _ => return Err ( EscapeError :: InvalidEscape ) ,
254269 } ;
@@ -336,7 +351,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
336351
337352/// Takes a contents of a string literal (without quotes) and produces a
338353/// sequence of escaped characters or errors.
339- fn unescape_non_raw_common < F , T : From < u8 > + From < char > > ( src : & str , mode : Mode , callback : & mut F )
354+ fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F )
340355where
341356 F : FnMut ( Range < usize > , Result < T , EscapeError > ) ,
342357{
@@ -430,7 +445,3 @@ pub fn byte_from_char(c: char) -> u8 {
430445 debug_assert ! ( res <= u8 :: MAX as u32 , "guaranteed because of ByteStr" ) ;
431446 res as u8
432447}
433-
434- fn is_ascii ( x : u32 ) -> bool {
435- x <= 0x7F
436- }
0 commit comments