11#![ unstable( issue = "none" , feature = "windows_stdio" ) ]
22
3- use core:: str:: utf8_char_width;
4-
53use super :: api:: { self , WinError } ;
64use crate :: mem:: MaybeUninit ;
75use crate :: os:: windows:: io:: { FromRawHandle , IntoRawHandle } ;
86use crate :: sys:: handle:: Handle ;
97use crate :: sys:: { c, cvt} ;
10- use crate :: { cmp, io, ptr, str } ;
8+ use crate :: { cmp, io, ptr} ;
119
1210#[ cfg( test) ]
1311mod tests;
@@ -19,13 +17,9 @@ pub struct Stdin {
1917 incomplete_utf8 : IncompleteUtf8 ,
2018}
2119
22- pub struct Stdout {
23- incomplete_utf8 : IncompleteUtf8 ,
24- }
20+ pub struct Stdout { }
2521
26- pub struct Stderr {
27- incomplete_utf8 : IncompleteUtf8 ,
28- }
22+ pub struct Stderr { }
2923
3024struct IncompleteUtf8 {
3125 bytes : [ u8 ; 4 ] ,
@@ -98,7 +92,7 @@ fn is_utf8_console() -> bool {
9892 false
9993}
10094
101- fn write ( handle_id : u32 , data : & [ u8 ] , incomplete_utf8 : & mut IncompleteUtf8 ) -> io:: Result < usize > {
95+ fn write ( handle_id : u32 , data : & [ u8 ] ) -> io:: Result < usize > {
10296 if data. is_empty ( ) {
10397 return Ok ( 0 ) ;
10498 }
@@ -112,134 +106,41 @@ fn write(handle_id: u32, data: &[u8], incomplete_utf8: &mut IncompleteUtf8) -> i
112106 return ret;
113107 }
114108 } else {
115- write_console_utf16 ( data, incomplete_utf8 , handle)
109+ write_console_utf16 ( data, handle)
116110 }
117111}
118112
119- fn write_console_utf16 (
120- data : & [ u8 ] ,
121- incomplete_utf8 : & mut IncompleteUtf8 ,
122- handle : c:: HANDLE ,
123- ) -> io:: Result < usize > {
124- if incomplete_utf8. len > 0 {
125- assert ! (
126- incomplete_utf8. len < 4 ,
127- "Unexpected number of bytes for incomplete UTF-8 codepoint."
128- ) ;
129- if data[ 0 ] >> 6 != 0b10 {
130- // not a continuation byte - reject
131- incomplete_utf8. len = 0 ;
132- return Err ( io:: const_error!(
133- io:: ErrorKind :: InvalidData ,
134- "Windows stdio in console mode does not support writing non-UTF-8 byte sequences" ,
135- ) ) ;
136- }
137- incomplete_utf8. bytes [ incomplete_utf8. len as usize ] = data[ 0 ] ;
138- incomplete_utf8. len += 1 ;
139- let char_width = utf8_char_width ( incomplete_utf8. bytes [ 0 ] ) ;
140- if ( incomplete_utf8. len as usize ) < char_width {
141- // more bytes needed
142- return Ok ( 1 ) ;
143- }
144- let s = str:: from_utf8 ( & incomplete_utf8. bytes [ 0 ..incomplete_utf8. len as usize ] ) ;
145- incomplete_utf8. len = 0 ;
146- match s {
147- Ok ( s) => {
148- assert_eq ! ( char_width, s. len( ) ) ;
149- let written = write_valid_utf8_to_console ( handle, s) ?;
150- assert_eq ! ( written, s. len( ) ) ; // guaranteed by write_valid_utf8_to_console() for single codepoint writes
151- return Ok ( 1 ) ;
152- }
153- Err ( _) => {
154- return Err ( io:: const_error!(
155- io:: ErrorKind :: InvalidData ,
156- "Windows stdio in console mode does not support writing non-UTF-8 byte sequences" ,
157- ) ) ;
158- }
159- }
160- }
161-
162- // As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8,
163- // which needs to be encoded as UTF-16.
164- //
165- // If the data is not valid UTF-8 we write out as many bytes as are valid.
166- // If the first byte is invalid it is either first byte of a multi-byte sequence but the
167- // provided byte slice is too short or it is the first byte of an invalid multi-byte sequence.
168- let len = cmp:: min ( data. len ( ) , MAX_BUFFER_SIZE / 2 ) ;
169- let utf8 = match str:: from_utf8 ( & data[ ..len] ) {
170- Ok ( s) => s,
171- Err ( ref e) if e. valid_up_to ( ) == 0 => {
172- let first_byte_char_width = utf8_char_width ( data[ 0 ] ) ;
173- if first_byte_char_width > 1 && data. len ( ) < first_byte_char_width {
174- incomplete_utf8. bytes [ 0 ] = data[ 0 ] ;
175- incomplete_utf8. len = 1 ;
176- return Ok ( 1 ) ;
177- } else {
178- return Err ( io:: const_error!(
179- io:: ErrorKind :: InvalidData ,
180- "Windows stdio in console mode does not support writing non-UTF-8 byte sequences" ,
181- ) ) ;
182- }
183- }
184- Err ( e) => str:: from_utf8 ( & data[ ..e. valid_up_to ( ) ] ) . unwrap ( ) ,
185- } ;
186-
187- write_valid_utf8_to_console ( handle, utf8)
113+ fn write_console_utf16 ( data : & [ u8 ] , handle : c:: HANDLE ) -> io:: Result < usize > {
114+ let mut buffer = [ MaybeUninit :: < u16 > :: uninit ( ) ; MAX_BUFFER_SIZE / 2 ] ;
115+ let data = & data[ ..data. len ( ) . min ( buffer. len ( ) ) ] ;
116+
117+ // Split off any trailing incomplete UTF-8 from the end of the input.
118+ let utf8 = trim_last_char_boundary ( data) ;
119+ let utf16 = utf8_to_utf16_lossy ( utf8, & mut buffer) ;
120+ debug_assert ! ( !utf16. is_empty( ) ) ;
121+
122+ // Write the UTF-16 chars to the console.
123+ // This will succeed in one write so long as our [u16] slice is smaller than the console's buffer,
124+ // which we've ensured by truncating the input (see `MAX_BUFFER_SIZE`).
125+ let written = write_u16s ( handle, & utf16) ?;
126+ debug_assert_eq ! ( written, utf16. len( ) ) ;
127+ Ok ( utf8. len ( ) )
188128}
189129
190- fn write_valid_utf8_to_console ( handle : c:: HANDLE , utf8 : & str ) -> io:: Result < usize > {
191- debug_assert ! ( !utf8. is_empty( ) ) ;
192-
193- let mut utf16 = [ MaybeUninit :: < u16 > :: uninit ( ) ; MAX_BUFFER_SIZE / 2 ] ;
194- let utf8 = & utf8[ ..utf8. floor_char_boundary ( utf16. len ( ) ) ] ;
195-
196- let utf16: & [ u16 ] = unsafe {
197- // Note that this theoretically checks validity twice in the (most common) case
198- // where the underlying byte sequence is valid utf-8 (given the check in `write()`).
130+ fn utf8_to_utf16_lossy < ' a > ( utf8 : & [ u8 ] , utf16 : & ' a mut [ MaybeUninit < u16 > ] ) -> & ' a [ u16 ] {
131+ unsafe {
199132 let result = c:: MultiByteToWideChar (
200133 c:: CP_UTF8 , // CodePage
201- c :: MB_ERR_INVALID_CHARS , // dwFlags
134+ 0 , // dwFlags
202135 utf8. as_ptr ( ) , // lpMultiByteStr
203136 utf8. len ( ) as i32 , // cbMultiByte
204137 utf16. as_mut_ptr ( ) as * mut c:: WCHAR , // lpWideCharStr
205138 utf16. len ( ) as i32 , // cchWideChar
206139 ) ;
207- assert ! ( result != 0 , "Unexpected error in MultiByteToWideChar" ) ;
208-
140+ // The only way an error can happen here is if we've messed up.
141+ debug_assert ! ( result != 0 , "Unexpected error in MultiByteToWideChar" ) ;
209142 // Safety: MultiByteToWideChar initializes `result` values.
210143 MaybeUninit :: slice_assume_init_ref ( & utf16[ ..result as usize ] )
211- } ;
212-
213- let mut written = write_u16s ( handle, utf16) ?;
214-
215- // Figure out how many bytes of as UTF-8 were written away as UTF-16.
216- if written == utf16. len ( ) {
217- Ok ( utf8. len ( ) )
218- } else {
219- // Make sure we didn't end up writing only half of a surrogate pair (even though the chance
220- // is tiny). Because it is not possible for user code to re-slice `data` in such a way that
221- // a missing surrogate can be produced (and also because of the UTF-8 validation above),
222- // write the missing surrogate out now.
223- // Buffering it would mean we have to lie about the number of bytes written.
224- let first_code_unit_remaining = utf16[ written] ;
225- if matches ! ( first_code_unit_remaining, 0xDCEE ..=0xDFFF ) {
226- // low surrogate
227- // We just hope this works, and give up otherwise
228- let _ = write_u16s ( handle, & utf16[ written..written + 1 ] ) ;
229- written += 1 ;
230- }
231- // Calculate the number of bytes of `utf8` that were actually written.
232- let mut count = 0 ;
233- for ch in utf16[ ..written] . iter ( ) {
234- count += match ch {
235- 0x0000 ..=0x007F => 1 ,
236- 0x0080 ..=0x07FF => 2 ,
237- 0xDCEE ..=0xDFFF => 1 , // Low surrogate. We already counted 3 bytes for the other.
238- _ => 3 ,
239- } ;
240- }
241- debug_assert ! ( String :: from_utf16( & utf16[ ..written] ) . unwrap( ) == utf8[ ..count] ) ;
242- Ok ( count)
243144 }
244145}
245146
@@ -432,13 +333,13 @@ impl IncompleteUtf8 {
432333
433334impl Stdout {
434335 pub const fn new ( ) -> Stdout {
435- Stdout { incomplete_utf8 : IncompleteUtf8 :: new ( ) }
336+ Stdout { }
436337 }
437338}
438339
439340impl io:: Write for Stdout {
440341 fn write ( & mut self , buf : & [ u8 ] ) -> io:: Result < usize > {
441- write ( c:: STD_OUTPUT_HANDLE , buf, & mut self . incomplete_utf8 )
342+ write ( c:: STD_OUTPUT_HANDLE , buf)
442343 }
443344
444345 fn flush ( & mut self ) -> io:: Result < ( ) > {
@@ -448,13 +349,13 @@ impl io::Write for Stdout {
448349
449350impl Stderr {
450351 pub const fn new ( ) -> Stderr {
451- Stderr { incomplete_utf8 : IncompleteUtf8 :: new ( ) }
352+ Stderr { }
452353 }
453354}
454355
455356impl io:: Write for Stderr {
456357 fn write ( & mut self , buf : & [ u8 ] ) -> io:: Result < usize > {
457- write ( c:: STD_ERROR_HANDLE , buf, & mut self . incomplete_utf8 )
358+ write ( c:: STD_ERROR_HANDLE , buf)
458359 }
459360
460361 fn flush ( & mut self ) -> io:: Result < ( ) > {
@@ -469,3 +370,50 @@ pub fn is_ebadf(err: &io::Error) -> bool {
469370pub fn panic_output ( ) -> Option < impl io:: Write > {
470371 Some ( Stderr :: new ( ) )
471372}
373+
374+ /// Trim one incomplete UTF-8 char from the end of a byte slice.
375+ ///
376+ /// If trimming would lead to an empty slice then it returns `bytes` instead.
377+ ///
378+ /// Note: This function is optimized for size rather than speed.
379+ pub fn trim_last_char_boundary ( bytes : & [ u8 ] ) -> & [ u8 ] {
380+ // UTF-8's multiple-byte encoding uses the leading bits to encode the length of a code point.
381+ // The bits of a multi-byte sequence are (where `n` is a placeholder for any bit):
382+ //
383+ // 11110nnn 10nnnnnn 10nnnnnn 10nnnnnn
384+ // 1110nnnn 10nnnnnn 10nnnnnn
385+ // 110nnnnn 10nnnnnn
386+ //
387+ // So if follows that an incomplete sequence is one of these:
388+ // 11110nnn 10nnnnnn 10nnnnnn
389+ // 11110nnn 10nnnnnn
390+ // 1110nnnn 10nnnnnn
391+ // 11110nnn
392+ // 1110nnnn
393+ // 110nnnnn
394+
395+ // Get up to three bytes from the end of the slice and encode them as a u32
396+ // because it turns out the compiler is very good at optimizing numbers.
397+ let u = match bytes {
398+ [ .., b1, b2, b3] => ( * b1 as u32 ) << 16 | ( * b2 as u32 ) << 8 | * b3 as u32 ,
399+ [ .., b1, b2] => ( * b1 as u32 ) << 8 | * b2 as u32 ,
400+ // If it's just a single byte or empty then we return the full slice
401+ _ => return bytes,
402+ } ;
403+ if ( u & 0b_11111000_11000000_11000000 == 0b_11110000_10000000_10000000 ) && bytes. len ( ) >= 4 {
404+ & bytes[ ..bytes. len ( ) - 3 ]
405+ } else if ( u & 0b_11111000_11000000 == 0b_11110000_10000000
406+ || u & 0b_11110000_11000000 == 0b_11100000_10000000 )
407+ && bytes. len ( ) >= 3
408+ {
409+ & bytes[ ..bytes. len ( ) - 2 ]
410+ } else if ( u & 0b_1111_1000 == 0b_1111_0000
411+ || u & 0b_11110000 == 0b_11100000
412+ || u & 0b_11100000 == 0b_11000000 )
413+ && bytes. len ( ) >= 2
414+ {
415+ & bytes[ ..bytes. len ( ) - 1 ]
416+ } else {
417+ bytes
418+ }
419+ }
0 commit comments