@@ -34,6 +34,100 @@ use core::ptr::addr_of;
3434
3535use super :: c;
3636
37+ /// Creates a null-terminated UTF-16 string from a str.
38+ pub macro wide_str ( $str: literal) { {
39+ const _: ( ) = {
40+ if core:: slice:: memchr:: memchr ( 0 , $str. as_bytes ( ) ) . is_some ( ) {
41+ panic ! ( "null terminated strings cannot contain interior nulls" ) ;
42+ }
43+ } ;
44+ crate :: sys:: pal:: windows:: api:: utf16!( concat!( $str , '\0' ) )
45+ } }
46+
47+ /// Creates a UTF-16 string from a str without null termination.
48+ pub macro utf16 ( $str: expr) { {
49+ const UTF8 : & str = $str;
50+ const UTF16_LEN : usize = crate :: sys:: pal:: windows:: api:: utf16_len ( UTF8 ) ;
51+ const UTF16 : [ u16 ; UTF16_LEN ] = crate :: sys:: pal:: windows:: api:: to_utf16 ( UTF8 ) ;
52+ & UTF16
53+ } }
54+
55+ #[ cfg( test) ]
56+ mod tests;
57+
58+ /// Gets the UTF-16 length of a UTF-8 string, for use in the wide_str macro.
59+ pub const fn utf16_len ( s : & str ) -> usize {
60+ let s = s. as_bytes ( ) ;
61+ let mut i = 0 ;
62+ let mut len = 0 ;
63+ while i < s. len ( ) {
64+ // the length of a UTF-8 encoded code-point is given by the number of
65+ // leading ones, except in the case of ASCII.
66+ let utf8_len = match s[ i] . leading_ones ( ) {
67+ 0 => 1 ,
68+ n => n as usize ,
69+ } ;
70+ i += utf8_len;
71+ len += if utf8_len < 4 { 1 } else { 2 } ;
72+ }
73+ len
74+ }
75+
76+ /// Const convert UTF-8 to UTF-16, for use in the wide_str macro.
77+ ///
78+ /// Note that this is designed for use in const contexts so is not optimized.
79+ pub const fn to_utf16 < const UTF16_LEN : usize > ( s : & str ) -> [ u16 ; UTF16_LEN ] {
80+ let mut output = [ 0_u16 ; UTF16_LEN ] ;
81+ let mut pos = 0 ;
82+ let s = s. as_bytes ( ) ;
83+ let mut i = 0 ;
84+ while i < s. len ( ) {
85+ match s[ i] . leading_ones ( ) {
86+ // Decode UTF-8 based on its length.
87+ // See https://en.wikipedia.org/wiki/UTF-8
88+ 0 => {
89+ // ASCII is the same in both encodings
90+ output[ pos] = s[ i] as u16 ;
91+ i += 1 ;
92+ pos += 1 ;
93+ }
94+ 2 => {
95+ // Bits: 110xxxxx 10xxxxxx
96+ output[ pos] = ( ( s[ i] as u16 & 0b11111 ) << 6 ) | ( s[ i + 1 ] as u16 & 0b111111 ) ;
97+ i += 2 ;
98+ pos += 1 ;
99+ }
100+ 3 => {
101+ // Bits: 1110xxxx 10xxxxxx 10xxxxxx
102+ output[ pos] = ( ( s[ i] as u16 & 0b1111 ) << 12 )
103+ | ( ( s[ i + 1 ] as u16 & 0b111111 ) << 6 )
104+ | ( s[ i + 2 ] as u16 & 0b111111 ) ;
105+ i += 3 ;
106+ pos += 1 ;
107+ }
108+ 4 => {
109+ // Bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
110+ let mut c = ( ( s[ i] as u32 & 0b111 ) << 18 )
111+ | ( ( s[ i + 1 ] as u32 & 0b111111 ) << 12 )
112+ | ( ( s[ i + 2 ] as u32 & 0b111111 ) << 6 )
113+ | ( s[ i + 3 ] as u32 & 0b111111 ) ;
114+ // re-encode as UTF-16 (see https://en.wikipedia.org/wiki/UTF-16)
115+ // - Subtract 0x10000 from the code point
116+ // - For the high surrogate, shift right by 10 then add 0xD800
117+ // - For the low surrogate, take the low 10 bits then add 0xDC00
118+ c -= 0x10000 ;
119+ output[ pos] = ( ( c >> 10 ) + 0xD800 ) as u16 ;
120+ output[ pos + 1 ] = ( ( c & 0b1111111111 ) + 0xDC00 ) as u16 ;
121+ i += 4 ;
122+ pos += 2 ;
123+ }
124+ // valid UTF-8 cannot have any other values
125+ _ => unreachable ! ( ) ,
126+ }
127+ }
128+ output
129+ }
130+
37131/// Helper method for getting the size of `T` as a u32.
38132/// Errors at compile time if the size would overflow.
39133///
0 commit comments