1010//
1111// Original authors: alexchrichton, bluss
1212
13+ use std:: ptr;
14+
1315// UTF-8 ranges and tags for encoding characters
1416const TAG_CONT : u8 = 0b1000_0000 ;
1517const TAG_TWO_B : u8 = 0b1100_0000 ;
@@ -22,33 +24,75 @@ const MAX_THREE_B: u32 = 0x10000;
2224/// Placeholder
2325pub struct EncodeUtf8Error ;
2426
27+ #[ inline]
28+ unsafe fn write ( ptr : * mut u8 , index : usize , byte : u8 ) {
29+ ptr:: write ( ptr. add ( index) , byte)
30+ }
31+
2532/// Encode a char into buf using UTF-8.
2633///
2734/// On success, return the byte length of the encoding (1, 2, 3 or 4).<br>
2835/// On error, return `EncodeUtf8Error` if the buffer was too short for the char.
36+ ///
37+ /// Safety: `ptr` must be writable for `len` bytes.
2938#[ inline]
30- pub fn encode_utf8 ( ch : char , buf : & mut [ u8 ] ) -> Result < usize , EncodeUtf8Error >
39+ pub unsafe fn encode_utf8 ( ch : char , ptr : * mut u8 , len : usize ) -> Result < usize , EncodeUtf8Error >
3140{
3241 let code = ch as u32 ;
33- if code < MAX_ONE_B && buf . len ( ) >= 1 {
34- buf [ 0 ] = code as u8 ;
42+ if code < MAX_ONE_B && len >= 1 {
43+ write ( ptr , 0 , code as u8 ) ;
3544 return Ok ( 1 ) ;
36- } else if code < MAX_TWO_B && buf . len ( ) >= 2 {
37- buf [ 0 ] = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
38- buf [ 1 ] = ( code & 0x3F ) as u8 | TAG_CONT ;
45+ } else if code < MAX_TWO_B && len >= 2 {
46+ write ( ptr , 0 , ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ) ;
47+ write ( ptr , 1 , ( code & 0x3F ) as u8 | TAG_CONT ) ;
3948 return Ok ( 2 ) ;
40- } else if code < MAX_THREE_B && buf . len ( ) >= 3 {
41- buf [ 0 ] = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
42- buf [ 1 ] = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
43- buf [ 2 ] = ( code & 0x3F ) as u8 | TAG_CONT ;
49+ } else if code < MAX_THREE_B && len >= 3 {
50+ write ( ptr , 0 , ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ) ;
51+ write ( ptr , 1 , ( code >> 6 & 0x3F ) as u8 | TAG_CONT ) ;
52+ write ( ptr , 2 , ( code & 0x3F ) as u8 | TAG_CONT ) ;
4453 return Ok ( 3 ) ;
45- } else if buf . len ( ) >= 4 {
46- buf [ 0 ] = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
47- buf [ 1 ] = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
48- buf [ 2 ] = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
49- buf [ 3 ] = ( code & 0x3F ) as u8 | TAG_CONT ;
54+ } else if len >= 4 {
55+ write ( ptr , 0 , ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ) ;
56+ write ( ptr , 1 , ( code >> 12 & 0x3F ) as u8 | TAG_CONT ) ;
57+ write ( ptr , 2 , ( code >> 6 & 0x3F ) as u8 | TAG_CONT ) ;
58+ write ( ptr , 3 , ( code & 0x3F ) as u8 | TAG_CONT ) ;
5059 return Ok ( 4 ) ;
5160 } ;
5261 Err ( EncodeUtf8Error )
5362}
5463
64+
65+ #[ test]
66+ fn test_encode_utf8 ( ) {
67+ // Test that all codepoints are encoded correctly
68+ let mut data = [ 0u8 ; 16 ] ;
69+ for codepoint in 0 ..=( std:: char:: MAX as u32 ) {
70+ if let Some ( ch) = std:: char:: from_u32 ( codepoint) {
71+ for elt in & mut data { * elt = 0 ; }
72+ let ptr = data. as_mut_ptr ( ) ;
73+ let len = data. len ( ) ;
74+ unsafe {
75+ let res = encode_utf8 ( ch, ptr, len) . ok ( ) . unwrap ( ) ;
76+ assert_eq ! ( res, ch. len_utf8( ) ) ;
77+ }
78+ let string = std:: str:: from_utf8 ( & data) . unwrap ( ) ;
79+ assert_eq ! ( string. chars( ) . next( ) , Some ( ch) ) ;
80+ }
81+ }
82+ }
83+
84+ #[ test]
85+ fn test_encode_utf8_oob ( ) {
86+ // test that we report oob if the buffer is too short
87+ let mut data = [ 0u8 ; 16 ] ;
88+ let chars = [ 'a' , 'α' , '�' , '𐍈' ] ;
89+ for ( len, & ch) in ( 1 ..=4 ) . zip ( & chars) {
90+ assert_eq ! ( len, ch. len_utf8( ) , "Len of ch={}" , ch) ;
91+ let ptr = data. as_mut_ptr ( ) ;
92+ unsafe {
93+ assert ! ( matches:: matches!( encode_utf8( ch, ptr, len - 1 ) , Err ( _) ) ) ;
94+ assert ! ( matches:: matches!( encode_utf8( ch, ptr, len) , Ok ( _) ) ) ;
95+ }
96+ }
97+ }
98+
0 commit comments