@@ -661,7 +661,8 @@ impl char {
661661 #[ stable( feature = "unicode_encode_char" , since = "1.15.0" ) ]
662662 #[ inline]
663663 pub fn encode_utf8 ( self , dst : & mut [ u8 ] ) -> & mut str {
664- encode_utf8_raw ( self as u32 , dst)
664+ // SAFETY: `char` is not a surrogate, so this is valid UTF-8.
665+ unsafe { from_utf8_unchecked_mut ( encode_utf8_raw ( self as u32 , dst) ) }
665666 }
666667
667668 /// Encodes this character as UTF-16 into the provided `u16` buffer,
@@ -1631,7 +1632,11 @@ fn len_utf8(code: u32) -> usize {
16311632/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
16321633/// and then returns the subslice of the buffer that contains the encoded character.
16331634///
1634- /// Unlike `char::encode_utf8`, this method can be called on codepoints in the surrogate range.
1635+ /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1636+ /// (Creating a `char` in the surrogate range is UB.)
1637+ /// The result is valid [generalized UTF-8] but not valid UTF-8.
1638+ ///
1639+ /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
16351640///
16361641/// # Panics
16371642///
@@ -1640,7 +1645,7 @@ fn len_utf8(code: u32) -> usize {
16401645#[ unstable( feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" ) ]
16411646#[ doc( hidden) ]
16421647#[ inline]
1643- pub fn encode_utf8_raw ( code : u32 , dst : & mut [ u8 ] ) -> & mut str {
1648+ pub fn encode_utf8_raw ( code : u32 , dst : & mut [ u8 ] ) -> & mut [ u8 ] {
16441649 let len = len_utf8 ( code) ;
16451650 match ( len, & mut dst[ ..] ) {
16461651 ( 1 , [ a, ..] ) => {
@@ -1668,14 +1673,14 @@ pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut str {
16681673 dst. len( ) ,
16691674 ) ,
16701675 } ;
1671- // SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
1672- unsafe { from_utf8_unchecked_mut ( & mut dst[ ..len] ) }
1676+ & mut dst[ ..len]
16731677}
16741678
16751679/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
16761680/// and then returns the subslice of the buffer that contains the encoded character.
16771681///
1678- /// Unlike `char::encode_utf16`, this method can be called on codepoints in the surrogate range.
1682+ /// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
1683+ /// (Creating a `char` in the surrogate range is UB.)
16791684///
16801685/// # Panics
16811686///
@@ -1688,7 +1693,7 @@ pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] {
16881693 // SAFETY: each arm checks whether there are enough bits to write into
16891694 unsafe {
16901695 if ( code & 0xFFFF ) == code && !dst. is_empty ( ) {
1691- // The BMP falls through (assuming non-surrogate, as it should)
1696+ // The BMP falls through
16921697 * dst. get_unchecked_mut ( 0 ) = code as u16 ;
16931698 slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , 1 )
16941699 } else if dst. len ( ) >= 2 {
0 commit comments