@@ -441,6 +441,42 @@ impl Wtf8Buf {
441441 }
442442 }
443443
444+ /// Consumes the WTF-8 string and converts it to a (UTF-8, WTF-8) pair.
445+ ///
446+ /// This does not copy the data.
447+ ///
448+ /// The first element of the return value is the longest prefix of valid
449+ /// UTF-8, with the second element being the remainder.
450+ pub fn into_string_split ( self ) -> ( String , Wtf8Buf ) {
451+ if self . is_known_utf8 {
452+ // SAFETY: The inner value is known to be UTF-8.
453+ let utf8 = unsafe { String :: from_utf8_unchecked ( self . bytes ) } ;
454+ return ( utf8, Wtf8Buf :: new ( ) ) ;
455+ }
456+
457+ let surrogate_pos = match self . next_surrogate ( 0 ) {
458+ None => {
459+ // SAFETY: Well-formed WTF-8 that contains no surrogates is
460+ // also well-formed UTF-8.
461+ let utf8 = unsafe { String :: from_utf8_unchecked ( self . bytes ) } ;
462+ return ( utf8, Wtf8Buf :: new ( ) ) ;
463+ }
464+ Some ( ( surrogate_pos, _) ) => surrogate_pos,
465+ } ;
466+
467+ if surrogate_pos == 0 {
468+ return ( String :: new ( ) , self ) ;
469+ }
470+
471+ let mut utf8_bytes = self . bytes ;
472+ let wtf8_bytes = utf8_bytes. split_off ( surrogate_pos) ;
473+ // SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
474+ // surrogates, and well-formed WTF-8 that contains no surrogates is
475+ // also well-formed UTF-8.
476+ let utf8 = unsafe { String :: from_utf8_unchecked ( utf8_bytes) } ;
477+ ( utf8, Wtf8Buf { bytes : wtf8_bytes, is_known_utf8 : false } )
478+ }
479+
444480 /// Converts this `Wtf8Buf` into a boxed `Wtf8`.
445481 #[ inline]
446482 pub fn into_box ( self ) -> Box < Wtf8 > {
@@ -664,6 +700,38 @@ impl Wtf8 {
664700 }
665701 }
666702
703+ /// Losslessly split a WTF-8 string into to a (UTF-8, WTF-8) pair.
704+ ///
705+ /// This does not copy the data.
706+ ///
707+ /// The first element of the return value is the longest prefix of valid
708+ /// UTF-8, with the second element being the remainder.
709+ pub fn to_str_split ( & self ) -> ( & str , & Wtf8 ) {
710+ let surrogate_pos = match self . next_surrogate ( 0 ) {
711+ None => {
712+ // SAFETY: Well-formed WTF-8 that contains no surrogates is
713+ // also well-formed UTF-8.
714+ let utf8 = unsafe { str:: from_utf8_unchecked ( & self . bytes ) } ;
715+ return ( utf8, Wtf8 :: from_str ( "" ) ) ;
716+ }
717+ Some ( ( surrogate_pos, _) ) => surrogate_pos,
718+ } ;
719+
720+ if surrogate_pos == 0 {
721+ return ( "" , self ) ;
722+ }
723+
724+ let ( utf8_bytes, wtf8_bytes) = self . bytes . split_at ( surrogate_pos) ;
725+ // SAFETY: `utf8_bytes` is a prefix of a WTF-8 value that contains no
726+ // surrogates, and well-formed WTF-8 that contains no surrogates is
727+ // also well-formed UTF-8.
728+ unsafe {
729+ let utf8 = str:: from_utf8_unchecked ( utf8_bytes) ;
730+ let wtf8 = Wtf8 :: from_bytes_unchecked ( wtf8_bytes) ;
731+ ( utf8, wtf8)
732+ }
733+ }
734+
667735 /// Converts the WTF-8 string to potentially ill-formed UTF-16
668736 /// and return an iterator of 16-bit code units.
669737 ///
0 commit comments