@@ -26,8 +26,7 @@ pub fn validate_utf8_compat(input: &[u8]) -> Result<(), crate::compat::Utf8Error
2626/// available.
2727#[ cfg( feature = "public_imp" ) ]
2828pub struct Utf8ValidatorImp {
29- incomplete_data : [ u8 ; 4 ] ,
30- incomplete_len : u8 ,
29+ expected_cont_bytes : u8 ,
3130 err : bool ,
3231}
3332
@@ -39,65 +38,59 @@ pub use Utf8ValidatorImp as ChunkedUtf8ValidatorImp;
3938#[ cfg( feature = "public_imp" ) ]
4039impl Utf8ValidatorImp {
4140 #[ inline]
42- #[ expect( clippy:: cast_possible_truncation) ]
4341 fn update ( & mut self , mut input : & [ u8 ] ) {
4442 if self . err {
4543 return ;
4644 }
47- if self . incomplete_len > 0 {
48- let total_bytes_needed: usize = match self . incomplete_data [ 0 ] {
49- 0 ..0b1000_0000 => {
50- panic ! ( "ASCII data should never be incomplete" ) ;
51- }
52- 0b1000_0000 ..0b1100_0000 => {
53- // first byte cannot be a continuation byte
45+ if self . expected_cont_bytes > 0 {
46+ let to_check = ( self . expected_cont_bytes as usize ) . min ( input. len ( ) ) ;
47+ for b in & input[ ..to_check] {
48+ if b & 0b1100_0000 != 0b1000_0000 {
49+ // not a continuation byte
5450 self . err = true ;
5551 return ;
5652 }
57- 0b1100_0000 ..0b1110_0000 => 2 ,
58- 0b1110_0000 ..0b1111_0000 => 3 ,
59- 0b1111_0000 ..0b1111_1000 => 4 ,
53+ self . expected_cont_bytes -= 1 ;
54+ }
55+ if self . expected_cont_bytes > 0 {
56+ // not enough continuation bytes
57+ return ;
58+ }
59+ input = & input[ to_check..] ;
60+ }
61+ if let Err ( e) = core:: str:: from_utf8 ( input) {
62+ // cannot wrap, since there is at least one byte left which is not valid UTF-8
63+ // by itself
64+ self . expected_cont_bytes = match input[ e. valid_up_to ( ) ] {
65+ 0b1100_0000 ..0b1110_0000 => 1 ,
66+ 0b1110_0000 ..0b1111_0000 => 2 ,
67+ 0b1111_0000 ..0b1111_1000 => 3 ,
6068 _ => {
6169 // invalid byte for starting sequence
6270 self . err = true ;
6371 return ;
6472 }
6573 } ;
66- if self . incomplete_len as usize >= total_bytes_needed {
67- // actually errored on previous update
68- self . err = true ;
69- return ;
70- }
71- let bytes_needed = total_bytes_needed - self . incomplete_len as usize ;
72- let to_copy = core:: cmp:: min ( bytes_needed, input. len ( ) ) ;
73- self . incomplete_data
74- [ self . incomplete_len as usize ..self . incomplete_len as usize + to_copy]
75- . copy_from_slice ( & input[ ..to_copy] ) ;
76- if to_copy < bytes_needed {
77- self . incomplete_len += to_copy as u8 ;
78- return ;
79- }
80- if core:: str:: from_utf8 ( & self . incomplete_data [ ..total_bytes_needed] ) . is_err ( ) {
74+ let rem_input = input. len ( ) - e. valid_up_to ( ) - 1 ;
75+ if rem_input >= self . expected_cont_bytes as usize {
76+ // too many continuation bytes so they are not valid
8177 self . err = true ;
8278 return ;
8379 }
84- self . incomplete_len = 0 ;
85- input = & input[ to_copy.. ] ;
86- }
87- if let Err ( e ) = core :: str :: from_utf8 ( input ) {
88- if input . len ( ) - e . valid_up_to ( ) > 3 {
89- self . err = true ;
90- return ;
80+ for i in 0 ..rem_input {
81+ if input[ e . valid_up_to ( ) + i + 1 ] & 0b1100_0000 != 0b1000_0000 {
82+ // not a continuation byte
83+ self . err = true ;
84+ return ;
85+ }
86+ self . expected_cont_bytes -= 1 ;
9187 }
92- self . incomplete_len = ( input. len ( ) - e. valid_up_to ( ) ) as u8 ;
93- self . incomplete_data [ ..self . incomplete_len as usize ]
94- . copy_from_slice ( & input[ e. valid_up_to ( ) ..] ) ;
9588 }
9689 }
9790
9891 #[ inline]
9992 const fn finalize ( self ) -> core:: result:: Result < ( ) , crate :: basic:: Utf8Error > {
100- if self . err || self . incomplete_len > 0 {
93+ if self . err || self . expected_cont_bytes > 0 {
10194 Err ( crate :: basic:: Utf8Error { } )
10295 } else {
10396 Ok ( ( ) )
@@ -111,8 +104,7 @@ impl crate::basic::imp::Utf8Validator for Utf8ValidatorImp {
111104 #[ must_use]
112105 fn new ( ) -> Self {
113106 Self {
114- incomplete_data : [ 0 ; 4 ] ,
115- incomplete_len : 0 ,
107+ expected_cont_bytes : 0 ,
116108 err : false ,
117109 }
118110 }
@@ -137,8 +129,7 @@ impl crate::basic::imp::ChunkedUtf8Validator for Utf8ValidatorImp {
137129 #[ must_use]
138130 fn new ( ) -> Self {
139131 Self {
140- incomplete_data : [ 0 ; 4 ] ,
141- incomplete_len : 0 ,
132+ expected_cont_bytes : 0 ,
142133 err : false ,
143134 }
144135 }
0 commit comments