@@ -125,13 +125,14 @@ Section: Creating a string
125125#[ stable( feature = "rust1" , since = "1.0.0" ) ]
126126pub struct Utf8Error {
127127 valid_up_to : usize ,
128+ error_len : Option < u8 > ,
128129}
129130
130131impl Utf8Error {
131132 /// Returns the index in the given string up to which valid UTF-8 was
132133 /// verified.
133134 ///
134- /// It is the maximum index such that `from_utf8(input[..index])`
135+ /// It is the maximum index such that `from_utf8(& input[..index])`
135136 /// would return `Ok(_)`.
136137 ///
137138 /// # Examples
@@ -152,6 +153,23 @@ impl Utf8Error {
152153 /// ```
153154 #[ stable( feature = "utf8_error" , since = "1.5.0" ) ]
154155 pub fn valid_up_to ( & self ) -> usize { self . valid_up_to }
156+
157+ /// Provide more information about the failure:
158+ ///
159+ /// * `None`: the end of the input was reached unexpectedly.
160+ /// `self.valid_up_to()` is 1 to 3 bytes from the end of the input.
161+ /// If a byte stream (such as a file or a network socket) is being decoded incrementally,
162+ /// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
163+ ///
164+ /// * `Some(len)`: an unexpected byte was encountered.
165+ /// The length provided is that of the invalid byte sequence
166+ /// that starts at the index given by `valid_up_to()`.
167+ /// Decoding should resume after that sequence
168+ /// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
169+ #[ unstable( feature = "utf8_error_error_len" , reason ="new" , issue = "40494" ) ]
170+ pub fn error_len ( & self ) -> Option < usize > {
171+ self . error_len . map ( |len| len as usize )
172+ }
155173}
156174
157175/// Converts a slice of bytes to a string slice.
@@ -300,7 +318,12 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
300318#[ stable( feature = "rust1" , since = "1.0.0" ) ]
301319impl fmt:: Display for Utf8Error {
302320 fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
303- write ! ( f, "invalid utf-8: invalid byte near index {}" , self . valid_up_to)
321+ if let Some ( error_len) = self . error_len {
322+ write ! ( f, "invalid utf-8 sequence of {} bytes from index {}" ,
323+ error_len, self . valid_up_to)
324+ } else {
325+ write ! ( f, "incomplete utf-8 byte sequence from index {}" , self . valid_up_to)
326+ }
304327 }
305328}
306329
@@ -1241,25 +1264,27 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
12411264
12421265 while index < len {
12431266 let old_offset = index;
1244- macro_rules! err { ( ) => { {
1245- return Err ( Utf8Error {
1246- valid_up_to: old_offset
1247- } )
1248- } } }
1267+ macro_rules! err {
1268+ ( $error_len: expr) => {
1269+ return Err ( Utf8Error {
1270+ valid_up_to: old_offset,
1271+ error_len: $error_len,
1272+ } )
1273+ }
1274+ }
12491275
12501276 macro_rules! next { ( ) => { {
12511277 index += 1 ;
12521278 // we needed data, but there was none: error!
12531279 if index >= len {
1254- err!( )
1280+ err!( None )
12551281 }
12561282 v[ index]
12571283 } } }
12581284
12591285 let first = v[ index] ;
12601286 if first >= 128 {
12611287 let w = UTF8_CHAR_WIDTH [ first as usize ] ;
1262- let second = next ! ( ) ;
12631288 // 2-byte encoding is for codepoints \u{0080} to \u{07ff}
12641289 // first C2 80 last DF BF
12651290 // 3-byte encoding is for codepoints \u{0800} to \u{ffff}
@@ -1279,25 +1304,36 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
12791304 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
12801305 // %xF4 %x80-8F 2( UTF8-tail )
12811306 match w {
1282- 2 => if second & !CONT_MASK != TAG_CONT_U8 { err ! ( ) } ,
1307+ 2 => if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1308+ err ! ( Some ( 1 ) )
1309+ } ,
12831310 3 => {
1284- match ( first, second, next ! ( ) & !CONT_MASK ) {
1285- ( 0xE0 , 0xA0 ... 0xBF , TAG_CONT_U8 ) |
1286- ( 0xE1 ... 0xEC , 0x80 ... 0xBF , TAG_CONT_U8 ) |
1287- ( 0xED , 0x80 ... 0x9F , TAG_CONT_U8 ) |
1288- ( 0xEE ... 0xEF , 0x80 ... 0xBF , TAG_CONT_U8 ) => { }
1289- _ => err ! ( )
1311+ match ( first, next ! ( ) ) {
1312+ ( 0xE0 , 0xA0 ... 0xBF ) |
1313+ ( 0xE1 ... 0xEC , 0x80 ... 0xBF ) |
1314+ ( 0xED , 0x80 ... 0x9F ) |
1315+ ( 0xEE ... 0xEF , 0x80 ... 0xBF ) => { }
1316+ _ => err ! ( Some ( 1 ) )
1317+ }
1318+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1319+ err ! ( Some ( 2 ) )
12901320 }
12911321 }
12921322 4 => {
1293- match ( first, second, next ! ( ) & !CONT_MASK , next ! ( ) & !CONT_MASK ) {
1294- ( 0xF0 , 0x90 ... 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
1295- ( 0xF1 ... 0xF3 , 0x80 ... 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
1296- ( 0xF4 , 0x80 ... 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => { }
1297- _ => err ! ( )
1323+ match ( first, next ! ( ) ) {
1324+ ( 0xF0 , 0x90 ... 0xBF ) |
1325+ ( 0xF1 ... 0xF3 , 0x80 ... 0xBF ) |
1326+ ( 0xF4 , 0x80 ... 0x8F ) => { }
1327+ _ => err ! ( Some ( 1 ) )
1328+ }
1329+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1330+ err ! ( Some ( 2 ) )
1331+ }
1332+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1333+ err ! ( Some ( 3 ) )
12981334 }
12991335 }
1300- _ => err ! ( )
1336+ _ => err ! ( Some ( 1 ) )
13011337 }
13021338 index += 1 ;
13031339 } else {
0 commit comments