|
1 | 1 | use super::from_utf8_unchecked; |
2 | | -use super::validations::utf8_char_width; |
3 | 2 | use crate::fmt; |
4 | 3 | use crate::fmt::{Formatter, Write}; |
5 | 4 | use crate::iter::FusedIterator; |
@@ -197,93 +196,27 @@ impl<'a> Iterator for Utf8Chunks<'a> { |
197 | 196 | return None; |
198 | 197 | } |
199 | 198 |
|
200 | | - const TAG_CONT_U8: u8 = 128; |
201 | | - fn safe_get(xs: &[u8], i: usize) -> u8 { |
202 | | - *xs.get(i).unwrap_or(&0) |
203 | | - } |
204 | | - |
205 | | - let mut i = 0; |
206 | | - let mut valid_up_to = 0; |
207 | | - while i < self.source.len() { |
208 | | - // SAFETY: `i < self.source.len()` per previous line. |
209 | | - // For some reason the following are both significantly slower: |
210 | | - // while let Some(&byte) = self.source.get(i) { |
211 | | - // while let Some(byte) = self.source.get(i).copied() { |
212 | | - let byte = unsafe { *self.source.get_unchecked(i) }; |
213 | | - i += 1; |
214 | | - |
215 | | - if byte < 128 { |
216 | | - // This could be a `1 => ...` case in the match below, but for |
217 | | - // the common case of all-ASCII inputs, we bypass loading the |
218 | | - // sizeable UTF8_CHAR_WIDTH table into cache. |
219 | | - } else { |
220 | | - let w = utf8_char_width(byte); |
221 | | - |
222 | | - match w { |
223 | | - 2 => { |
224 | | - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
225 | | - break; |
226 | | - } |
227 | | - i += 1; |
228 | | - } |
229 | | - 3 => { |
230 | | - match (byte, safe_get(self.source, i)) { |
231 | | - (0xE0, 0xA0..=0xBF) => (), |
232 | | - (0xE1..=0xEC, 0x80..=0xBF) => (), |
233 | | - (0xED, 0x80..=0x9F) => (), |
234 | | - (0xEE..=0xEF, 0x80..=0xBF) => (), |
235 | | - _ => break, |
236 | | - } |
237 | | - i += 1; |
238 | | - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
239 | | - break; |
240 | | - } |
241 | | - i += 1; |
242 | | - } |
243 | | - 4 => { |
244 | | - match (byte, safe_get(self.source, i)) { |
245 | | - (0xF0, 0x90..=0xBF) => (), |
246 | | - (0xF1..=0xF3, 0x80..=0xBF) => (), |
247 | | - (0xF4, 0x80..=0x8F) => (), |
248 | | - _ => break, |
249 | | - } |
250 | | - i += 1; |
251 | | - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
252 | | - break; |
253 | | - } |
254 | | - i += 1; |
255 | | - if safe_get(self.source, i) & 192 != TAG_CONT_U8 { |
256 | | - break; |
257 | | - } |
258 | | - i += 1; |
259 | | - } |
260 | | - _ => break, |
261 | | - } |
| 199 | + match super::from_utf8(self.source) { |
| 200 | + Ok(valid) => { |
| 201 | + // Truncate the slice, no need to touch the pointer. |
| 202 | + self.source = &self.source[..0]; |
| 203 | + Some(Utf8Chunk { valid, invalid: &[] }) |
| 204 | + } |
| 205 | + Err(err) => { |
| 206 | + let valid_up_to = err.valid_up_to(); |
| 207 | + let error_len = err.error_len().unwrap_or(self.source.len() - valid_up_to); |
| 208 | + // SAFETY: `valid_up_to` is the valid UTF-8 string length, so is in bound. |
| 209 | + let (valid, remaining) = unsafe { self.source.split_at_unchecked(valid_up_to) }; |
| 210 | + // SAFETY: `error_len` is the errornous byte sequence length, so is in bound. |
| 211 | + let (invalid, after_invalid) = unsafe { remaining.split_at_unchecked(error_len) }; |
| 212 | + self.source = after_invalid; |
| 213 | + Some(Utf8Chunk { |
| 214 | + // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. |
| 215 | + valid: unsafe { from_utf8_unchecked(valid) }, |
| 216 | + invalid, |
| 217 | + }) |
262 | 218 | } |
263 | | - |
264 | | - valid_up_to = i; |
265 | 219 | } |
266 | | - |
267 | | - // SAFETY: `i <= self.source.len()` because it is only ever incremented |
268 | | - // via `i += 1` and in between every single one of those increments, `i` |
269 | | - // is compared against `self.source.len()`. That happens either |
270 | | - // literally by `i < self.source.len()` in the while-loop's condition, |
271 | | - // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The |
272 | | - // loop is terminated as soon as the latest `i += 1` has made `i` no |
273 | | - // longer less than `self.source.len()`, which means it'll be at most |
274 | | - // equal to `self.source.len()`. |
275 | | - let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) }; |
276 | | - self.source = remaining; |
277 | | - |
278 | | - // SAFETY: `valid_up_to <= i` because it is only ever assigned via |
279 | | - // `valid_up_to = i` and `i` only increases. |
280 | | - let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; |
281 | | - |
282 | | - Some(Utf8Chunk { |
283 | | - // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. |
284 | | - valid: unsafe { from_utf8_unchecked(valid) }, |
285 | | - invalid, |
286 | | - }) |
287 | 220 | } |
288 | 221 | } |
289 | 222 |
|
|
0 commit comments