@@ -2399,14 +2399,16 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
23992399
24002400#ifndef EBCDIC /* The below relies on the bit patterns of UTF-8 */
24012401
2402- /* There is some start-up/tear-down overhead with this, so no real gain
2402+ /* Do a first pass through the string to see if it actually is translatable
2403+ * into bytes. On long strings this is
2404+ * done a word at a time, so is relatively quick. (There is some
2405+ * start-up/tear-down overhead with the per-word algorithm, so no real gain
24032406 * unless the remaining portion of the string is long enough. The current
2404- * value is just a guess. */
2407+ * value is just a guess.) On EBCDIC, it's always per-byte. */
24052408 if ((send - s ) > (ptrdiff_t ) (5 * PERL_WORDSIZE )) {
24062409
2407- /* First, go through the string a word at-a-time to verify that it is
2408- * downgradable. If it contains any start byte besides C2 and C3, then
2409- * it isn't. */
2410+ /* If the string contains any start byte besides C2 and C3, then it
2411+ * isn't translatable into bytes */
24102412
24112413 const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0 ;
24122414 const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2 ;
@@ -2490,9 +2492,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
24902492 }
24912493
24922494#endif
2493-
2494- /* Do the straggler bytes beyond the final word boundary (or all bytes
2495- * in the case of EBCDIC) */
2495+ /* Do the straggler bytes beyond what the loop above did */
24962496 while (s < send ) {
24972497 if (! UTF8_IS_INVARIANT (* s )) {
24982498 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE (s , send )) {
@@ -2504,19 +2504,18 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
25042504 s ++ ;
25052505 }
25062506
2507- /* Here, we passed the tests above. For the EBCDIC case, everything
2508- * was well-formed and can be downgraded to non-UTF8. For non-EBCDIC,
2509- * it means only that all start bytes were C2 or C3, hence any
2510- * well-formed sequences are downgradable. But we didn't test, for
2511- * example, that there weren't two C2's in a row. That means that in
2512- * the loop below, we have to be sure things are well-formed. Because
2513- * this is very very likely, and we don't care about having speedy
2514- * handling of malformed input, the loop proceeds as if well formed,
2515- * and should a malformed one come along, it undoes what it already has
2516- * done */
2517-
25182507 U8 * d = s = first_variant ;
25192508
2509+ /* For the cases where the per-word algorithm wasn't used, everything is
2510+ * well-formed and can definitely be translated. When the per word
2511+ * algorithm was used, it found that all start bytes in the string were C2
2512+ * or C3, hence any well-formed sequences are convertible to bytes. But we
2513+ * didn't test, for example, that there weren't two C2's in a row. That
2514+ * means that in the loop below, we have to be sure things are well-formed.
2515+ * Because it is very very unlikely that we got this far for something
2516+ * malformed, and because we prioritize speed in the normal case over the
2517+ * malformed one, we go ahead and do the translation, and undo it if found
2518+ * to be necessary. */
25202519 while (s < send ) {
25212520 U8 c = * s ++ ;
25222521 if (! UVCHR_IS_INVARIANT (c )) {
@@ -2548,12 +2547,11 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
25482547
25492548 cant_convert : ;
25502549
2551- /* Here, it is malformed. This shouldn't happen on EBCDIC, and on ASCII
2552- * platforms, we know that the only start bytes in the text are C2 and C3,
2553- * and the code above has made sure that it doesn't end with a start byte.
2554- * That means the only malformations that are possible are a start byte
2555- * without a continuation (either followed by another start byte or an
2556- * invariant) or an unexpected continuation.
2550+ /* Here, we found a malformation in the input. This won't happen except
2551+ * when the per-word algorithm was used in the first pass, because that may
2552+ * miss some malformations. It determined that the only start bytes in the
2553+ * text are C2 and C3, but didn't examine it to make sure each of those was
2554+ * followed by precisely one continuation, for example.
25572555 *
25582556 * We have to undo all we've done before, back down to the first UTF-8
25592557 * variant. Note that each 2-byte variant we've done so far (converted to
0 commit comments