@@ -2378,12 +2378,16 @@ If you need a copy of the string, see L</bytes_from_utf8>.
23782378*/
23792379
23802380bool
2381- Perl_utf8_to_bytes_ (pTHX_ U8 * * s_ptr , STRLEN * lenp ,
2381+ Perl_utf8_to_bytes_ (pTHX_ U8 * * s_ptr , STRLEN * lenp , U8 * * free_me ,
23822382 Perl_utf8_to_bytes_arg result_as )
23832383{
23842384 PERL_ARGS_ASSERT_UTF8_TO_BYTES_ ;
23852385 PERL_UNUSED_CONTEXT ;
23862386
2387+ if (result_as == PL_utf8_to_bytes_new_memory ) {
2388+ * free_me = NULL ;
2389+ }
2390+
23872391 U8 * first_variant ;
23882392
23892393 /* This is a no-op if no variants at all in the input */
@@ -2505,7 +2509,15 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
25052509 s ++ ;
25062510 }
25072511
2508- U8 * d0 = s0 ;
2512+ U8 * d0 ;
2513+ if (result_as == PL_utf8_to_bytes_overwrite ) {
2514+ d0 = s0 ;
2515+ }
2516+ else {
2517+ Newx (d0 , * lenp + 1 , U8 );
2518+ Copy (s0 , d0 , invariant_length , U8 );
2519+ }
2520+
25092521 U8 * d = d0 + invariant_length ;
25102522
25112523 /* For the cases where the per-word algorithm wasn't used, everything is
@@ -2546,6 +2558,10 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
25462558 * d = '\0' ;
25472559 * lenp = d - d0 ;
25482560
2561+ if (result_as != PL_utf8_to_bytes_overwrite ) {
2562+ * s_ptr = * free_me = d0 ;
2563+ }
2564+
25492565 return true;
25502566
25512567 cant_convert : ;
@@ -2556,10 +2572,16 @@ Perl_utf8_to_bytes_(pTHX_ U8 **s_ptr, STRLEN *lenp,
25562572 * text are C2 and C3, but didn't examine it to make sure each of those was
25572573 * followed by precisely one continuation, for example.
25582574 *
2559- * We have to undo all we've done before, back down to the first UTF-8
2560- * variant. Note that each 2-byte variant we've done so far (converted to
2561- * single byte) slides things to the left one byte, and so we have bytes
2562- * that haven't been written over.
2575+ * If the result is in newly allocated memory, just free it */
2576+ if (result_as != PL_utf8_to_bytes_overwrite ) {
2577+ Safefree (d0 );
2578+ return false;
2579+ }
2580+
2581+ /* Otherwise, we have to undo all we've done before, back down to the first
2582+ * UTF-8 variant. Note that each 2-byte variant we've done so far
2583+ * (converted to single byte) slides things to the left one byte, and so we
2584+ * have bytes that haven't been written over.
25632585 *
25642586 * Here, 'd' points to the next position to overwrite, and 's' points to
25652587 * the first invalid byte. That means 'd's contents haven't been changed
@@ -2641,57 +2663,25 @@ U8 *
26412663Perl_bytes_from_utf8 (pTHX_ const U8 * s , STRLEN * lenp , bool * is_utf8p )
26422664{
26432665 PERL_ARGS_ASSERT_BYTES_FROM_UTF8 ;
2644- PERL_UNUSED_CONTEXT ;
2645-
2646- if (! * is_utf8p ) {
2647- return (U8 * ) s ;
2648- }
2649-
2650- const U8 * const s0 = s ;
2651- const U8 * const send = s + * lenp ;
2652- const U8 * first_variant ;
2653-
2654- /* The initial portion of 's' that consists of invariants can be Copied
2655- * as-is. If it is entirely invariant, the whole thing can be Copied. */
2656- if (is_utf8_invariant_string_loc (s , * lenp , & first_variant )) {
2657- first_variant = send ;
2658- }
2659-
2660- U8 * d ;
2661- Newx (d , (* lenp ) + 1 , U8 );
2662- Copy (s , d , first_variant - s , U8 );
2663-
2664- U8 * converted_start = d ;
2665- d += first_variant - s ;
2666- s = first_variant ;
2667-
2668- while (s < send ) {
2669- U8 c = * s ++ ;
2670- if (! UTF8_IS_INVARIANT (c )) {
26712666
2672- /* Then it is multi-byte encoded. If the code point is above 0xFF,
2673- * have to stop now */
2674- if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE (s - 1 , send ))) {
2675- Safefree (converted_start );
2676- return (U8 * ) s0 ;
2667+ if (* is_utf8p ) {
2668+ U8 * new_memory = NULL ;
2669+ if (utf8_to_bytes_new_pv (& s , lenp , & new_memory )) {
2670+ * is_utf8p = false;
2671+
2672+ /* Our callers are always expecting new memory upon success. Give
2673+ * it to them, adding a trailing NUL if not already there */
2674+ if (new_memory == NULL ) {
2675+ U8 * new_s ;
2676+ Newx (new_s , * lenp + 1 , U8 );
2677+ Copy (s , new_s , * lenp , U8 );
2678+ new_s [* lenp ] = '\0' ;
2679+ s = new_s ;
26772680 }
2678-
2679- c = EIGHT_BIT_UTF8_TO_NATIVE (c , * s );
2680- s ++ ;
26812681 }
2682- * d ++ = c ;
26832682 }
26842683
2685- /* Here, converted the whole of the input */
2686- * is_utf8p = FALSE;
2687-
2688- * d = '\0' ;
2689- * lenp = d - converted_start ;
2690-
2691- /* Trim unused space */
2692- Renew (converted_start , * lenp + 1 , U8 );
2693-
2694- return converted_start ;
2684+ return (U8 * ) s ;
26952685}
26962686
26972687/*
0 commit comments