@@ -2361,18 +2361,152 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
23612361}
23622362
23632363/*
2364- =for apidoc utf8_to_bytes
2364+ =for apidoc utf8_to_bytes_overwrite
2365+ =for apidoc_item utf8_to_bytes_new_pv
2366+ =for apidoc_item utf8_to_bytes_temp_pv
2367+ =for apidoc_item utf8_to_bytes
2368+ =for apidoc_item bytes_from_utf8
23652369
2366- Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
2367- Unlike L</bytes_to_utf8>, this over-writes the original string, and
2368- updates C<*lenp> to contain the new length.
2369- Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1 .
2370+ These each convert a string encoded as UTF-8 into the equivalent native byte
2371+ representation, if possible. The first three forms are preferred; their API is
2372+ more convenient to use, and each return C<true> if the result is in bytes;
2373+ C<false> if the conversion failed .
23702374
2371- Upon successful return, the number of variants in the string can be computed by
2372- having saved the value of C<*lenp> before the call, and subtracting the
2373- after-call value of C<*lenp> from it.
2375+ =over 4
2376+
2377+ =item * C<utf8_to_bytes_overwrite>
2378+
2379+ =item * C<utf8_to_bytes_new_pv>
2380+
2381+ =item * C<utf8_to_bytes_temp_pv>
2382+
2383+ These differ primarily in the form of the returned string and the allowed
2384+ constness of the input string. In each, if the input string was already in
2385+ native bytes or was not convertible, the input isn't changed.
2386+
2387+ In each of these three functions, the input C<s_ptr> is a pointer to the string
2388+ to be converted and C<*lenp> is its length (so that the first byte will be at
2389+ C<*sptr[0]>).
2390+
2391+ C<utf8_to_bytes_overwrite> overwrites the input string with the bytes
2392+ conversion. Hence, the input string should not be C<const>. (Converting the
2393+ multi-byte UTF-8 encoding to single bytes never expands the result, so
2394+ overwriting is always feasible.)
2395+
2396+ Both C<utf8_to_bytes_new_pv> and C<utf8_to_bytes_temp_pv> allocate new memory
2397+ to hold the converted string, never changing the input. Hence the input string
2398+ may be C<const>. They differ in that C<utf8_to_bytes_temp_pv> arranges for the
2399+ new memory to automatically be freed. With C<utf8_to_bytes_new_pv>, the caller
2400+ is responsible for freeing the memory. As explained below, not all successful
2401+ calls result in new memory being allocated. Hence this function also returns
2402+ to the caller (via an extra parameter, C<*free_me>) a pointer to any new
2403+ memory, or C<NULL> if none was allocated.
2404+
2405+ The functions return C<false> when the input is not well-formed UTF-8 or contains
2406+ at least one UTF-8 sequence that represents a code point that can't be
2407+ expressed as a byte. The contents of C<*s_ptr> and C<*lenp> are not changed.
2408+ C<utf8_to_bytes_new_pv> sets C<*free_me> to C<NULL>.
2409+
2410+ They all return C<true> when either:
2411+
2412+ =over 4
2413+
2414+ =item The input turned out to already be in bytes form
2415+
2416+ The contents of C<*s_ptr> and C<*lenp> are not changed.
2417+ C<utf8_to_bytes_new_pv> sets C<*free_me> to C<NULL>.
2418+
2419+ =item The input was successfully converted
2420+
2421+ =over 4
2422+
2423+ =item For C<utf8_to_bytes_overwrite>
2424+
2425+ The input string C<*s_ptr> was overwritten with the native bytes, including a
2426+ NUL terminator. C<*lenp> has been updated with the new length.
2427+
2428+ =item For C<utf8_to_bytes_new_pv> and C<utf8_to_bytes_temp_pv>
2429+
2430+ The input string was not changed. Instead, new memory has been allocated
2431+ containing the translation of the input into native bytes, with a NUL
2432+ terminator byte. C<*s_ptr> now points to that new memory, and C<*lenp>
2433+ contains its length.
2434+
2435+ For C<utf8_to_bytes_temp_pv>, the new memory has been arranged to be
2436+ automatically freed, via a call to C<L</SAVEFREEPV>>.
2437+
2438+ For C<utf8_to_bytes_new_pv>, C<*free_me> has been set to C<*s_ptr>, and it is
2439+ the caller's responsibility to free the new memory when done using it.
2440+ The results of this parameter can simply be passed to C<L</Safefree>> when
2441+ done, as that handles a C<NULL> parameter, and/or it can be used as a boolean
2442+ (non-NULL meaning C<true>) to indicate that the input was indeed changed.
2443+
2444+ =back
2445+
2446+ =back
2447+
2448+ Note that in all cases, C<*s_ptr> and C<*lenp> will have correct and consistent
2449+ values, updated as was necessary.
2450+
2451+ Also note that upon successful conversion, the number of variants in the string
2452+ can be computed by having saved the value of C<*lenp> before the call, and
2453+ subtracting the after-call value of C<*lenp> from it. This is also true for
2454+ the other two functions described below.
2455+
2456+ =item * C<utf8_to_bytes>
2457+
2458+ Plain C<utf8_to_bytes> (which has never lost its experimental status) also
2459+ converts a UTF-8 encoded string to bytes, but there are more glitches that the
2460+ caller has to be prepared to handle.
2461+
2462+ The input string is passed with one less indirection level, C<s>.
23742463
2375- If you need a copy of the string, see L</bytes_from_utf8>.
2464+ =over
2465+
2466+ =item If the conversion was a noop
2467+
2468+ The contents of C<s> and C<*lenp> are not changed, and the function returns
2469+ C<s>.
2470+
2471+ =item If the conversion was successful
2472+
2473+ The contents of C<s> were changed, and C<*lenp> updated to be the correct length.
2474+ The function returns C<s> (unchanged).
2475+
2476+ =item If the conversion failed
2477+
2478+ The contents of C<s> were not changed.
2479+
2480+ The function returns NULL and sets C<*lenp> to -1, cast to C<STRLEN>.
2481+ This means that you will have to use a temporary containing the string length
2482+ to pass to the function if you will need the value afterwards.
2483+
2484+ =back
2485+
2486+ =item * C<bytes_from_utf8>
2487+
2488+ C<bytes_from_utf8> also converts a potentially UTF-8 encoded string C<s> to
2489+ bytes. It preserves C<s>, allocating new memory for the converted string.
2490+
2491+ In contrast to the other functions, the input string to this one need not
2492+ be UTF-8. If not, the caller has set C<*is_utf8p> to be C<false>, and the
2493+ function does nothing, returning the original C<s>.
2494+
2495+ Also do nothing if there are code points in the string not expressible in
2496+ native byte encoding, returning the original C<s>.
2497+
2498+ Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2499+ newly created string containing the native byte equivalent of C<s>, and whose
2500+ length is returned in C<*lenp>, updated. The new string is C<NUL>-terminated.
2501+ The caller is responsible for arranging for the memory used by this string to
2502+ get freed.
2503+
2504+ The major problem with this function is that memory is allocated and filled
2505+ even when the input string was already in bytes form.
2506+
2507+ =back
2508+
2509+ New code should use the first three functions listed above.
23762510
23772511=cut
23782512*/
@@ -2651,34 +2785,6 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
26512785 return NULL ;
26522786}
26532787
2654- /*
2655- =for apidoc bytes_from_utf8
2656-
2657- Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
2658- byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
2659- actually encoded in UTF-8.
2660-
2661- Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2662- the input string.
2663-
2664- Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2665- not expressible in native byte encoding. In these cases, C<*is_utf8p> and
2666- C<*lenp> are unchanged, and the return value is the original C<s>.
2667-
2668- Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2669- newly created string containing a downgraded copy of C<s>, and whose length is
2670- returned in C<*lenp>, updated. The new string is C<NUL>-terminated. The
2671- caller is responsible for arranging for the memory used by this string to get
2672- freed.
2673-
2674- Upon successful return, the number of variants in the string can be computed by
2675- having saved the value of C<*lenp> before the call, and subtracting the
2676- after-call value of C<*lenp> from it.
2677-
2678- =cut
2679-
2680- */
2681-
26822788U8 *
26832789Perl_bytes_from_utf8 (pTHX_ const U8 * s , STRLEN * lenp , bool * is_utf8p )
26842790{
0 commit comments