@@ -1591,7 +1591,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
15911591 PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_ ;
15921592
15931593 const U8 * s = s0 ;
1594+
1595+ /* The ending position, plus 1, of the first character in the sequence
1596+ * beginning at s0. In other words, 'e', adjusted down to to be no more
1597+ * than a single character */
15941598 const U8 * send = e ;
1599+
15951600 SSize_t curlen = send - s0 ;
15961601 U32 possible_problems ; /* A bit is set here for each potential problem
15971602 found as we go along */
@@ -1603,10 +1608,12 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
16031608
16041609 dTHX ;
16051610
1606- /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1607- * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1608- * syllables that the dfa doesn't properly handle. Quickly dispose of the
1609- * final case. */
1611+ /* Here, is one of:
1612+ * a) malformed;
1613+ * b) a problematic code point (surrogate, non-unicode, or nonchar); or
1614+ * c) on ASCII platforms, one of the Hangul syllables that the dfa
1615+ * doesn't properly handle. Quickly dispose of the final case.
1616+ */
16101617
16111618 /* Assume will be successful; override later if necessary */
16121619 if (UNLIKELY (errors )) {
@@ -1652,7 +1659,10 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
16521659 errors = & discard_errors ;
16531660 }
16541661
1655- /* The order of malformation tests here is important. We should consume as
1662+ /* Accumulate the code point translation of the input byte sequence
1663+ * s0 .. e-1, looking for malformations.
1664+ *
1665+ * The order of malformation tests here is important. We should consume as
16561666 * few bytes as possible in order to not skip any valid character. This is
16571667 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
16581668 * https://unicode.org/reports/tr36 for more discussion as to why. For
@@ -1711,8 +1721,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
17111721
17121722 /* Here is not a continuation byte, nor an invariant. The only thing left
17131723 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1714- * because it excludes start bytes like \xC0 that always lead to
1715- * overlongs.) */
1724+ * to check for sure because it excludes start bytes like \xC0 that always
1725+ * lead to overlongs.) */
17161726
17171727 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
17181728 * that indicate the number of bytes in the character's whole UTF-8
@@ -1874,24 +1884,30 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
18741884 }
18751885 }
18761886 }
1877- }
1887+ } /* End of checking if is a special code point */
18781888
18791889 ready_to_handle_errors : ;
18801890
18811891 /* At this point:
1882- * curlen contains the number of bytes in the sequence that
1883- * this call should advance the input by.
1884- * avail_len gives the available number of bytes passed in, but
1885- * only if this is less than the expected number of
1886- * bytes, based on the code point's start byte.
1892+ * s0 points to the first byte of the character
1893+ * expectlen gives the number of bytes that the character is
1894+ * expected to occupy, based on the value of the
1895+ * presumed start byte in s0. This will be 0 if the
1896+ * sequence is empty, or 1 if s0 isn't actually a
1897+ * start byte.
1898+ * avail_len gives the number of bytes in the sequence this
1899+ * call can look at, one character's worth at most.
1900+ * curlen gives the number of bytes in the sequence that
1901+ * this call actually looked at. This is returned to
1902+ * the caller as the value they should advance the
1903+ * input by for the next call to this function.
18871904 * possible_problems is 0 if there weren't any problems; otherwise a bit
18881905 * is set in it for each potential problem found.
18891906 * uv contains the code point the input sequence
18901907 * represents; or if there is a problem that prevents
18911908 * a well-defined value from being computed, it is
18921909 * some substitute value, typically the REPLACEMENT
18931910 * CHARACTER.
1894- * s0 points to the first byte of the character
18951911 * s points to just after where we left off processing
18961912 * the character
18971913 * send points to just after where that character should
@@ -1902,20 +1918,86 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
19021918 bool success = true;
19031919
19041920 if (UNLIKELY (possible_problems )) {
1921+
1922+ /* Here, the input sequence is potentially problematic. The code here
1923+ * determines if that is indeed the case and how to handle it. The
1924+ * possible outcomes are:
1925+ * 1) substituting the Unicode REPLACEMENT CHARACTER as the
1926+ * translation for this input sequence; and/or
1927+ * 2) returning information about the problem to the caller in
1928+ * *errors and/or *msgs; and/or
1929+ * 3) raising appropriate warnings.
1930+ *
1931+ * There are two main categories of potential problems.
1932+ *
1933+ * a) One type is by default not considered to be a problem. These
1934+ * are for when the input was syntactically valid
1935+ * Perl-extended-UTF-8 for a code point that is representable on
1936+ * this platform, but that code point isn't considered by Unicode
1937+ * to be freely exchangeable between applications. To get here,
1938+ * code earlier in this function has determined both that this
1939+ * sequence is for such a code point, and that the 'flags'
1940+ * parameter indicates that these are to be considered
1941+ * problematic, meaning this sequence should be rejected, merely
1942+ * warned about, or both. *errors will be set for each of these.
1943+ *
1944+ * If the caller to this function has set the corresponding
1945+ * DISALLOW bit in 'flags', the translation of this sequence will
1946+ * be the Unicode REPLACEMENT CHARACTER.
1947+ *
1948+ * If the caller to this function has set the corresponding WARN
1949+ * bit in 'flags' potentially a warning message will be generated,
1950+ * using the rules common to both types of problems, and detailed
1951+ * below.
1952+ *
1953+ * b) The other type is considered by default to be problematic.
1954+ * There are three subclasses:
1955+ * 1) Some syntactic malformation meant that no code point could
1956+ * be calculated for the input. An example is that the
1957+ * sequence was incomplete, more bytes were called for than
1958+ * the input contained. The function returns the Unicode
1959+ * REPLACEMENT CHARACTER as the translation of these.
1960+ * 2) The sequence is legal Perl extended UTF-8, but is for a
1961+ * code point too large to be represented on this platform.
1962+ * The function returns the Unicode REPLACEMENT CHARACTER as
1963+ * the translation of these.
1964+ * 3) The sequence represents a code point which can also be
1965+ * represented by a shorter sequence. These have been
1966+ * declared illegal by Unicode fiat because they were being
1967+ * used as Trojan horses to successfully attack applications.
1968+ * One undocumented flag causes these to be accepted, but
1969+ * otherwise the function returns the Unicode REPLACEMENT
1970+ * CHARACTER as the translation of these.
1971+ *
1972+ * In all cases the corresponding bit in *errors is set. This is
1973+ * in contrast to the other type of problem where the input
1974+ * 'flags' affect if the bit is set or not.
1975+ *
1976+ * The default is to generate a warning for each of these. If the
1977+ * input 'flags' has a corresponding ALLOW flag, warnings are
1978+ * suppressed. The only other thing the ALLOW flags do is
1979+ * determine if the function returns sucess or failure
1980+ *
1981+ * For both types of problems, if warnings are called for by the input
1982+ * flags, also setting the UTF8_CHECK_ONLY flag overrides
1983+ * generating them. If 'msgs' is not NULL, they all will be returned
1984+ * there; otherwise they will be raised if warnings are enabled.
1985+ */
1986+
19051987 bool disallowed = FALSE;
19061988 const U32 orig_problems = possible_problems ;
19071989
1908- /* Returns 0 if no message needs to be generated for this problem even
1909- * if everything else says to. Otherwise returns the warning category
1910- * to use for the message.
1990+ /* The following macro returns 0 if no message needs to be generated
1991+ * for this problem even if everything else says to. Otherwise returns
1992+ * the warning category to use for the message. .
19111993 *
19121994 * No message need be generated if the UTF8_CHECK_ONLY flag has been
19131995 * set by the caller. Otherwise, a message should be generated if
19141996 * either:
19151997 * 1) the caller has furnished a structure into which messages should
19161998 * be returned to it (so it itself can decide what to do); or
1917- * 2) warnings are enabled for either of the category parameters to the
1918- * macro.
1999+ * 2) warnings are enabled for either of the category parameters to
2000+ * the macro.
19192001 *
19202002 * The 'warning' parameter is the higher priority warning category to
19212003 * check. The macro calls ckWARN_d(warning), so warnings for it are
@@ -1940,21 +2022,29 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
19402022 while (possible_problems ) { /* Handle each possible problem */
19412023 char * message = NULL ;
19422024
1943- /* Each 'case' handles one problem given by a bit in
1944- * 'possible_problems'. The lowest bit positions, as #defined in
1945- * utf8.h, are are handled first. Some of the ordering is
1946- * important so that higher priority items are done before lower
1947- * ones; some of which may depend on earlier actions. Also the
1948- * ordering tries to cause any messages to be displayed in kind of
1949- * decreasing severity order. But the overlong must come last, as
1950- * it changes 'uv' looked at by the others */
2025+ /* The lowest bit positions, as #defined in utf8.h, are handled
2026+ * first. Some of the ordering is important so that higher
2027+ * priority items are done before lower ones; some of which may
2028+ * depend on earlier actions. Also the ordering tries to cause any
2029+ * messages to be displayed in kind of decreasing severity order.
2030+ * But the overlong must come last, as it changes 'uv' looked at by
2031+ * the others */
19512032
19522033 U32 this_problem = 1U << lsbit_pos32 (possible_problems );
19532034
19542035 U32 this_flag_bit = this_problem ;
19552036
2037+ /* Turn off so next iteration doesn't retry this */
19562038 possible_problems &= ~this_problem ;
19572039
2040+ /* The code is structured so that there is a case: in a switch()
2041+ * for each problem type, so as to handle the different details of
2042+ * each. The only common part after setting things up is the
2043+ * handling of any generated warning message. That means that if a
2044+ * case: finds there is no message, it can 'continue' to the next
2045+ * loop iteration instead of doing a 'break', whose only purpose
2046+ * would be to handle the message. */
2047+
19582048 /* Most case:s use this; overridden in a few */
19592049 U32 pack_warn = packWARN (WARN_UTF8 );
19602050
@@ -2088,8 +2178,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
20882178 if (NEED_MESSAGE (WARN_NONCHAR ,,)) {
20892179 /* The code above should have guaranteed that we don't
20902180 * get here with errors other than overlong */
2091- assert (! (orig_problems
2092- & ~(UTF8_GOT_LONG |UTF8_GOT_NONCHAR )));
2181+ assert (! ( orig_problems
2182+ & ~(UTF8_GOT_LONG |UTF8_GOT_NONCHAR )));
20932183
20942184 pack_warn = packWARN (WARN_NONCHAR );
20952185 message = Perl_form (aTHX_ nonchar_cp_format , uv );
@@ -2301,8 +2391,8 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
23012391 }
23022392
23032393 av_push (* msgs , newRV_noinc ((SV * ) new_msg_hv (message ,
2304- pack_warn ,
2305- this_flag_bit )));
2394+ pack_warn ,
2395+ this_flag_bit )));
23062396 }
23072397 else if (PL_op )
23082398 Perl_warner (aTHX_ pack_warn , "%s in %s" , message ,
@@ -2323,7 +2413,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
23232413 success = false;
23242414 uv = UNICODE_REPLACEMENT ;
23252415 }
2326- }
2416+ } /* End of there was a possible problem */
23272417
23282418 * cp_p = UNI_TO_NATIVE (uv );
23292419 return success ;
0 commit comments