@@ -1801,7 +1801,11 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
18011801 * small code point, it is still using this Perl invention, so mark it
18021802 * as such */
18031803 if (UNLIKELY (UTF8_IS_PERL_EXTENDED (s0 ))) {
1804- possible_problems |= UTF8_GOT_SUPER ;
1804+ if (flags & ( UTF8_DISALLOW_PERL_EXTENDED |UTF8_DISALLOW_SUPER
1805+ |UTF8_WARN_PERL_EXTENDED |UTF8_WARN_SUPER ))
1806+ {
1807+ possible_problems |= UTF8_GOT_PERL_EXTENDED ;
1808+ }
18051809 }
18061810 else {
18071811 /* See if the input has malformations besides possibly overlong */
@@ -2239,56 +2243,177 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
22392243 }
22402244
22412245 break ;
2246+
2247+ /* The remaining cases all involve non-Unicode code points.
2248+ * These come in three increasingly restrictive flavors.
2249+ * SUPERs are simply all the ones above Unicode;
2250+ * PERL_EXTENDED_UTF8 are the subset of these that are
2251+ * expressed in a non-standard extension to UTF-8. Unless also
2252+ * overlong, these have a very high ordinal value. Finally
2253+ * OVERFLOWS are for such a high code point that they don't fit
2254+ * into the word size of this platform. Perl extended-UTF-8 is
2255+ * required to express code points this high. So an overflow
2256+ * is a member of all three flavors; besides overflowing, it
2257+ * also is using perl extended UTF-8 and is also plain
2258+ * non-Unicode.
2259+ *
2260+ * There are cases in this switch for each of the three types.
2261+ * Because they are related, there are tests of the input flags
2262+ * to see what combination of these require warnings and/or
2263+ * rejection. And there a jumps between the cases. The task
2264+ * is simpler because the code earlier in the function has set
2265+ * things up so that at most one problem flag bit is set for
2266+ * any of them, the most restrictive case the input matches.
2267+ * Also, for the non-overflow cases, there is no problem flag
2268+ * bit if the caller doesn't want special handling for it.
2269+ *
2270+ * Each type has its own warning category and text,
2271+ * corresponding to the specific problem. Whenever a warning
2272+ * is generated, it uses the one for the most dire type the
2273+ * code point fits into. Suppose the flags say we warn on all
2274+ * non-Unicode code points, but not on overflowing and we get a
2275+ * code point too large for the platform. The generated
2276+ * warning will be the text that says it overflowed, while the
2277+ * returned bit will be for the SUPER type. To accomplish
2278+ * this, the formats are shared between the cases. 'cp_format'
2279+ * is used if there is a specific representable code point that
2280+ * the input translates to; if not, instead a more generic
2281+ * format, 'non_cp_format' is used */
2282+ const char * cp_format ;
2283+ const char * non_cp_format ;
2284+
22422285 case UTF8_GOT_OVERFLOW :
2286+ uv = UNICODE_REPLACEMENT ; /* Can't represent this on this
2287+ platform */
2288+ /* For this overflow case, any format and message text are set
2289+ * up to create the warning for it. If overflows are to be
2290+ * rejected, the warning is simply created, and we break to the
2291+ * end of the switch() (where code common to all cases will
2292+ * finish the job). Otherwise it looks to see if either the
2293+ * perl-extended or plain super cases are supposed to handle
2294+ * things. If so, it jumps into the code of the most
2295+ * restrictive one so that that they will use this more dire
2296+ * warning. If neither handle it, the code just breaks; doing
2297+ * nothing. */
2298+ non_cp_format = MALFORMED_TEXT ": %s (overflows)" ;
2299+
2300+ /* We can't exactly specify such a large code point, so can't
2301+ * output it */
2302+ cp_format = NULL ;
2303+
2304+ /* In the unlikely case that the caller has asked to "allow"
2305+ * this malformation, we transfer to the next lower severity of
2306+ * code that handles the case; or just 'break' if none. */
2307+ if (UNLIKELY (flags & UTF8_ALLOW_OVERFLOW )) {
2308+ if (flags & ( UTF8_DISALLOW_PERL_EXTENDED
2309+ |UTF8_WARN_PERL_EXTENDED ))
2310+ {
2311+ this_flag_bit = UTF8_GOT_PERL_EXTENDED ;
2312+ goto join_perl_extended ;
2313+ }
2314+ if (flags & (UTF8_DISALLOW_SUPER |UTF8_WARN_SUPER )) {
2315+ this_flag_bit = UTF8_GOT_SUPER ;
2316+ goto join_plain_supers ;
2317+ }
22432318
2244- /* Overflow means also got a super and are using Perl's
2245- * extended UTF-8, but we handle all three cases here */
2246- possible_problems &= ~(UTF8_GOT_SUPER |UTF8_GOT_PERL_EXTENDED );
2247- uv = UNICODE_REPLACEMENT ;
2319+ break ;
2320+ }
22482321
2249- /* But the API says we flag all errors found */
2250- if (flags & (UTF8_WARN_SUPER |UTF8_DISALLOW_SUPER )) {
2251- error_flags_return |= UTF8_GOT_SUPER ;
2322+ /* Here, overflow is disallowed; handle everything in this
2323+ * case: */
2324+ disallowed = true;
2325+
2326+ /* Overflow is a hybrid. If the word size on this platform
2327+ * were wide enough for this to not overflow, a non-Unicode
2328+ * code point would have been generated. If the caller wanted
2329+ * warnings for such code points, the warning category would be
2330+ * WARN_NON_UNICODE, On the other hand, overflow is considered
2331+ * a malformation, which is serious, and the category would be
2332+ * just WARN_UTF8. We clearly should warn if either category
2333+ * is enabled, but which category to use? Historically, we've
2334+ * used 'utf8' if it is enabled; and that seems like the more
2335+ * severe category, more befitting a malformation. */
2336+ pack_warn = NEED_MESSAGE (WARN_UTF8 , ckWARN_d , WARN_NON_UNICODE );
2337+ if (pack_warn ) {
2338+ message = Perl_form (aTHX_ non_cp_format ,
2339+ _byte_dump_string (s0 , curlen , 0 ));
22522340 }
2253- if (flags
2254- & (UTF8_WARN_PERL_EXTENDED |UTF8_DISALLOW_PERL_EXTENDED ))
2341+
2342+ /* But the API says we flag all errors found that the calling
2343+ * flags indicate should be */
2344+ if (flags & ( UTF8_WARN_PERL_EXTENDED
2345+ |UTF8_DISALLOW_PERL_EXTENDED ))
22552346 {
22562347 error_flags_return |= UTF8_GOT_PERL_EXTENDED ;
22572348 }
2258-
2259- /* Disallow if any of the three categories say to */
2260- if ( ! (flags & UTF8_ALLOW_OVERFLOW )
2261- || (flags & ( UTF8_DISALLOW_SUPER
2262- |UTF8_DISALLOW_PERL_EXTENDED )))
2263- {
2264- disallowed = TRUE;
2349+ if (flags & (UTF8_WARN_SUPER |UTF8_DISALLOW_SUPER )) {
2350+ error_flags_return |= UTF8_GOT_SUPER ;
22652351 }
22662352
2267- /* Likewise, warn if any say to */
2268- if ( ! (flags & UTF8_ALLOW_OVERFLOW )
2269- || (flags & (UTF8_WARN_SUPER |UTF8_WARN_PERL_EXTENDED )))
2353+ break ;
2354+
2355+ case UTF8_GOT_PERL_EXTENDED :
2356+
2357+ /* We get here when the input uses Perl extended UTF-8, and the
2358+ * caller has indicated that above-Unicode code points (of
2359+ * which these are a subset) are to be disallowed and/or warned
2360+ * about
2361+ *
2362+ * Set up the formats. We can include the code point in the
2363+ * message if we have an exact one (input not too short) and
2364+ * it's not an overlong that reduces down to something too low.
2365+ * (Otherwise, the message could say something untrue like
2366+ * "Code point 0x41 is not Unicode ...". But this would be a
2367+ * lie; 0x41 is Unicode. It was expressed in a non-standard
2368+ * form of UTF-8 that Unicode doesn't approve of.) */
2369+ cp_format = ( (orig_problems & (UTF8_GOT_TOO_SHORT ))
2370+ || ! UNICODE_IS_PERL_EXTENDED (uv ))
2371+ ? NULL
2372+ : PL_extended_cp_format ;
2373+ non_cp_format = "Any UTF-8 sequence that starts with \"%s\""
2374+ " is a Perl extension, and so is not portable" ;
2375+
2376+ /* We know here that the caller indicated at least one of the
2377+ * EXTENDED or SUPER flags. If it's not EXTENDED, use SUPER */
2378+ if (! (flags & ( UTF8_DISALLOW_PERL_EXTENDED
2379+ |UTF8_WARN_PERL_EXTENDED )))
22702380 {
2381+ this_flag_bit = UTF8_GOT_SUPER ;
2382+ }
22712383
2272- /* Overflow is a hybrid. If the word size on this platform
2273- * were wide enough for this to not overflow, a non-Unicode
2274- * code point would have been generated. If the caller
2275- * wanted warnings for such code points, the warning
2276- * category would be WARN_NON_UNICODE, On the other hand,
2277- * overflow is considered a malformation, which is serious,
2278- * and the category would be just WARN_UTF8. We clearly
2279- * should warn if either category is enabled, but which
2280- * category to use? Historically, we've used 'utf8' if it
2281- * is enabled; and that seems like the more severe
2282- * category, more befitting a malformation. */
2283- pack_warn = NEED_MESSAGE (WARN_UTF8 ,
2284- ckWARN_d , WARN_NON_UNICODE );
2285- if (pack_warn ) {
2286- message = Perl_form (aTHX_ MALFORMED_TEXT
2287- ": %s (overflows)" ,
2288- _byte_dump_string (s0 , curlen , 0 ));
2384+ join_perl_extended :
2385+
2386+ /* Here this level is to warn, reject, or both. The format has
2387+ * been set up to be for this level, or maybe the overflow
2388+ * case set up a more dire warning and jumped to the label just
2389+ * above (after determining that warning/rejecting here was
2390+ * enabled). We warn at this level if either it is supposed to
2391+ * warn, or plain supers are supposed to. In the latter case,
2392+ * we get this higher severity warning */
2393+ if (flags & (UTF8_WARN_PERL_EXTENDED |UTF8_WARN_SUPER )) {
2394+ error_flags_return |= this_flag_bit ;
2395+
2396+ /* These code points are non-portable, so warn if either
2397+ * category is enabled */
2398+ if (NEED_MESSAGE (WARN_NON_UNICODE , ckWARN , WARN_PORTABLE )) {
2399+ pack_warn = packWARN2 (WARN_NON_UNICODE , WARN_PORTABLE );
2400+ if (cp_format ) {
2401+ message = Perl_form (aTHX_ cp_format , uv );
2402+ }
2403+ else {
2404+ message = Perl_form (aTHX_
2405+ non_cp_format ,
2406+ _byte_dump_string (s0 , curlen , 0 ));
2407+ }
22892408 }
22902409 }
22912410
2411+ /* Similarly if either of the two levels reject this, do it */
2412+ if (flags & (UTF8_DISALLOW_PERL_EXTENDED |UTF8_DISALLOW_SUPER )) {
2413+ disallowed = true;
2414+ error_flags_return |= this_flag_bit ;
2415+ }
2416+
22922417 break ;
22932418
22942419 case UTF8_GOT_SUPER :
@@ -2298,71 +2423,41 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0,
22982423 * caller has indicated that these are to be disallowed and/or
22992424 * warned about */
23002425
2301- if (flags & UTF8_WARN_SUPER ) {
2302- error_flags_return |= UTF8_GOT_SUPER ;
2426+ non_cp_format = "Any UTF-8 sequence that starts with \"%s\""
2427+ " is for a non-Unicode code point, may not be"
2428+ " portable" ;
23032429
2304- if (NEED_MESSAGE (WARN_NON_UNICODE ,,)) {
2305- pack_warn = packWARN (WARN_NON_UNICODE );
2430+ /* We can include the code point in the message if we have an
2431+ * exact one (input not too short) */
2432+ cp_format = (orig_problems & (UTF8_GOT_TOO_SHORT ))
2433+ ? NULL
2434+ : super_cp_format ;
23062435
2307- if (orig_problems & UTF8_GOT_TOO_SHORT ) {
2308- message = Perl_form (aTHX_
2309- "Any UTF-8 sequence that starts with"
2310- " \"%s\" is for a non-Unicode code point,"
2311- " may not be portable" ,
2312- _byte_dump_string (s0 , curlen , 0 ));
2313- }
2314- else {
2315- message = Perl_form (aTHX_ super_cp_format , uv );
2316- }
2317- }
2318- }
2436+ join_plain_supers :
23192437
2320- /* Test for Perl's extended UTF-8 after the regular SUPER ones,
2321- * and before possibly bailing out, so that the more dire
2322- * warning will override the regular one. */
2323- if (UNLIKELY (UTF8_IS_PERL_EXTENDED (s0 ))) {
2324- if ( (flags & (UTF8_WARN_PERL_EXTENDED |UTF8_WARN_SUPER ))
2325- && NEED_MESSAGE (WARN_NON_UNICODE , ckWARN , WARN_PORTABLE ))
2326- {
2327- pack_warn = packWARN2 (WARN_NON_UNICODE , WARN_PORTABLE );
2328-
2329- /* If it is an overlong that evaluates to a code point
2330- * that doesn't have to use the Perl extended UTF-8, it
2331- * still used it, and so we output a message that
2332- * doesn't refer to the code point. The same is true
2333- * if there was a SHORT malformation where the code
2334- * point is not valid. In that case, 'uv' will have
2335- * been set to the REPLACEMENT CHAR, and the message
2336- * below without the code point in it will be selected
2337- * */
2338- if (UNICODE_IS_PERL_EXTENDED (uv )) {
2339- message = Perl_form (aTHX_
2340- PL_extended_cp_format , uv );
2438+ /* Here this level is to warn, reject, or both. The format has
2439+ * been set up to be for this level, or maybe the overflow
2440+ * case set up a more dire warning and jumped to the label just
2441+ * above (after determining that warning/rejecting here was
2442+ * enabled). */
2443+ if (flags & UTF8_WARN_SUPER ) {
2444+ error_flags_return |= this_flag_bit ;
2445+ if (NEED_MESSAGE (WARN_NON_UNICODE ,,)) {
2446+ pack_warn = packWARN (WARN_NON_UNICODE );
2447+ if (cp_format ) {
2448+ message = Perl_form (aTHX_ cp_format , uv );
23412449 }
23422450 else {
23432451 message = Perl_form (aTHX_
2344- "Any UTF-8 sequence that starts with"
2345- " \"%s\" is a Perl extension, and"
2346- " so is not portable" ,
2347- _byte_dump_string (s0 , curlen , 0 ));
2348- }
2349- this_flag_bit = UTF8_GOT_PERL_EXTENDED ;
2350- }
2351-
2352- if (flags & ( UTF8_WARN_PERL_EXTENDED
2353- |UTF8_DISALLOW_PERL_EXTENDED ))
2354- {
2355- error_flags_return |= UTF8_GOT_PERL_EXTENDED ;
2356-
2357- if (flags & UTF8_DISALLOW_PERL_EXTENDED ) {
2358- disallowed = TRUE;
2452+ non_cp_format ,
2453+ _byte_dump_string (s0 , curlen , 0 ));
23592454 }
23602455 }
23612456 }
23622457
23632458 if (flags & UTF8_DISALLOW_SUPER ) {
2364- error_flags_return |= UTF8_GOT_SUPER ;
2365- disallowed = TRUE ;
2459+ error_flags_return |= this_flag_bit ;
2460+ disallowed = true ;
23662461 }
23672462
23682463 break ;
0 commit comments