From 29d187e50785098e9184511e86da7dd20412370b Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 5 Nov 2025 07:17:00 -0700 Subject: [PATCH 1/8] perl.h: Comments/white space only Clarify some comments. --- perl.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/perl.h b/perl.h index f63e298a3240..2b67ee55809b 100644 --- a/perl.h +++ b/perl.h @@ -8313,7 +8313,11 @@ EXTERN_C int flock(int fd, int op); /* Number scan flags. All are used for input, the ones used for output are so * marked */ -#define PERL_SCAN_ALLOW_UNDERSCORES 0x01 /* grok_??? accept _ in numbers */ + +/* grok_??? accept a stand-alone underscore initially or between digits in + * numbers */ +#define PERL_SCAN_ALLOW_UNDERSCORES 0x01 + #define PERL_SCAN_DISALLOW_PREFIX 0x02 /* grok_??? reject 0x in hex etc */ /* grok_??? input: ignored; output: found overflow */ @@ -8324,21 +8328,21 @@ EXTERN_C int flock(int fd, int op); * PERL_SCAN_NOTIFY_ILLDIGIT. */ #define PERL_SCAN_SILENT_ILLDIGIT 0x08 -#define PERL_SCAN_TRAILING 0x10 /* grok_number_flags() allow trailing - and set IS_NUMBER_TRAILING */ +/* grok_number_flags() allow trailing and set IS_NUMBER_TRAILING */ +#define PERL_SCAN_TRAILING 0x10 /* These are considered experimental, so not exposed publicly */ #if defined(PERL_CORE) || defined(PERL_EXT) /* grok_??? don't warn about very large numbers which are <= UV_MAX; * output: found such a number */ -# define PERL_SCAN_SILENT_NON_PORTABLE 0x20 +# define PERL_SCAN_SILENT_NON_PORTABLE 0x20 /* If this is set on input, and no illegal digit is found, it will be cleared * on output; otherwise unchanged */ -# define PERL_SCAN_NOTIFY_ILLDIGIT 0x40 +# define PERL_SCAN_NOTIFY_ILLDIGIT 0x40 /* Don't warn on overflow; output flag still set */ -# define PERL_SCAN_SILENT_OVERFLOW 0x80 +# define PERL_SCAN_SILENT_OVERFLOW 0x80 /* Forbid a leading underscore, which the other one doesn't */ # define PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES (0x100|PERL_SCAN_ALLOW_UNDERSCORES) From 080bee0129d72602142b8f0b00b30f211154011e Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 5 Nov 2025 06:34:19 -0700 Subject: [PATCH 2/8] grok_bin_oct_hex: Shortcut leading zeros I noticed that leading zeros are quite common for octal and hex constants. This code is structured for speed, with a partially unrolled loop structured so that it is impossible to overflow the unrolled part. If we get to the end of the unrolled portion, and the accumulated value is still zero, it's because there have been only leading zeroes so far, and instead of dropping into the loop, we can re-enter the unrolled part without having to consider the possibility of overflowing. This allows the next chunk of digits to be processed without branching. --- numeric.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/numeric.c b/numeric.c index 351435cbc627..cbcb786f1a21 100644 --- a/numeric.c +++ b/numeric.c @@ -427,6 +427,7 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, /* Unroll the loop so that the first 8 digits are branchless except for the * switch. A ninth hex one overflows a 32 bit word. */ + redo_switch: switch (e - s) { default: if (UNLIKELY(! generic_isCC_(*s, class_bit))) break; @@ -473,6 +474,15 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, return value; } + /* If we get here, and the accumulated value is still 0, it is + * because there are more leading zeros than the cases of this + * switch(), These are common enough with these kinds of + * binary-style numbers that it is worth this extra conditional to + * continue absorbing them via the switch. */ + if (value == 0) { + goto redo_switch; + } + break; } @@ -549,7 +559,16 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, != PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES))) { ++s; - goto redo; + + /* To get here with the value so-far being 0 means we've only had + * leading zeros, then an underscore. We can continue with the + * branchless switch() instead of this loop */ + if (value == 0) { + goto redo_switch; + } + else { + goto redo; + } } if (*s) { From 6fa95f2ce2c01ec71852165474e9c39594202c45 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 5 Nov 2025 07:23:12 -0700 Subject: [PATCH 3/8] grok_bin_oct_hex: Simplify flag It's easier to understand and code if each flag name is for a single flag. --- numeric.c | 7 ++++--- perl.h | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/numeric.c b/numeric.c index cbcb786f1a21..fab4d66e3953 100644 --- a/numeric.c +++ b/numeric.c @@ -400,7 +400,8 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, *flags = 0; const bool allow_underscores = - cBOOL(input_flags & PERL_SCAN_ALLOW_UNDERSCORES); + cBOOL(input_flags & ( PERL_SCAN_ALLOW_UNDERSCORES + |PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES)); const char * s = start; const char * e = start + *len_p; @@ -555,8 +556,8 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, /* Don't allow a leading underscore if the only-medial bit is * set */ && ( LIKELY(s > s0) - || UNLIKELY((input_flags & PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES) - != PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES))) + || UNLIKELY(! ( input_flags + & PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES)))) { ++s; diff --git a/perl.h b/perl.h index 2b67ee55809b..c4c8577aed35 100644 --- a/perl.h +++ b/perl.h @@ -8345,7 +8345,8 @@ EXTERN_C int flock(int fd, int op); # define PERL_SCAN_SILENT_OVERFLOW 0x80 /* Forbid a leading underscore, which the other one doesn't */ -# define PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES (0x100|PERL_SCAN_ALLOW_UNDERSCORES) +# define PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES 0x100 + #endif From 5b88501374e3c9c6cd272363830f21ae7d507b87 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 5 Nov 2025 07:35:49 -0700 Subject: [PATCH 4/8] grok_bin_oct_hex: Change name of internal flag This clarifies its meaning --- numeric.c | 4 ++-- perl.h | 4 ++-- regcomp.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/numeric.c b/numeric.c index fab4d66e3953..c5ac72fef456 100644 --- a/numeric.c +++ b/numeric.c @@ -401,7 +401,7 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, const bool allow_underscores = cBOOL(input_flags & ( PERL_SCAN_ALLOW_UNDERSCORES - |PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES)); + |PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES_ONLY)); const char * s = start; const char * e = start + *len_p; @@ -557,7 +557,7 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, * set */ && ( LIKELY(s > s0) || UNLIKELY(! ( input_flags - & PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES)))) + & PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES_ONLY)))) { ++s; diff --git a/perl.h b/perl.h index c4c8577aed35..8707b00ad288 100644 --- a/perl.h +++ b/perl.h @@ -8344,8 +8344,8 @@ EXTERN_C int flock(int fd, int op); /* Don't warn on overflow; output flag still set */ # define PERL_SCAN_SILENT_OVERFLOW 0x80 -/* Forbid a leading underscore, which the other one doesn't */ -# define PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES 0x100 +/* grok_??? accept a stand-alone underscore between digits only in numbers */ +# define PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES_ONLY 0x100 #endif diff --git a/regcomp.c b/regcomp.c index c6a2e6ee0e14..56d3bb7986b8 100644 --- a/regcomp.c +++ b/regcomp.c @@ -5352,7 +5352,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, I32 flags = PERL_SCAN_SILENT_OVERFLOW | PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_NOTIFY_ILLDIGIT - | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES + | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES_ONLY | PERL_SCAN_DISALLOW_PREFIX; STRLEN len = e - RExC_parse; NV overflow_value; From 49ddb48f9c3adde9a315270a2982175999f6d552 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 5 Nov 2025 19:39:55 -0700 Subject: [PATCH 5/8] grok_bin_oct_hex: fix broken return flags Apparently no one has tried to use these before. These flags are to suppress the display of certain warnings, but to instead return that the suppression happened in output flags. The output flags were not getting set. I'm not adding a separate test, because a future commit will cause this feature to be used regularly. --- numeric.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/numeric.c b/numeric.c index c5ac72fef456..c667a9e7c9af 100644 --- a/numeric.c +++ b/numeric.c @@ -534,9 +534,10 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, if (! overflowed) { overflowed = TRUE; - if ( ! (input_flags & PERL_SCAN_SILENT_OVERFLOW) - && ckWARN_d(WARN_OVERFLOW)) - { + if (input_flags & PERL_SCAN_SILENT_OVERFLOW) { + *flags |= PERL_SCAN_SILENT_OVERFLOW; + } + else if (ckWARN_d(WARN_OVERFLOW)) { warner(packWARN(WARN_OVERFLOW), "Integer overflow in %s number", (base == 16) ? "hexadecimal" @@ -608,10 +609,10 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, if (LIKELY(! overflowed)) { #if UVSIZE > 4 - if ( UNLIKELY(value > 0xffffffff) - && ! (input_flags & PERL_SCAN_SILENT_NON_PORTABLE)) - { - output_non_portable(base); + if (UNLIKELY(value > 0xffffffff)) { + if (! (input_flags & PERL_SCAN_SILENT_NON_PORTABLE)) { + output_non_portable(base); + } *flags |= PERL_SCAN_SILENT_NON_PORTABLE; } #endif From c202865100db4deb895f8156bb8d390203629802 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 9 Nov 2025 18:02:40 -0700 Subject: [PATCH 6/8] grok_bin_oct_hex: Avoid two recalculations We already have this value in a variable. --- numeric.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numeric.c b/numeric.c index c667a9e7c9af..1214c699e5df 100644 --- a/numeric.c +++ b/numeric.c @@ -513,7 +513,7 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, /* Note XDIGIT_VALUE() is branchless, works on binary * and octal as well, so can be used here, without * slowing those down */ - factor *= 1 << shift; + factor *= base; continue; } @@ -530,7 +530,7 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, * 'value_nv' eventually, either when all digits are gone, or we * have overflowed this fresh start. */ value = XDIGIT_VALUE(*s); - factor = 1 << shift; + factor = base; if (! overflowed) { overflowed = TRUE; From bd5b5b5b38427737898315487082a3b06dd18f4b Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 9 Nov 2025 18:04:55 -0700 Subject: [PATCH 7/8] grok_bin_oct_hex: Improve overflow detection The new scheme is is what toke.c already uses for the same purpose, and I think is cleaner --- numeric.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/numeric.c b/numeric.c index 1214c699e5df..4094b53915b8 100644 --- a/numeric.c +++ b/numeric.c @@ -497,8 +497,6 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, bool overflowed = FALSE; NV value_nv = 0; const PERL_UINT_FAST8_T base = 1 << shift; /* 2, 8, or 16 */ - const UV max_div= UV_MAX / base; /* Value above which, the next digit - processed would overflow */ for (; s < e; s++) { if (generic_isCC_(*s, class_bit)) { @@ -507,9 +505,16 @@ Perl_grok_bin_oct_hex(pTHX_ const char *start, With gcc seems to be much straighter code than old scan_hex. (khw suspects that adding a LIKELY() just above would do the same thing) */ - redo: - if (LIKELY(value <= max_div)) { - value = (value << shift) | XDIGIT_VALUE(*s); + redo: ; + + /* Make room for the next digit */ + UV tentative_value = value << shift; + + /* If shiftng back doesn't yield the previous value, it was + * because a bit got shifted off the left end, so overflowed. + * But if it worked, add the new digit. */ + if (LIKELY((tentative_value >> shift) == value)) { + value = tentative_value | XDIGIT_VALUE(*s); /* Note XDIGIT_VALUE() is branchless, works on binary * and octal as well, so can be used here, without * slowing those down */ From bbd568b0276cf40119b5dd32a315fb62d1c7af41 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 9 Nov 2025 18:09:01 -0700 Subject: [PATCH 8/8] grok_bin_oct_hex: Make a parameter const This prevents a future maintainer from accidentally changing this pointer inside the function, where its original value is needed at the end --- embed.fnc | 2 +- numeric.c | 2 +- proto.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/embed.fnc b/embed.fnc index 8eb86869bbee..7e6ea1afe56d 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1404,7 +1404,7 @@ AMdp |UV |grok_bin |NN const char *start \ |NN I32 *flags \ |NULLOK NV *result Cp |UV |grok_bin_oct_hex \ - |NN const char *start \ + |NN const char * const start \ |NN STRLEN *len_p \ |NN I32 *flags \ |NULLOK NV *result \ diff --git a/numeric.c b/numeric.c index 4094b53915b8..4ed6a1409bda 100644 --- a/numeric.c +++ b/numeric.c @@ -365,7 +365,7 @@ S_output_non_portable(pTHX_ const U8 base) } UV -Perl_grok_bin_oct_hex(pTHX_ const char *start, +Perl_grok_bin_oct_hex(pTHX_ const char * const start, STRLEN *len_p, I32 *flags, NV *result, diff --git a/proto.h b/proto.h index 7aabf8d17f1c..acd96ff396ad 100644 --- a/proto.h +++ b/proto.h @@ -1263,7 +1263,7 @@ Perl_grok_bin(pTHX_ const char *start, STRLEN *len_p, I32 *flags, NV *result); assert(start); assert(len_p); assert(flags) PERL_CALLCONV UV -Perl_grok_bin_oct_hex(pTHX_ const char *start, STRLEN *len_p, I32 *flags, NV *result, const unsigned shift, const U8 lookup_bit, const char prefix); +Perl_grok_bin_oct_hex(pTHX_ const char * const start, STRLEN *len_p, I32 *flags, NV *result, const unsigned shift, const U8 lookup_bit, const char prefix); #define PERL_ARGS_ASSERT_GROK_BIN_OCT_HEX \ assert(start); assert(len_p); assert(flags)