@@ -10750,12 +10750,22 @@ Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STR
1075010750 return s ;
1075110751}
1075210752
10753- /* scan s and extract an identifier ($var) from it if possible
10754- * into dest.
10753+ /* scan 's' and extract an identifier ($var) from it into 'dest' if possible.
10754+ *
10755+ * Unlike S_parse_ident which looks for the more usual types of identifiers
10756+ * (and which this calls if needed), this looks for every possible identifier
10757+ * type, such as punctuation ones.
10758+ *
10759+ * It returns a pointer into the input buffer pointing to just after all the
10760+ * bytes this function consumed; or croaks if an invalid identifier is found.
10761+ *
1075510762 * XXX: This function has subtle implications on parsing, and
1075610763 * changing how it behaves can cause a variable to change from
1075710764 * being a run time rv2sv call or a compile time binding to a
1075810765 * specific variable name.
10766+ *
10767+ * Use the CHECK_UNARY flag to cause this to look for ambiguities with unary
10768+ * operators.
1075910769 */
1076010770STATIC char *
1076110771S_scan_ident (pTHX_ char * s , char * dest , char * dest_end , U32 flags )
@@ -10781,14 +10791,16 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1078110791 d = dest ;
1078210792
1078310793 if (* d ) {
10794+
1078410795 /* Here parse_ident() found a digit variable or an identifier
1078510796 (anything valid as a bareword), so job done and return. */
1078610797 if (PL_lex_state != LEX_NORMAL )
1078710798 PL_lex_state = LEX_INTERPENDMAYBE ;
1078810799 return s ;
1078910800 }
1079010801
10791- /* Here, it is not a run-of-the-mill identifier name */
10802+ /* Here, it is not a run-of-the-mill identifier name; maybe not an
10803+ * identifier at all. Note *d is a NUL */
1079210804
1079310805 if (* s == '$' && s [1 ]
1079410806 && ( isIDFIRST_lazy_if_safe (s + 1 , PL_bufend , is_utf8 )
@@ -10803,21 +10815,25 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1080310815 return s ;
1080410816 }
1080510817
10806- /* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...} */
10818+ /* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...}
10819+ * Skip to the first non-space past the brace */
1080710820 if (* s == '{' ) {
10821+ /* 'bracket' becomes the offset from the beginning of this chunk */
1080810822 bracket = s - SvPVX (PL_linestr );
10823+
1080910824 s ++ ;
1081010825 orig_copline = CopLINE (PL_curcop );
1081110826 if (s < PL_bufend && isSPACE (* s )) {
1081210827 s = skipspace (s );
1081310828 }
1081410829 }
1081510830
10816- /* Extract the first character of the variable name from 's' and
10817- * copy it, null terminated into 'd'. Note that this does not
10818- * involve checking for just IDFIRST characters, as it allows the
10819- * '^' for ${^FOO} type variable names, and it allows all the
10820- * characters that are legal in a single character variable name.
10831+ /* Here, 's' points to the next "interesting" character.
10832+ * Extract the first character of the potential variable name from 's' and
10833+ * copy it, NUL terminated, into 'd'. Note that this does not involve
10834+ * checking for just IDFIRST characters, as it allows the '^' for ${^FOO}
10835+ * type variable names, and it allows all the characters that are legal in
10836+ * a single character variable name.
1082110837 *
1082210838 * The legal ones are any of:
1082310839 * a) all ASCII characters except:
@@ -10841,59 +10857,86 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1084110857 && LIKELY ((U8 ) * s != LATIN1_TO_NATIVE (0xAD ))))))
1084210858 {
1084310859 if (is_utf8 ) {
10844- const STRLEN skip = UTF8SKIP (s );
10845- STRLEN i ;
10846- d [skip ] = '\0' ;
10847- for ( i = 0 ; i < skip ; i ++ )
10848- d [i ] = * s ++ ;
10860+ const STRLEN skip = UTF8SKIP (s );
10861+ STRLEN i ;
10862+ d [skip ] = '\0' ;
10863+ for ( i = 0 ; i < skip ; i ++ )
10864+ d [i ] = * s ++ ;
1084910865 }
1085010866 else {
1085110867 * d = * s ++ ;
1085210868 d [1 ] = '\0' ;
1085310869 }
1085410870 }
1085510871
10856- /* special case to handle ${10}, ${11} the same way we handle $1 etc */
10872+ /* 'd' has not been advanced, but if 's' pointed to a legal identifier
10873+ * character, it has been advanced to the next character, and the
10874+ * character it previously pointed to has been copied to where 'd'
10875+ * continues to point to.
10876+ *
10877+ * If that copied character is a digit, it means we have something like
10878+ * ${10}, ${1547}, etc. Handle those the same way we handle $1, etc */
1085710879 if (isDIGIT (* d )) {
1085810880 s = parse_ident (s - 1 , PL_bufend , & d , e , is_utf8 ,
1085910881 STOP_AT_FIRST_NON_DIGIT );
1086010882
1086110883 /* The code below is expecting d to point to the final digit */
1086210884 d -- ;
1086310885 }
10864-
10865- /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */
10866- else if (* d == '^' && * s && isCONTROLVAR (* s )) {
10886+ else /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */
10887+ if (* d == '^' && * s && isCONTROLVAR (* s )) {
1086710888 * d = toCTRL (* s );
1086810889 s ++ ;
1086910890 }
10870- /* Warn about ambiguous code after unary operators if {...} notation isn't
10871- used. There's no difference in ambiguity; it's merely a heuristic
10872- about when not to warn. */
10873- else if (chk_unary && bracket == -1 )
10891+ else /* Warn about ambiguous code after unary operators if {...} notation
10892+ isn't used. There's no difference in ambiguity; it's merely a
10893+ heuristic about when not to warn. */
10894+ if (chk_unary && bracket == -1 ) {
1087410895 check_unary ();
10896+ }
10897+
10898+ /* Here, 's' points to the next "interesting" character to be parsed. And
10899+ * *d points to the first byte of the final so-far parsed and copied
10900+ * character. This is one of four things:
10901+ * 1) The only byte of the final character of an all-digit numeric
10902+ * variable inside braces. e.g. if the input is ${ 123 }, '123' has
10903+ * been copied to 'dest', and 'd' points to the '3'. We don't know
10904+ * yet if there is a closing brace.
10905+ * 2) A control character
10906+ * 3) The first (or only) byte of some other identifier
10907+ * 4) *d is NUL for anything else.
10908+ */
1087510909
10876- if (bracket != -1 ) {
10910+ if (bracket != -1 ) { /* Found a '{' */
1087710911 bool skip ;
1087810912 char * s2 ;
10879- /* If we were processing {...} notation then... */
10913+
10914+ /* Handle the interior of braces. First look to see if the character
10915+ * pointed to by 'd' is legal as the start of an identifier.
10916+ * If it isn't a normal identifier, it could be a control-character
10917+ * one. Those have to be followed by a \w character. Prefer a normal
10918+ * identifier, as UTF-8 strings could erroneously be conflated with a
10919+ * control character identifier. */
1088010920 if ( isIDFIRST_lazy_if_safe (d , e , is_utf8 )
1088110921 || ( ! isPRINT (* d ) /* isCNTRL(d), plus all non-ASCII */
1088210922 && isWORDCHAR (* s ))
1088310923 ) {
10884- /* note we have to check for a normal identifier first,
10885- * as it handles utf8 symbols, and only after that has
10886- * been ruled out can we look at the caret words */
1088710924 Size_t advance ;
1088810925 if ((advance = isIDFIRST_lazy_if_safe (d , e , is_utf8 ) )) {
10889- /* if it starts as a valid identifier, assume that it is one.
10890- (the later check for } being at the expected point will trap
10891- cases where this doesn't pan out.) */
10926+
10927+ /* Now parse the normal identifier.
10928+ *
10929+ * khw: The code below is buggy because we already have parsed
10930+ * and copied the first character of it. The next character
10931+ * could be any IDCONT one, not just an IDFIRST */
1089210932 d += advance ;
1089310933 s = parse_ident (s , PL_bufend , & d , e , is_utf8 ,
1089410934 (ALLOW_PACKAGE | CHECK_DOLLAR ));
1089510935 }
1089610936 else { /* caret word: ${^Foo} ${^CAPTURE[0]} */
10937+
10938+ /* Now parse the control character identifier. Again, we have
10939+ * already copied the first character. */
1089710940 d ++ ;
1089810941 while (isWORDCHAR (* s ) && d < e ) {
1089910942 * d ++ = * s ++ ;
@@ -10902,12 +10945,15 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1090210945 croak ("%s" , ident_too_long );
1090310946 * d = '\0' ;
1090410947 }
10948+
1090510949 tmp_copline = CopLINE (PL_curcop );
1090610950 if (s < PL_bufend && isSPACE (* s )) {
1090710951 s = skipspace (s );
1090810952 }
10909- if ((* s == '[' || (* s == '{' && strNE (dest , "sub" )))) {
10953+
10954+ if (* s == '[' || (* s == '{' && strNE (dest , "sub" ))) {
1091010955 /* ${foo[0]} and ${foo{bar}} and ${^CAPTURE[0]} notation. */
10956+
1091110957 if (ckWARN (WARN_AMBIGUOUS ) && keyword (dest , d - dest , 0 )) {
1091210958 const char * const brack =
1091310959 (const char * )
@@ -10929,6 +10975,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1092910975
1093010976 if ( !tmp_copline )
1093110977 tmp_copline = CopLINE (PL_curcop );
10978+
1093210979 if ((skip = s < PL_bufend && isSPACE (* s ))) {
1093310980 /* Avoid incrementing line numbers or resetting PL_linestart,
1093410981 in case we have to back up. */
@@ -10939,10 +10986,9 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1093910986 else
1094010987 s2 = s ;
1094110988
10942- /* Expect to find a closing } after consuming any trailing whitespace.
10943- */
10944- if (* s2 == '}' ) {
10945- /* Now increment line numbers if applicable. */
10989+ /* Expect to find a closing '}' after consuming any trailing
10990+ * whitespace. */
10991+ if (* s2 == '}' ) { /* Now increment line numbers if applicable. */
1094610992 if (skip )
1094710993 s = skipspace (s );
1094810994 s ++ ;
@@ -10971,9 +11017,10 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1097111017 }
1097211018 }
1097311019 else {
10974- /* Didn't find the closing } at the point we expected, so restore
10975- state such that the next thing to process is the opening { and */
10976- s = SvPVX (PL_linestr ) + bracket ; /* let the parser handle it */
11020+ /* Didn't find the closing '}' at the point we expected, so
11021+ * restore the state such that the next thing to process is the
11022+ * opening '{" and let the parser handle it */
11023+ s = SvPVX (PL_linestr ) + bracket ;
1097711024 CopLINE_set (PL_curcop , orig_copline );
1097811025 PL_parser -> herelines = herelines ;
1097911026 * dest = '\0' ;
@@ -10984,6 +11031,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1098411031 && !PL_lex_brackets
1098511032 && !intuit_more (s , PL_bufend , FROM_IDENT , NULL , 0 ))
1098611033 PL_lex_state = LEX_INTERPEND ;
11034+
1098711035 return s ;
1098811036}
1098911037
0 commit comments