Skip to content

Commit 845e6ce

Browse files
committed
S_scan_ident: Add some comments, white space, braces
This function is complicated, without enough documentation for me to understand the subtleties; I only studied it enough to change things I needed to, or which became obvious to me in the process. Other things remain undocumented by this commit. Some of the white space gives improper indentation which will fit a future commit. This commit also remove redundant parentheses in one statement
1 parent 2d683ad commit 845e6ce

File tree

1 file changed

+86
-38
lines changed

1 file changed

+86
-38
lines changed

toke.c

Lines changed: 86 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -10750,12 +10750,22 @@ Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STR
1075010750
return s;
1075110751
}
1075210752

10753-
/* scan s and extract an identifier ($var) from it if possible
10754-
* into dest.
10753+
/* scan 's' and extract an identifier ($var) from it into 'dest' if possible.
10754+
*
10755+
* Unlike S_parse_ident which looks for the more usual types of identifiers
10756+
* (and which this calls if needed), this looks for every possible identifier
10757+
* type, such as punctuation ones.
10758+
*
10759+
* It returns a pointer into the input buffer pointing to just after all the
10760+
* bytes this function consumed; or croaks if an invalid identifier is found.
10761+
*
1075510762
* XXX: This function has subtle implications on parsing, and
1075610763
* changing how it behaves can cause a variable to change from
1075710764
* being a run time rv2sv call or a compile time binding to a
1075810765
* specific variable name.
10766+
*
10767+
* Use the CHECK_UNARY flag to cause this to look for ambiguities with unary
10768+
* operators.
1075910769
*/
1076010770
STATIC char *
1076110771
S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
@@ -10781,14 +10791,16 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1078110791
d = dest;
1078210792

1078310793
if (*d) {
10794+
1078410795
/* Here parse_ident() found a digit variable or an identifier
1078510796
(anything valid as a bareword), so job done and return. */
1078610797
if (PL_lex_state != LEX_NORMAL)
1078710798
PL_lex_state = LEX_INTERPENDMAYBE;
1078810799
return s;
1078910800
}
1079010801

10791-
/* Here, it is not a run-of-the-mill identifier name */
10802+
/* Here, it is not a run-of-the-mill identifier name; maybe not an
10803+
* identifier at all. Note *d is a NUL */
1079210804

1079310805
if (*s == '$' && s[1]
1079410806
&& ( isIDFIRST_lazy_if_safe(s+1, PL_bufend, is_utf8)
@@ -10803,21 +10815,25 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1080310815
return s;
1080410816
}
1080510817

10806-
/* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...} */
10818+
/* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...}
10819+
* Skip to the first non-space past the brace */
1080710820
if (*s == '{') {
10821+
/* 'bracket' becomes the offset from the beginning of this chunk */
1080810822
bracket = s - SvPVX(PL_linestr);
10823+
1080910824
s++;
1081010825
orig_copline = CopLINE(PL_curcop);
1081110826
if (s < PL_bufend && isSPACE(*s)) {
1081210827
s = skipspace(s);
1081310828
}
1081410829
}
1081510830

10816-
/* Extract the first character of the variable name from 's' and
10817-
* copy it, null terminated into 'd'. Note that this does not
10818-
* involve checking for just IDFIRST characters, as it allows the
10819-
* '^' for ${^FOO} type variable names, and it allows all the
10820-
* characters that are legal in a single character variable name.
10831+
/* Here, 's' points to the next "interesting" character.
10832+
* Extract the first character of the potential variable name from 's' and
10833+
* copy it, NUL terminated, into 'd'. Note that this does not involve
10834+
* checking for just IDFIRST characters, as it allows the '^' for ${^FOO}
10835+
* type variable names, and it allows all the characters that are legal in
10836+
* a single character variable name.
1082110837
*
1082210838
* The legal ones are any of:
1082310839
* a) all ASCII characters except:
@@ -10841,59 +10857,86 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1084110857
&& LIKELY((U8) *s != LATIN1_TO_NATIVE(0xAD))))))
1084210858
{
1084310859
if (is_utf8) {
10844-
const STRLEN skip = UTF8SKIP(s);
10845-
STRLEN i;
10846-
d[skip] = '\0';
10847-
for ( i = 0; i < skip; i++ )
10848-
d[i] = *s++;
10860+
const STRLEN skip = UTF8SKIP(s);
10861+
STRLEN i;
10862+
d[skip] = '\0';
10863+
for ( i = 0; i < skip; i++ )
10864+
d[i] = *s++;
1084910865
}
1085010866
else {
1085110867
*d = *s++;
1085210868
d[1] = '\0';
1085310869
}
1085410870
}
1085510871

10856-
/* special case to handle ${10}, ${11} the same way we handle $1 etc */
10872+
/* 'd' has not been advanced, but if 's' pointed to a legal identifier
10873+
* character, it has been advanced to the next character, and the
10874+
* character it previously pointed to has been copied to where 'd'
10875+
* continues to point to.
10876+
*
10877+
* If that copied character is a digit, it means we have something like
10878+
* ${10}, ${1547}, etc. Handle those the same way we handle $1, etc */
1085710879
if (isDIGIT(*d)) {
1085810880
s = parse_ident(s - 1, PL_bufend, &d, e, is_utf8,
1085910881
STOP_AT_FIRST_NON_DIGIT);
1086010882

1086110883
/* The code below is expecting d to point to the final digit */
1086210884
d--;
1086310885
}
10864-
10865-
/* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */
10866-
else if (*d == '^' && *s && isCONTROLVAR(*s)) {
10886+
else /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */
10887+
if (*d == '^' && *s && isCONTROLVAR(*s)) {
1086710888
*d = toCTRL(*s);
1086810889
s++;
1086910890
}
10870-
/* Warn about ambiguous code after unary operators if {...} notation isn't
10871-
used. There's no difference in ambiguity; it's merely a heuristic
10872-
about when not to warn. */
10873-
else if (chk_unary && bracket == -1)
10891+
else /* Warn about ambiguous code after unary operators if {...} notation
10892+
isn't used. There's no difference in ambiguity; it's merely a
10893+
heuristic about when not to warn. */
10894+
if (chk_unary && bracket == -1) {
1087410895
check_unary();
10896+
}
10897+
10898+
/* Here, 's' points to the next "interesting" character to be parsed. And
10899+
* *d points to the first byte of the final so-far parsed and copied
10900+
* character. This is one of four things:
10901+
* 1) The only byte of the final character of an all-digit numeric
10902+
* variable inside braces. e.g. if the input is ${ 123 }, '123' has
10903+
* been copied to 'dest', and 'd' points to the '3'. We don't know
10904+
* yet if there is a closing brace.
10905+
* 2) A control character
10906+
* 3) The first (or only) byte of some other identifier
10907+
* 4) *d is NUL for anything else.
10908+
*/
1087510909

10876-
if (bracket != -1) {
10910+
if (bracket != -1) { /* Found a '{' */
1087710911
bool skip;
1087810912
char *s2;
10879-
/* If we were processing {...} notation then... */
10913+
10914+
/* Handle the interior of braces. First look to see if the character
10915+
* pointed to by 'd' is legal as the start of an identifier.
10916+
* If it isn't a normal identifier, it could be a control-character
10917+
* one. Those have to be followed by a \w character. Prefer a normal
10918+
* identifier, as UTF-8 strings could erroneously be conflated with a
10919+
* control character identifier. */
1088010920
if ( isIDFIRST_lazy_if_safe(d, e, is_utf8)
1088110921
|| ( ! isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */
1088210922
&& isWORDCHAR(*s))
1088310923
) {
10884-
/* note we have to check for a normal identifier first,
10885-
* as it handles utf8 symbols, and only after that has
10886-
* been ruled out can we look at the caret words */
1088710924
Size_t advance;
1088810925
if ((advance = isIDFIRST_lazy_if_safe(d, e, is_utf8) )) {
10889-
/* if it starts as a valid identifier, assume that it is one.
10890-
(the later check for } being at the expected point will trap
10891-
cases where this doesn't pan out.) */
10926+
10927+
/* Now parse the normal identifier.
10928+
*
10929+
* khw: The code below is buggy because we already have parsed
10930+
* and copied the first character of it. The next character
10931+
* could be any IDCONT one, not just an IDFIRST */
1089210932
d += advance;
1089310933
s = parse_ident(s, PL_bufend, &d, e, is_utf8,
1089410934
(ALLOW_PACKAGE | CHECK_DOLLAR));
1089510935
}
1089610936
else { /* caret word: ${^Foo} ${^CAPTURE[0]} */
10937+
10938+
/* Now parse the control character identifier. Again, we have
10939+
* already copied the first character. */
1089710940
d++;
1089810941
while (isWORDCHAR(*s) && d < e) {
1089910942
*d++ = *s++;
@@ -10902,12 +10945,15 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1090210945
croak("%s", ident_too_long);
1090310946
*d = '\0';
1090410947
}
10948+
1090510949
tmp_copline = CopLINE(PL_curcop);
1090610950
if (s < PL_bufend && isSPACE(*s)) {
1090710951
s = skipspace(s);
1090810952
}
10909-
if ((*s == '[' || (*s == '{' && strNE(dest, "sub")))) {
10953+
10954+
if (*s == '[' || (*s == '{' && strNE(dest, "sub"))) {
1091010955
/* ${foo[0]} and ${foo{bar}} and ${^CAPTURE[0]} notation. */
10956+
1091110957
if (ckWARN(WARN_AMBIGUOUS) && keyword(dest, d - dest, 0)) {
1091210958
const char * const brack =
1091310959
(const char *)
@@ -10929,6 +10975,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1092910975

1093010976
if ( !tmp_copline )
1093110977
tmp_copline = CopLINE(PL_curcop);
10978+
1093210979
if ((skip = s < PL_bufend && isSPACE(*s))) {
1093310980
/* Avoid incrementing line numbers or resetting PL_linestart,
1093410981
in case we have to back up. */
@@ -10939,10 +10986,9 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1093910986
else
1094010987
s2 = s;
1094110988

10942-
/* Expect to find a closing } after consuming any trailing whitespace.
10943-
*/
10944-
if (*s2 == '}') {
10945-
/* Now increment line numbers if applicable. */
10989+
/* Expect to find a closing '}' after consuming any trailing
10990+
* whitespace. */
10991+
if (*s2 == '}') { /* Now increment line numbers if applicable. */
1094610992
if (skip)
1094710993
s = skipspace(s);
1094810994
s++;
@@ -10971,9 +11017,10 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1097111017
}
1097211018
}
1097311019
else {
10974-
/* Didn't find the closing } at the point we expected, so restore
10975-
state such that the next thing to process is the opening { and */
10976-
s = SvPVX(PL_linestr) + bracket; /* let the parser handle it */
11020+
/* Didn't find the closing '}' at the point we expected, so
11021+
* restore the state such that the next thing to process is the
11022+
* opening '{" and let the parser handle it */
11023+
s = SvPVX(PL_linestr) + bracket;
1097711024
CopLINE_set(PL_curcop, orig_copline);
1097811025
PL_parser->herelines = herelines;
1097911026
*dest = '\0';
@@ -10984,6 +11031,7 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags)
1098411031
&& !PL_lex_brackets
1098511032
&& !intuit_more(s, PL_bufend, FROM_IDENT, NULL, 0))
1098611033
PL_lex_state = LEX_INTERPEND;
11034+
1098711035
return s;
1098811036
}
1098911037

0 commit comments

Comments
 (0)