Skip to content

Commit d563cbf

Browse files
committed
Stop parsing continuation lines in number literals
The previous algorithm incorrectly handled line continuations in number literals. Instead of attempting to parse them incorrectly, we now treat them as a parsing error. This allows library users to handle such cases explicitly.
1 parent aec9397 commit d563cbf

File tree

2 files changed

+57
-39
lines changed

2 files changed

+57
-39
lines changed

src/scanner.c

Lines changed: 44 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,16 @@ typedef struct {
1919
bool in_line_continuation;
2020
} Scanner;
2121

22+
typedef enum {
23+
False,
24+
True,
25+
Error,
26+
} BoolOrErr;
27+
28+
static BoolOrErr bool_or_err_max(BoolOrErr lhs, BoolOrErr rhs) {
29+
return lhs >= rhs ? lhs : rhs;
30+
}
31+
2232
// consume current character into current token and advance
2333
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
2434

@@ -54,61 +64,63 @@ static bool is_exp_sentinel(char chr) {
5464
}
5565
}
5666

57-
static bool scan_int(TSLexer *lexer) {
67+
static BoolOrErr scan_int(TSLexer *lexer) {
5868
if (!iswdigit(lexer->lookahead)) {
59-
return false;
69+
return False;
6070
}
6171
// consume digits
6272
while (iswdigit(lexer->lookahead)) {
6373
advance(lexer); // store all digits
6474
}
65-
66-
// handle line continuations
75+
lexer->mark_end(lexer);
76+
// Return an error if a line continuation is found. This scanner cannot
77+
// handle line continuations, particularly in cases like:
78+
//
79+
// ```fortran
80+
// b = 6& ! foo
81+
// &7;
82+
// ```
83+
//
84+
// Here, the scanner would need to return multiple tokens, but tree-sitter
85+
// expects only a single token.
6786
if (lexer->lookahead == '&') {
68-
skip(lexer);
69-
while (iswspace(lexer->lookahead)) {
70-
skip(lexer);
71-
}
72-
// second '&' required to continue the literal
73-
if (lexer->lookahead == '&') {
74-
skip(lexer);
75-
// don't return here, as we may have finished literal on first
76-
// line but still have second '&'
77-
scan_int(lexer);
78-
}
87+
return Error;
7988
}
80-
81-
lexer->mark_end(lexer);
82-
return true;
89+
return True;
8390
}
8491

8592
/// Scan a number of the forms 1XXX, 1.0XXX, 0.1XXX, 1.XDX, etc.
86-
static bool scan_number(TSLexer *lexer) {
93+
static BoolOrErr scan_number(TSLexer *lexer) {
8794
lexer->result_symbol = INTEGER_LITERAL;
88-
bool digits = scan_int(lexer);
95+
BoolOrErr digits = scan_int(lexer);
8996
if (lexer->lookahead == '.') {
9097
advance(lexer);
9198
// exclude decimal if followed by any letter other than d/D and e/E
9299
// if no leading digits are present and a non-digit follows
93100
// the decimal it's a nonmatch.
94-
if (digits && !iswalnum(lexer->lookahead)) {
101+
if ((digits == True) && !iswalnum(lexer->lookahead)) {
95102
lexer->mark_end(lexer); // add decimal to token
96103
}
97104
lexer->result_symbol = FLOAT_LITERAL;
98105
}
99106
// if next char isn't number return since we handle exp
100107
// notation and precision identifiers separately. If there are
101108
// no leading digit it's a nonmatch.
102-
digits = scan_int(lexer) || digits;
103-
if (digits) {
109+
digits = bool_or_err_max(scan_int(lexer), digits);
110+
if (digits == True) {
104111
// process exp notation
105112
if (is_exp_sentinel(lexer->lookahead)) {
106113
advance(lexer);
107114
if (lexer->lookahead == '+' || lexer->lookahead == '-') {
108115
advance(lexer);
109116
}
110-
if (!scan_int(lexer)) {
111-
return true; // valid number token with junk after it
117+
switch (scan_int(lexer)) {
118+
case False:
119+
return True; // valid number token with junk after it
120+
case True:
121+
break;
122+
case Error:
123+
return Error;
112124
}
113125
lexer->mark_end(lexer);
114126
lexer->result_symbol = FLOAT_LITERAL;
@@ -429,8 +441,13 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
429441
if (valid_symbols[INTEGER_LITERAL] || valid_symbols[FLOAT_LITERAL] ||
430442
valid_symbols[BOZ_LITERAL]) {
431443
// extract out root number from expression
432-
if (scan_number(lexer)) {
433-
return true;
444+
switch (scan_number(lexer)) {
445+
case False:
446+
break;
447+
case True:
448+
return true;
449+
case Error:
450+
return false;
434451
}
435452
if (scan_boz(lexer)) {
436453
return true;

test/corpus/expressions.txt

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,6 +1030,8 @@ end program
10301030
(type_qualifier)
10311031
(init_declarator
10321032
(identifier)
1033+
(ERROR
1034+
(number_literal))
10331035
(number_literal)))
10341036
(end_of_statement)
10351037
(variable_declaration
@@ -1040,18 +1042,17 @@ end program
10401042
(init_declarator
10411043
(identifier)
10421044
(array_literal
1043-
(number_literal)
1044-
(number_literal))))
1045-
(end_of_statement)
1046-
(variable_declaration
1047-
(intrinsic_type)
1048-
(type_qualifier
1049-
(argument_list
1050-
(number_literal)))
1051-
(init_declarator
1052-
(identifier)
1053-
(array_literal
1054-
(number_literal)
1045+
(ERROR
1046+
(number_literal))
1047+
(identifier)
1048+
(call_expression
1049+
(identifier)
1050+
(argument_list
1051+
(number_literal)))
1052+
(ERROR
1053+
(ERROR
1054+
(number_literal))
1055+
(number_literal))
10551056
(number_literal))))
10561057
(end_of_statement)
10571058
(end_program_statement

0 commit comments

Comments
 (0)