@@ -2048,6 +2048,14 @@ const char *Lexer::tryScanRegexLiteral(const char *TokStart, bool MustBeRegex,
20482048
20492049 bool IsForwardSlash = (*TokStart == ' /' );
20502050
2051+ auto spaceOrTabDescription = [](char c) -> StringRef {
2052+ switch (c) {
2053+ case ' ' : return " space" ;
2054+ case ' \t ' : return " tab" ;
2055+ default : llvm_unreachable (" Unhandled case" );
2056+ }
2057+ };
2058+
20512059 // Check if we're able to lex a `/.../` regex.
20522060 if (IsForwardSlash) {
20532061 // For `/.../` regex literals, we need to ban space and tab at the start of
@@ -2063,33 +2071,17 @@ const char *Lexer::tryScanRegexLiteral(const char *TokStart, bool MustBeRegex,
20632071 // TODO: This heuristic should be sunk into the Swift library once we have a
20642072 // way of doing fix-its from there.
20652073 auto *RegexContentStart = TokStart + 1 ;
2066- switch (*RegexContentStart) {
2067- case ' ' :
2068- case ' \t ' : {
2074+ if (*RegexContentStart == ' ' || *RegexContentStart == ' \t ' ) {
20692075 if (!MustBeRegex)
20702076 return nullptr ;
20712077
20722078 if (Diags) {
20732079 // We must have a regex, so emit an error for space and tab.
2074- StringRef DiagChar;
2075- switch (*RegexContentStart) {
2076- case ' ' :
2077- DiagChar = " space" ;
2078- break ;
2079- case ' \t ' :
2080- DiagChar = " tab" ;
2081- break ;
2082- default :
2083- llvm_unreachable (" Unhandled case" );
2084- }
20852080 Diags->diagnose (getSourceLoc (RegexContentStart),
2086- diag::lex_regex_literal_invalid_starting_char, DiagChar)
2081+ diag::lex_regex_literal_invalid_starting_char,
2082+ spaceOrTabDescription (*RegexContentStart))
20872083 .fixItInsert (getSourceLoc (RegexContentStart), " \\ " );
20882084 }
2089- break ;
2090- }
2091- default :
2092- break ;
20932085 }
20942086 }
20952087
@@ -2106,60 +2098,82 @@ const char *Lexer::tryScanRegexLiteral(const char *TokStart, bool MustBeRegex,
21062098 if (Ptr == TokStart)
21072099 return nullptr ;
21082100
2109- // If we're lexing `/.../`, error if we ended on the opening of a comment.
2110- // We prefer to lex the comment as it's more likely than not that is what
2111- // the user is expecting.
2112- // TODO: This should be sunk into the Swift library.
2113- if (IsForwardSlash && Ptr[-1 ] == ' /' && (*Ptr == ' *' || *Ptr == ' /' )) {
2114- if (!MustBeRegex)
2115- return nullptr ;
2101+ // Perform some additional heuristics to see if we can lex `/.../`.
2102+ // TODO: These should all be sunk into the Swift library.
2103+ if (IsForwardSlash) {
2104+ // If we're lexing `/.../`, error if we ended on the opening of a comment.
2105+ // We prefer to lex the comment as it's more likely than not that is what
2106+ // the user is expecting.
2107+ if (Ptr[-1 ] == ' /' && (*Ptr == ' *' || *Ptr == ' /' )) {
2108+ if (!MustBeRegex)
2109+ return nullptr ;
21162110
2117- if (Diags) {
2118- Diags->diagnose (getSourceLoc (TokStart),
2119- diag::lex_regex_literal_unterminated);
2120- }
2121- // Move the pointer back to the '/' of the comment.
2122- Ptr--;
2123- }
2124-
2125- // If we're tentatively lexing `/.../`, scan to make sure we don't have any
2126- // unbalanced ')'s. This helps avoid ambiguity with unapplied operator
2127- // references e.g `reduce(1, /)` and `foo(/, 0) / 2`. This would be invalid
2128- // regex syntax anyways. This ensures users can surround their operator ref
2129- // in parens `(/)` to fix the issue. This also applies to prefix operators
2130- // that can be disambiguated as e.g `(/S.foo)`. Note we need to track whether
2131- // or not we're in a custom character class `[...]`, as parens are literal
2132- // there.
2133- // TODO: This should be sunk into the Swift library.
2134- if (IsForwardSlash && !MustBeRegex) {
2135- unsigned CharClassDepth = 0 ;
2136- unsigned GroupDepth = 0 ;
2137- for (auto *Cursor = TokStart + 1 ; Cursor < Ptr - 1 ; Cursor++) {
2138- switch (*Cursor) {
2139- case ' \\ ' :
2140- // Skip over the next character of an escape.
2141- Cursor++;
2142- break ;
2143- case ' (' :
2144- if (CharClassDepth == 0 )
2145- GroupDepth += 1 ;
2146- break ;
2147- case ' )' :
2148- if (CharClassDepth != 0 )
2111+ if (Diags) {
2112+ Diags->diagnose (getSourceLoc (TokStart),
2113+ diag::lex_regex_literal_unterminated);
2114+ }
2115+ // Move the pointer back to the '/' of the comment.
2116+ Ptr--;
2117+ }
2118+ auto *TokEnd = Ptr - 1 ;
2119+ auto *ContentEnd = TokEnd - 1 ;
2120+
2121+ // We also ban unescaped space and tab at the end of a `/.../` literal.
2122+ if (*TokEnd == ' /' && (TokEnd - TokStart > 2 ) && ContentEnd[-1 ] != ' \\ ' &&
2123+ (*ContentEnd == ' ' || *ContentEnd == ' \t ' )) {
2124+ if (!MustBeRegex)
2125+ return nullptr ;
2126+
2127+ if (Diags) {
2128+ // Diagnose and suggest using a `#/.../#` literal instead. We could
2129+ // suggest escaping, but that would be wrong if the user has written (?x).
2130+ // TODO: Should we suggest this for space-as-first character too?
2131+ Diags->diagnose (getSourceLoc (ContentEnd),
2132+ diag::lex_regex_literal_invalid_ending_char,
2133+ spaceOrTabDescription (*ContentEnd))
2134+ .fixItInsert (getSourceLoc (TokStart), " #" )
2135+ .fixItInsert (getSourceLoc (Ptr), " #" );
2136+ }
2137+ }
2138+
2139+ // If we're tentatively lexing `/.../`, scan to make sure we don't have any
2140+ // unbalanced ')'s. This helps avoid ambiguity with unapplied operator
2141+ // references e.g `reduce(1, /)` and `foo(/, 0) / 2`. This would be invalid
2142+ // regex syntax anyways. This ensures users can surround their operator ref
2143+ // in parens `(/)` to fix the issue. This also applies to prefix operators
2144+ // that can be disambiguated as e.g `(/S.foo)`. Note we need to track whether
2145+ // or not we're in a custom character class `[...]`, as parens are literal
2146+ // there.
2147+ if (!MustBeRegex) {
2148+ unsigned CharClassDepth = 0 ;
2149+ unsigned GroupDepth = 0 ;
2150+ for (auto *Cursor = TokStart + 1 ; Cursor < TokEnd; Cursor++) {
2151+ switch (*Cursor) {
2152+ case ' \\ ' :
2153+ // Skip over the next character of an escape.
2154+ Cursor++;
2155+ break ;
2156+ case ' (' :
2157+ if (CharClassDepth == 0 )
2158+ GroupDepth += 1 ;
21492159 break ;
2160+ case ' )' :
2161+ if (CharClassDepth != 0 )
2162+ break ;
21502163
2151- // Invalid, so bail.
2152- if (GroupDepth == 0 )
2153- return nullptr ;
2164+ // Invalid, so bail.
2165+ if (GroupDepth == 0 )
2166+ return nullptr ;
21542167
2155- GroupDepth -= 1 ;
2156- break ;
2157- case ' [' :
2158- CharClassDepth += 1 ;
2159- break ;
2160- case ' ]' :
2161- if (CharClassDepth != 0 )
2162- CharClassDepth -= 1 ;
2168+ GroupDepth -= 1 ;
2169+ break ;
2170+ case ' [' :
2171+ CharClassDepth += 1 ;
2172+ break ;
2173+ case ' ]' :
2174+ if (CharClassDepth != 0 )
2175+ CharClassDepth -= 1 ;
2176+ }
21632177 }
21642178 }
21652179 }
0 commit comments