From d211fd1030494d0db230ccd608f935edc5af8406 Mon Sep 17 00:00:00 2001 From: shivrm Date: Fri, 7 Nov 2025 17:02:47 +0530 Subject: [PATCH 1/4] Add splitting for user-defined suffixes --- clang/lib/Format/BreakableToken.cpp | 21 ++++++++--- clang/lib/Format/BreakableToken.h | 12 +++++-- clang/lib/Format/ContinuationIndenter.cpp | 44 +++++++++++++++++------ 3 files changed, 61 insertions(+), 16 deletions(-) diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp index 994a427517ffc..ff9f2f10ffac0 100644 --- a/clang/lib/Format/BreakableToken.cpp +++ b/clang/lib/Format/BreakableToken.cpp @@ -253,10 +253,13 @@ unsigned BreakableStringLiteral::getContentStartColumn(unsigned LineIndex, BreakableStringLiteral::BreakableStringLiteral( const FormatToken &Tok, unsigned StartColumn, StringRef Prefix, - StringRef Postfix, unsigned UnbreakableTailLength, bool InPPDirective, - encoding::Encoding Encoding, const FormatStyle &Style) + StringRef Postfix, StringRef ContinuationPrefix, + StringRef ContinuationPostfix, unsigned UnbreakableTailLength, + bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style) : BreakableToken(Tok, InPPDirective, Encoding, Style), StartColumn(StartColumn), Prefix(Prefix), Postfix(Postfix), + ContinuationPrefix(ContinuationPrefix), + ContinuationPostfix(ContinuationPostfix), UnbreakableTailLength(UnbreakableTailLength) { assert(Tok.TokenText.starts_with(Prefix) && Tok.TokenText.ends_with(Postfix)); Line = Tok.TokenText.substr( @@ -274,9 +277,15 @@ void BreakableStringLiteral::insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split, unsigned ContentIndent, WhitespaceManager &Whitespaces) const { + + const unsigned SplitEnd = TailOffset + Split.first + Split.second; + const bool IsLastFragment = SplitEnd >= Line.size() - UnbreakableTailLength; + + StringRef LocalPostfix = (IsLastFragment) ? Postfix : ContinuationPostfix; + Whitespaces.replaceWhitespaceInToken( - Tok, Prefix.size() + TailOffset + Split.first, Split.second, Postfix, - Prefix, InPPDirective, 1, StartColumn); + Tok, ContinuationPrefix.size() + TailOffset + Split.first, Split.second, + LocalPostfix, ContinuationPrefix, InPPDirective, 1, StartColumn); } BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators( @@ -288,6 +297,10 @@ BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators( : QuoteStyle == AtDoubleQuotes ? "@\"" : "\"", /*Postfix=*/QuoteStyle == SingleQuotes ? "'" : "\"", + /*ContinuationPrefix=*/QuoteStyle == SingleQuotes ? "'" + : QuoteStyle == AtDoubleQuotes ? "@\"" + : "\"", + /*ContinuationPostfix=*/QuoteStyle == SingleQuotes ? "'" : "\"", UnbreakableTailLength, InPPDirective, Encoding, Style), BracesNeeded(Tok.isNot(TT_StringInConcatenation)), QuoteStyle(QuoteStyle) { diff --git a/clang/lib/Format/BreakableToken.h b/clang/lib/Format/BreakableToken.h index 45c00b35fd01e..2ee37d3e0e059 100644 --- a/clang/lib/Format/BreakableToken.h +++ b/clang/lib/Format/BreakableToken.h @@ -252,6 +252,8 @@ class BreakableStringLiteral : public BreakableToken { /// after formatting. BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn, StringRef Prefix, StringRef Postfix, + StringRef ContinuationPrefix, + StringRef ContinuationPostfix, unsigned UnbreakableTailLength, bool InPPDirective, encoding::Encoding Encoding, const FormatStyle &Style); @@ -274,15 +276,21 @@ class BreakableStringLiteral : public BreakableToken { protected: // The column in which the token starts. unsigned StartColumn; - // The prefix a line needs after a break in the token. + // The prefix a line needs at the start StringRef Prefix; - // The postfix a line needs before introducing a break. + // The postfix a line needs at the end StringRef Postfix; + // The prefix every line except the first line needs + StringRef ContinuationPrefix; + // The postfix every line except the last line needs + StringRef ContinuationPostfix; // The token text excluding the prefix and postfix. StringRef Line; // Length of the sequence of tokens after this string literal that cannot // contain line breaks. unsigned UnbreakableTailLength; + // Whether the string prefix and postfix should be repeated on each line + // when breaking the string. }; class BreakableStringLiteralUsingOperators : public BreakableStringLiteral { diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 9ab024a03fbd7..6cfb7a505200e 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -2540,22 +2540,46 @@ ContinuationIndenter::createBreakableToken(const FormatToken &Current, StringRef Prefix; StringRef Postfix; + // FIXME: Handle whitespace between '_T', '(', '"..."', and ')'. // FIXME: Store Prefix and Suffix (or PrefixLength and SuffixLength to // reduce the overhead) for each FormatToken, which is a string, so that we // don't run multiple checks here on the hot path. - if ((Text.ends_with(Postfix = "\"") && - (Text.starts_with(Prefix = "@\"") || Text.starts_with(Prefix = "\"") || - Text.starts_with(Prefix = "u\"") || - Text.starts_with(Prefix = "U\"") || - Text.starts_with(Prefix = "u8\"") || - Text.starts_with(Prefix = "L\""))) || - (Text.starts_with(Prefix = "_T(\"") && - Text.ends_with(Postfix = "\")"))) { + if (Text.starts_with(Prefix = "_T(\"") && Text.ends_with(Postfix = "\")")) { + // We need to put `_T("` and `")` on each line because it is a macro + llvm::StringRef ContinuationPrefix = Prefix; + llvm::StringRef ContinuationPostfix = Postfix; + return std::make_unique( - Current, StartColumn, Prefix, Postfix, UnbreakableTailLength, - State.Line->InPPDirective, Encoding, Style); + Current, StartColumn, Prefix, Postfix, ContinuationPrefix, + ContinuationPostfix, UnbreakableTailLength, State.Line->InPPDirective, + Encoding, Style); + } + + static const auto PostfixRegex = + llvm::Regex(R"("(_[a-zA-Z_][a-zA-Z0-9_]*)?$)"); + llvm::SmallVector Matches; + + if (PostfixRegex.match(Text, &Matches)) { + Postfix = Matches.front(); + + if ((Text.starts_with(Prefix = "@\"") || + Text.starts_with(Prefix = "\"") || + Text.starts_with(Prefix = "u\"") || + Text.starts_with(Prefix = "U\"") || + Text.starts_with(Prefix = "u8\"") || + Text.starts_with(Prefix = "L\""))) { + + // Use quotes when breaking the string + llvm::StringRef ContinuationPrefix = "\""; + llvm::StringRef ContinuationPostfix = "\""; + return std::make_unique( + Current, StartColumn, Prefix, Postfix, ContinuationPrefix, + ContinuationPostfix, UnbreakableTailLength, + State.Line->InPPDirective, Encoding, Style); + } } + } else if (Current.is(TT_BlockComment)) { if (Style.ReflowComments == FormatStyle::RCS_Never || // If a comment token switches formatting, like From 93060fdd0a3b03ed6a9c38a06a5e6819f67c13e4 Mon Sep 17 00:00:00 2001 From: shivrm Date: Fri, 7 Nov 2025 22:06:12 +0530 Subject: [PATCH 2/4] Modify string splitting to repeat prefix --- clang/lib/Format/BreakableToken.cpp | 2 +- clang/lib/Format/ContinuationIndenter.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp index ff9f2f10ffac0..872660535eb35 100644 --- a/clang/lib/Format/BreakableToken.cpp +++ b/clang/lib/Format/BreakableToken.cpp @@ -298,7 +298,7 @@ BreakableStringLiteralUsingOperators::BreakableStringLiteralUsingOperators( : "\"", /*Postfix=*/QuoteStyle == SingleQuotes ? "'" : "\"", /*ContinuationPrefix=*/QuoteStyle == SingleQuotes ? "'" - : QuoteStyle == AtDoubleQuotes ? "@\"" + : QuoteStyle == AtDoubleQuotes ? "@\"" : "\"", /*ContinuationPostfix=*/QuoteStyle == SingleQuotes ? "'" : "\"", UnbreakableTailLength, InPPDirective, Encoding, Style), diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 6cfb7a505200e..5badd6edf4a7b 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -2570,8 +2570,8 @@ ContinuationIndenter::createBreakableToken(const FormatToken &Current, Text.starts_with(Prefix = "u8\"") || Text.starts_with(Prefix = "L\""))) { - // Use quotes when breaking the string - llvm::StringRef ContinuationPrefix = "\""; + // Repeat the prefix on every line but don't repeat the suffix + llvm::StringRef ContinuationPrefix = Prefix; llvm::StringRef ContinuationPostfix = "\""; return std::make_unique( Current, StartColumn, Prefix, Postfix, ContinuationPrefix, From 91c9b81e83f82af2103f258b699fa5202fc2af89 Mon Sep 17 00:00:00 2001 From: shivrm Date: Fri, 7 Nov 2025 22:25:39 +0530 Subject: [PATCH 3/4] Fix bug causing repetition of suffixes --- clang/lib/Format/BreakableToken.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp index 872660535eb35..dd9d4ecb2f3c7 100644 --- a/clang/lib/Format/BreakableToken.cpp +++ b/clang/lib/Format/BreakableToken.cpp @@ -279,8 +279,7 @@ void BreakableStringLiteral::insertBreak(unsigned LineIndex, WhitespaceManager &Whitespaces) const { const unsigned SplitEnd = TailOffset + Split.first + Split.second; - const bool IsLastFragment = SplitEnd >= Line.size() - UnbreakableTailLength; - + const bool IsLastFragment = SplitEnd > Line.size() - UnbreakableTailLength; StringRef LocalPostfix = (IsLastFragment) ? Postfix : ContinuationPostfix; Whitespaces.replaceWhitespaceInToken( From faa1996fa3238d95386d8377da96689714551a25 Mon Sep 17 00:00:00 2001 From: shivrm Date: Sat, 8 Nov 2025 21:17:21 +0530 Subject: [PATCH 4/4] Add unit tests --- clang/unittests/Format/FormatTest.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 24235b966399d..4c7593b88202f 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -15838,6 +15838,10 @@ TEST_F(FormatTest, BreaksWideAndNSStringLiterals) { "@\"NSString literal\";", getGoogleStyleWithColumns(19)); verifyFormat(R"(NSString *s = @"那那那那";)", getLLVMStyleWithColumns(26)); + EXPECT_EQ("L\"suffixed \"\n" + "L\"string\"_s;", + format("L\"suffixed string\"_s;", getLLVMStyleWithColumns(19))); + // This input makes clang-format try to split the incomplete unicode escape // sequence, which used to lead to a crasher. verifyNoCrash(