1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements FormatTokenLexer, which tokenizes a source file 11 /// into a FormatToken stream suitable for ClangFormat. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "FormatTokenLexer.h" 16 #include "FormatToken.h" 17 #include "clang/Basic/SourceLocation.h" 18 #include "clang/Basic/SourceManager.h" 19 #include "clang/Format/Format.h" 20 #include "llvm/Support/Regex.h" 21 22 namespace clang { 23 namespace format { 24 25 FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, 26 unsigned Column, const FormatStyle &Style, 27 encoding::Encoding Encoding) 28 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), 29 Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID), 30 Style(Style), IdentTable(getFormattingLangOpts(Style)), 31 Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0), 32 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), 33 MacroBlockEndRegex(Style.MacroBlockEnd) { 34 Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr, 35 getFormattingLangOpts(Style))); 36 Lex->SetKeepWhitespaceMode(true); 37 38 for (const std::string &ForEachMacro : Style.ForEachMacros) 39 Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro}); 40 for (const std::string &StatementMacro : Style.StatementMacros) 41 Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro}); 42 for (const std::string &TypenameMacro : Style.TypenameMacros) 43 Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro}); 44 for (const std::string &NamespaceMacro : Style.NamespaceMacros) 45 Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro}); 46 } 47 48 ArrayRef<FormatToken *> FormatTokenLexer::lex() { 49 assert(Tokens.empty()); 50 assert(FirstInLineIndex == 0); 51 do { 52 Tokens.push_back(getNextToken()); 53 if (Style.Language == FormatStyle::LK_JavaScript) { 54 tryParseJSRegexLiteral(); 55 handleTemplateStrings(); 56 } 57 if (Style.Language == FormatStyle::LK_TextProto) 58 tryParsePythonComment(); 59 tryMergePreviousTokens(); 60 if (Style.isCSharp()) 61 // This needs to come after tokens have been merged so that C# 62 // string literals are correctly identified. 63 handleCSharpVerbatimAndInterpolatedStrings(); 64 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) 65 FirstInLineIndex = Tokens.size() - 1; 66 } while (Tokens.back()->Tok.isNot(tok::eof)); 67 return Tokens; 68 } 69 70 void FormatTokenLexer::tryMergePreviousTokens() { 71 if (tryMerge_TMacro()) 72 return; 73 if (tryMergeConflictMarkers()) 74 return; 75 if (tryMergeLessLess()) 76 return; 77 78 if (Style.isCSharp()) { 79 if (tryMergeCSharpNamedArgument()) 80 return; 81 if (tryMergeCSharpAttributeAndTarget()) 82 return; 83 if (tryMergeCSharpKeywordVariables()) 84 return; 85 if (tryMergeCSharpStringLiteral()) 86 return; 87 if (tryMergeCSharpDoubleQuestion()) 88 return; 89 if (tryMergeCSharpNullConditionals()) 90 return; 91 if (tryTransformCSharpForEach()) 92 return; 93 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater}; 94 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow)) 95 return; 96 } 97 98 if (tryMergeNSStringLiteral()) 99 return; 100 101 if (Style.Language == FormatStyle::LK_JavaScript) { 102 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal}; 103 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal, 104 tok::equal}; 105 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater, 106 tok::greaterequal}; 107 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater}; 108 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star}; 109 static const tok::TokenKind JSExponentiationEqual[] = {tok::star, 110 tok::starequal}; 111 static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question, 112 tok::period}; 113 static const tok::TokenKind JSNullishOperator[] = {tok::question, 114 tok::question}; 115 116 // FIXME: Investigate what token type gives the correct operator priority. 117 if (tryMergeTokens(JSIdentity, TT_BinaryOperator)) 118 return; 119 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator)) 120 return; 121 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator)) 122 return; 123 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow)) 124 return; 125 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation)) 126 return; 127 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) { 128 Tokens.back()->Tok.setKind(tok::starequal); 129 return; 130 } 131 if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) { 132 // Treat like the "||" operator (as opposed to the ternary ?). 133 Tokens.back()->Tok.setKind(tok::pipepipe); 134 return; 135 } 136 if (tryMergeTokens(JSNullPropagatingOperator, 137 TT_JsNullPropagatingOperator)) { 138 // Treat like a regular "." access. 139 Tokens.back()->Tok.setKind(tok::period); 140 return; 141 } 142 if (tryMergeJSPrivateIdentifier()) 143 return; 144 } 145 146 if (Style.Language == FormatStyle::LK_Java) { 147 static const tok::TokenKind JavaRightLogicalShiftAssign[] = { 148 tok::greater, tok::greater, tok::greaterequal}; 149 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator)) 150 return; 151 } 152 } 153 154 bool FormatTokenLexer::tryMergeNSStringLiteral() { 155 if (Tokens.size() < 2) 156 return false; 157 auto &At = *(Tokens.end() - 2); 158 auto &String = *(Tokens.end() - 1); 159 if (!At->is(tok::at) || !String->is(tok::string_literal)) 160 return false; 161 At->Tok.setKind(tok::string_literal); 162 At->TokenText = StringRef(At->TokenText.begin(), 163 String->TokenText.end() - At->TokenText.begin()); 164 At->ColumnWidth += String->ColumnWidth; 165 At->Type = TT_ObjCStringLiteral; 166 Tokens.erase(Tokens.end() - 1); 167 return true; 168 } 169 170 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() { 171 // Merges #idenfier into a single identifier with the text #identifier 172 // but the token tok::identifier. 173 if (Tokens.size() < 2) 174 return false; 175 auto &Hash = *(Tokens.end() - 2); 176 auto &Identifier = *(Tokens.end() - 1); 177 if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier)) 178 return false; 179 Hash->Tok.setKind(tok::identifier); 180 Hash->TokenText = 181 StringRef(Hash->TokenText.begin(), 182 Identifier->TokenText.end() - Hash->TokenText.begin()); 183 Hash->ColumnWidth += Identifier->ColumnWidth; 184 Hash->Type = TT_JsPrivateIdentifier; 185 Tokens.erase(Tokens.end() - 1); 186 return true; 187 } 188 189 // Merge 'argName' and ':' into a single token in `foo(argName: bar)`. 190 bool FormatTokenLexer::tryMergeCSharpNamedArgument() { 191 if (Tokens.size() < 2) 192 return false; 193 auto &Colon = *(Tokens.end() - 1); 194 if (!Colon->is(tok::colon)) 195 return false; 196 197 auto &Name = *(Tokens.end() - 2); 198 if (!Name->is(tok::identifier)) 199 return false; 200 201 const FormatToken *CommaOrLeftParen = nullptr; 202 for (auto I = Tokens.rbegin() + 2, E = Tokens.rend(); I != E; ++I) { 203 // NB: Because previous pointers are not initialized yet, this cannot use 204 // Token.getPreviousNonComment. 205 if ((*I)->isNot(tok::comment)) { 206 CommaOrLeftParen = *I; 207 break; 208 } 209 } 210 211 if (!CommaOrLeftParen || !CommaOrLeftParen->isOneOf(tok::l_paren, tok::comma)) 212 return false; 213 214 Name->TokenText = StringRef(Name->TokenText.begin(), 215 Colon->TokenText.end() - Name->TokenText.begin()); 216 Name->ColumnWidth += Colon->ColumnWidth; 217 Name->Type = TT_CSharpNamedArgument; 218 Tokens.erase(Tokens.end() - 1); 219 return true; 220 } 221 222 // Search for verbatim or interpolated string literals @"ABC" or 223 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to 224 // prevent splitting of @, $ and ". 225 // Merging of multiline verbatim strings with embedded '"' is handled in 226 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing. 227 bool FormatTokenLexer::tryMergeCSharpStringLiteral() { 228 if (Tokens.size() < 2) 229 return false; 230 231 // Interpolated strings could contain { } with " characters inside. 232 // $"{x ?? "null"}" 233 // should not be split into $"{x ?? ", null, "}" but should treated as a 234 // single string-literal. 235 // 236 // We opt not to try and format expressions inside {} within a C# 237 // interpolated string. Formatting expressions within an interpolated string 238 // would require similar work as that done for JavaScript template strings 239 // in `handleTemplateStrings()`. 240 auto &CSharpInterpolatedString = *(Tokens.end() - 2); 241 if (CSharpInterpolatedString->Type == TT_CSharpStringLiteral && 242 (CSharpInterpolatedString->TokenText.startswith(R"($")") || 243 CSharpInterpolatedString->TokenText.startswith(R"($@")"))) { 244 int UnmatchedOpeningBraceCount = 0; 245 246 auto TokenTextSize = CSharpInterpolatedString->TokenText.size(); 247 for (size_t Index = 0; Index < TokenTextSize; ++Index) { 248 char C = CSharpInterpolatedString->TokenText[Index]; 249 if (C == '{') { 250 // "{{" inside an interpolated string is an escaped '{' so skip it. 251 if (Index + 1 < TokenTextSize && 252 CSharpInterpolatedString->TokenText[Index + 1] == '{') { 253 ++Index; 254 continue; 255 } 256 ++UnmatchedOpeningBraceCount; 257 } else if (C == '}') { 258 // "}}" inside an interpolated string is an escaped '}' so skip it. 259 if (Index + 1 < TokenTextSize && 260 CSharpInterpolatedString->TokenText[Index + 1] == '}') { 261 ++Index; 262 continue; 263 } 264 --UnmatchedOpeningBraceCount; 265 } 266 } 267 268 if (UnmatchedOpeningBraceCount > 0) { 269 auto &NextToken = *(Tokens.end() - 1); 270 CSharpInterpolatedString->TokenText = 271 StringRef(CSharpInterpolatedString->TokenText.begin(), 272 NextToken->TokenText.end() - 273 CSharpInterpolatedString->TokenText.begin()); 274 CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth; 275 Tokens.erase(Tokens.end() - 1); 276 return true; 277 } 278 } 279 280 // Look for @"aaaaaa" or $"aaaaaa". 281 auto &String = *(Tokens.end() - 1); 282 if (!String->is(tok::string_literal)) 283 return false; 284 285 auto &At = *(Tokens.end() - 2); 286 if (!(At->is(tok::at) || At->TokenText == "$")) 287 return false; 288 289 if (Tokens.size() > 2 && At->is(tok::at)) { 290 auto &Dollar = *(Tokens.end() - 3); 291 if (Dollar->TokenText == "$") { 292 // This looks like $@"aaaaa" so we need to combine all 3 tokens. 293 Dollar->Tok.setKind(tok::string_literal); 294 Dollar->TokenText = 295 StringRef(Dollar->TokenText.begin(), 296 String->TokenText.end() - Dollar->TokenText.begin()); 297 Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth); 298 Dollar->Type = TT_CSharpStringLiteral; 299 Tokens.erase(Tokens.end() - 2); 300 Tokens.erase(Tokens.end() - 1); 301 return true; 302 } 303 } 304 305 // Convert back into just a string_literal. 306 At->Tok.setKind(tok::string_literal); 307 At->TokenText = StringRef(At->TokenText.begin(), 308 String->TokenText.end() - At->TokenText.begin()); 309 At->ColumnWidth += String->ColumnWidth; 310 At->Type = TT_CSharpStringLiteral; 311 Tokens.erase(Tokens.end() - 1); 312 return true; 313 } 314 315 // Valid C# attribute targets: 316 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets 317 const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = { 318 "assembly", "module", "field", "event", "method", 319 "param", "property", "return", "type", 320 }; 321 322 bool FormatTokenLexer::tryMergeCSharpAttributeAndTarget() { 323 // Treat '[assembly:' and '[field:' as tokens in their own right. 324 if (Tokens.size() < 3) 325 return false; 326 327 auto &SquareBracket = *(Tokens.end() - 3); 328 auto &Target = *(Tokens.end() - 2); 329 auto &Colon = *(Tokens.end() - 1); 330 331 if (!SquareBracket->Tok.is(tok::l_square)) 332 return false; 333 334 if (CSharpAttributeTargets.find(Target->TokenText) == 335 CSharpAttributeTargets.end()) 336 return false; 337 338 if (!Colon->Tok.is(tok::colon)) 339 return false; 340 341 SquareBracket->TokenText = 342 StringRef(SquareBracket->TokenText.begin(), 343 Colon->TokenText.end() - SquareBracket->TokenText.begin()); 344 SquareBracket->ColumnWidth += (Target->ColumnWidth + Colon->ColumnWidth); 345 Tokens.erase(Tokens.end() - 2); 346 Tokens.erase(Tokens.end() - 1); 347 return true; 348 } 349 350 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() { 351 if (Tokens.size() < 2) 352 return false; 353 auto &FirstQuestion = *(Tokens.end() - 2); 354 auto &SecondQuestion = *(Tokens.end() - 1); 355 if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question)) 356 return false; 357 FirstQuestion->Tok.setKind(tok::question); 358 FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(), 359 SecondQuestion->TokenText.end() - 360 FirstQuestion->TokenText.begin()); 361 FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth; 362 FirstQuestion->Type = TT_CSharpNullCoalescing; 363 Tokens.erase(Tokens.end() - 1); 364 return true; 365 } 366 367 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() { 368 if (Tokens.size() < 2) 369 return false; 370 auto &At = *(Tokens.end() - 2); 371 auto &Keyword = *(Tokens.end() - 1); 372 if (!At->is(tok::at)) 373 return false; 374 if (!Keywords.isCSharpKeyword(*Keyword)) 375 return false; 376 377 At->Tok.setKind(tok::identifier); 378 At->TokenText = StringRef(At->TokenText.begin(), 379 Keyword->TokenText.end() - At->TokenText.begin()); 380 At->ColumnWidth += Keyword->ColumnWidth; 381 At->Type = Keyword->Type; 382 Tokens.erase(Tokens.end() - 1); 383 return true; 384 } 385 386 // In C# merge the Identifier and the ? together e.g. arg?. 387 bool FormatTokenLexer::tryMergeCSharpNullConditionals() { 388 if (Tokens.size() < 2) 389 return false; 390 auto &Identifier = *(Tokens.end() - 2); 391 auto &Question = *(Tokens.end() - 1); 392 if (!Identifier->isOneOf(tok::r_square, tok::identifier) || 393 !Question->is(tok::question)) 394 return false; 395 Identifier->TokenText = 396 StringRef(Identifier->TokenText.begin(), 397 Question->TokenText.end() - Identifier->TokenText.begin()); 398 Identifier->ColumnWidth += Question->ColumnWidth; 399 Tokens.erase(Tokens.end() - 1); 400 return true; 401 } 402 403 // In C# transform identifier foreach into kw_foreach 404 bool FormatTokenLexer::tryTransformCSharpForEach() { 405 if (Tokens.size() < 1) 406 return false; 407 auto &Identifier = *(Tokens.end() - 1); 408 if (!Identifier->is(tok::identifier)) 409 return false; 410 if (Identifier->TokenText != "foreach") 411 return false; 412 413 Identifier->Type = TT_ForEachMacro; 414 Identifier->Tok.setKind(tok::kw_for); 415 return true; 416 } 417 418 bool FormatTokenLexer::tryMergeLessLess() { 419 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less. 420 if (Tokens.size() < 3) 421 return false; 422 423 bool FourthTokenIsLess = false; 424 if (Tokens.size() > 3) 425 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less); 426 427 auto First = Tokens.end() - 3; 428 if (First[2]->is(tok::less) || First[1]->isNot(tok::less) || 429 First[0]->isNot(tok::less) || FourthTokenIsLess) 430 return false; 431 432 // Only merge if there currently is no whitespace between the two "<". 433 if (First[1]->WhitespaceRange.getBegin() != 434 First[1]->WhitespaceRange.getEnd()) 435 return false; 436 437 First[0]->Tok.setKind(tok::lessless); 438 First[0]->TokenText = "<<"; 439 First[0]->ColumnWidth += 1; 440 Tokens.erase(Tokens.end() - 2); 441 return true; 442 } 443 444 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, 445 TokenType NewType) { 446 if (Tokens.size() < Kinds.size()) 447 return false; 448 449 SmallVectorImpl<FormatToken *>::const_iterator First = 450 Tokens.end() - Kinds.size(); 451 if (!First[0]->is(Kinds[0])) 452 return false; 453 unsigned AddLength = 0; 454 for (unsigned i = 1; i < Kinds.size(); ++i) { 455 if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() != 456 First[i]->WhitespaceRange.getEnd()) 457 return false; 458 AddLength += First[i]->TokenText.size(); 459 } 460 Tokens.resize(Tokens.size() - Kinds.size() + 1); 461 First[0]->TokenText = StringRef(First[0]->TokenText.data(), 462 First[0]->TokenText.size() + AddLength); 463 First[0]->ColumnWidth += AddLength; 464 First[0]->Type = NewType; 465 return true; 466 } 467 468 // Returns \c true if \p Tok can only be followed by an operand in JavaScript. 469 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) { 470 // NB: This is not entirely correct, as an r_paren can introduce an operand 471 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough 472 // corner case to not matter in practice, though. 473 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace, 474 tok::r_brace, tok::l_square, tok::semi, tok::exclaim, 475 tok::colon, tok::question, tok::tilde) || 476 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw, 477 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void, 478 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) || 479 Tok->isBinaryOperator(); 480 } 481 482 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) { 483 if (!Prev) 484 return true; 485 486 // Regex literals can only follow after prefix unary operators, not after 487 // postfix unary operators. If the '++' is followed by a non-operand 488 // introducing token, the slash here is the operand and not the start of a 489 // regex. 490 // `!` is an unary prefix operator, but also a post-fix operator that casts 491 // away nullability, so the same check applies. 492 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim)) 493 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3])); 494 495 // The previous token must introduce an operand location where regex 496 // literals can occur. 497 if (!precedesOperand(Prev)) 498 return false; 499 500 return true; 501 } 502 503 // Tries to parse a JavaScript Regex literal starting at the current token, 504 // if that begins with a slash and is in a location where JavaScript allows 505 // regex literals. Changes the current token to a regex literal and updates 506 // its text if successful. 507 void FormatTokenLexer::tryParseJSRegexLiteral() { 508 FormatToken *RegexToken = Tokens.back(); 509 if (!RegexToken->isOneOf(tok::slash, tok::slashequal)) 510 return; 511 512 FormatToken *Prev = nullptr; 513 for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) { 514 // NB: Because previous pointers are not initialized yet, this cannot use 515 // Token.getPreviousNonComment. 516 if ((*I)->isNot(tok::comment)) { 517 Prev = *I; 518 break; 519 } 520 } 521 522 if (!canPrecedeRegexLiteral(Prev)) 523 return; 524 525 // 'Manually' lex ahead in the current file buffer. 526 const char *Offset = Lex->getBufferLocation(); 527 const char *RegexBegin = Offset - RegexToken->TokenText.size(); 528 StringRef Buffer = Lex->getBuffer(); 529 bool InCharacterClass = false; 530 bool HaveClosingSlash = false; 531 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) { 532 // Regular expressions are terminated with a '/', which can only be 533 // escaped using '\' or a character class between '[' and ']'. 534 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5. 535 switch (*Offset) { 536 case '\\': 537 // Skip the escaped character. 538 ++Offset; 539 break; 540 case '[': 541 InCharacterClass = true; 542 break; 543 case ']': 544 InCharacterClass = false; 545 break; 546 case '/': 547 if (!InCharacterClass) 548 HaveClosingSlash = true; 549 break; 550 } 551 } 552 553 RegexToken->Type = TT_RegexLiteral; 554 // Treat regex literals like other string_literals. 555 RegexToken->Tok.setKind(tok::string_literal); 556 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin); 557 RegexToken->ColumnWidth = RegexToken->TokenText.size(); 558 559 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); 560 } 561 562 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() { 563 FormatToken *CSharpStringLiteral = Tokens.back(); 564 565 if (CSharpStringLiteral->Type != TT_CSharpStringLiteral) 566 return; 567 568 // Deal with multiline strings. 569 if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") || 570 CSharpStringLiteral->TokenText.startswith(R"($@")"))) 571 return; 572 573 const char *StrBegin = 574 Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size(); 575 const char *Offset = StrBegin; 576 if (CSharpStringLiteral->TokenText.startswith(R"(@")")) 577 Offset += 2; 578 else // CSharpStringLiteral->TokenText.startswith(R"($@")") 579 Offset += 3; 580 581 // Look for a terminating '"' in the current file buffer. 582 // Make no effort to format code within an interpolated or verbatim string. 583 for (; Offset != Lex->getBuffer().end(); ++Offset) { 584 if (Offset[0] == '"') { 585 // "" within a verbatim string is an escaped double quote: skip it. 586 if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"') 587 ++Offset; 588 else 589 break; 590 } 591 } 592 593 // Make no attempt to format code properly if a verbatim string is 594 // unterminated. 595 if (Offset == Lex->getBuffer().end()) 596 return; 597 598 StringRef LiteralText(StrBegin, Offset - StrBegin + 1); 599 CSharpStringLiteral->TokenText = LiteralText; 600 601 // Adjust width for potentially multiline string literals. 602 size_t FirstBreak = LiteralText.find('\n'); 603 StringRef FirstLineText = FirstBreak == StringRef::npos 604 ? LiteralText 605 : LiteralText.substr(0, FirstBreak); 606 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs( 607 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth, 608 Encoding); 609 size_t LastBreak = LiteralText.rfind('\n'); 610 if (LastBreak != StringRef::npos) { 611 CSharpStringLiteral->IsMultiline = true; 612 unsigned StartColumn = 0; // The template tail spans the entire line. 613 CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs( 614 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn, 615 Style.TabWidth, Encoding); 616 } 617 618 SourceLocation loc = Offset < Lex->getBuffer().end() 619 ? Lex->getSourceLocation(Offset + 1) 620 : SourceMgr.getLocForEndOfFile(ID); 621 resetLexer(SourceMgr.getFileOffset(loc)); 622 } 623 624 void FormatTokenLexer::handleTemplateStrings() { 625 FormatToken *BacktickToken = Tokens.back(); 626 627 if (BacktickToken->is(tok::l_brace)) { 628 StateStack.push(LexerState::NORMAL); 629 return; 630 } 631 if (BacktickToken->is(tok::r_brace)) { 632 if (StateStack.size() == 1) 633 return; 634 StateStack.pop(); 635 if (StateStack.top() != LexerState::TEMPLATE_STRING) 636 return; 637 // If back in TEMPLATE_STRING, fallthrough and continue parsing the 638 } else if (BacktickToken->is(tok::unknown) && 639 BacktickToken->TokenText == "`") { 640 StateStack.push(LexerState::TEMPLATE_STRING); 641 } else { 642 return; // Not actually a template 643 } 644 645 // 'Manually' lex ahead in the current file buffer. 646 const char *Offset = Lex->getBufferLocation(); 647 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`" 648 for (; Offset != Lex->getBuffer().end(); ++Offset) { 649 if (Offset[0] == '`') { 650 StateStack.pop(); 651 break; 652 } 653 if (Offset[0] == '\\') { 654 ++Offset; // Skip the escaped character. 655 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' && 656 Offset[1] == '{') { 657 // '${' introduces an expression interpolation in the template string. 658 StateStack.push(LexerState::NORMAL); 659 ++Offset; 660 break; 661 } 662 } 663 664 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1); 665 BacktickToken->Type = TT_TemplateString; 666 BacktickToken->Tok.setKind(tok::string_literal); 667 BacktickToken->TokenText = LiteralText; 668 669 // Adjust width for potentially multiline string literals. 670 size_t FirstBreak = LiteralText.find('\n'); 671 StringRef FirstLineText = FirstBreak == StringRef::npos 672 ? LiteralText 673 : LiteralText.substr(0, FirstBreak); 674 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs( 675 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding); 676 size_t LastBreak = LiteralText.rfind('\n'); 677 if (LastBreak != StringRef::npos) { 678 BacktickToken->IsMultiline = true; 679 unsigned StartColumn = 0; // The template tail spans the entire line. 680 BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs( 681 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn, 682 Style.TabWidth, Encoding); 683 } 684 685 SourceLocation loc = Offset < Lex->getBuffer().end() 686 ? Lex->getSourceLocation(Offset + 1) 687 : SourceMgr.getLocForEndOfFile(ID); 688 resetLexer(SourceMgr.getFileOffset(loc)); 689 } 690 691 void FormatTokenLexer::tryParsePythonComment() { 692 FormatToken *HashToken = Tokens.back(); 693 if (!HashToken->isOneOf(tok::hash, tok::hashhash)) 694 return; 695 // Turn the remainder of this line into a comment. 696 const char *CommentBegin = 697 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#" 698 size_t From = CommentBegin - Lex->getBuffer().begin(); 699 size_t To = Lex->getBuffer().find_first_of('\n', From); 700 if (To == StringRef::npos) 701 To = Lex->getBuffer().size(); 702 size_t Len = To - From; 703 HashToken->Type = TT_LineComment; 704 HashToken->Tok.setKind(tok::comment); 705 HashToken->TokenText = Lex->getBuffer().substr(From, Len); 706 SourceLocation Loc = To < Lex->getBuffer().size() 707 ? Lex->getSourceLocation(CommentBegin + Len) 708 : SourceMgr.getLocForEndOfFile(ID); 709 resetLexer(SourceMgr.getFileOffset(Loc)); 710 } 711 712 bool FormatTokenLexer::tryMerge_TMacro() { 713 if (Tokens.size() < 4) 714 return false; 715 FormatToken *Last = Tokens.back(); 716 if (!Last->is(tok::r_paren)) 717 return false; 718 719 FormatToken *String = Tokens[Tokens.size() - 2]; 720 if (!String->is(tok::string_literal) || String->IsMultiline) 721 return false; 722 723 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren)) 724 return false; 725 726 FormatToken *Macro = Tokens[Tokens.size() - 4]; 727 if (Macro->TokenText != "_T") 728 return false; 729 730 const char *Start = Macro->TokenText.data(); 731 const char *End = Last->TokenText.data() + Last->TokenText.size(); 732 String->TokenText = StringRef(Start, End - Start); 733 String->IsFirst = Macro->IsFirst; 734 String->LastNewlineOffset = Macro->LastNewlineOffset; 735 String->WhitespaceRange = Macro->WhitespaceRange; 736 String->OriginalColumn = Macro->OriginalColumn; 737 String->ColumnWidth = encoding::columnWidthWithTabs( 738 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding); 739 String->NewlinesBefore = Macro->NewlinesBefore; 740 String->HasUnescapedNewline = Macro->HasUnescapedNewline; 741 742 Tokens.pop_back(); 743 Tokens.pop_back(); 744 Tokens.pop_back(); 745 Tokens.back() = String; 746 return true; 747 } 748 749 bool FormatTokenLexer::tryMergeConflictMarkers() { 750 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof)) 751 return false; 752 753 // Conflict lines look like: 754 // <marker> <text from the vcs> 755 // For example: 756 // >>>>>>> /file/in/file/system at revision 1234 757 // 758 // We merge all tokens in a line that starts with a conflict marker 759 // into a single token with a special token type that the unwrapped line 760 // parser will use to correctly rebuild the underlying code. 761 762 FileID ID; 763 // Get the position of the first token in the line. 764 unsigned FirstInLineOffset; 765 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc( 766 Tokens[FirstInLineIndex]->getStartOfNonWhitespace()); 767 StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer(); 768 // Calculate the offset of the start of the current line. 769 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset); 770 if (LineOffset == StringRef::npos) { 771 LineOffset = 0; 772 } else { 773 ++LineOffset; 774 } 775 776 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset); 777 StringRef LineStart; 778 if (FirstSpace == StringRef::npos) { 779 LineStart = Buffer.substr(LineOffset); 780 } else { 781 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset); 782 } 783 784 TokenType Type = TT_Unknown; 785 if (LineStart == "<<<<<<<" || LineStart == ">>>>") { 786 Type = TT_ConflictStart; 787 } else if (LineStart == "|||||||" || LineStart == "=======" || 788 LineStart == "====") { 789 Type = TT_ConflictAlternative; 790 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") { 791 Type = TT_ConflictEnd; 792 } 793 794 if (Type != TT_Unknown) { 795 FormatToken *Next = Tokens.back(); 796 797 Tokens.resize(FirstInLineIndex + 1); 798 // We do not need to build a complete token here, as we will skip it 799 // during parsing anyway (as we must not touch whitespace around conflict 800 // markers). 801 Tokens.back()->Type = Type; 802 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype); 803 804 Tokens.push_back(Next); 805 return true; 806 } 807 808 return false; 809 } 810 811 FormatToken *FormatTokenLexer::getStashedToken() { 812 // Create a synthesized second '>' or '<' token. 813 Token Tok = FormatTok->Tok; 814 StringRef TokenText = FormatTok->TokenText; 815 816 unsigned OriginalColumn = FormatTok->OriginalColumn; 817 FormatTok = new (Allocator.Allocate()) FormatToken; 818 FormatTok->Tok = Tok; 819 SourceLocation TokLocation = 820 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1); 821 FormatTok->Tok.setLocation(TokLocation); 822 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation); 823 FormatTok->TokenText = TokenText; 824 FormatTok->ColumnWidth = 1; 825 FormatTok->OriginalColumn = OriginalColumn + 1; 826 827 return FormatTok; 828 } 829 830 FormatToken *FormatTokenLexer::getNextToken() { 831 if (StateStack.top() == LexerState::TOKEN_STASHED) { 832 StateStack.pop(); 833 return getStashedToken(); 834 } 835 836 FormatTok = new (Allocator.Allocate()) FormatToken; 837 readRawToken(*FormatTok); 838 SourceLocation WhitespaceStart = 839 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace); 840 FormatTok->IsFirst = IsFirstToken; 841 IsFirstToken = false; 842 843 // Consume and record whitespace until we find a significant token. 844 unsigned WhitespaceLength = TrailingWhitespace; 845 while (FormatTok->Tok.is(tok::unknown)) { 846 StringRef Text = FormatTok->TokenText; 847 auto EscapesNewline = [&](int pos) { 848 // A '\r' here is just part of '\r\n'. Skip it. 849 if (pos >= 0 && Text[pos] == '\r') 850 --pos; 851 // See whether there is an odd number of '\' before this. 852 // FIXME: This is wrong. A '\' followed by a newline is always removed, 853 // regardless of whether there is another '\' before it. 854 // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph. 855 unsigned count = 0; 856 for (; pos >= 0; --pos, ++count) 857 if (Text[pos] != '\\') 858 break; 859 return count & 1; 860 }; 861 // FIXME: This miscounts tok:unknown tokens that are not just 862 // whitespace, e.g. a '`' character. 863 for (int i = 0, e = Text.size(); i != e; ++i) { 864 switch (Text[i]) { 865 case '\n': 866 ++FormatTok->NewlinesBefore; 867 FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1); 868 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; 869 Column = 0; 870 break; 871 case '\r': 872 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; 873 Column = 0; 874 break; 875 case '\f': 876 case '\v': 877 Column = 0; 878 break; 879 case ' ': 880 ++Column; 881 break; 882 case '\t': 883 Column += 884 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0); 885 break; 886 case '\\': 887 if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n')) 888 FormatTok->Type = TT_ImplicitStringLiteral; 889 break; 890 default: 891 FormatTok->Type = TT_ImplicitStringLiteral; 892 break; 893 } 894 if (FormatTok->Type == TT_ImplicitStringLiteral) 895 break; 896 } 897 898 if (FormatTok->is(TT_ImplicitStringLiteral)) 899 break; 900 WhitespaceLength += FormatTok->Tok.getLength(); 901 902 readRawToken(*FormatTok); 903 } 904 905 // JavaScript and Java do not allow to escape the end of the line with a 906 // backslash. Backslashes are syntax errors in plain source, but can occur in 907 // comments. When a single line comment ends with a \, it'll cause the next 908 // line of code to be lexed as a comment, breaking formatting. The code below 909 // finds comments that contain a backslash followed by a line break, truncates 910 // the comment token at the backslash, and resets the lexer to restart behind 911 // the backslash. 912 if ((Style.Language == FormatStyle::LK_JavaScript || 913 Style.Language == FormatStyle::LK_Java) && 914 FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) { 915 size_t BackslashPos = FormatTok->TokenText.find('\\'); 916 while (BackslashPos != StringRef::npos) { 917 if (BackslashPos + 1 < FormatTok->TokenText.size() && 918 FormatTok->TokenText[BackslashPos + 1] == '\n') { 919 const char *Offset = Lex->getBufferLocation(); 920 Offset -= FormatTok->TokenText.size(); 921 Offset += BackslashPos + 1; 922 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); 923 FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1); 924 FormatTok->ColumnWidth = encoding::columnWidthWithTabs( 925 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth, 926 Encoding); 927 break; 928 } 929 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1); 930 } 931 } 932 933 // In case the token starts with escaped newlines, we want to 934 // take them into account as whitespace - this pattern is quite frequent 935 // in macro definitions. 936 // FIXME: Add a more explicit test. 937 while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') { 938 unsigned SkippedWhitespace = 0; 939 if (FormatTok->TokenText.size() > 2 && 940 (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n')) 941 SkippedWhitespace = 3; 942 else if (FormatTok->TokenText[1] == '\n') 943 SkippedWhitespace = 2; 944 else 945 break; 946 947 ++FormatTok->NewlinesBefore; 948 WhitespaceLength += SkippedWhitespace; 949 FormatTok->LastNewlineOffset = SkippedWhitespace; 950 Column = 0; 951 FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace); 952 } 953 954 FormatTok->WhitespaceRange = SourceRange( 955 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); 956 957 FormatTok->OriginalColumn = Column; 958 959 TrailingWhitespace = 0; 960 if (FormatTok->Tok.is(tok::comment)) { 961 // FIXME: Add the trimmed whitespace to Column. 962 StringRef UntrimmedText = FormatTok->TokenText; 963 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f"); 964 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size(); 965 } else if (FormatTok->Tok.is(tok::raw_identifier)) { 966 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText); 967 FormatTok->Tok.setIdentifierInfo(&Info); 968 FormatTok->Tok.setKind(Info.getTokenID()); 969 if (Style.Language == FormatStyle::LK_Java && 970 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete, 971 tok::kw_operator)) { 972 FormatTok->Tok.setKind(tok::identifier); 973 FormatTok->Tok.setIdentifierInfo(nullptr); 974 } else if (Style.Language == FormatStyle::LK_JavaScript && 975 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, 976 tok::kw_operator)) { 977 FormatTok->Tok.setKind(tok::identifier); 978 FormatTok->Tok.setIdentifierInfo(nullptr); 979 } 980 } else if (FormatTok->Tok.is(tok::greatergreater)) { 981 FormatTok->Tok.setKind(tok::greater); 982 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); 983 ++Column; 984 StateStack.push(LexerState::TOKEN_STASHED); 985 } else if (FormatTok->Tok.is(tok::lessless)) { 986 FormatTok->Tok.setKind(tok::less); 987 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); 988 ++Column; 989 StateStack.push(LexerState::TOKEN_STASHED); 990 } 991 992 // Now FormatTok is the next non-whitespace token. 993 994 StringRef Text = FormatTok->TokenText; 995 size_t FirstNewlinePos = Text.find('\n'); 996 if (FirstNewlinePos == StringRef::npos) { 997 // FIXME: ColumnWidth actually depends on the start column, we need to 998 // take this into account when the token is moved. 999 FormatTok->ColumnWidth = 1000 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding); 1001 Column += FormatTok->ColumnWidth; 1002 } else { 1003 FormatTok->IsMultiline = true; 1004 // FIXME: ColumnWidth actually depends on the start column, we need to 1005 // take this into account when the token is moved. 1006 FormatTok->ColumnWidth = encoding::columnWidthWithTabs( 1007 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding); 1008 1009 // The last line of the token always starts in column 0. 1010 // Thus, the length can be precomputed even in the presence of tabs. 1011 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs( 1012 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding); 1013 Column = FormatTok->LastLineColumnWidth; 1014 } 1015 1016 if (Style.isCpp()) { 1017 auto it = Macros.find(FormatTok->Tok.getIdentifierInfo()); 1018 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() && 1019 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() == 1020 tok::pp_define) && 1021 it != Macros.end()) { 1022 FormatTok->Type = it->second; 1023 } else if (FormatTok->is(tok::identifier)) { 1024 if (MacroBlockBeginRegex.match(Text)) { 1025 FormatTok->Type = TT_MacroBlockBegin; 1026 } else if (MacroBlockEndRegex.match(Text)) { 1027 FormatTok->Type = TT_MacroBlockEnd; 1028 } 1029 } 1030 } 1031 1032 return FormatTok; 1033 } 1034 1035 void FormatTokenLexer::readRawToken(FormatToken &Tok) { 1036 Lex->LexFromRawLexer(Tok.Tok); 1037 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()), 1038 Tok.Tok.getLength()); 1039 // For formatting, treat unterminated string literals like normal string 1040 // literals. 1041 if (Tok.is(tok::unknown)) { 1042 if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') { 1043 Tok.Tok.setKind(tok::string_literal); 1044 Tok.IsUnterminatedLiteral = true; 1045 } else if (Style.Language == FormatStyle::LK_JavaScript && 1046 Tok.TokenText == "''") { 1047 Tok.Tok.setKind(tok::string_literal); 1048 } 1049 } 1050 1051 if ((Style.Language == FormatStyle::LK_JavaScript || 1052 Style.Language == FormatStyle::LK_Proto || 1053 Style.Language == FormatStyle::LK_TextProto) && 1054 Tok.is(tok::char_constant)) { 1055 Tok.Tok.setKind(tok::string_literal); 1056 } 1057 1058 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" || 1059 Tok.TokenText == "/* clang-format on */")) { 1060 FormattingDisabled = false; 1061 } 1062 1063 Tok.Finalized = FormattingDisabled; 1064 1065 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" || 1066 Tok.TokenText == "/* clang-format off */")) { 1067 FormattingDisabled = true; 1068 } 1069 } 1070 1071 void FormatTokenLexer::resetLexer(unsigned Offset) { 1072 StringRef Buffer = SourceMgr.getBufferData(ID); 1073 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), 1074 getFormattingLangOpts(Style), Buffer.begin(), 1075 Buffer.begin() + Offset, Buffer.end())); 1076 Lex->SetKeepWhitespaceMode(true); 1077 TrailingWhitespace = 0; 1078 } 1079 1080 } // namespace format 1081 } // namespace clang 1082