1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements FormatTokenLexer, which tokenizes a source file 11 /// into a FormatToken stream suitable for ClangFormat. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "FormatTokenLexer.h" 16 #include "FormatToken.h" 17 #include "clang/Basic/SourceLocation.h" 18 #include "clang/Basic/SourceManager.h" 19 #include "clang/Format/Format.h" 20 #include "llvm/Support/Regex.h" 21 22 namespace clang { 23 namespace format { 24 25 FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, 26 unsigned Column, const FormatStyle &Style, 27 encoding::Encoding Encoding) 28 : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}), 29 Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID), 30 Style(Style), IdentTable(getFormattingLangOpts(Style)), 31 Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0), 32 FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin), 33 MacroBlockEndRegex(Style.MacroBlockEnd) { 34 Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr, 35 getFormattingLangOpts(Style))); 36 Lex->SetKeepWhitespaceMode(true); 37 38 for (const std::string &ForEachMacro : Style.ForEachMacros) 39 Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro}); 40 for (const std::string &StatementMacro : Style.StatementMacros) 41 Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro}); 42 for (const std::string &TypenameMacro : Style.TypenameMacros) 43 Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro}); 44 for (const std::string &NamespaceMacro : Style.NamespaceMacros) 45 Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro}); 46 } 47 48 ArrayRef<FormatToken *> FormatTokenLexer::lex() { 49 assert(Tokens.empty()); 50 assert(FirstInLineIndex == 0); 51 do { 52 Tokens.push_back(getNextToken()); 53 if (Style.Language == FormatStyle::LK_JavaScript) { 54 tryParseJSRegexLiteral(); 55 handleTemplateStrings(); 56 } 57 if (Style.Language == FormatStyle::LK_TextProto) 58 tryParsePythonComment(); 59 tryMergePreviousTokens(); 60 if (Style.isCSharp()) 61 // This needs to come after tokens have been merged so that C# 62 // string literals are correctly identified. 63 handleCSharpVerbatimAndInterpolatedStrings(); 64 if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline) 65 FirstInLineIndex = Tokens.size() - 1; 66 } while (Tokens.back()->Tok.isNot(tok::eof)); 67 return Tokens; 68 } 69 70 void FormatTokenLexer::tryMergePreviousTokens() { 71 if (tryMerge_TMacro()) 72 return; 73 if (tryMergeConflictMarkers()) 74 return; 75 if (tryMergeLessLess()) 76 return; 77 if (tryMergeForEach()) 78 return; 79 if (Style.isCpp() && tryTransformTryUsageForC()) 80 return; 81 82 if (Style.isCSharp()) { 83 if (tryMergeCSharpKeywordVariables()) 84 return; 85 if (tryMergeCSharpStringLiteral()) 86 return; 87 if (tryMergeCSharpDoubleQuestion()) 88 return; 89 if (tryMergeCSharpNullConditional()) 90 return; 91 if (tryTransformCSharpForEach()) 92 return; 93 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater}; 94 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow)) 95 return; 96 } 97 98 if (tryMergeNSStringLiteral()) 99 return; 100 101 if (Style.Language == FormatStyle::LK_JavaScript) { 102 static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal}; 103 static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal, 104 tok::equal}; 105 static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater, 106 tok::greaterequal}; 107 static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater}; 108 static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star}; 109 static const tok::TokenKind JSExponentiationEqual[] = {tok::star, 110 tok::starequal}; 111 static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question, 112 tok::period}; 113 static const tok::TokenKind JSNullishOperator[] = {tok::question, 114 tok::question}; 115 116 // FIXME: Investigate what token type gives the correct operator priority. 117 if (tryMergeTokens(JSIdentity, TT_BinaryOperator)) 118 return; 119 if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator)) 120 return; 121 if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator)) 122 return; 123 if (tryMergeTokens(JSRightArrow, TT_JsFatArrow)) 124 return; 125 if (tryMergeTokens(JSExponentiation, TT_JsExponentiation)) 126 return; 127 if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) { 128 Tokens.back()->Tok.setKind(tok::starequal); 129 return; 130 } 131 if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) { 132 // Treat like the "||" operator (as opposed to the ternary ?). 133 Tokens.back()->Tok.setKind(tok::pipepipe); 134 return; 135 } 136 if (tryMergeTokens(JSNullPropagatingOperator, 137 TT_JsNullPropagatingOperator)) { 138 // Treat like a regular "." access. 139 Tokens.back()->Tok.setKind(tok::period); 140 return; 141 } 142 if (tryMergeJSPrivateIdentifier()) 143 return; 144 } 145 146 if (Style.Language == FormatStyle::LK_Java) { 147 static const tok::TokenKind JavaRightLogicalShiftAssign[] = { 148 tok::greater, tok::greater, tok::greaterequal}; 149 if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator)) 150 return; 151 } 152 } 153 154 bool FormatTokenLexer::tryMergeNSStringLiteral() { 155 if (Tokens.size() < 2) 156 return false; 157 auto &At = *(Tokens.end() - 2); 158 auto &String = *(Tokens.end() - 1); 159 if (!At->is(tok::at) || !String->is(tok::string_literal)) 160 return false; 161 At->Tok.setKind(tok::string_literal); 162 At->TokenText = StringRef(At->TokenText.begin(), 163 String->TokenText.end() - At->TokenText.begin()); 164 At->ColumnWidth += String->ColumnWidth; 165 At->setType(TT_ObjCStringLiteral); 166 Tokens.erase(Tokens.end() - 1); 167 return true; 168 } 169 170 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() { 171 // Merges #idenfier into a single identifier with the text #identifier 172 // but the token tok::identifier. 173 if (Tokens.size() < 2) 174 return false; 175 auto &Hash = *(Tokens.end() - 2); 176 auto &Identifier = *(Tokens.end() - 1); 177 if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier)) 178 return false; 179 Hash->Tok.setKind(tok::identifier); 180 Hash->TokenText = 181 StringRef(Hash->TokenText.begin(), 182 Identifier->TokenText.end() - Hash->TokenText.begin()); 183 Hash->ColumnWidth += Identifier->ColumnWidth; 184 Hash->setType(TT_JsPrivateIdentifier); 185 Tokens.erase(Tokens.end() - 1); 186 return true; 187 } 188 189 // Search for verbatim or interpolated string literals @"ABC" or 190 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to 191 // prevent splitting of @, $ and ". 192 // Merging of multiline verbatim strings with embedded '"' is handled in 193 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing. 194 bool FormatTokenLexer::tryMergeCSharpStringLiteral() { 195 if (Tokens.size() < 2) 196 return false; 197 198 // Interpolated strings could contain { } with " characters inside. 199 // $"{x ?? "null"}" 200 // should not be split into $"{x ?? ", null, "}" but should treated as a 201 // single string-literal. 202 // 203 // We opt not to try and format expressions inside {} within a C# 204 // interpolated string. Formatting expressions within an interpolated string 205 // would require similar work as that done for JavaScript template strings 206 // in `handleTemplateStrings()`. 207 auto &CSharpInterpolatedString = *(Tokens.end() - 2); 208 if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral && 209 (CSharpInterpolatedString->TokenText.startswith(R"($")") || 210 CSharpInterpolatedString->TokenText.startswith(R"($@")"))) { 211 int UnmatchedOpeningBraceCount = 0; 212 213 auto TokenTextSize = CSharpInterpolatedString->TokenText.size(); 214 for (size_t Index = 0; Index < TokenTextSize; ++Index) { 215 char C = CSharpInterpolatedString->TokenText[Index]; 216 if (C == '{') { 217 // "{{" inside an interpolated string is an escaped '{' so skip it. 218 if (Index + 1 < TokenTextSize && 219 CSharpInterpolatedString->TokenText[Index + 1] == '{') { 220 ++Index; 221 continue; 222 } 223 ++UnmatchedOpeningBraceCount; 224 } else if (C == '}') { 225 // "}}" inside an interpolated string is an escaped '}' so skip it. 226 if (Index + 1 < TokenTextSize && 227 CSharpInterpolatedString->TokenText[Index + 1] == '}') { 228 ++Index; 229 continue; 230 } 231 --UnmatchedOpeningBraceCount; 232 } 233 } 234 235 if (UnmatchedOpeningBraceCount > 0) { 236 auto &NextToken = *(Tokens.end() - 1); 237 CSharpInterpolatedString->TokenText = 238 StringRef(CSharpInterpolatedString->TokenText.begin(), 239 NextToken->TokenText.end() - 240 CSharpInterpolatedString->TokenText.begin()); 241 CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth; 242 Tokens.erase(Tokens.end() - 1); 243 return true; 244 } 245 } 246 247 // Look for @"aaaaaa" or $"aaaaaa". 248 auto &String = *(Tokens.end() - 1); 249 if (!String->is(tok::string_literal)) 250 return false; 251 252 auto &At = *(Tokens.end() - 2); 253 if (!(At->is(tok::at) || At->TokenText == "$")) 254 return false; 255 256 if (Tokens.size() > 2 && At->is(tok::at)) { 257 auto &Dollar = *(Tokens.end() - 3); 258 if (Dollar->TokenText == "$") { 259 // This looks like $@"aaaaa" so we need to combine all 3 tokens. 260 Dollar->Tok.setKind(tok::string_literal); 261 Dollar->TokenText = 262 StringRef(Dollar->TokenText.begin(), 263 String->TokenText.end() - Dollar->TokenText.begin()); 264 Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth); 265 Dollar->setType(TT_CSharpStringLiteral); 266 Tokens.erase(Tokens.end() - 2); 267 Tokens.erase(Tokens.end() - 1); 268 return true; 269 } 270 } 271 272 // Convert back into just a string_literal. 273 At->Tok.setKind(tok::string_literal); 274 At->TokenText = StringRef(At->TokenText.begin(), 275 String->TokenText.end() - At->TokenText.begin()); 276 At->ColumnWidth += String->ColumnWidth; 277 At->setType(TT_CSharpStringLiteral); 278 Tokens.erase(Tokens.end() - 1); 279 return true; 280 } 281 282 // Valid C# attribute targets: 283 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets 284 const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = { 285 "assembly", "module", "field", "event", "method", 286 "param", "property", "return", "type", 287 }; 288 289 bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() { 290 if (Tokens.size() < 2) 291 return false; 292 auto &FirstQuestion = *(Tokens.end() - 2); 293 auto &SecondQuestion = *(Tokens.end() - 1); 294 if (!FirstQuestion->is(tok::question) || !SecondQuestion->is(tok::question)) 295 return false; 296 FirstQuestion->Tok.setKind(tok::question); // no '??' in clang tokens. 297 FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(), 298 SecondQuestion->TokenText.end() - 299 FirstQuestion->TokenText.begin()); 300 FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth; 301 FirstQuestion->setType(TT_CSharpNullCoalescing); 302 Tokens.erase(Tokens.end() - 1); 303 return true; 304 } 305 306 // Merge '?[' and '?.' pairs into single tokens. 307 bool FormatTokenLexer::tryMergeCSharpNullConditional() { 308 if (Tokens.size() < 2) 309 return false; 310 auto &Question = *(Tokens.end() - 2); 311 auto &PeriodOrLSquare = *(Tokens.end() - 1); 312 if (!Question->is(tok::question) || 313 !PeriodOrLSquare->isOneOf(tok::l_square, tok::period)) 314 return false; 315 Question->TokenText = 316 StringRef(Question->TokenText.begin(), 317 PeriodOrLSquare->TokenText.end() - Question->TokenText.begin()); 318 Question->ColumnWidth += PeriodOrLSquare->ColumnWidth; 319 320 if (PeriodOrLSquare->is(tok::l_square)) { 321 Question->Tok.setKind(tok::question); // no '?[' in clang tokens. 322 Question->setType(TT_CSharpNullConditionalLSquare); 323 } else { 324 Question->Tok.setKind(tok::question); // no '?.' in clang tokens. 325 Question->setType(TT_CSharpNullConditional); 326 } 327 328 Tokens.erase(Tokens.end() - 1); 329 return true; 330 } 331 332 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() { 333 if (Tokens.size() < 2) 334 return false; 335 auto &At = *(Tokens.end() - 2); 336 auto &Keyword = *(Tokens.end() - 1); 337 if (!At->is(tok::at)) 338 return false; 339 if (!Keywords.isCSharpKeyword(*Keyword)) 340 return false; 341 342 At->Tok.setKind(tok::identifier); 343 At->TokenText = StringRef(At->TokenText.begin(), 344 Keyword->TokenText.end() - At->TokenText.begin()); 345 At->ColumnWidth += Keyword->ColumnWidth; 346 At->setType(Keyword->getType()); 347 Tokens.erase(Tokens.end() - 1); 348 return true; 349 } 350 351 // In C# transform identifier foreach into kw_foreach 352 bool FormatTokenLexer::tryTransformCSharpForEach() { 353 if (Tokens.size() < 1) 354 return false; 355 auto &Identifier = *(Tokens.end() - 1); 356 if (!Identifier->is(tok::identifier)) 357 return false; 358 if (Identifier->TokenText != "foreach") 359 return false; 360 361 Identifier->setType(TT_ForEachMacro); 362 Identifier->Tok.setKind(tok::kw_for); 363 return true; 364 } 365 366 bool FormatTokenLexer::tryMergeForEach() { 367 if (Tokens.size() < 2) 368 return false; 369 auto &For = *(Tokens.end() - 2); 370 auto &Each = *(Tokens.end() - 1); 371 if (!For->is(tok::kw_for)) 372 return false; 373 if (!Each->is(tok::identifier)) 374 return false; 375 if (Each->TokenText != "each") 376 return false; 377 378 For->setType(TT_ForEachMacro); 379 For->Tok.setKind(tok::kw_for); 380 381 For->TokenText = StringRef(For->TokenText.begin(), 382 Each->TokenText.end() - For->TokenText.begin()); 383 For->ColumnWidth += Each->ColumnWidth; 384 Tokens.erase(Tokens.end() - 1); 385 return true; 386 } 387 388 bool FormatTokenLexer::tryTransformTryUsageForC() { 389 if (Tokens.size() < 2) 390 return false; 391 auto &Try = *(Tokens.end() - 2); 392 if (!Try->is(tok::kw_try)) 393 return false; 394 auto &Next = *(Tokens.end() - 1); 395 if (Next->isOneOf(tok::l_brace, tok::colon)) 396 return false; 397 398 if (Tokens.size() > 2) { 399 auto &At = *(Tokens.end() - 3); 400 if (At->is(tok::at)) 401 return false; 402 } 403 404 Try->Tok.setKind(tok::identifier); 405 return true; 406 } 407 408 bool FormatTokenLexer::tryMergeLessLess() { 409 // Merge X,less,less,Y into X,lessless,Y unless X or Y is less. 410 if (Tokens.size() < 3) 411 return false; 412 413 bool FourthTokenIsLess = false; 414 if (Tokens.size() > 3) 415 FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less); 416 417 auto First = Tokens.end() - 3; 418 if (First[2]->is(tok::less) || First[1]->isNot(tok::less) || 419 First[0]->isNot(tok::less) || FourthTokenIsLess) 420 return false; 421 422 // Only merge if there currently is no whitespace between the two "<". 423 if (First[1]->WhitespaceRange.getBegin() != 424 First[1]->WhitespaceRange.getEnd()) 425 return false; 426 427 First[0]->Tok.setKind(tok::lessless); 428 First[0]->TokenText = "<<"; 429 First[0]->ColumnWidth += 1; 430 Tokens.erase(Tokens.end() - 2); 431 return true; 432 } 433 434 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, 435 TokenType NewType) { 436 if (Tokens.size() < Kinds.size()) 437 return false; 438 439 SmallVectorImpl<FormatToken *>::const_iterator First = 440 Tokens.end() - Kinds.size(); 441 if (!First[0]->is(Kinds[0])) 442 return false; 443 unsigned AddLength = 0; 444 for (unsigned i = 1; i < Kinds.size(); ++i) { 445 if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() != 446 First[i]->WhitespaceRange.getEnd()) 447 return false; 448 AddLength += First[i]->TokenText.size(); 449 } 450 Tokens.resize(Tokens.size() - Kinds.size() + 1); 451 First[0]->TokenText = StringRef(First[0]->TokenText.data(), 452 First[0]->TokenText.size() + AddLength); 453 First[0]->ColumnWidth += AddLength; 454 First[0]->setType(NewType); 455 return true; 456 } 457 458 // Returns \c true if \p Tok can only be followed by an operand in JavaScript. 459 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) { 460 // NB: This is not entirely correct, as an r_paren can introduce an operand 461 // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough 462 // corner case to not matter in practice, though. 463 return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace, 464 tok::r_brace, tok::l_square, tok::semi, tok::exclaim, 465 tok::colon, tok::question, tok::tilde) || 466 Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw, 467 tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void, 468 tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) || 469 Tok->isBinaryOperator(); 470 } 471 472 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) { 473 if (!Prev) 474 return true; 475 476 // Regex literals can only follow after prefix unary operators, not after 477 // postfix unary operators. If the '++' is followed by a non-operand 478 // introducing token, the slash here is the operand and not the start of a 479 // regex. 480 // `!` is an unary prefix operator, but also a post-fix operator that casts 481 // away nullability, so the same check applies. 482 if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim)) 483 return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3])); 484 485 // The previous token must introduce an operand location where regex 486 // literals can occur. 487 if (!precedesOperand(Prev)) 488 return false; 489 490 return true; 491 } 492 493 // Tries to parse a JavaScript Regex literal starting at the current token, 494 // if that begins with a slash and is in a location where JavaScript allows 495 // regex literals. Changes the current token to a regex literal and updates 496 // its text if successful. 497 void FormatTokenLexer::tryParseJSRegexLiteral() { 498 FormatToken *RegexToken = Tokens.back(); 499 if (!RegexToken->isOneOf(tok::slash, tok::slashequal)) 500 return; 501 502 FormatToken *Prev = nullptr; 503 for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) { 504 // NB: Because previous pointers are not initialized yet, this cannot use 505 // Token.getPreviousNonComment. 506 if ((*I)->isNot(tok::comment)) { 507 Prev = *I; 508 break; 509 } 510 } 511 512 if (!canPrecedeRegexLiteral(Prev)) 513 return; 514 515 // 'Manually' lex ahead in the current file buffer. 516 const char *Offset = Lex->getBufferLocation(); 517 const char *RegexBegin = Offset - RegexToken->TokenText.size(); 518 StringRef Buffer = Lex->getBuffer(); 519 bool InCharacterClass = false; 520 bool HaveClosingSlash = false; 521 for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) { 522 // Regular expressions are terminated with a '/', which can only be 523 // escaped using '\' or a character class between '[' and ']'. 524 // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5. 525 switch (*Offset) { 526 case '\\': 527 // Skip the escaped character. 528 ++Offset; 529 break; 530 case '[': 531 InCharacterClass = true; 532 break; 533 case ']': 534 InCharacterClass = false; 535 break; 536 case '/': 537 if (!InCharacterClass) 538 HaveClosingSlash = true; 539 break; 540 } 541 } 542 543 RegexToken->setType(TT_RegexLiteral); 544 // Treat regex literals like other string_literals. 545 RegexToken->Tok.setKind(tok::string_literal); 546 RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin); 547 RegexToken->ColumnWidth = RegexToken->TokenText.size(); 548 549 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); 550 } 551 552 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() { 553 FormatToken *CSharpStringLiteral = Tokens.back(); 554 555 if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral) 556 return; 557 558 // Deal with multiline strings. 559 if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") || 560 CSharpStringLiteral->TokenText.startswith(R"($@")"))) 561 return; 562 563 const char *StrBegin = 564 Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size(); 565 const char *Offset = StrBegin; 566 if (CSharpStringLiteral->TokenText.startswith(R"(@")")) 567 Offset += 2; 568 else // CSharpStringLiteral->TokenText.startswith(R"($@")") 569 Offset += 3; 570 571 // Look for a terminating '"' in the current file buffer. 572 // Make no effort to format code within an interpolated or verbatim string. 573 for (; Offset != Lex->getBuffer().end(); ++Offset) { 574 if (Offset[0] == '"') { 575 // "" within a verbatim string is an escaped double quote: skip it. 576 if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"') 577 ++Offset; 578 else 579 break; 580 } 581 } 582 583 // Make no attempt to format code properly if a verbatim string is 584 // unterminated. 585 if (Offset == Lex->getBuffer().end()) 586 return; 587 588 StringRef LiteralText(StrBegin, Offset - StrBegin + 1); 589 CSharpStringLiteral->TokenText = LiteralText; 590 591 // Adjust width for potentially multiline string literals. 592 size_t FirstBreak = LiteralText.find('\n'); 593 StringRef FirstLineText = FirstBreak == StringRef::npos 594 ? LiteralText 595 : LiteralText.substr(0, FirstBreak); 596 CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs( 597 FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth, 598 Encoding); 599 size_t LastBreak = LiteralText.rfind('\n'); 600 if (LastBreak != StringRef::npos) { 601 CSharpStringLiteral->IsMultiline = true; 602 unsigned StartColumn = 0; 603 CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs( 604 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn, 605 Style.TabWidth, Encoding); 606 } 607 608 SourceLocation loc = Offset < Lex->getBuffer().end() 609 ? Lex->getSourceLocation(Offset + 1) 610 : SourceMgr.getLocForEndOfFile(ID); 611 resetLexer(SourceMgr.getFileOffset(loc)); 612 } 613 614 void FormatTokenLexer::handleTemplateStrings() { 615 FormatToken *BacktickToken = Tokens.back(); 616 617 if (BacktickToken->is(tok::l_brace)) { 618 StateStack.push(LexerState::NORMAL); 619 return; 620 } 621 if (BacktickToken->is(tok::r_brace)) { 622 if (StateStack.size() == 1) 623 return; 624 StateStack.pop(); 625 if (StateStack.top() != LexerState::TEMPLATE_STRING) 626 return; 627 // If back in TEMPLATE_STRING, fallthrough and continue parsing the 628 } else if (BacktickToken->is(tok::unknown) && 629 BacktickToken->TokenText == "`") { 630 StateStack.push(LexerState::TEMPLATE_STRING); 631 } else { 632 return; // Not actually a template 633 } 634 635 // 'Manually' lex ahead in the current file buffer. 636 const char *Offset = Lex->getBufferLocation(); 637 const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`" 638 for (; Offset != Lex->getBuffer().end(); ++Offset) { 639 if (Offset[0] == '`') { 640 StateStack.pop(); 641 break; 642 } 643 if (Offset[0] == '\\') { 644 ++Offset; // Skip the escaped character. 645 } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' && 646 Offset[1] == '{') { 647 // '${' introduces an expression interpolation in the template string. 648 StateStack.push(LexerState::NORMAL); 649 ++Offset; 650 break; 651 } 652 } 653 654 StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1); 655 BacktickToken->setType(TT_TemplateString); 656 BacktickToken->Tok.setKind(tok::string_literal); 657 BacktickToken->TokenText = LiteralText; 658 659 // Adjust width for potentially multiline string literals. 660 size_t FirstBreak = LiteralText.find('\n'); 661 StringRef FirstLineText = FirstBreak == StringRef::npos 662 ? LiteralText 663 : LiteralText.substr(0, FirstBreak); 664 BacktickToken->ColumnWidth = encoding::columnWidthWithTabs( 665 FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding); 666 size_t LastBreak = LiteralText.rfind('\n'); 667 if (LastBreak != StringRef::npos) { 668 BacktickToken->IsMultiline = true; 669 unsigned StartColumn = 0; // The template tail spans the entire line. 670 BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs( 671 LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn, 672 Style.TabWidth, Encoding); 673 } 674 675 SourceLocation loc = Offset < Lex->getBuffer().end() 676 ? Lex->getSourceLocation(Offset + 1) 677 : SourceMgr.getLocForEndOfFile(ID); 678 resetLexer(SourceMgr.getFileOffset(loc)); 679 } 680 681 void FormatTokenLexer::tryParsePythonComment() { 682 FormatToken *HashToken = Tokens.back(); 683 if (!HashToken->isOneOf(tok::hash, tok::hashhash)) 684 return; 685 // Turn the remainder of this line into a comment. 686 const char *CommentBegin = 687 Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#" 688 size_t From = CommentBegin - Lex->getBuffer().begin(); 689 size_t To = Lex->getBuffer().find_first_of('\n', From); 690 if (To == StringRef::npos) 691 To = Lex->getBuffer().size(); 692 size_t Len = To - From; 693 HashToken->setType(TT_LineComment); 694 HashToken->Tok.setKind(tok::comment); 695 HashToken->TokenText = Lex->getBuffer().substr(From, Len); 696 SourceLocation Loc = To < Lex->getBuffer().size() 697 ? Lex->getSourceLocation(CommentBegin + Len) 698 : SourceMgr.getLocForEndOfFile(ID); 699 resetLexer(SourceMgr.getFileOffset(Loc)); 700 } 701 702 bool FormatTokenLexer::tryMerge_TMacro() { 703 if (Tokens.size() < 4) 704 return false; 705 FormatToken *Last = Tokens.back(); 706 if (!Last->is(tok::r_paren)) 707 return false; 708 709 FormatToken *String = Tokens[Tokens.size() - 2]; 710 if (!String->is(tok::string_literal) || String->IsMultiline) 711 return false; 712 713 if (!Tokens[Tokens.size() - 3]->is(tok::l_paren)) 714 return false; 715 716 FormatToken *Macro = Tokens[Tokens.size() - 4]; 717 if (Macro->TokenText != "_T") 718 return false; 719 720 const char *Start = Macro->TokenText.data(); 721 const char *End = Last->TokenText.data() + Last->TokenText.size(); 722 String->TokenText = StringRef(Start, End - Start); 723 String->IsFirst = Macro->IsFirst; 724 String->LastNewlineOffset = Macro->LastNewlineOffset; 725 String->WhitespaceRange = Macro->WhitespaceRange; 726 String->OriginalColumn = Macro->OriginalColumn; 727 String->ColumnWidth = encoding::columnWidthWithTabs( 728 String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding); 729 String->NewlinesBefore = Macro->NewlinesBefore; 730 String->HasUnescapedNewline = Macro->HasUnescapedNewline; 731 732 Tokens.pop_back(); 733 Tokens.pop_back(); 734 Tokens.pop_back(); 735 Tokens.back() = String; 736 return true; 737 } 738 739 bool FormatTokenLexer::tryMergeConflictMarkers() { 740 if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof)) 741 return false; 742 743 // Conflict lines look like: 744 // <marker> <text from the vcs> 745 // For example: 746 // >>>>>>> /file/in/file/system at revision 1234 747 // 748 // We merge all tokens in a line that starts with a conflict marker 749 // into a single token with a special token type that the unwrapped line 750 // parser will use to correctly rebuild the underlying code. 751 752 FileID ID; 753 // Get the position of the first token in the line. 754 unsigned FirstInLineOffset; 755 std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc( 756 Tokens[FirstInLineIndex]->getStartOfNonWhitespace()); 757 StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer(); 758 // Calculate the offset of the start of the current line. 759 auto LineOffset = Buffer.rfind('\n', FirstInLineOffset); 760 if (LineOffset == StringRef::npos) { 761 LineOffset = 0; 762 } else { 763 ++LineOffset; 764 } 765 766 auto FirstSpace = Buffer.find_first_of(" \n", LineOffset); 767 StringRef LineStart; 768 if (FirstSpace == StringRef::npos) { 769 LineStart = Buffer.substr(LineOffset); 770 } else { 771 LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset); 772 } 773 774 TokenType Type = TT_Unknown; 775 if (LineStart == "<<<<<<<" || LineStart == ">>>>") { 776 Type = TT_ConflictStart; 777 } else if (LineStart == "|||||||" || LineStart == "=======" || 778 LineStart == "====") { 779 Type = TT_ConflictAlternative; 780 } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") { 781 Type = TT_ConflictEnd; 782 } 783 784 if (Type != TT_Unknown) { 785 FormatToken *Next = Tokens.back(); 786 787 Tokens.resize(FirstInLineIndex + 1); 788 // We do not need to build a complete token here, as we will skip it 789 // during parsing anyway (as we must not touch whitespace around conflict 790 // markers). 791 Tokens.back()->setType(Type); 792 Tokens.back()->Tok.setKind(tok::kw___unknown_anytype); 793 794 Tokens.push_back(Next); 795 return true; 796 } 797 798 return false; 799 } 800 801 FormatToken *FormatTokenLexer::getStashedToken() { 802 // Create a synthesized second '>' or '<' token. 803 Token Tok = FormatTok->Tok; 804 StringRef TokenText = FormatTok->TokenText; 805 806 unsigned OriginalColumn = FormatTok->OriginalColumn; 807 FormatTok = new (Allocator.Allocate()) FormatToken; 808 FormatTok->Tok = Tok; 809 SourceLocation TokLocation = 810 FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1); 811 FormatTok->Tok.setLocation(TokLocation); 812 FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation); 813 FormatTok->TokenText = TokenText; 814 FormatTok->ColumnWidth = 1; 815 FormatTok->OriginalColumn = OriginalColumn + 1; 816 817 return FormatTok; 818 } 819 820 FormatToken *FormatTokenLexer::getNextToken() { 821 if (StateStack.top() == LexerState::TOKEN_STASHED) { 822 StateStack.pop(); 823 return getStashedToken(); 824 } 825 826 FormatTok = new (Allocator.Allocate()) FormatToken; 827 readRawToken(*FormatTok); 828 SourceLocation WhitespaceStart = 829 FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace); 830 FormatTok->IsFirst = IsFirstToken; 831 IsFirstToken = false; 832 833 // Consume and record whitespace until we find a significant token. 834 unsigned WhitespaceLength = TrailingWhitespace; 835 while (FormatTok->Tok.is(tok::unknown)) { 836 StringRef Text = FormatTok->TokenText; 837 auto EscapesNewline = [&](int pos) { 838 // A '\r' here is just part of '\r\n'. Skip it. 839 if (pos >= 0 && Text[pos] == '\r') 840 --pos; 841 // See whether there is an odd number of '\' before this. 842 // FIXME: This is wrong. A '\' followed by a newline is always removed, 843 // regardless of whether there is another '\' before it. 844 // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph. 845 unsigned count = 0; 846 for (; pos >= 0; --pos, ++count) 847 if (Text[pos] != '\\') 848 break; 849 return count & 1; 850 }; 851 // FIXME: This miscounts tok:unknown tokens that are not just 852 // whitespace, e.g. a '`' character. 853 for (int i = 0, e = Text.size(); i != e; ++i) { 854 switch (Text[i]) { 855 case '\n': 856 ++FormatTok->NewlinesBefore; 857 FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1); 858 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; 859 Column = 0; 860 break; 861 case '\r': 862 FormatTok->LastNewlineOffset = WhitespaceLength + i + 1; 863 Column = 0; 864 break; 865 case '\f': 866 case '\v': 867 Column = 0; 868 break; 869 case ' ': 870 ++Column; 871 break; 872 case '\t': 873 Column += 874 Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0); 875 break; 876 case '\\': 877 if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n')) 878 FormatTok->setType(TT_ImplicitStringLiteral); 879 break; 880 default: 881 FormatTok->setType(TT_ImplicitStringLiteral); 882 break; 883 } 884 if (FormatTok->getType() == TT_ImplicitStringLiteral) 885 break; 886 } 887 888 if (FormatTok->is(TT_ImplicitStringLiteral)) 889 break; 890 WhitespaceLength += FormatTok->Tok.getLength(); 891 892 readRawToken(*FormatTok); 893 } 894 895 // JavaScript and Java do not allow to escape the end of the line with a 896 // backslash. Backslashes are syntax errors in plain source, but can occur in 897 // comments. When a single line comment ends with a \, it'll cause the next 898 // line of code to be lexed as a comment, breaking formatting. The code below 899 // finds comments that contain a backslash followed by a line break, truncates 900 // the comment token at the backslash, and resets the lexer to restart behind 901 // the backslash. 902 if ((Style.Language == FormatStyle::LK_JavaScript || 903 Style.Language == FormatStyle::LK_Java) && 904 FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) { 905 size_t BackslashPos = FormatTok->TokenText.find('\\'); 906 while (BackslashPos != StringRef::npos) { 907 if (BackslashPos + 1 < FormatTok->TokenText.size() && 908 FormatTok->TokenText[BackslashPos + 1] == '\n') { 909 const char *Offset = Lex->getBufferLocation(); 910 Offset -= FormatTok->TokenText.size(); 911 Offset += BackslashPos + 1; 912 resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset))); 913 FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1); 914 FormatTok->ColumnWidth = encoding::columnWidthWithTabs( 915 FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth, 916 Encoding); 917 break; 918 } 919 BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1); 920 } 921 } 922 923 // In case the token starts with escaped newlines, we want to 924 // take them into account as whitespace - this pattern is quite frequent 925 // in macro definitions. 926 // FIXME: Add a more explicit test. 927 while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') { 928 unsigned SkippedWhitespace = 0; 929 if (FormatTok->TokenText.size() > 2 && 930 (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n')) 931 SkippedWhitespace = 3; 932 else if (FormatTok->TokenText[1] == '\n') 933 SkippedWhitespace = 2; 934 else 935 break; 936 937 ++FormatTok->NewlinesBefore; 938 WhitespaceLength += SkippedWhitespace; 939 FormatTok->LastNewlineOffset = SkippedWhitespace; 940 Column = 0; 941 FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace); 942 } 943 944 FormatTok->WhitespaceRange = SourceRange( 945 WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength)); 946 947 FormatTok->OriginalColumn = Column; 948 949 TrailingWhitespace = 0; 950 if (FormatTok->Tok.is(tok::comment)) { 951 // FIXME: Add the trimmed whitespace to Column. 952 StringRef UntrimmedText = FormatTok->TokenText; 953 FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f"); 954 TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size(); 955 } else if (FormatTok->Tok.is(tok::raw_identifier)) { 956 IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText); 957 FormatTok->Tok.setIdentifierInfo(&Info); 958 FormatTok->Tok.setKind(Info.getTokenID()); 959 if (Style.Language == FormatStyle::LK_Java && 960 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete, 961 tok::kw_operator)) { 962 FormatTok->Tok.setKind(tok::identifier); 963 FormatTok->Tok.setIdentifierInfo(nullptr); 964 } else if (Style.Language == FormatStyle::LK_JavaScript && 965 FormatTok->isOneOf(tok::kw_struct, tok::kw_union, 966 tok::kw_operator)) { 967 FormatTok->Tok.setKind(tok::identifier); 968 FormatTok->Tok.setIdentifierInfo(nullptr); 969 } 970 } else if (FormatTok->Tok.is(tok::greatergreater)) { 971 FormatTok->Tok.setKind(tok::greater); 972 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); 973 ++Column; 974 StateStack.push(LexerState::TOKEN_STASHED); 975 } else if (FormatTok->Tok.is(tok::lessless)) { 976 FormatTok->Tok.setKind(tok::less); 977 FormatTok->TokenText = FormatTok->TokenText.substr(0, 1); 978 ++Column; 979 StateStack.push(LexerState::TOKEN_STASHED); 980 } 981 982 // Now FormatTok is the next non-whitespace token. 983 984 StringRef Text = FormatTok->TokenText; 985 size_t FirstNewlinePos = Text.find('\n'); 986 if (FirstNewlinePos == StringRef::npos) { 987 // FIXME: ColumnWidth actually depends on the start column, we need to 988 // take this into account when the token is moved. 989 FormatTok->ColumnWidth = 990 encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding); 991 Column += FormatTok->ColumnWidth; 992 } else { 993 FormatTok->IsMultiline = true; 994 // FIXME: ColumnWidth actually depends on the start column, we need to 995 // take this into account when the token is moved. 996 FormatTok->ColumnWidth = encoding::columnWidthWithTabs( 997 Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding); 998 999 // The last line of the token always starts in column 0. 1000 // Thus, the length can be precomputed even in the presence of tabs. 1001 FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs( 1002 Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding); 1003 Column = FormatTok->LastLineColumnWidth; 1004 } 1005 1006 if (Style.isCpp()) { 1007 auto it = Macros.find(FormatTok->Tok.getIdentifierInfo()); 1008 if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() && 1009 Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() == 1010 tok::pp_define) && 1011 it != Macros.end()) { 1012 FormatTok->setType(it->second); 1013 } else if (FormatTok->is(tok::identifier)) { 1014 if (MacroBlockBeginRegex.match(Text)) { 1015 FormatTok->setType(TT_MacroBlockBegin); 1016 } else if (MacroBlockEndRegex.match(Text)) { 1017 FormatTok->setType(TT_MacroBlockEnd); 1018 } 1019 } 1020 } 1021 1022 return FormatTok; 1023 } 1024 1025 void FormatTokenLexer::readRawToken(FormatToken &Tok) { 1026 Lex->LexFromRawLexer(Tok.Tok); 1027 Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()), 1028 Tok.Tok.getLength()); 1029 // For formatting, treat unterminated string literals like normal string 1030 // literals. 1031 if (Tok.is(tok::unknown)) { 1032 if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') { 1033 Tok.Tok.setKind(tok::string_literal); 1034 Tok.IsUnterminatedLiteral = true; 1035 } else if (Style.Language == FormatStyle::LK_JavaScript && 1036 Tok.TokenText == "''") { 1037 Tok.Tok.setKind(tok::string_literal); 1038 } 1039 } 1040 1041 if ((Style.Language == FormatStyle::LK_JavaScript || 1042 Style.Language == FormatStyle::LK_Proto || 1043 Style.Language == FormatStyle::LK_TextProto) && 1044 Tok.is(tok::char_constant)) { 1045 Tok.Tok.setKind(tok::string_literal); 1046 } 1047 1048 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" || 1049 Tok.TokenText == "/* clang-format on */")) { 1050 FormattingDisabled = false; 1051 } 1052 1053 Tok.Finalized = FormattingDisabled; 1054 1055 if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" || 1056 Tok.TokenText == "/* clang-format off */")) { 1057 FormattingDisabled = true; 1058 } 1059 } 1060 1061 void FormatTokenLexer::resetLexer(unsigned Offset) { 1062 StringRef Buffer = SourceMgr.getBufferData(ID); 1063 Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), 1064 getFormattingLangOpts(Style), Buffer.begin(), 1065 Buffer.begin() + Offset, Buffer.end())); 1066 Lex->SetKeepWhitespaceMode(true); 1067 TrailingWhitespace = 0; 1068 } 1069 1070 } // namespace format 1071 } // namespace clang 1072