1 //===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "SourceCode.h" 9 10 #include "FuzzyMatch.h" 11 #include "Preamble.h" 12 #include "Protocol.h" 13 #include "refactor/Tweak.h" 14 #include "support/Context.h" 15 #include "support/Logger.h" 16 #include "clang/AST/ASTContext.h" 17 #include "clang/Basic/LangOptions.h" 18 #include "clang/Basic/SourceLocation.h" 19 #include "clang/Basic/SourceManager.h" 20 #include "clang/Basic/TokenKinds.h" 21 #include "clang/Driver/Types.h" 22 #include "clang/Format/Format.h" 23 #include "clang/Lex/Lexer.h" 24 #include "clang/Lex/Preprocessor.h" 25 #include "clang/Lex/Token.h" 26 #include "clang/Tooling/Core/Replacement.h" 27 #include "clang/Tooling/Syntax/Tokens.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/None.h" 30 #include "llvm/ADT/STLExtras.h" 31 #include "llvm/ADT/StringExtras.h" 32 #include "llvm/ADT/StringMap.h" 33 #include "llvm/ADT/StringRef.h" 34 #include "llvm/Support/Compiler.h" 35 #include "llvm/Support/Errc.h" 36 #include "llvm/Support/Error.h" 37 #include "llvm/Support/ErrorHandling.h" 38 #include "llvm/Support/LineIterator.h" 39 #include "llvm/Support/MemoryBuffer.h" 40 #include "llvm/Support/Path.h" 41 #include "llvm/Support/SHA1.h" 42 #include "llvm/Support/VirtualFileSystem.h" 43 #include "llvm/Support/xxhash.h" 44 #include <algorithm> 45 #include <cstddef> 46 #include <string> 47 #include <vector> 48 49 namespace clang { 50 namespace clangd { 51 52 // Here be dragons. LSP positions use columns measured in *UTF-16 code units*! 53 // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial. 54 55 // Iterates over unicode codepoints in the (UTF-8) string. For each, 56 // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true. 57 // Returns true if CB returned true, false if we hit the end of string. 58 template <typename Callback> 59 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { 60 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). 61 // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. 62 for (size_t I = 0; I < U8.size();) { 63 unsigned char C = static_cast<unsigned char>(U8[I]); 64 if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. 65 if (CB(1, 1)) 66 return true; 67 ++I; 68 continue; 69 } 70 // This convenient property of UTF-8 holds for all non-ASCII characters. 71 size_t UTF8Length = llvm::countLeadingOnes(C); 72 // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. 73 // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug. 74 assert((UTF8Length >= 2 && UTF8Length <= 4) && 75 "Invalid UTF-8, or transcoding bug?"); 76 I += UTF8Length; // Skip over all trailing bytes. 77 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). 78 // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) 79 if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1)) 80 return true; 81 } 82 return false; 83 } 84 85 // Returns the byte offset into the string that is an offset of \p Units in 86 // the specified encoding. 87 // Conceptually, this converts to the encoding, truncates to CodeUnits, 88 // converts back to UTF-8, and returns the length in bytes. 89 static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc, 90 bool &Valid) { 91 Valid = Units >= 0; 92 if (Units <= 0) 93 return 0; 94 size_t Result = 0; 95 switch (Enc) { 96 case OffsetEncoding::UTF8: 97 Result = Units; 98 break; 99 case OffsetEncoding::UTF16: 100 Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { 101 Result += U8Len; 102 Units -= U16Len; 103 return Units <= 0; 104 }); 105 if (Units < 0) // Offset in the middle of a surrogate pair. 106 Valid = false; 107 break; 108 case OffsetEncoding::UTF32: 109 Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { 110 Result += U8Len; 111 Units--; 112 return Units <= 0; 113 }); 114 break; 115 case OffsetEncoding::UnsupportedEncoding: 116 llvm_unreachable("unsupported encoding"); 117 } 118 // Don't return an out-of-range index if we overran. 119 if (Result > U8.size()) { 120 Valid = false; 121 return U8.size(); 122 } 123 return Result; 124 } 125 126 Key<OffsetEncoding> kCurrentOffsetEncoding; 127 static OffsetEncoding lspEncoding() { 128 auto *Enc = Context::current().get(kCurrentOffsetEncoding); 129 return Enc ? *Enc : OffsetEncoding::UTF16; 130 } 131 132 // Like most strings in clangd, the input is UTF-8 encoded. 133 size_t lspLength(llvm::StringRef Code) { 134 size_t Count = 0; 135 switch (lspEncoding()) { 136 case OffsetEncoding::UTF8: 137 Count = Code.size(); 138 break; 139 case OffsetEncoding::UTF16: 140 iterateCodepoints(Code, [&](int U8Len, int U16Len) { 141 Count += U16Len; 142 return false; 143 }); 144 break; 145 case OffsetEncoding::UTF32: 146 iterateCodepoints(Code, [&](int U8Len, int U16Len) { 147 ++Count; 148 return false; 149 }); 150 break; 151 case OffsetEncoding::UnsupportedEncoding: 152 llvm_unreachable("unsupported encoding"); 153 } 154 return Count; 155 } 156 157 llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P, 158 bool AllowColumnsBeyondLineLength) { 159 if (P.line < 0) 160 return llvm::make_error<llvm::StringError>( 161 llvm::formatv("Line value can't be negative ({0})", P.line), 162 llvm::errc::invalid_argument); 163 if (P.character < 0) 164 return llvm::make_error<llvm::StringError>( 165 llvm::formatv("Character value can't be negative ({0})", P.character), 166 llvm::errc::invalid_argument); 167 size_t StartOfLine = 0; 168 for (int I = 0; I != P.line; ++I) { 169 size_t NextNL = Code.find('\n', StartOfLine); 170 if (NextNL == llvm::StringRef::npos) 171 return llvm::make_error<llvm::StringError>( 172 llvm::formatv("Line value is out of range ({0})", P.line), 173 llvm::errc::invalid_argument); 174 StartOfLine = NextNL + 1; 175 } 176 StringRef Line = 177 Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); 178 179 // P.character may be in UTF-16, transcode if necessary. 180 bool Valid; 181 size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid); 182 if (!Valid && !AllowColumnsBeyondLineLength) 183 return llvm::make_error<llvm::StringError>( 184 llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(), 185 P.character, P.line), 186 llvm::errc::invalid_argument); 187 return StartOfLine + ByteInLine; 188 } 189 190 Position offsetToPosition(llvm::StringRef Code, size_t Offset) { 191 Offset = std::min(Code.size(), Offset); 192 llvm::StringRef Before = Code.substr(0, Offset); 193 int Lines = Before.count('\n'); 194 size_t PrevNL = Before.rfind('\n'); 195 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1); 196 Position Pos; 197 Pos.line = Lines; 198 Pos.character = lspLength(Before.substr(StartOfLine)); 199 return Pos; 200 } 201 202 Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) { 203 // We use the SourceManager's line tables, but its column number is in bytes. 204 FileID FID; 205 unsigned Offset; 206 std::tie(FID, Offset) = SM.getDecomposedSpellingLoc(Loc); 207 Position P; 208 P.line = static_cast<int>(SM.getLineNumber(FID, Offset)) - 1; 209 bool Invalid = false; 210 llvm::StringRef Code = SM.getBufferData(FID, &Invalid); 211 if (!Invalid) { 212 auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1; 213 auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes); 214 P.character = lspLength(LineSoFar); 215 } 216 return P; 217 } 218 219 bool isSpelledInSource(SourceLocation Loc, const SourceManager &SM) { 220 if (Loc.isMacroID()) { 221 std::string PrintLoc = SM.getSpellingLoc(Loc).printToString(SM); 222 if (llvm::StringRef(PrintLoc).startswith("<scratch") || 223 llvm::StringRef(PrintLoc).startswith("<command line>")) 224 return false; 225 } 226 return true; 227 } 228 229 bool isValidFileRange(const SourceManager &Mgr, SourceRange R) { 230 if (!R.getBegin().isValid() || !R.getEnd().isValid()) 231 return false; 232 233 FileID BeginFID; 234 size_t BeginOffset = 0; 235 std::tie(BeginFID, BeginOffset) = Mgr.getDecomposedLoc(R.getBegin()); 236 237 FileID EndFID; 238 size_t EndOffset = 0; 239 std::tie(EndFID, EndOffset) = Mgr.getDecomposedLoc(R.getEnd()); 240 241 return BeginFID.isValid() && BeginFID == EndFID && BeginOffset <= EndOffset; 242 } 243 244 SourceLocation includeHashLoc(FileID IncludedFile, const SourceManager &SM) { 245 assert(SM.getLocForEndOfFile(IncludedFile).isFileID()); 246 FileID IncludingFile; 247 unsigned Offset; 248 std::tie(IncludingFile, Offset) = 249 SM.getDecomposedExpansionLoc(SM.getIncludeLoc(IncludedFile)); 250 bool Invalid = false; 251 llvm::StringRef Buf = SM.getBufferData(IncludingFile, &Invalid); 252 if (Invalid) 253 return SourceLocation(); 254 // Now buf is "...\n#include <foo>\n..." 255 // and Offset points here: ^ 256 // Rewind to the preceding # on the line. 257 assert(Offset < Buf.size()); 258 for (;; --Offset) { 259 if (Buf[Offset] == '#') 260 return SM.getComposedLoc(IncludingFile, Offset); 261 if (Buf[Offset] == '\n' || Offset == 0) // no hash, what's going on? 262 return SourceLocation(); 263 } 264 } 265 266 static unsigned getTokenLengthAtLoc(SourceLocation Loc, const SourceManager &SM, 267 const LangOptions &LangOpts) { 268 Token TheTok; 269 if (Lexer::getRawToken(Loc, TheTok, SM, LangOpts)) 270 return 0; 271 // FIXME: Here we check whether the token at the location is a greatergreater 272 // (>>) token and consider it as a single greater (>). This is to get it 273 // working for templates but it isn't correct for the right shift operator. We 274 // can avoid this by using half open char ranges in getFileRange() but getting 275 // token ending is not well supported in macroIDs. 276 if (TheTok.is(tok::greatergreater)) 277 return 1; 278 return TheTok.getLength(); 279 } 280 281 // Returns location of the last character of the token at a given loc 282 static SourceLocation getLocForTokenEnd(SourceLocation BeginLoc, 283 const SourceManager &SM, 284 const LangOptions &LangOpts) { 285 unsigned Len = getTokenLengthAtLoc(BeginLoc, SM, LangOpts); 286 return BeginLoc.getLocWithOffset(Len ? Len - 1 : 0); 287 } 288 289 // Returns location of the starting of the token at a given EndLoc 290 static SourceLocation getLocForTokenBegin(SourceLocation EndLoc, 291 const SourceManager &SM, 292 const LangOptions &LangOpts) { 293 return EndLoc.getLocWithOffset( 294 -(signed)getTokenLengthAtLoc(EndLoc, SM, LangOpts)); 295 } 296 297 // Converts a char source range to a token range. 298 static SourceRange toTokenRange(CharSourceRange Range, const SourceManager &SM, 299 const LangOptions &LangOpts) { 300 if (!Range.isTokenRange()) 301 Range.setEnd(getLocForTokenBegin(Range.getEnd(), SM, LangOpts)); 302 return Range.getAsRange(); 303 } 304 // Returns the union of two token ranges. 305 // To find the maximum of the Ends of the ranges, we compare the location of the 306 // last character of the token. 307 static SourceRange unionTokenRange(SourceRange R1, SourceRange R2, 308 const SourceManager &SM, 309 const LangOptions &LangOpts) { 310 SourceLocation Begin = 311 SM.isBeforeInTranslationUnit(R1.getBegin(), R2.getBegin()) 312 ? R1.getBegin() 313 : R2.getBegin(); 314 SourceLocation End = 315 SM.isBeforeInTranslationUnit(getLocForTokenEnd(R1.getEnd(), SM, LangOpts), 316 getLocForTokenEnd(R2.getEnd(), SM, LangOpts)) 317 ? R2.getEnd() 318 : R1.getEnd(); 319 return SourceRange(Begin, End); 320 } 321 322 // Given a range whose endpoints may be in different expansions or files, 323 // tries to find a range within a common file by following up the expansion and 324 // include location in each. 325 static SourceRange rangeInCommonFile(SourceRange R, const SourceManager &SM, 326 const LangOptions &LangOpts) { 327 // Fast path for most common cases. 328 if (SM.isWrittenInSameFile(R.getBegin(), R.getEnd())) 329 return R; 330 // Record the stack of expansion locations for the beginning, keyed by FileID. 331 llvm::DenseMap<FileID, SourceLocation> BeginExpansions; 332 for (SourceLocation Begin = R.getBegin(); Begin.isValid(); 333 Begin = Begin.isFileID() 334 ? includeHashLoc(SM.getFileID(Begin), SM) 335 : SM.getImmediateExpansionRange(Begin).getBegin()) { 336 BeginExpansions[SM.getFileID(Begin)] = Begin; 337 } 338 // Move up the stack of expansion locations for the end until we find the 339 // location in BeginExpansions with that has the same file id. 340 for (SourceLocation End = R.getEnd(); End.isValid(); 341 End = End.isFileID() ? includeHashLoc(SM.getFileID(End), SM) 342 : toTokenRange(SM.getImmediateExpansionRange(End), 343 SM, LangOpts) 344 .getEnd()) { 345 auto It = BeginExpansions.find(SM.getFileID(End)); 346 if (It != BeginExpansions.end()) { 347 if (SM.getFileOffset(It->second) > SM.getFileOffset(End)) 348 return SourceLocation(); 349 return {It->second, End}; 350 } 351 } 352 return SourceRange(); 353 } 354 355 // Find an expansion range (not necessarily immediate) the ends of which are in 356 // the same file id. 357 static SourceRange 358 getExpansionTokenRangeInSameFile(SourceLocation Loc, const SourceManager &SM, 359 const LangOptions &LangOpts) { 360 return rangeInCommonFile( 361 toTokenRange(SM.getImmediateExpansionRange(Loc), SM, LangOpts), SM, 362 LangOpts); 363 } 364 365 // Returns the file range for a given Location as a Token Range 366 // This is quite similar to getFileLoc in SourceManager as both use 367 // getImmediateExpansionRange and getImmediateSpellingLoc (for macro IDs). 368 // However: 369 // - We want to maintain the full range information as we move from one file to 370 // the next. getFileLoc only uses the BeginLoc of getImmediateExpansionRange. 371 // - We want to split '>>' tokens as the lexer parses the '>>' in nested 372 // template instantiations as a '>>' instead of two '>'s. 373 // There is also getExpansionRange but it simply calls 374 // getImmediateExpansionRange on the begin and ends separately which is wrong. 375 static SourceRange getTokenFileRange(SourceLocation Loc, 376 const SourceManager &SM, 377 const LangOptions &LangOpts) { 378 SourceRange FileRange = Loc; 379 while (!FileRange.getBegin().isFileID()) { 380 if (SM.isMacroArgExpansion(FileRange.getBegin())) { 381 FileRange = unionTokenRange( 382 SM.getImmediateSpellingLoc(FileRange.getBegin()), 383 SM.getImmediateSpellingLoc(FileRange.getEnd()), SM, LangOpts); 384 assert(SM.isWrittenInSameFile(FileRange.getBegin(), FileRange.getEnd())); 385 } else { 386 SourceRange ExpansionRangeForBegin = 387 getExpansionTokenRangeInSameFile(FileRange.getBegin(), SM, LangOpts); 388 SourceRange ExpansionRangeForEnd = 389 getExpansionTokenRangeInSameFile(FileRange.getEnd(), SM, LangOpts); 390 if (ExpansionRangeForBegin.isInvalid() || 391 ExpansionRangeForEnd.isInvalid()) 392 return SourceRange(); 393 assert(SM.isWrittenInSameFile(ExpansionRangeForBegin.getBegin(), 394 ExpansionRangeForEnd.getBegin()) && 395 "Both Expansion ranges should be in same file."); 396 FileRange = unionTokenRange(ExpansionRangeForBegin, ExpansionRangeForEnd, 397 SM, LangOpts); 398 } 399 } 400 return FileRange; 401 } 402 403 bool isInsideMainFile(SourceLocation Loc, const SourceManager &SM) { 404 if (!Loc.isValid()) 405 return false; 406 FileID FID = SM.getFileID(SM.getExpansionLoc(Loc)); 407 return FID == SM.getMainFileID() || FID == SM.getPreambleFileID(); 408 } 409 410 llvm::Optional<SourceRange> toHalfOpenFileRange(const SourceManager &SM, 411 const LangOptions &LangOpts, 412 SourceRange R) { 413 SourceRange R1 = getTokenFileRange(R.getBegin(), SM, LangOpts); 414 if (!isValidFileRange(SM, R1)) 415 return llvm::None; 416 417 SourceRange R2 = getTokenFileRange(R.getEnd(), SM, LangOpts); 418 if (!isValidFileRange(SM, R2)) 419 return llvm::None; 420 421 SourceRange Result = 422 rangeInCommonFile(unionTokenRange(R1, R2, SM, LangOpts), SM, LangOpts); 423 unsigned TokLen = getTokenLengthAtLoc(Result.getEnd(), SM, LangOpts); 424 // Convert from closed token range to half-open (char) range 425 Result.setEnd(Result.getEnd().getLocWithOffset(TokLen)); 426 if (!isValidFileRange(SM, Result)) 427 return llvm::None; 428 429 return Result; 430 } 431 432 llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R) { 433 assert(isValidFileRange(SM, R)); 434 bool Invalid = false; 435 auto *Buf = SM.getBuffer(SM.getFileID(R.getBegin()), &Invalid); 436 assert(!Invalid); 437 438 size_t BeginOffset = SM.getFileOffset(R.getBegin()); 439 size_t EndOffset = SM.getFileOffset(R.getEnd()); 440 return Buf->getBuffer().substr(BeginOffset, EndOffset - BeginOffset); 441 } 442 443 llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM, 444 Position P) { 445 llvm::StringRef Code = SM.getBuffer(SM.getMainFileID())->getBuffer(); 446 auto Offset = 447 positionToOffset(Code, P, /*AllowColumnBeyondLineLength=*/false); 448 if (!Offset) 449 return Offset.takeError(); 450 return SM.getLocForStartOfFile(SM.getMainFileID()).getLocWithOffset(*Offset); 451 } 452 453 Range halfOpenToRange(const SourceManager &SM, CharSourceRange R) { 454 // Clang is 1-based, LSP uses 0-based indexes. 455 Position Begin = sourceLocToPosition(SM, R.getBegin()); 456 Position End = sourceLocToPosition(SM, R.getEnd()); 457 458 return {Begin, End}; 459 } 460 461 std::pair<size_t, size_t> offsetToClangLineColumn(llvm::StringRef Code, 462 size_t Offset) { 463 Offset = std::min(Code.size(), Offset); 464 llvm::StringRef Before = Code.substr(0, Offset); 465 int Lines = Before.count('\n'); 466 size_t PrevNL = Before.rfind('\n'); 467 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1); 468 return {Lines + 1, Offset - StartOfLine + 1}; 469 } 470 471 std::pair<StringRef, StringRef> splitQualifiedName(StringRef QName) { 472 size_t Pos = QName.rfind("::"); 473 if (Pos == llvm::StringRef::npos) 474 return {llvm::StringRef(), QName}; 475 return {QName.substr(0, Pos + 2), QName.substr(Pos + 2)}; 476 } 477 478 TextEdit replacementToEdit(llvm::StringRef Code, 479 const tooling::Replacement &R) { 480 Range ReplacementRange = { 481 offsetToPosition(Code, R.getOffset()), 482 offsetToPosition(Code, R.getOffset() + R.getLength())}; 483 return {ReplacementRange, std::string(R.getReplacementText())}; 484 } 485 486 std::vector<TextEdit> replacementsToEdits(llvm::StringRef Code, 487 const tooling::Replacements &Repls) { 488 std::vector<TextEdit> Edits; 489 for (const auto &R : Repls) 490 Edits.push_back(replacementToEdit(Code, R)); 491 return Edits; 492 } 493 494 llvm::Optional<std::string> getCanonicalPath(const FileEntry *F, 495 const SourceManager &SourceMgr) { 496 if (!F) 497 return None; 498 499 llvm::SmallString<128> FilePath = F->getName(); 500 if (!llvm::sys::path::is_absolute(FilePath)) { 501 if (auto EC = 502 SourceMgr.getFileManager().getVirtualFileSystem().makeAbsolute( 503 FilePath)) { 504 elog("Could not turn relative path '{0}' to absolute: {1}", FilePath, 505 EC.message()); 506 return None; 507 } 508 } 509 510 // Handle the symbolic link path case where the current working directory 511 // (getCurrentWorkingDirectory) is a symlink. We always want to the real 512 // file path (instead of the symlink path) for the C++ symbols. 513 // 514 // Consider the following example: 515 // 516 // src dir: /project/src/foo.h 517 // current working directory (symlink): /tmp/build -> /project/src/ 518 // 519 // The file path of Symbol is "/project/src/foo.h" instead of 520 // "/tmp/build/foo.h" 521 if (auto Dir = SourceMgr.getFileManager().getDirectory( 522 llvm::sys::path::parent_path(FilePath))) { 523 llvm::SmallString<128> RealPath; 524 llvm::StringRef DirName = SourceMgr.getFileManager().getCanonicalName(*Dir); 525 llvm::sys::path::append(RealPath, DirName, 526 llvm::sys::path::filename(FilePath)); 527 return RealPath.str().str(); 528 } 529 530 return FilePath.str().str(); 531 } 532 533 TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M, 534 const LangOptions &L) { 535 TextEdit Result; 536 Result.range = 537 halfOpenToRange(M, Lexer::makeFileCharRange(FixIt.RemoveRange, M, L)); 538 Result.newText = FixIt.CodeToInsert; 539 return Result; 540 } 541 542 FileDigest digest(llvm::StringRef Content) { 543 uint64_t Hash{llvm::xxHash64(Content)}; 544 FileDigest Result; 545 for (unsigned I = 0; I < Result.size(); ++I) { 546 Result[I] = uint8_t(Hash); 547 Hash >>= 8; 548 } 549 return Result; 550 } 551 552 llvm::Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID) { 553 bool Invalid = false; 554 llvm::StringRef Content = SM.getBufferData(FID, &Invalid); 555 if (Invalid) 556 return None; 557 return digest(Content); 558 } 559 560 format::FormatStyle getFormatStyleForFile(llvm::StringRef File, 561 llvm::StringRef Content, 562 llvm::vfs::FileSystem *FS) { 563 auto Style = format::getStyle(format::DefaultFormatStyle, File, 564 format::DefaultFallbackStyle, Content, FS); 565 if (!Style) { 566 log("getStyle() failed for file {0}: {1}. Fallback is LLVM style.", File, 567 Style.takeError()); 568 return format::getLLVMStyle(); 569 } 570 return *Style; 571 } 572 573 llvm::Expected<tooling::Replacements> 574 cleanupAndFormat(StringRef Code, const tooling::Replacements &Replaces, 575 const format::FormatStyle &Style) { 576 auto CleanReplaces = cleanupAroundReplacements(Code, Replaces, Style); 577 if (!CleanReplaces) 578 return CleanReplaces; 579 return formatReplacements(Code, std::move(*CleanReplaces), Style); 580 } 581 582 static void 583 lex(llvm::StringRef Code, const LangOptions &LangOpts, 584 llvm::function_ref<void(const syntax::Token &, const SourceManager &SM)> 585 Action) { 586 // FIXME: InMemoryFileAdapter crashes unless the buffer is null terminated! 587 std::string NullTerminatedCode = Code.str(); 588 SourceManagerForFile FileSM("dummy.cpp", NullTerminatedCode); 589 auto &SM = FileSM.get(); 590 for (const auto &Tok : syntax::tokenize(SM.getMainFileID(), SM, LangOpts)) 591 Action(Tok, SM); 592 } 593 594 llvm::StringMap<unsigned> collectIdentifiers(llvm::StringRef Content, 595 const format::FormatStyle &Style) { 596 llvm::StringMap<unsigned> Identifiers; 597 auto LangOpt = format::getFormattingLangOpts(Style); 598 lex(Content, LangOpt, [&](const syntax::Token &Tok, const SourceManager &SM) { 599 if (Tok.kind() == tok::identifier) 600 ++Identifiers[Tok.text(SM)]; 601 // FIXME: Should this function really return keywords too ? 602 else if (const auto *Keyword = tok::getKeywordSpelling(Tok.kind())) 603 ++Identifiers[Keyword]; 604 }); 605 return Identifiers; 606 } 607 608 std::vector<Range> collectIdentifierRanges(llvm::StringRef Identifier, 609 llvm::StringRef Content, 610 const LangOptions &LangOpts) { 611 std::vector<Range> Ranges; 612 lex(Content, LangOpts, 613 [&](const syntax::Token &Tok, const SourceManager &SM) { 614 if (Tok.kind() != tok::identifier || Tok.text(SM) != Identifier) 615 return; 616 Ranges.push_back(halfOpenToRange(SM, Tok.range(SM).toCharRange(SM))); 617 }); 618 return Ranges; 619 } 620 621 namespace { 622 struct NamespaceEvent { 623 enum { 624 BeginNamespace, // namespace <ns> {. Payload is resolved <ns>. 625 EndNamespace, // } // namespace <ns>. Payload is resolved *outer* 626 // namespace. 627 UsingDirective // using namespace <ns>. Payload is unresolved <ns>. 628 } Trigger; 629 std::string Payload; 630 Position Pos; 631 }; 632 // Scans C++ source code for constructs that change the visible namespaces. 633 void parseNamespaceEvents(llvm::StringRef Code, const LangOptions &LangOpts, 634 llvm::function_ref<void(NamespaceEvent)> Callback) { 635 636 // Stack of enclosing namespaces, e.g. {"clang", "clangd"} 637 std::vector<std::string> Enclosing; // Contains e.g. "clang", "clangd" 638 // Stack counts open braces. true if the brace opened a namespace. 639 std::vector<bool> BraceStack; 640 641 enum { 642 Default, 643 Namespace, // just saw 'namespace' 644 NamespaceName, // just saw 'namespace' NSName 645 Using, // just saw 'using' 646 UsingNamespace, // just saw 'using namespace' 647 UsingNamespaceName, // just saw 'using namespace' NSName 648 } State = Default; 649 std::string NSName; 650 651 NamespaceEvent Event; 652 lex(Code, LangOpts, [&](const syntax::Token &Tok, const SourceManager &SM) { 653 Event.Pos = sourceLocToPosition(SM, Tok.location()); 654 switch (Tok.kind()) { 655 case tok::kw_using: 656 State = State == Default ? Using : Default; 657 break; 658 case tok::kw_namespace: 659 switch (State) { 660 case Using: 661 State = UsingNamespace; 662 break; 663 case Default: 664 State = Namespace; 665 break; 666 default: 667 State = Default; 668 break; 669 } 670 break; 671 case tok::identifier: 672 switch (State) { 673 case UsingNamespace: 674 NSName.clear(); 675 LLVM_FALLTHROUGH; 676 case UsingNamespaceName: 677 NSName.append(Tok.text(SM).str()); 678 State = UsingNamespaceName; 679 break; 680 case Namespace: 681 NSName.clear(); 682 LLVM_FALLTHROUGH; 683 case NamespaceName: 684 NSName.append(Tok.text(SM).str()); 685 State = NamespaceName; 686 break; 687 case Using: 688 case Default: 689 State = Default; 690 break; 691 } 692 break; 693 case tok::coloncolon: 694 // This can come at the beginning or in the middle of a namespace 695 // name. 696 switch (State) { 697 case UsingNamespace: 698 NSName.clear(); 699 LLVM_FALLTHROUGH; 700 case UsingNamespaceName: 701 NSName.append("::"); 702 State = UsingNamespaceName; 703 break; 704 case NamespaceName: 705 NSName.append("::"); 706 State = NamespaceName; 707 break; 708 case Namespace: // Not legal here. 709 case Using: 710 case Default: 711 State = Default; 712 break; 713 } 714 break; 715 case tok::l_brace: 716 // Record which { started a namespace, so we know when } ends one. 717 if (State == NamespaceName) { 718 // Parsed: namespace <name> { 719 BraceStack.push_back(true); 720 Enclosing.push_back(NSName); 721 Event.Trigger = NamespaceEvent::BeginNamespace; 722 Event.Payload = llvm::join(Enclosing, "::"); 723 Callback(Event); 724 } else { 725 // This case includes anonymous namespaces (State = Namespace). 726 // For our purposes, they're not namespaces and we ignore them. 727 BraceStack.push_back(false); 728 } 729 State = Default; 730 break; 731 case tok::r_brace: 732 // If braces are unmatched, we're going to be confused, but don't 733 // crash. 734 if (!BraceStack.empty()) { 735 if (BraceStack.back()) { 736 // Parsed: } // namespace 737 Enclosing.pop_back(); 738 Event.Trigger = NamespaceEvent::EndNamespace; 739 Event.Payload = llvm::join(Enclosing, "::"); 740 Callback(Event); 741 } 742 BraceStack.pop_back(); 743 } 744 break; 745 case tok::semi: 746 if (State == UsingNamespaceName) { 747 // Parsed: using namespace <name> ; 748 Event.Trigger = NamespaceEvent::UsingDirective; 749 Event.Payload = std::move(NSName); 750 Callback(Event); 751 } 752 State = Default; 753 break; 754 default: 755 State = Default; 756 break; 757 } 758 }); 759 } 760 761 // Returns the prefix namespaces of NS: {"" ... NS}. 762 llvm::SmallVector<llvm::StringRef, 8> ancestorNamespaces(llvm::StringRef NS) { 763 llvm::SmallVector<llvm::StringRef, 8> Results; 764 Results.push_back(NS.take_front(0)); 765 NS.split(Results, "::", /*MaxSplit=*/-1, /*KeepEmpty=*/false); 766 for (llvm::StringRef &R : Results) 767 R = NS.take_front(R.end() - NS.begin()); 768 return Results; 769 } 770 771 } // namespace 772 773 std::vector<std::string> visibleNamespaces(llvm::StringRef Code, 774 const LangOptions &LangOpts) { 775 std::string Current; 776 // Map from namespace to (resolved) namespaces introduced via using directive. 777 llvm::StringMap<llvm::StringSet<>> UsingDirectives; 778 779 parseNamespaceEvents(Code, LangOpts, [&](NamespaceEvent Event) { 780 llvm::StringRef NS = Event.Payload; 781 switch (Event.Trigger) { 782 case NamespaceEvent::BeginNamespace: 783 case NamespaceEvent::EndNamespace: 784 Current = std::move(Event.Payload); 785 break; 786 case NamespaceEvent::UsingDirective: 787 if (NS.consume_front("::")) 788 UsingDirectives[Current].insert(NS); 789 else { 790 for (llvm::StringRef Enclosing : ancestorNamespaces(Current)) { 791 if (Enclosing.empty()) 792 UsingDirectives[Current].insert(NS); 793 else 794 UsingDirectives[Current].insert((Enclosing + "::" + NS).str()); 795 } 796 } 797 break; 798 } 799 }); 800 801 std::vector<std::string> Found; 802 for (llvm::StringRef Enclosing : ancestorNamespaces(Current)) { 803 Found.push_back(std::string(Enclosing)); 804 auto It = UsingDirectives.find(Enclosing); 805 if (It != UsingDirectives.end()) 806 for (const auto &Used : It->second) 807 Found.push_back(std::string(Used.getKey())); 808 } 809 810 llvm::sort(Found, [&](const std::string &LHS, const std::string &RHS) { 811 if (Current == RHS) 812 return false; 813 if (Current == LHS) 814 return true; 815 return LHS < RHS; 816 }); 817 Found.erase(std::unique(Found.begin(), Found.end()), Found.end()); 818 return Found; 819 } 820 821 llvm::StringSet<> collectWords(llvm::StringRef Content) { 822 // We assume short words are not significant. 823 // We may want to consider other stopwords, e.g. language keywords. 824 // (A very naive implementation showed no benefit, but lexing might do better) 825 static constexpr int MinWordLength = 4; 826 827 std::vector<CharRole> Roles(Content.size()); 828 calculateRoles(Content, Roles); 829 830 llvm::StringSet<> Result; 831 llvm::SmallString<256> Word; 832 auto Flush = [&] { 833 if (Word.size() >= MinWordLength) { 834 for (char &C : Word) 835 C = llvm::toLower(C); 836 Result.insert(Word); 837 } 838 Word.clear(); 839 }; 840 for (unsigned I = 0; I < Content.size(); ++I) { 841 switch (Roles[I]) { 842 case Head: 843 Flush(); 844 LLVM_FALLTHROUGH; 845 case Tail: 846 Word.push_back(Content[I]); 847 break; 848 case Unknown: 849 case Separator: 850 Flush(); 851 break; 852 } 853 } 854 Flush(); 855 856 return Result; 857 } 858 859 static bool isLikelyIdentifier(llvm::StringRef Word, llvm::StringRef Before, 860 llvm::StringRef After) { 861 // `foo` is an identifier. 862 if (Before.endswith("`") && After.startswith("`")) 863 return true; 864 // In foo::bar, both foo and bar are identifiers. 865 if (Before.endswith("::") || After.startswith("::")) 866 return true; 867 // Doxygen tags like \c foo indicate identifiers. 868 // Don't search too far back. 869 // This duplicates clang's doxygen parser, revisit if it gets complicated. 870 Before = Before.take_back(100); // Don't search too far back. 871 auto Pos = Before.find_last_of("\\@"); 872 if (Pos != llvm::StringRef::npos) { 873 llvm::StringRef Tag = Before.substr(Pos + 1).rtrim(' '); 874 if (Tag == "p" || Tag == "c" || Tag == "class" || Tag == "tparam" || 875 Tag == "param" || Tag == "param[in]" || Tag == "param[out]" || 876 Tag == "param[in,out]" || Tag == "retval" || Tag == "throw" || 877 Tag == "throws" || Tag == "link") 878 return true; 879 } 880 881 // Word contains underscore. 882 // This handles things like snake_case and MACRO_CASE. 883 if (Word.contains('_')) { 884 return true; 885 } 886 // Word contains capital letter other than at beginning. 887 // This handles things like lowerCamel and UpperCamel. 888 // The check for also containing a lowercase letter is to rule out 889 // initialisms like "HTTP". 890 bool HasLower = Word.find_if(clang::isLowercase) != StringRef::npos; 891 bool HasUpper = Word.substr(1).find_if(clang::isUppercase) != StringRef::npos; 892 if (HasLower && HasUpper) { 893 return true; 894 } 895 // FIXME: consider mid-sentence Capitalization? 896 return false; 897 } 898 899 llvm::Optional<SpelledWord> SpelledWord::touching(SourceLocation SpelledLoc, 900 const syntax::TokenBuffer &TB, 901 const LangOptions &LangOpts) { 902 const auto &SM = TB.sourceManager(); 903 auto Touching = syntax::spelledTokensTouching(SpelledLoc, TB); 904 for (const auto &T : Touching) { 905 // If the token is an identifier or a keyword, don't use any heuristics. 906 if (tok::isAnyIdentifier(T.kind()) || tok::getKeywordSpelling(T.kind())) { 907 SpelledWord Result; 908 Result.Location = T.location(); 909 Result.Text = T.text(SM); 910 Result.LikelyIdentifier = tok::isAnyIdentifier(T.kind()); 911 Result.PartOfSpelledToken = &T; 912 Result.SpelledToken = &T; 913 auto Expanded = 914 TB.expandedTokens(SM.getMacroArgExpandedLocation(T.location())); 915 if (Expanded.size() == 1 && Expanded.front().text(SM) == Result.Text) 916 Result.ExpandedToken = &Expanded.front(); 917 return Result; 918 } 919 } 920 FileID File; 921 unsigned Offset; 922 std::tie(File, Offset) = SM.getDecomposedLoc(SpelledLoc); 923 bool Invalid = false; 924 llvm::StringRef Code = SM.getBufferData(File, &Invalid); 925 if (Invalid) 926 return llvm::None; 927 unsigned B = Offset, E = Offset; 928 while (B > 0 && isIdentifierBody(Code[B - 1])) 929 --B; 930 while (E < Code.size() && isIdentifierBody(Code[E])) 931 ++E; 932 if (B == E) 933 return llvm::None; 934 935 SpelledWord Result; 936 Result.Location = SM.getComposedLoc(File, B); 937 Result.Text = Code.slice(B, E); 938 Result.LikelyIdentifier = 939 isLikelyIdentifier(Result.Text, Code.substr(0, B), Code.substr(E)) && 940 // should not be a keyword 941 tok::isAnyIdentifier( 942 IdentifierTable(LangOpts).get(Result.Text).getTokenID()); 943 for (const auto &T : Touching) 944 if (T.location() <= Result.Location) 945 Result.PartOfSpelledToken = &T; 946 return Result; 947 } 948 949 llvm::Optional<DefinedMacro> locateMacroAt(const syntax::Token &SpelledTok, 950 Preprocessor &PP) { 951 SourceLocation Loc = SpelledTok.location(); 952 assert(Loc.isFileID()); 953 const auto &SM = PP.getSourceManager(); 954 IdentifierInfo *IdentifierInfo = PP.getIdentifierInfo(SpelledTok.text(SM)); 955 if (!IdentifierInfo || !IdentifierInfo->hadMacroDefinition()) 956 return None; 957 958 // Get the definition just before the searched location so that a macro 959 // referenced in a '#undef MACRO' can still be found. Note that we only do 960 // that if Loc is not pointing at start of file. 961 if (SM.getLocForStartOfFile(SM.getFileID(Loc)) != Loc) 962 Loc = Loc.getLocWithOffset(-1); 963 MacroDefinition MacroDef = PP.getMacroDefinitionAtLoc(IdentifierInfo, Loc); 964 if (auto *MI = MacroDef.getMacroInfo()) 965 return DefinedMacro{ 966 IdentifierInfo->getName(), MI, 967 translatePreamblePatchLocation(MI->getDefinitionLoc(), SM)}; 968 return None; 969 } 970 971 llvm::Expected<std::string> Edit::apply() const { 972 return tooling::applyAllReplacements(InitialCode, Replacements); 973 } 974 975 std::vector<TextEdit> Edit::asTextEdits() const { 976 return replacementsToEdits(InitialCode, Replacements); 977 } 978 979 bool Edit::canApplyTo(llvm::StringRef Code) const { 980 // Create line iterators, since line numbers are important while applying our 981 // edit we cannot skip blank lines. 982 auto LHS = llvm::MemoryBuffer::getMemBuffer(Code); 983 llvm::line_iterator LHSIt(*LHS, /*SkipBlanks=*/false); 984 985 auto RHS = llvm::MemoryBuffer::getMemBuffer(InitialCode); 986 llvm::line_iterator RHSIt(*RHS, /*SkipBlanks=*/false); 987 988 // Compare the InitialCode we prepared the edit for with the Code we received 989 // line by line to make sure there are no differences. 990 // FIXME: This check is too conservative now, it should be enough to only 991 // check lines around the replacements contained inside the Edit. 992 while (!LHSIt.is_at_eof() && !RHSIt.is_at_eof()) { 993 if (*LHSIt != *RHSIt) 994 return false; 995 ++LHSIt; 996 ++RHSIt; 997 } 998 999 // After we reach EOF for any of the files we make sure the other one doesn't 1000 // contain any additional content except empty lines, they should not 1001 // interfere with the edit we produced. 1002 while (!LHSIt.is_at_eof()) { 1003 if (!LHSIt->empty()) 1004 return false; 1005 ++LHSIt; 1006 } 1007 while (!RHSIt.is_at_eof()) { 1008 if (!RHSIt->empty()) 1009 return false; 1010 ++RHSIt; 1011 } 1012 return true; 1013 } 1014 1015 llvm::Error reformatEdit(Edit &E, const format::FormatStyle &Style) { 1016 if (auto NewEdits = cleanupAndFormat(E.InitialCode, E.Replacements, Style)) 1017 E.Replacements = std::move(*NewEdits); 1018 else 1019 return NewEdits.takeError(); 1020 return llvm::Error::success(); 1021 } 1022 1023 EligibleRegion getEligiblePoints(llvm::StringRef Code, 1024 llvm::StringRef FullyQualifiedName, 1025 const LangOptions &LangOpts) { 1026 EligibleRegion ER; 1027 // Start with global namespace. 1028 std::vector<std::string> Enclosing = {""}; 1029 // FIXME: In addition to namespaces try to generate events for function 1030 // definitions as well. One might use a closing parantheses(")" followed by an 1031 // opening brace "{" to trigger the start. 1032 parseNamespaceEvents(Code, LangOpts, [&](NamespaceEvent Event) { 1033 // Using Directives only introduces declarations to current scope, they do 1034 // not change the current namespace, so skip them. 1035 if (Event.Trigger == NamespaceEvent::UsingDirective) 1036 return; 1037 // Do not qualify the global namespace. 1038 if (!Event.Payload.empty()) 1039 Event.Payload.append("::"); 1040 1041 std::string CurrentNamespace; 1042 if (Event.Trigger == NamespaceEvent::BeginNamespace) { 1043 Enclosing.emplace_back(std::move(Event.Payload)); 1044 CurrentNamespace = Enclosing.back(); 1045 // parseNameSpaceEvents reports the beginning position of a token; we want 1046 // to insert after '{', so increment by one. 1047 ++Event.Pos.character; 1048 } else { 1049 // Event.Payload points to outer namespace when exiting a scope, so use 1050 // the namespace we've last entered instead. 1051 CurrentNamespace = std::move(Enclosing.back()); 1052 Enclosing.pop_back(); 1053 assert(Enclosing.back() == Event.Payload); 1054 } 1055 1056 // Ignore namespaces that are not a prefix of the target. 1057 if (!FullyQualifiedName.startswith(CurrentNamespace)) 1058 return; 1059 1060 // Prefer the namespace that shares the longest prefix with target. 1061 if (CurrentNamespace.size() > ER.EnclosingNamespace.size()) { 1062 ER.EligiblePoints.clear(); 1063 ER.EnclosingNamespace = CurrentNamespace; 1064 } 1065 if (CurrentNamespace.size() == ER.EnclosingNamespace.size()) 1066 ER.EligiblePoints.emplace_back(std::move(Event.Pos)); 1067 }); 1068 // If there were no shared namespaces just return EOF. 1069 if (ER.EligiblePoints.empty()) { 1070 assert(ER.EnclosingNamespace.empty()); 1071 ER.EligiblePoints.emplace_back(offsetToPosition(Code, Code.size())); 1072 } 1073 return ER; 1074 } 1075 1076 bool isHeaderFile(llvm::StringRef FileName, 1077 llvm::Optional<LangOptions> LangOpts) { 1078 // Respect the langOpts, for non-file-extension cases, e.g. standard library 1079 // files. 1080 if (LangOpts && LangOpts->IsHeaderFile) 1081 return true; 1082 namespace types = clang::driver::types; 1083 auto Lang = types::lookupTypeForExtension( 1084 llvm::sys::path::extension(FileName).substr(1)); 1085 return Lang != types::TY_INVALID && types::onlyPrecompileType(Lang); 1086 } 1087 1088 bool isProtoFile(SourceLocation Loc, const SourceManager &SM) { 1089 auto FileName = SM.getFilename(Loc); 1090 if (!FileName.endswith(".proto.h") && !FileName.endswith(".pb.h")) 1091 return false; 1092 auto FID = SM.getFileID(Loc); 1093 // All proto generated headers should start with this line. 1094 static const char *PROTO_HEADER_COMMENT = 1095 "// Generated by the protocol buffer compiler. DO NOT EDIT!"; 1096 // Double check that this is an actual protobuf header. 1097 return SM.getBufferData(FID).startswith(PROTO_HEADER_COMMENT); 1098 } 1099 1100 } // namespace clangd 1101 } // namespace clang 1102