1 //===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "SourceCode.h" 9 10 #include "FuzzyMatch.h" 11 #include "Preamble.h" 12 #include "Protocol.h" 13 #include "refactor/Tweak.h" 14 #include "support/Context.h" 15 #include "support/Logger.h" 16 #include "support/Threading.h" 17 #include "clang/AST/ASTContext.h" 18 #include "clang/Basic/LangOptions.h" 19 #include "clang/Basic/SourceLocation.h" 20 #include "clang/Basic/SourceManager.h" 21 #include "clang/Basic/TokenKinds.h" 22 #include "clang/Driver/Types.h" 23 #include "clang/Format/Format.h" 24 #include "clang/Lex/Lexer.h" 25 #include "clang/Lex/Preprocessor.h" 26 #include "clang/Lex/Token.h" 27 #include "clang/Tooling/Core/Replacement.h" 28 #include "clang/Tooling/Syntax/Tokens.h" 29 #include "llvm/ADT/ArrayRef.h" 30 #include "llvm/ADT/None.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/StringExtras.h" 33 #include "llvm/ADT/StringMap.h" 34 #include "llvm/ADT/StringRef.h" 35 #include "llvm/Support/Compiler.h" 36 #include "llvm/Support/Errc.h" 37 #include "llvm/Support/Error.h" 38 #include "llvm/Support/ErrorHandling.h" 39 #include "llvm/Support/LineIterator.h" 40 #include "llvm/Support/MemoryBuffer.h" 41 #include "llvm/Support/Path.h" 42 #include "llvm/Support/SHA1.h" 43 #include "llvm/Support/VirtualFileSystem.h" 44 #include "llvm/Support/xxhash.h" 45 #include <algorithm> 46 #include <cstddef> 47 #include <string> 48 #include <vector> 49 50 namespace clang { 51 namespace clangd { 52 53 // Here be dragons. LSP positions use columns measured in *UTF-16 code units*! 54 // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial. 55 56 // Iterates over unicode codepoints in the (UTF-8) string. For each, 57 // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true. 58 // Returns true if CB returned true, false if we hit the end of string. 59 // 60 // If the string is not valid UTF-8, we log this error and "decode" the 61 // text in some arbitrary way. This is pretty sad, but this tends to happen deep 62 // within indexing of headers where clang misdetected the encoding, and 63 // propagating the error all the way back up is (probably?) not be worth it. 64 template <typename Callback> 65 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { 66 bool LoggedInvalid = false; 67 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). 68 // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. 69 for (size_t I = 0; I < U8.size();) { 70 unsigned char C = static_cast<unsigned char>(U8[I]); 71 if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. 72 if (CB(1, 1)) 73 return true; 74 ++I; 75 continue; 76 } 77 // This convenient property of UTF-8 holds for all non-ASCII characters. 78 size_t UTF8Length = llvm::countLeadingOnes(C); 79 // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. 80 // 11111xxx is not valid UTF-8 at all, maybe some ISO-8859-*. 81 if (LLVM_UNLIKELY(UTF8Length < 2 || UTF8Length > 4)) { 82 if (!LoggedInvalid) { 83 elog("File has invalid UTF-8 near offset {0}: {1}", I, llvm::toHex(U8)); 84 LoggedInvalid = true; 85 } 86 // We can't give a correct result, but avoid returning something wild. 87 // Pretend this is a valid ASCII byte, for lack of better options. 88 // (Too late to get ISO-8859-* right, we've skipped some bytes already). 89 if (CB(1, 1)) 90 return true; 91 ++I; 92 continue; 93 } 94 I += UTF8Length; // Skip over all trailing bytes. 95 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). 96 // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) 97 if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1)) 98 return true; 99 } 100 return false; 101 } 102 103 // Returns the byte offset into the string that is an offset of \p Units in 104 // the specified encoding. 105 // Conceptually, this converts to the encoding, truncates to CodeUnits, 106 // converts back to UTF-8, and returns the length in bytes. 107 static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc, 108 bool &Valid) { 109 Valid = Units >= 0; 110 if (Units <= 0) 111 return 0; 112 size_t Result = 0; 113 switch (Enc) { 114 case OffsetEncoding::UTF8: 115 Result = Units; 116 break; 117 case OffsetEncoding::UTF16: 118 Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { 119 Result += U8Len; 120 Units -= U16Len; 121 return Units <= 0; 122 }); 123 if (Units < 0) // Offset in the middle of a surrogate pair. 124 Valid = false; 125 break; 126 case OffsetEncoding::UTF32: 127 Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { 128 Result += U8Len; 129 Units--; 130 return Units <= 0; 131 }); 132 break; 133 case OffsetEncoding::UnsupportedEncoding: 134 llvm_unreachable("unsupported encoding"); 135 } 136 // Don't return an out-of-range index if we overran. 137 if (Result > U8.size()) { 138 Valid = false; 139 return U8.size(); 140 } 141 return Result; 142 } 143 144 Key<OffsetEncoding> kCurrentOffsetEncoding; 145 static OffsetEncoding lspEncoding() { 146 auto *Enc = Context::current().get(kCurrentOffsetEncoding); 147 return Enc ? *Enc : OffsetEncoding::UTF16; 148 } 149 150 // Like most strings in clangd, the input is UTF-8 encoded. 151 size_t lspLength(llvm::StringRef Code) { 152 size_t Count = 0; 153 switch (lspEncoding()) { 154 case OffsetEncoding::UTF8: 155 Count = Code.size(); 156 break; 157 case OffsetEncoding::UTF16: 158 iterateCodepoints(Code, [&](int U8Len, int U16Len) { 159 Count += U16Len; 160 return false; 161 }); 162 break; 163 case OffsetEncoding::UTF32: 164 iterateCodepoints(Code, [&](int U8Len, int U16Len) { 165 ++Count; 166 return false; 167 }); 168 break; 169 case OffsetEncoding::UnsupportedEncoding: 170 llvm_unreachable("unsupported encoding"); 171 } 172 return Count; 173 } 174 175 llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P, 176 bool AllowColumnsBeyondLineLength) { 177 if (P.line < 0) 178 return llvm::make_error<llvm::StringError>( 179 llvm::formatv("Line value can't be negative ({0})", P.line), 180 llvm::errc::invalid_argument); 181 if (P.character < 0) 182 return llvm::make_error<llvm::StringError>( 183 llvm::formatv("Character value can't be negative ({0})", P.character), 184 llvm::errc::invalid_argument); 185 size_t StartOfLine = 0; 186 for (int I = 0; I != P.line; ++I) { 187 size_t NextNL = Code.find('\n', StartOfLine); 188 if (NextNL == llvm::StringRef::npos) 189 return llvm::make_error<llvm::StringError>( 190 llvm::formatv("Line value is out of range ({0})", P.line), 191 llvm::errc::invalid_argument); 192 StartOfLine = NextNL + 1; 193 } 194 StringRef Line = 195 Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); 196 197 // P.character may be in UTF-16, transcode if necessary. 198 bool Valid; 199 size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid); 200 if (!Valid && !AllowColumnsBeyondLineLength) 201 return llvm::make_error<llvm::StringError>( 202 llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(), 203 P.character, P.line), 204 llvm::errc::invalid_argument); 205 return StartOfLine + ByteInLine; 206 } 207 208 Position offsetToPosition(llvm::StringRef Code, size_t Offset) { 209 Offset = std::min(Code.size(), Offset); 210 llvm::StringRef Before = Code.substr(0, Offset); 211 int Lines = Before.count('\n'); 212 size_t PrevNL = Before.rfind('\n'); 213 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1); 214 Position Pos; 215 Pos.line = Lines; 216 Pos.character = lspLength(Before.substr(StartOfLine)); 217 return Pos; 218 } 219 220 Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) { 221 // We use the SourceManager's line tables, but its column number is in bytes. 222 FileID FID; 223 unsigned Offset; 224 std::tie(FID, Offset) = SM.getDecomposedSpellingLoc(Loc); 225 Position P; 226 P.line = static_cast<int>(SM.getLineNumber(FID, Offset)) - 1; 227 bool Invalid = false; 228 llvm::StringRef Code = SM.getBufferData(FID, &Invalid); 229 if (!Invalid) { 230 auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1; 231 auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes); 232 P.character = lspLength(LineSoFar); 233 } 234 return P; 235 } 236 237 bool isSpelledInSource(SourceLocation Loc, const SourceManager &SM) { 238 if (Loc.isMacroID()) { 239 std::string PrintLoc = SM.getSpellingLoc(Loc).printToString(SM); 240 if (llvm::StringRef(PrintLoc).startswith("<scratch") || 241 llvm::StringRef(PrintLoc).startswith("<command line>")) 242 return false; 243 } 244 return true; 245 } 246 247 bool isValidFileRange(const SourceManager &Mgr, SourceRange R) { 248 if (!R.getBegin().isValid() || !R.getEnd().isValid()) 249 return false; 250 251 FileID BeginFID; 252 size_t BeginOffset = 0; 253 std::tie(BeginFID, BeginOffset) = Mgr.getDecomposedLoc(R.getBegin()); 254 255 FileID EndFID; 256 size_t EndOffset = 0; 257 std::tie(EndFID, EndOffset) = Mgr.getDecomposedLoc(R.getEnd()); 258 259 return BeginFID.isValid() && BeginFID == EndFID && BeginOffset <= EndOffset; 260 } 261 262 SourceLocation includeHashLoc(FileID IncludedFile, const SourceManager &SM) { 263 assert(SM.getLocForEndOfFile(IncludedFile).isFileID()); 264 FileID IncludingFile; 265 unsigned Offset; 266 std::tie(IncludingFile, Offset) = 267 SM.getDecomposedExpansionLoc(SM.getIncludeLoc(IncludedFile)); 268 bool Invalid = false; 269 llvm::StringRef Buf = SM.getBufferData(IncludingFile, &Invalid); 270 if (Invalid) 271 return SourceLocation(); 272 // Now buf is "...\n#include <foo>\n..." 273 // and Offset points here: ^ 274 // Rewind to the preceding # on the line. 275 assert(Offset < Buf.size()); 276 for (;; --Offset) { 277 if (Buf[Offset] == '#') 278 return SM.getComposedLoc(IncludingFile, Offset); 279 if (Buf[Offset] == '\n' || Offset == 0) // no hash, what's going on? 280 return SourceLocation(); 281 } 282 } 283 284 static unsigned getTokenLengthAtLoc(SourceLocation Loc, const SourceManager &SM, 285 const LangOptions &LangOpts) { 286 Token TheTok; 287 if (Lexer::getRawToken(Loc, TheTok, SM, LangOpts)) 288 return 0; 289 // FIXME: Here we check whether the token at the location is a greatergreater 290 // (>>) token and consider it as a single greater (>). This is to get it 291 // working for templates but it isn't correct for the right shift operator. We 292 // can avoid this by using half open char ranges in getFileRange() but getting 293 // token ending is not well supported in macroIDs. 294 if (TheTok.is(tok::greatergreater)) 295 return 1; 296 return TheTok.getLength(); 297 } 298 299 // Returns location of the last character of the token at a given loc 300 static SourceLocation getLocForTokenEnd(SourceLocation BeginLoc, 301 const SourceManager &SM, 302 const LangOptions &LangOpts) { 303 unsigned Len = getTokenLengthAtLoc(BeginLoc, SM, LangOpts); 304 return BeginLoc.getLocWithOffset(Len ? Len - 1 : 0); 305 } 306 307 // Returns location of the starting of the token at a given EndLoc 308 static SourceLocation getLocForTokenBegin(SourceLocation EndLoc, 309 const SourceManager &SM, 310 const LangOptions &LangOpts) { 311 return EndLoc.getLocWithOffset( 312 -(signed)getTokenLengthAtLoc(EndLoc, SM, LangOpts)); 313 } 314 315 // Converts a char source range to a token range. 316 static SourceRange toTokenRange(CharSourceRange Range, const SourceManager &SM, 317 const LangOptions &LangOpts) { 318 if (!Range.isTokenRange()) 319 Range.setEnd(getLocForTokenBegin(Range.getEnd(), SM, LangOpts)); 320 return Range.getAsRange(); 321 } 322 // Returns the union of two token ranges. 323 // To find the maximum of the Ends of the ranges, we compare the location of the 324 // last character of the token. 325 static SourceRange unionTokenRange(SourceRange R1, SourceRange R2, 326 const SourceManager &SM, 327 const LangOptions &LangOpts) { 328 SourceLocation Begin = 329 SM.isBeforeInTranslationUnit(R1.getBegin(), R2.getBegin()) 330 ? R1.getBegin() 331 : R2.getBegin(); 332 SourceLocation End = 333 SM.isBeforeInTranslationUnit(getLocForTokenEnd(R1.getEnd(), SM, LangOpts), 334 getLocForTokenEnd(R2.getEnd(), SM, LangOpts)) 335 ? R2.getEnd() 336 : R1.getEnd(); 337 return SourceRange(Begin, End); 338 } 339 340 // Given a range whose endpoints may be in different expansions or files, 341 // tries to find a range within a common file by following up the expansion and 342 // include location in each. 343 static SourceRange rangeInCommonFile(SourceRange R, const SourceManager &SM, 344 const LangOptions &LangOpts) { 345 // Fast path for most common cases. 346 if (SM.isWrittenInSameFile(R.getBegin(), R.getEnd())) 347 return R; 348 // Record the stack of expansion locations for the beginning, keyed by FileID. 349 llvm::DenseMap<FileID, SourceLocation> BeginExpansions; 350 for (SourceLocation Begin = R.getBegin(); Begin.isValid(); 351 Begin = Begin.isFileID() 352 ? includeHashLoc(SM.getFileID(Begin), SM) 353 : SM.getImmediateExpansionRange(Begin).getBegin()) { 354 BeginExpansions[SM.getFileID(Begin)] = Begin; 355 } 356 // Move up the stack of expansion locations for the end until we find the 357 // location in BeginExpansions with that has the same file id. 358 for (SourceLocation End = R.getEnd(); End.isValid(); 359 End = End.isFileID() ? includeHashLoc(SM.getFileID(End), SM) 360 : toTokenRange(SM.getImmediateExpansionRange(End), 361 SM, LangOpts) 362 .getEnd()) { 363 auto It = BeginExpansions.find(SM.getFileID(End)); 364 if (It != BeginExpansions.end()) { 365 if (SM.getFileOffset(It->second) > SM.getFileOffset(End)) 366 return SourceLocation(); 367 return {It->second, End}; 368 } 369 } 370 return SourceRange(); 371 } 372 373 // Find an expansion range (not necessarily immediate) the ends of which are in 374 // the same file id. 375 static SourceRange 376 getExpansionTokenRangeInSameFile(SourceLocation Loc, const SourceManager &SM, 377 const LangOptions &LangOpts) { 378 return rangeInCommonFile( 379 toTokenRange(SM.getImmediateExpansionRange(Loc), SM, LangOpts), SM, 380 LangOpts); 381 } 382 383 // Returns the file range for a given Location as a Token Range 384 // This is quite similar to getFileLoc in SourceManager as both use 385 // getImmediateExpansionRange and getImmediateSpellingLoc (for macro IDs). 386 // However: 387 // - We want to maintain the full range information as we move from one file to 388 // the next. getFileLoc only uses the BeginLoc of getImmediateExpansionRange. 389 // - We want to split '>>' tokens as the lexer parses the '>>' in nested 390 // template instantiations as a '>>' instead of two '>'s. 391 // There is also getExpansionRange but it simply calls 392 // getImmediateExpansionRange on the begin and ends separately which is wrong. 393 static SourceRange getTokenFileRange(SourceLocation Loc, 394 const SourceManager &SM, 395 const LangOptions &LangOpts) { 396 SourceRange FileRange = Loc; 397 while (!FileRange.getBegin().isFileID()) { 398 if (SM.isMacroArgExpansion(FileRange.getBegin())) { 399 FileRange = unionTokenRange( 400 SM.getImmediateSpellingLoc(FileRange.getBegin()), 401 SM.getImmediateSpellingLoc(FileRange.getEnd()), SM, LangOpts); 402 assert(SM.isWrittenInSameFile(FileRange.getBegin(), FileRange.getEnd())); 403 } else { 404 SourceRange ExpansionRangeForBegin = 405 getExpansionTokenRangeInSameFile(FileRange.getBegin(), SM, LangOpts); 406 SourceRange ExpansionRangeForEnd = 407 getExpansionTokenRangeInSameFile(FileRange.getEnd(), SM, LangOpts); 408 if (ExpansionRangeForBegin.isInvalid() || 409 ExpansionRangeForEnd.isInvalid()) 410 return SourceRange(); 411 assert(SM.isWrittenInSameFile(ExpansionRangeForBegin.getBegin(), 412 ExpansionRangeForEnd.getBegin()) && 413 "Both Expansion ranges should be in same file."); 414 FileRange = unionTokenRange(ExpansionRangeForBegin, ExpansionRangeForEnd, 415 SM, LangOpts); 416 } 417 } 418 return FileRange; 419 } 420 421 bool isInsideMainFile(SourceLocation Loc, const SourceManager &SM) { 422 if (!Loc.isValid()) 423 return false; 424 FileID FID = SM.getFileID(SM.getExpansionLoc(Loc)); 425 return FID == SM.getMainFileID() || FID == SM.getPreambleFileID(); 426 } 427 428 llvm::Optional<SourceRange> toHalfOpenFileRange(const SourceManager &SM, 429 const LangOptions &LangOpts, 430 SourceRange R) { 431 SourceRange R1 = getTokenFileRange(R.getBegin(), SM, LangOpts); 432 if (!isValidFileRange(SM, R1)) 433 return llvm::None; 434 435 SourceRange R2 = getTokenFileRange(R.getEnd(), SM, LangOpts); 436 if (!isValidFileRange(SM, R2)) 437 return llvm::None; 438 439 SourceRange Result = 440 rangeInCommonFile(unionTokenRange(R1, R2, SM, LangOpts), SM, LangOpts); 441 unsigned TokLen = getTokenLengthAtLoc(Result.getEnd(), SM, LangOpts); 442 // Convert from closed token range to half-open (char) range 443 Result.setEnd(Result.getEnd().getLocWithOffset(TokLen)); 444 if (!isValidFileRange(SM, Result)) 445 return llvm::None; 446 447 return Result; 448 } 449 450 llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R) { 451 assert(isValidFileRange(SM, R)); 452 bool Invalid = false; 453 auto *Buf = SM.getBuffer(SM.getFileID(R.getBegin()), &Invalid); 454 assert(!Invalid); 455 456 size_t BeginOffset = SM.getFileOffset(R.getBegin()); 457 size_t EndOffset = SM.getFileOffset(R.getEnd()); 458 return Buf->getBuffer().substr(BeginOffset, EndOffset - BeginOffset); 459 } 460 461 llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM, 462 Position P) { 463 llvm::StringRef Code = SM.getBuffer(SM.getMainFileID())->getBuffer(); 464 auto Offset = 465 positionToOffset(Code, P, /*AllowColumnBeyondLineLength=*/false); 466 if (!Offset) 467 return Offset.takeError(); 468 return SM.getLocForStartOfFile(SM.getMainFileID()).getLocWithOffset(*Offset); 469 } 470 471 Range halfOpenToRange(const SourceManager &SM, CharSourceRange R) { 472 // Clang is 1-based, LSP uses 0-based indexes. 473 Position Begin = sourceLocToPosition(SM, R.getBegin()); 474 Position End = sourceLocToPosition(SM, R.getEnd()); 475 476 return {Begin, End}; 477 } 478 479 std::pair<size_t, size_t> offsetToClangLineColumn(llvm::StringRef Code, 480 size_t Offset) { 481 Offset = std::min(Code.size(), Offset); 482 llvm::StringRef Before = Code.substr(0, Offset); 483 int Lines = Before.count('\n'); 484 size_t PrevNL = Before.rfind('\n'); 485 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1); 486 return {Lines + 1, Offset - StartOfLine + 1}; 487 } 488 489 std::pair<StringRef, StringRef> splitQualifiedName(StringRef QName) { 490 size_t Pos = QName.rfind("::"); 491 if (Pos == llvm::StringRef::npos) 492 return {llvm::StringRef(), QName}; 493 return {QName.substr(0, Pos + 2), QName.substr(Pos + 2)}; 494 } 495 496 TextEdit replacementToEdit(llvm::StringRef Code, 497 const tooling::Replacement &R) { 498 Range ReplacementRange = { 499 offsetToPosition(Code, R.getOffset()), 500 offsetToPosition(Code, R.getOffset() + R.getLength())}; 501 return {ReplacementRange, std::string(R.getReplacementText())}; 502 } 503 504 std::vector<TextEdit> replacementsToEdits(llvm::StringRef Code, 505 const tooling::Replacements &Repls) { 506 std::vector<TextEdit> Edits; 507 for (const auto &R : Repls) 508 Edits.push_back(replacementToEdit(Code, R)); 509 return Edits; 510 } 511 512 llvm::Optional<std::string> getCanonicalPath(const FileEntry *F, 513 const SourceManager &SourceMgr) { 514 if (!F) 515 return None; 516 517 llvm::SmallString<128> FilePath = F->getName(); 518 if (!llvm::sys::path::is_absolute(FilePath)) { 519 if (auto EC = 520 SourceMgr.getFileManager().getVirtualFileSystem().makeAbsolute( 521 FilePath)) { 522 elog("Could not turn relative path '{0}' to absolute: {1}", FilePath, 523 EC.message()); 524 return None; 525 } 526 } 527 528 // Handle the symbolic link path case where the current working directory 529 // (getCurrentWorkingDirectory) is a symlink. We always want to the real 530 // file path (instead of the symlink path) for the C++ symbols. 531 // 532 // Consider the following example: 533 // 534 // src dir: /project/src/foo.h 535 // current working directory (symlink): /tmp/build -> /project/src/ 536 // 537 // The file path of Symbol is "/project/src/foo.h" instead of 538 // "/tmp/build/foo.h" 539 if (auto Dir = SourceMgr.getFileManager().getDirectory( 540 llvm::sys::path::parent_path(FilePath))) { 541 llvm::SmallString<128> RealPath; 542 llvm::StringRef DirName = SourceMgr.getFileManager().getCanonicalName(*Dir); 543 llvm::sys::path::append(RealPath, DirName, 544 llvm::sys::path::filename(FilePath)); 545 return RealPath.str().str(); 546 } 547 548 return FilePath.str().str(); 549 } 550 551 TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M, 552 const LangOptions &L) { 553 TextEdit Result; 554 Result.range = 555 halfOpenToRange(M, Lexer::makeFileCharRange(FixIt.RemoveRange, M, L)); 556 Result.newText = FixIt.CodeToInsert; 557 return Result; 558 } 559 560 FileDigest digest(llvm::StringRef Content) { 561 uint64_t Hash{llvm::xxHash64(Content)}; 562 FileDigest Result; 563 for (unsigned I = 0; I < Result.size(); ++I) { 564 Result[I] = uint8_t(Hash); 565 Hash >>= 8; 566 } 567 return Result; 568 } 569 570 llvm::Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID) { 571 bool Invalid = false; 572 llvm::StringRef Content = SM.getBufferData(FID, &Invalid); 573 if (Invalid) 574 return None; 575 return digest(Content); 576 } 577 578 format::FormatStyle getFormatStyleForFile(llvm::StringRef File, 579 llvm::StringRef Content, 580 const ThreadsafeFS &TFS) { 581 auto Style = format::getStyle(format::DefaultFormatStyle, File, 582 format::DefaultFallbackStyle, Content, 583 TFS.view(/*CWD=*/llvm::None).get()); 584 if (!Style) { 585 log("getStyle() failed for file {0}: {1}. Fallback is LLVM style.", File, 586 Style.takeError()); 587 return format::getLLVMStyle(); 588 } 589 return *Style; 590 } 591 592 llvm::Expected<tooling::Replacements> 593 cleanupAndFormat(StringRef Code, const tooling::Replacements &Replaces, 594 const format::FormatStyle &Style) { 595 auto CleanReplaces = cleanupAroundReplacements(Code, Replaces, Style); 596 if (!CleanReplaces) 597 return CleanReplaces; 598 return formatReplacements(Code, std::move(*CleanReplaces), Style); 599 } 600 601 static void 602 lex(llvm::StringRef Code, const LangOptions &LangOpts, 603 llvm::function_ref<void(const syntax::Token &, const SourceManager &SM)> 604 Action) { 605 // FIXME: InMemoryFileAdapter crashes unless the buffer is null terminated! 606 std::string NullTerminatedCode = Code.str(); 607 SourceManagerForFile FileSM("dummy.cpp", NullTerminatedCode); 608 auto &SM = FileSM.get(); 609 for (const auto &Tok : syntax::tokenize(SM.getMainFileID(), SM, LangOpts)) 610 Action(Tok, SM); 611 } 612 613 llvm::StringMap<unsigned> collectIdentifiers(llvm::StringRef Content, 614 const format::FormatStyle &Style) { 615 llvm::StringMap<unsigned> Identifiers; 616 auto LangOpt = format::getFormattingLangOpts(Style); 617 lex(Content, LangOpt, [&](const syntax::Token &Tok, const SourceManager &SM) { 618 if (Tok.kind() == tok::identifier) 619 ++Identifiers[Tok.text(SM)]; 620 // FIXME: Should this function really return keywords too ? 621 else if (const auto *Keyword = tok::getKeywordSpelling(Tok.kind())) 622 ++Identifiers[Keyword]; 623 }); 624 return Identifiers; 625 } 626 627 std::vector<Range> collectIdentifierRanges(llvm::StringRef Identifier, 628 llvm::StringRef Content, 629 const LangOptions &LangOpts) { 630 std::vector<Range> Ranges; 631 lex(Content, LangOpts, 632 [&](const syntax::Token &Tok, const SourceManager &SM) { 633 if (Tok.kind() != tok::identifier || Tok.text(SM) != Identifier) 634 return; 635 Ranges.push_back(halfOpenToRange(SM, Tok.range(SM).toCharRange(SM))); 636 }); 637 return Ranges; 638 } 639 640 namespace { 641 struct NamespaceEvent { 642 enum { 643 BeginNamespace, // namespace <ns> {. Payload is resolved <ns>. 644 EndNamespace, // } // namespace <ns>. Payload is resolved *outer* 645 // namespace. 646 UsingDirective // using namespace <ns>. Payload is unresolved <ns>. 647 } Trigger; 648 std::string Payload; 649 Position Pos; 650 }; 651 // Scans C++ source code for constructs that change the visible namespaces. 652 void parseNamespaceEvents(llvm::StringRef Code, const LangOptions &LangOpts, 653 llvm::function_ref<void(NamespaceEvent)> Callback) { 654 655 // Stack of enclosing namespaces, e.g. {"clang", "clangd"} 656 std::vector<std::string> Enclosing; // Contains e.g. "clang", "clangd" 657 // Stack counts open braces. true if the brace opened a namespace. 658 std::vector<bool> BraceStack; 659 660 enum { 661 Default, 662 Namespace, // just saw 'namespace' 663 NamespaceName, // just saw 'namespace' NSName 664 Using, // just saw 'using' 665 UsingNamespace, // just saw 'using namespace' 666 UsingNamespaceName, // just saw 'using namespace' NSName 667 } State = Default; 668 std::string NSName; 669 670 NamespaceEvent Event; 671 lex(Code, LangOpts, [&](const syntax::Token &Tok, const SourceManager &SM) { 672 Event.Pos = sourceLocToPosition(SM, Tok.location()); 673 switch (Tok.kind()) { 674 case tok::kw_using: 675 State = State == Default ? Using : Default; 676 break; 677 case tok::kw_namespace: 678 switch (State) { 679 case Using: 680 State = UsingNamespace; 681 break; 682 case Default: 683 State = Namespace; 684 break; 685 default: 686 State = Default; 687 break; 688 } 689 break; 690 case tok::identifier: 691 switch (State) { 692 case UsingNamespace: 693 NSName.clear(); 694 LLVM_FALLTHROUGH; 695 case UsingNamespaceName: 696 NSName.append(Tok.text(SM).str()); 697 State = UsingNamespaceName; 698 break; 699 case Namespace: 700 NSName.clear(); 701 LLVM_FALLTHROUGH; 702 case NamespaceName: 703 NSName.append(Tok.text(SM).str()); 704 State = NamespaceName; 705 break; 706 case Using: 707 case Default: 708 State = Default; 709 break; 710 } 711 break; 712 case tok::coloncolon: 713 // This can come at the beginning or in the middle of a namespace 714 // name. 715 switch (State) { 716 case UsingNamespace: 717 NSName.clear(); 718 LLVM_FALLTHROUGH; 719 case UsingNamespaceName: 720 NSName.append("::"); 721 State = UsingNamespaceName; 722 break; 723 case NamespaceName: 724 NSName.append("::"); 725 State = NamespaceName; 726 break; 727 case Namespace: // Not legal here. 728 case Using: 729 case Default: 730 State = Default; 731 break; 732 } 733 break; 734 case tok::l_brace: 735 // Record which { started a namespace, so we know when } ends one. 736 if (State == NamespaceName) { 737 // Parsed: namespace <name> { 738 BraceStack.push_back(true); 739 Enclosing.push_back(NSName); 740 Event.Trigger = NamespaceEvent::BeginNamespace; 741 Event.Payload = llvm::join(Enclosing, "::"); 742 Callback(Event); 743 } else { 744 // This case includes anonymous namespaces (State = Namespace). 745 // For our purposes, they're not namespaces and we ignore them. 746 BraceStack.push_back(false); 747 } 748 State = Default; 749 break; 750 case tok::r_brace: 751 // If braces are unmatched, we're going to be confused, but don't 752 // crash. 753 if (!BraceStack.empty()) { 754 if (BraceStack.back()) { 755 // Parsed: } // namespace 756 Enclosing.pop_back(); 757 Event.Trigger = NamespaceEvent::EndNamespace; 758 Event.Payload = llvm::join(Enclosing, "::"); 759 Callback(Event); 760 } 761 BraceStack.pop_back(); 762 } 763 break; 764 case tok::semi: 765 if (State == UsingNamespaceName) { 766 // Parsed: using namespace <name> ; 767 Event.Trigger = NamespaceEvent::UsingDirective; 768 Event.Payload = std::move(NSName); 769 Callback(Event); 770 } 771 State = Default; 772 break; 773 default: 774 State = Default; 775 break; 776 } 777 }); 778 } 779 780 // Returns the prefix namespaces of NS: {"" ... NS}. 781 llvm::SmallVector<llvm::StringRef, 8> ancestorNamespaces(llvm::StringRef NS) { 782 llvm::SmallVector<llvm::StringRef, 8> Results; 783 Results.push_back(NS.take_front(0)); 784 NS.split(Results, "::", /*MaxSplit=*/-1, /*KeepEmpty=*/false); 785 for (llvm::StringRef &R : Results) 786 R = NS.take_front(R.end() - NS.begin()); 787 return Results; 788 } 789 790 } // namespace 791 792 std::vector<std::string> visibleNamespaces(llvm::StringRef Code, 793 const LangOptions &LangOpts) { 794 std::string Current; 795 // Map from namespace to (resolved) namespaces introduced via using directive. 796 llvm::StringMap<llvm::StringSet<>> UsingDirectives; 797 798 parseNamespaceEvents(Code, LangOpts, [&](NamespaceEvent Event) { 799 llvm::StringRef NS = Event.Payload; 800 switch (Event.Trigger) { 801 case NamespaceEvent::BeginNamespace: 802 case NamespaceEvent::EndNamespace: 803 Current = std::move(Event.Payload); 804 break; 805 case NamespaceEvent::UsingDirective: 806 if (NS.consume_front("::")) 807 UsingDirectives[Current].insert(NS); 808 else { 809 for (llvm::StringRef Enclosing : ancestorNamespaces(Current)) { 810 if (Enclosing.empty()) 811 UsingDirectives[Current].insert(NS); 812 else 813 UsingDirectives[Current].insert((Enclosing + "::" + NS).str()); 814 } 815 } 816 break; 817 } 818 }); 819 820 std::vector<std::string> Found; 821 for (llvm::StringRef Enclosing : ancestorNamespaces(Current)) { 822 Found.push_back(std::string(Enclosing)); 823 auto It = UsingDirectives.find(Enclosing); 824 if (It != UsingDirectives.end()) 825 for (const auto &Used : It->second) 826 Found.push_back(std::string(Used.getKey())); 827 } 828 829 llvm::sort(Found, [&](const std::string &LHS, const std::string &RHS) { 830 if (Current == RHS) 831 return false; 832 if (Current == LHS) 833 return true; 834 return LHS < RHS; 835 }); 836 Found.erase(std::unique(Found.begin(), Found.end()), Found.end()); 837 return Found; 838 } 839 840 llvm::StringSet<> collectWords(llvm::StringRef Content) { 841 // We assume short words are not significant. 842 // We may want to consider other stopwords, e.g. language keywords. 843 // (A very naive implementation showed no benefit, but lexing might do better) 844 static constexpr int MinWordLength = 4; 845 846 std::vector<CharRole> Roles(Content.size()); 847 calculateRoles(Content, Roles); 848 849 llvm::StringSet<> Result; 850 llvm::SmallString<256> Word; 851 auto Flush = [&] { 852 if (Word.size() >= MinWordLength) { 853 for (char &C : Word) 854 C = llvm::toLower(C); 855 Result.insert(Word); 856 } 857 Word.clear(); 858 }; 859 for (unsigned I = 0; I < Content.size(); ++I) { 860 switch (Roles[I]) { 861 case Head: 862 Flush(); 863 LLVM_FALLTHROUGH; 864 case Tail: 865 Word.push_back(Content[I]); 866 break; 867 case Unknown: 868 case Separator: 869 Flush(); 870 break; 871 } 872 } 873 Flush(); 874 875 return Result; 876 } 877 878 static bool isLikelyIdentifier(llvm::StringRef Word, llvm::StringRef Before, 879 llvm::StringRef After) { 880 // `foo` is an identifier. 881 if (Before.endswith("`") && After.startswith("`")) 882 return true; 883 // In foo::bar, both foo and bar are identifiers. 884 if (Before.endswith("::") || After.startswith("::")) 885 return true; 886 // Doxygen tags like \c foo indicate identifiers. 887 // Don't search too far back. 888 // This duplicates clang's doxygen parser, revisit if it gets complicated. 889 Before = Before.take_back(100); // Don't search too far back. 890 auto Pos = Before.find_last_of("\\@"); 891 if (Pos != llvm::StringRef::npos) { 892 llvm::StringRef Tag = Before.substr(Pos + 1).rtrim(' '); 893 if (Tag == "p" || Tag == "c" || Tag == "class" || Tag == "tparam" || 894 Tag == "param" || Tag == "param[in]" || Tag == "param[out]" || 895 Tag == "param[in,out]" || Tag == "retval" || Tag == "throw" || 896 Tag == "throws" || Tag == "link") 897 return true; 898 } 899 900 // Word contains underscore. 901 // This handles things like snake_case and MACRO_CASE. 902 if (Word.contains('_')) { 903 return true; 904 } 905 // Word contains capital letter other than at beginning. 906 // This handles things like lowerCamel and UpperCamel. 907 // The check for also containing a lowercase letter is to rule out 908 // initialisms like "HTTP". 909 bool HasLower = Word.find_if(clang::isLowercase) != StringRef::npos; 910 bool HasUpper = Word.substr(1).find_if(clang::isUppercase) != StringRef::npos; 911 if (HasLower && HasUpper) { 912 return true; 913 } 914 // FIXME: consider mid-sentence Capitalization? 915 return false; 916 } 917 918 llvm::Optional<SpelledWord> SpelledWord::touching(SourceLocation SpelledLoc, 919 const syntax::TokenBuffer &TB, 920 const LangOptions &LangOpts) { 921 const auto &SM = TB.sourceManager(); 922 auto Touching = syntax::spelledTokensTouching(SpelledLoc, TB); 923 for (const auto &T : Touching) { 924 // If the token is an identifier or a keyword, don't use any heuristics. 925 if (tok::isAnyIdentifier(T.kind()) || tok::getKeywordSpelling(T.kind())) { 926 SpelledWord Result; 927 Result.Location = T.location(); 928 Result.Text = T.text(SM); 929 Result.LikelyIdentifier = tok::isAnyIdentifier(T.kind()); 930 Result.PartOfSpelledToken = &T; 931 Result.SpelledToken = &T; 932 auto Expanded = 933 TB.expandedTokens(SM.getMacroArgExpandedLocation(T.location())); 934 if (Expanded.size() == 1 && Expanded.front().text(SM) == Result.Text) 935 Result.ExpandedToken = &Expanded.front(); 936 return Result; 937 } 938 } 939 FileID File; 940 unsigned Offset; 941 std::tie(File, Offset) = SM.getDecomposedLoc(SpelledLoc); 942 bool Invalid = false; 943 llvm::StringRef Code = SM.getBufferData(File, &Invalid); 944 if (Invalid) 945 return llvm::None; 946 unsigned B = Offset, E = Offset; 947 while (B > 0 && isIdentifierBody(Code[B - 1])) 948 --B; 949 while (E < Code.size() && isIdentifierBody(Code[E])) 950 ++E; 951 if (B == E) 952 return llvm::None; 953 954 SpelledWord Result; 955 Result.Location = SM.getComposedLoc(File, B); 956 Result.Text = Code.slice(B, E); 957 Result.LikelyIdentifier = 958 isLikelyIdentifier(Result.Text, Code.substr(0, B), Code.substr(E)) && 959 // should not be a keyword 960 tok::isAnyIdentifier( 961 IdentifierTable(LangOpts).get(Result.Text).getTokenID()); 962 for (const auto &T : Touching) 963 if (T.location() <= Result.Location) 964 Result.PartOfSpelledToken = &T; 965 return Result; 966 } 967 968 llvm::Optional<DefinedMacro> locateMacroAt(const syntax::Token &SpelledTok, 969 Preprocessor &PP) { 970 SourceLocation Loc = SpelledTok.location(); 971 assert(Loc.isFileID()); 972 const auto &SM = PP.getSourceManager(); 973 IdentifierInfo *IdentifierInfo = PP.getIdentifierInfo(SpelledTok.text(SM)); 974 if (!IdentifierInfo || !IdentifierInfo->hadMacroDefinition()) 975 return None; 976 977 // Get the definition just before the searched location so that a macro 978 // referenced in a '#undef MACRO' can still be found. Note that we only do 979 // that if Loc is not pointing at start of file. 980 if (SM.getLocForStartOfFile(SM.getFileID(Loc)) != Loc) 981 Loc = Loc.getLocWithOffset(-1); 982 MacroDefinition MacroDef = PP.getMacroDefinitionAtLoc(IdentifierInfo, Loc); 983 if (auto *MI = MacroDef.getMacroInfo()) 984 return DefinedMacro{ 985 IdentifierInfo->getName(), MI, 986 translatePreamblePatchLocation(MI->getDefinitionLoc(), SM)}; 987 return None; 988 } 989 990 llvm::Expected<std::string> Edit::apply() const { 991 return tooling::applyAllReplacements(InitialCode, Replacements); 992 } 993 994 std::vector<TextEdit> Edit::asTextEdits() const { 995 return replacementsToEdits(InitialCode, Replacements); 996 } 997 998 bool Edit::canApplyTo(llvm::StringRef Code) const { 999 // Create line iterators, since line numbers are important while applying our 1000 // edit we cannot skip blank lines. 1001 auto LHS = llvm::MemoryBuffer::getMemBuffer(Code); 1002 llvm::line_iterator LHSIt(*LHS, /*SkipBlanks=*/false); 1003 1004 auto RHS = llvm::MemoryBuffer::getMemBuffer(InitialCode); 1005 llvm::line_iterator RHSIt(*RHS, /*SkipBlanks=*/false); 1006 1007 // Compare the InitialCode we prepared the edit for with the Code we received 1008 // line by line to make sure there are no differences. 1009 // FIXME: This check is too conservative now, it should be enough to only 1010 // check lines around the replacements contained inside the Edit. 1011 while (!LHSIt.is_at_eof() && !RHSIt.is_at_eof()) { 1012 if (*LHSIt != *RHSIt) 1013 return false; 1014 ++LHSIt; 1015 ++RHSIt; 1016 } 1017 1018 // After we reach EOF for any of the files we make sure the other one doesn't 1019 // contain any additional content except empty lines, they should not 1020 // interfere with the edit we produced. 1021 while (!LHSIt.is_at_eof()) { 1022 if (!LHSIt->empty()) 1023 return false; 1024 ++LHSIt; 1025 } 1026 while (!RHSIt.is_at_eof()) { 1027 if (!RHSIt->empty()) 1028 return false; 1029 ++RHSIt; 1030 } 1031 return true; 1032 } 1033 1034 llvm::Error reformatEdit(Edit &E, const format::FormatStyle &Style) { 1035 if (auto NewEdits = cleanupAndFormat(E.InitialCode, E.Replacements, Style)) 1036 E.Replacements = std::move(*NewEdits); 1037 else 1038 return NewEdits.takeError(); 1039 return llvm::Error::success(); 1040 } 1041 1042 EligibleRegion getEligiblePoints(llvm::StringRef Code, 1043 llvm::StringRef FullyQualifiedName, 1044 const LangOptions &LangOpts) { 1045 EligibleRegion ER; 1046 // Start with global namespace. 1047 std::vector<std::string> Enclosing = {""}; 1048 // FIXME: In addition to namespaces try to generate events for function 1049 // definitions as well. One might use a closing parantheses(")" followed by an 1050 // opening brace "{" to trigger the start. 1051 parseNamespaceEvents(Code, LangOpts, [&](NamespaceEvent Event) { 1052 // Using Directives only introduces declarations to current scope, they do 1053 // not change the current namespace, so skip them. 1054 if (Event.Trigger == NamespaceEvent::UsingDirective) 1055 return; 1056 // Do not qualify the global namespace. 1057 if (!Event.Payload.empty()) 1058 Event.Payload.append("::"); 1059 1060 std::string CurrentNamespace; 1061 if (Event.Trigger == NamespaceEvent::BeginNamespace) { 1062 Enclosing.emplace_back(std::move(Event.Payload)); 1063 CurrentNamespace = Enclosing.back(); 1064 // parseNameSpaceEvents reports the beginning position of a token; we want 1065 // to insert after '{', so increment by one. 1066 ++Event.Pos.character; 1067 } else { 1068 // Event.Payload points to outer namespace when exiting a scope, so use 1069 // the namespace we've last entered instead. 1070 CurrentNamespace = std::move(Enclosing.back()); 1071 Enclosing.pop_back(); 1072 assert(Enclosing.back() == Event.Payload); 1073 } 1074 1075 // Ignore namespaces that are not a prefix of the target. 1076 if (!FullyQualifiedName.startswith(CurrentNamespace)) 1077 return; 1078 1079 // Prefer the namespace that shares the longest prefix with target. 1080 if (CurrentNamespace.size() > ER.EnclosingNamespace.size()) { 1081 ER.EligiblePoints.clear(); 1082 ER.EnclosingNamespace = CurrentNamespace; 1083 } 1084 if (CurrentNamespace.size() == ER.EnclosingNamespace.size()) 1085 ER.EligiblePoints.emplace_back(std::move(Event.Pos)); 1086 }); 1087 // If there were no shared namespaces just return EOF. 1088 if (ER.EligiblePoints.empty()) { 1089 assert(ER.EnclosingNamespace.empty()); 1090 ER.EligiblePoints.emplace_back(offsetToPosition(Code, Code.size())); 1091 } 1092 return ER; 1093 } 1094 1095 bool isHeaderFile(llvm::StringRef FileName, 1096 llvm::Optional<LangOptions> LangOpts) { 1097 // Respect the langOpts, for non-file-extension cases, e.g. standard library 1098 // files. 1099 if (LangOpts && LangOpts->IsHeaderFile) 1100 return true; 1101 namespace types = clang::driver::types; 1102 auto Lang = types::lookupTypeForExtension( 1103 llvm::sys::path::extension(FileName).substr(1)); 1104 return Lang != types::TY_INVALID && types::onlyPrecompileType(Lang); 1105 } 1106 1107 bool isProtoFile(SourceLocation Loc, const SourceManager &SM) { 1108 auto FileName = SM.getFilename(Loc); 1109 if (!FileName.endswith(".proto.h") && !FileName.endswith(".pb.h")) 1110 return false; 1111 auto FID = SM.getFileID(Loc); 1112 // All proto generated headers should start with this line. 1113 static const char *PROTO_HEADER_COMMENT = 1114 "// Generated by the protocol buffer compiler. DO NOT EDIT!"; 1115 // Double check that this is an actual protobuf header. 1116 return SM.getBufferData(FID).startswith(PROTO_HEADER_COMMENT); 1117 } 1118 1119 } // namespace clangd 1120 } // namespace clang 1121