1 //===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "SourceCode.h" 9 10 #include "FuzzyMatch.h" 11 #include "Preamble.h" 12 #include "Protocol.h" 13 #include "refactor/Tweak.h" 14 #include "support/Context.h" 15 #include "support/Logger.h" 16 #include "clang/AST/ASTContext.h" 17 #include "clang/Basic/LangOptions.h" 18 #include "clang/Basic/SourceLocation.h" 19 #include "clang/Basic/SourceManager.h" 20 #include "clang/Basic/TokenKinds.h" 21 #include "clang/Driver/Types.h" 22 #include "clang/Format/Format.h" 23 #include "clang/Lex/Lexer.h" 24 #include "clang/Lex/Preprocessor.h" 25 #include "clang/Lex/Token.h" 26 #include "clang/Tooling/Core/Replacement.h" 27 #include "clang/Tooling/Syntax/Tokens.h" 28 #include "llvm/ADT/ArrayRef.h" 29 #include "llvm/ADT/None.h" 30 #include "llvm/ADT/STLExtras.h" 31 #include "llvm/ADT/StringExtras.h" 32 #include "llvm/ADT/StringMap.h" 33 #include "llvm/ADT/StringRef.h" 34 #include "llvm/Support/Compiler.h" 35 #include "llvm/Support/Errc.h" 36 #include "llvm/Support/Error.h" 37 #include "llvm/Support/ErrorHandling.h" 38 #include "llvm/Support/LineIterator.h" 39 #include "llvm/Support/MemoryBuffer.h" 40 #include "llvm/Support/Path.h" 41 #include "llvm/Support/SHA1.h" 42 #include "llvm/Support/VirtualFileSystem.h" 43 #include "llvm/Support/xxhash.h" 44 #include <algorithm> 45 #include <cstddef> 46 #include <string> 47 #include <vector> 48 49 namespace clang { 50 namespace clangd { 51 52 // Here be dragons. LSP positions use columns measured in *UTF-16 code units*! 53 // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial. 54 55 // Iterates over unicode codepoints in the (UTF-8) string. For each, 56 // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true. 57 // Returns true if CB returned true, false if we hit the end of string. 58 // 59 // If the string is not valid UTF-8, we log this error and "decode" the 60 // text in some arbitrary way. This is pretty sad, but this tends to happen deep 61 // within indexing of headers where clang misdetected the encoding, and 62 // propagating the error all the way back up is (probably?) not be worth it. 63 template <typename Callback> 64 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { 65 bool LoggedInvalid = false; 66 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). 67 // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. 68 for (size_t I = 0; I < U8.size();) { 69 unsigned char C = static_cast<unsigned char>(U8[I]); 70 if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. 71 if (CB(1, 1)) 72 return true; 73 ++I; 74 continue; 75 } 76 // This convenient property of UTF-8 holds for all non-ASCII characters. 77 size_t UTF8Length = llvm::countLeadingOnes(C); 78 // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. 79 // 11111xxx is not valid UTF-8 at all, maybe some ISO-8859-*. 80 if (LLVM_UNLIKELY(UTF8Length < 2 || UTF8Length > 4)) { 81 if (!LoggedInvalid) { 82 elog("File has invalid UTF-8 near offset {0}: {1}", I, llvm::toHex(U8)); 83 LoggedInvalid = true; 84 } 85 // We can't give a correct result, but avoid returning something wild. 86 // Pretend this is a valid ASCII byte, for lack of better options. 87 // (Too late to get ISO-8859-* right, we've skipped some bytes already). 88 if (CB(1, 1)) 89 return true; 90 ++I; 91 continue; 92 } 93 I += UTF8Length; // Skip over all trailing bytes. 94 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). 95 // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) 96 if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1)) 97 return true; 98 } 99 return false; 100 } 101 102 // Returns the byte offset into the string that is an offset of \p Units in 103 // the specified encoding. 104 // Conceptually, this converts to the encoding, truncates to CodeUnits, 105 // converts back to UTF-8, and returns the length in bytes. 106 static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc, 107 bool &Valid) { 108 Valid = Units >= 0; 109 if (Units <= 0) 110 return 0; 111 size_t Result = 0; 112 switch (Enc) { 113 case OffsetEncoding::UTF8: 114 Result = Units; 115 break; 116 case OffsetEncoding::UTF16: 117 Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { 118 Result += U8Len; 119 Units -= U16Len; 120 return Units <= 0; 121 }); 122 if (Units < 0) // Offset in the middle of a surrogate pair. 123 Valid = false; 124 break; 125 case OffsetEncoding::UTF32: 126 Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { 127 Result += U8Len; 128 Units--; 129 return Units <= 0; 130 }); 131 break; 132 case OffsetEncoding::UnsupportedEncoding: 133 llvm_unreachable("unsupported encoding"); 134 } 135 // Don't return an out-of-range index if we overran. 136 if (Result > U8.size()) { 137 Valid = false; 138 return U8.size(); 139 } 140 return Result; 141 } 142 143 Key<OffsetEncoding> kCurrentOffsetEncoding; 144 static OffsetEncoding lspEncoding() { 145 auto *Enc = Context::current().get(kCurrentOffsetEncoding); 146 return Enc ? *Enc : OffsetEncoding::UTF16; 147 } 148 149 // Like most strings in clangd, the input is UTF-8 encoded. 150 size_t lspLength(llvm::StringRef Code) { 151 size_t Count = 0; 152 switch (lspEncoding()) { 153 case OffsetEncoding::UTF8: 154 Count = Code.size(); 155 break; 156 case OffsetEncoding::UTF16: 157 iterateCodepoints(Code, [&](int U8Len, int U16Len) { 158 Count += U16Len; 159 return false; 160 }); 161 break; 162 case OffsetEncoding::UTF32: 163 iterateCodepoints(Code, [&](int U8Len, int U16Len) { 164 ++Count; 165 return false; 166 }); 167 break; 168 case OffsetEncoding::UnsupportedEncoding: 169 llvm_unreachable("unsupported encoding"); 170 } 171 return Count; 172 } 173 174 llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P, 175 bool AllowColumnsBeyondLineLength) { 176 if (P.line < 0) 177 return llvm::make_error<llvm::StringError>( 178 llvm::formatv("Line value can't be negative ({0})", P.line), 179 llvm::errc::invalid_argument); 180 if (P.character < 0) 181 return llvm::make_error<llvm::StringError>( 182 llvm::formatv("Character value can't be negative ({0})", P.character), 183 llvm::errc::invalid_argument); 184 size_t StartOfLine = 0; 185 for (int I = 0; I != P.line; ++I) { 186 size_t NextNL = Code.find('\n', StartOfLine); 187 if (NextNL == llvm::StringRef::npos) 188 return llvm::make_error<llvm::StringError>( 189 llvm::formatv("Line value is out of range ({0})", P.line), 190 llvm::errc::invalid_argument); 191 StartOfLine = NextNL + 1; 192 } 193 StringRef Line = 194 Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); 195 196 // P.character may be in UTF-16, transcode if necessary. 197 bool Valid; 198 size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid); 199 if (!Valid && !AllowColumnsBeyondLineLength) 200 return llvm::make_error<llvm::StringError>( 201 llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(), 202 P.character, P.line), 203 llvm::errc::invalid_argument); 204 return StartOfLine + ByteInLine; 205 } 206 207 Position offsetToPosition(llvm::StringRef Code, size_t Offset) { 208 Offset = std::min(Code.size(), Offset); 209 llvm::StringRef Before = Code.substr(0, Offset); 210 int Lines = Before.count('\n'); 211 size_t PrevNL = Before.rfind('\n'); 212 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1); 213 Position Pos; 214 Pos.line = Lines; 215 Pos.character = lspLength(Before.substr(StartOfLine)); 216 return Pos; 217 } 218 219 Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) { 220 // We use the SourceManager's line tables, but its column number is in bytes. 221 FileID FID; 222 unsigned Offset; 223 std::tie(FID, Offset) = SM.getDecomposedSpellingLoc(Loc); 224 Position P; 225 P.line = static_cast<int>(SM.getLineNumber(FID, Offset)) - 1; 226 bool Invalid = false; 227 llvm::StringRef Code = SM.getBufferData(FID, &Invalid); 228 if (!Invalid) { 229 auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1; 230 auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes); 231 P.character = lspLength(LineSoFar); 232 } 233 return P; 234 } 235 236 bool isSpelledInSource(SourceLocation Loc, const SourceManager &SM) { 237 if (Loc.isMacroID()) { 238 std::string PrintLoc = SM.getSpellingLoc(Loc).printToString(SM); 239 if (llvm::StringRef(PrintLoc).startswith("<scratch") || 240 llvm::StringRef(PrintLoc).startswith("<command line>")) 241 return false; 242 } 243 return true; 244 } 245 246 bool isValidFileRange(const SourceManager &Mgr, SourceRange R) { 247 if (!R.getBegin().isValid() || !R.getEnd().isValid()) 248 return false; 249 250 FileID BeginFID; 251 size_t BeginOffset = 0; 252 std::tie(BeginFID, BeginOffset) = Mgr.getDecomposedLoc(R.getBegin()); 253 254 FileID EndFID; 255 size_t EndOffset = 0; 256 std::tie(EndFID, EndOffset) = Mgr.getDecomposedLoc(R.getEnd()); 257 258 return BeginFID.isValid() && BeginFID == EndFID && BeginOffset <= EndOffset; 259 } 260 261 SourceLocation includeHashLoc(FileID IncludedFile, const SourceManager &SM) { 262 assert(SM.getLocForEndOfFile(IncludedFile).isFileID()); 263 FileID IncludingFile; 264 unsigned Offset; 265 std::tie(IncludingFile, Offset) = 266 SM.getDecomposedExpansionLoc(SM.getIncludeLoc(IncludedFile)); 267 bool Invalid = false; 268 llvm::StringRef Buf = SM.getBufferData(IncludingFile, &Invalid); 269 if (Invalid) 270 return SourceLocation(); 271 // Now buf is "...\n#include <foo>\n..." 272 // and Offset points here: ^ 273 // Rewind to the preceding # on the line. 274 assert(Offset < Buf.size()); 275 for (;; --Offset) { 276 if (Buf[Offset] == '#') 277 return SM.getComposedLoc(IncludingFile, Offset); 278 if (Buf[Offset] == '\n' || Offset == 0) // no hash, what's going on? 279 return SourceLocation(); 280 } 281 } 282 283 static unsigned getTokenLengthAtLoc(SourceLocation Loc, const SourceManager &SM, 284 const LangOptions &LangOpts) { 285 Token TheTok; 286 if (Lexer::getRawToken(Loc, TheTok, SM, LangOpts)) 287 return 0; 288 // FIXME: Here we check whether the token at the location is a greatergreater 289 // (>>) token and consider it as a single greater (>). This is to get it 290 // working for templates but it isn't correct for the right shift operator. We 291 // can avoid this by using half open char ranges in getFileRange() but getting 292 // token ending is not well supported in macroIDs. 293 if (TheTok.is(tok::greatergreater)) 294 return 1; 295 return TheTok.getLength(); 296 } 297 298 // Returns location of the last character of the token at a given loc 299 static SourceLocation getLocForTokenEnd(SourceLocation BeginLoc, 300 const SourceManager &SM, 301 const LangOptions &LangOpts) { 302 unsigned Len = getTokenLengthAtLoc(BeginLoc, SM, LangOpts); 303 return BeginLoc.getLocWithOffset(Len ? Len - 1 : 0); 304 } 305 306 // Returns location of the starting of the token at a given EndLoc 307 static SourceLocation getLocForTokenBegin(SourceLocation EndLoc, 308 const SourceManager &SM, 309 const LangOptions &LangOpts) { 310 return EndLoc.getLocWithOffset( 311 -(signed)getTokenLengthAtLoc(EndLoc, SM, LangOpts)); 312 } 313 314 // Converts a char source range to a token range. 315 static SourceRange toTokenRange(CharSourceRange Range, const SourceManager &SM, 316 const LangOptions &LangOpts) { 317 if (!Range.isTokenRange()) 318 Range.setEnd(getLocForTokenBegin(Range.getEnd(), SM, LangOpts)); 319 return Range.getAsRange(); 320 } 321 // Returns the union of two token ranges. 322 // To find the maximum of the Ends of the ranges, we compare the location of the 323 // last character of the token. 324 static SourceRange unionTokenRange(SourceRange R1, SourceRange R2, 325 const SourceManager &SM, 326 const LangOptions &LangOpts) { 327 SourceLocation Begin = 328 SM.isBeforeInTranslationUnit(R1.getBegin(), R2.getBegin()) 329 ? R1.getBegin() 330 : R2.getBegin(); 331 SourceLocation End = 332 SM.isBeforeInTranslationUnit(getLocForTokenEnd(R1.getEnd(), SM, LangOpts), 333 getLocForTokenEnd(R2.getEnd(), SM, LangOpts)) 334 ? R2.getEnd() 335 : R1.getEnd(); 336 return SourceRange(Begin, End); 337 } 338 339 // Given a range whose endpoints may be in different expansions or files, 340 // tries to find a range within a common file by following up the expansion and 341 // include location in each. 342 static SourceRange rangeInCommonFile(SourceRange R, const SourceManager &SM, 343 const LangOptions &LangOpts) { 344 // Fast path for most common cases. 345 if (SM.isWrittenInSameFile(R.getBegin(), R.getEnd())) 346 return R; 347 // Record the stack of expansion locations for the beginning, keyed by FileID. 348 llvm::DenseMap<FileID, SourceLocation> BeginExpansions; 349 for (SourceLocation Begin = R.getBegin(); Begin.isValid(); 350 Begin = Begin.isFileID() 351 ? includeHashLoc(SM.getFileID(Begin), SM) 352 : SM.getImmediateExpansionRange(Begin).getBegin()) { 353 BeginExpansions[SM.getFileID(Begin)] = Begin; 354 } 355 // Move up the stack of expansion locations for the end until we find the 356 // location in BeginExpansions with that has the same file id. 357 for (SourceLocation End = R.getEnd(); End.isValid(); 358 End = End.isFileID() ? includeHashLoc(SM.getFileID(End), SM) 359 : toTokenRange(SM.getImmediateExpansionRange(End), 360 SM, LangOpts) 361 .getEnd()) { 362 auto It = BeginExpansions.find(SM.getFileID(End)); 363 if (It != BeginExpansions.end()) { 364 if (SM.getFileOffset(It->second) > SM.getFileOffset(End)) 365 return SourceLocation(); 366 return {It->second, End}; 367 } 368 } 369 return SourceRange(); 370 } 371 372 // Find an expansion range (not necessarily immediate) the ends of which are in 373 // the same file id. 374 static SourceRange 375 getExpansionTokenRangeInSameFile(SourceLocation Loc, const SourceManager &SM, 376 const LangOptions &LangOpts) { 377 return rangeInCommonFile( 378 toTokenRange(SM.getImmediateExpansionRange(Loc), SM, LangOpts), SM, 379 LangOpts); 380 } 381 382 // Returns the file range for a given Location as a Token Range 383 // This is quite similar to getFileLoc in SourceManager as both use 384 // getImmediateExpansionRange and getImmediateSpellingLoc (for macro IDs). 385 // However: 386 // - We want to maintain the full range information as we move from one file to 387 // the next. getFileLoc only uses the BeginLoc of getImmediateExpansionRange. 388 // - We want to split '>>' tokens as the lexer parses the '>>' in nested 389 // template instantiations as a '>>' instead of two '>'s. 390 // There is also getExpansionRange but it simply calls 391 // getImmediateExpansionRange on the begin and ends separately which is wrong. 392 static SourceRange getTokenFileRange(SourceLocation Loc, 393 const SourceManager &SM, 394 const LangOptions &LangOpts) { 395 SourceRange FileRange = Loc; 396 while (!FileRange.getBegin().isFileID()) { 397 if (SM.isMacroArgExpansion(FileRange.getBegin())) { 398 FileRange = unionTokenRange( 399 SM.getImmediateSpellingLoc(FileRange.getBegin()), 400 SM.getImmediateSpellingLoc(FileRange.getEnd()), SM, LangOpts); 401 assert(SM.isWrittenInSameFile(FileRange.getBegin(), FileRange.getEnd())); 402 } else { 403 SourceRange ExpansionRangeForBegin = 404 getExpansionTokenRangeInSameFile(FileRange.getBegin(), SM, LangOpts); 405 SourceRange ExpansionRangeForEnd = 406 getExpansionTokenRangeInSameFile(FileRange.getEnd(), SM, LangOpts); 407 if (ExpansionRangeForBegin.isInvalid() || 408 ExpansionRangeForEnd.isInvalid()) 409 return SourceRange(); 410 assert(SM.isWrittenInSameFile(ExpansionRangeForBegin.getBegin(), 411 ExpansionRangeForEnd.getBegin()) && 412 "Both Expansion ranges should be in same file."); 413 FileRange = unionTokenRange(ExpansionRangeForBegin, ExpansionRangeForEnd, 414 SM, LangOpts); 415 } 416 } 417 return FileRange; 418 } 419 420 bool isInsideMainFile(SourceLocation Loc, const SourceManager &SM) { 421 if (!Loc.isValid()) 422 return false; 423 FileID FID = SM.getFileID(SM.getExpansionLoc(Loc)); 424 return FID == SM.getMainFileID() || FID == SM.getPreambleFileID(); 425 } 426 427 llvm::Optional<SourceRange> toHalfOpenFileRange(const SourceManager &SM, 428 const LangOptions &LangOpts, 429 SourceRange R) { 430 SourceRange R1 = getTokenFileRange(R.getBegin(), SM, LangOpts); 431 if (!isValidFileRange(SM, R1)) 432 return llvm::None; 433 434 SourceRange R2 = getTokenFileRange(R.getEnd(), SM, LangOpts); 435 if (!isValidFileRange(SM, R2)) 436 return llvm::None; 437 438 SourceRange Result = 439 rangeInCommonFile(unionTokenRange(R1, R2, SM, LangOpts), SM, LangOpts); 440 unsigned TokLen = getTokenLengthAtLoc(Result.getEnd(), SM, LangOpts); 441 // Convert from closed token range to half-open (char) range 442 Result.setEnd(Result.getEnd().getLocWithOffset(TokLen)); 443 if (!isValidFileRange(SM, Result)) 444 return llvm::None; 445 446 return Result; 447 } 448 449 llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R) { 450 assert(isValidFileRange(SM, R)); 451 bool Invalid = false; 452 auto *Buf = SM.getBuffer(SM.getFileID(R.getBegin()), &Invalid); 453 assert(!Invalid); 454 455 size_t BeginOffset = SM.getFileOffset(R.getBegin()); 456 size_t EndOffset = SM.getFileOffset(R.getEnd()); 457 return Buf->getBuffer().substr(BeginOffset, EndOffset - BeginOffset); 458 } 459 460 llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM, 461 Position P) { 462 llvm::StringRef Code = SM.getBuffer(SM.getMainFileID())->getBuffer(); 463 auto Offset = 464 positionToOffset(Code, P, /*AllowColumnBeyondLineLength=*/false); 465 if (!Offset) 466 return Offset.takeError(); 467 return SM.getLocForStartOfFile(SM.getMainFileID()).getLocWithOffset(*Offset); 468 } 469 470 Range halfOpenToRange(const SourceManager &SM, CharSourceRange R) { 471 // Clang is 1-based, LSP uses 0-based indexes. 472 Position Begin = sourceLocToPosition(SM, R.getBegin()); 473 Position End = sourceLocToPosition(SM, R.getEnd()); 474 475 return {Begin, End}; 476 } 477 478 std::pair<size_t, size_t> offsetToClangLineColumn(llvm::StringRef Code, 479 size_t Offset) { 480 Offset = std::min(Code.size(), Offset); 481 llvm::StringRef Before = Code.substr(0, Offset); 482 int Lines = Before.count('\n'); 483 size_t PrevNL = Before.rfind('\n'); 484 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1); 485 return {Lines + 1, Offset - StartOfLine + 1}; 486 } 487 488 std::pair<StringRef, StringRef> splitQualifiedName(StringRef QName) { 489 size_t Pos = QName.rfind("::"); 490 if (Pos == llvm::StringRef::npos) 491 return {llvm::StringRef(), QName}; 492 return {QName.substr(0, Pos + 2), QName.substr(Pos + 2)}; 493 } 494 495 TextEdit replacementToEdit(llvm::StringRef Code, 496 const tooling::Replacement &R) { 497 Range ReplacementRange = { 498 offsetToPosition(Code, R.getOffset()), 499 offsetToPosition(Code, R.getOffset() + R.getLength())}; 500 return {ReplacementRange, std::string(R.getReplacementText())}; 501 } 502 503 std::vector<TextEdit> replacementsToEdits(llvm::StringRef Code, 504 const tooling::Replacements &Repls) { 505 std::vector<TextEdit> Edits; 506 for (const auto &R : Repls) 507 Edits.push_back(replacementToEdit(Code, R)); 508 return Edits; 509 } 510 511 llvm::Optional<std::string> getCanonicalPath(const FileEntry *F, 512 const SourceManager &SourceMgr) { 513 if (!F) 514 return None; 515 516 llvm::SmallString<128> FilePath = F->getName(); 517 if (!llvm::sys::path::is_absolute(FilePath)) { 518 if (auto EC = 519 SourceMgr.getFileManager().getVirtualFileSystem().makeAbsolute( 520 FilePath)) { 521 elog("Could not turn relative path '{0}' to absolute: {1}", FilePath, 522 EC.message()); 523 return None; 524 } 525 } 526 527 // Handle the symbolic link path case where the current working directory 528 // (getCurrentWorkingDirectory) is a symlink. We always want to the real 529 // file path (instead of the symlink path) for the C++ symbols. 530 // 531 // Consider the following example: 532 // 533 // src dir: /project/src/foo.h 534 // current working directory (symlink): /tmp/build -> /project/src/ 535 // 536 // The file path of Symbol is "/project/src/foo.h" instead of 537 // "/tmp/build/foo.h" 538 if (auto Dir = SourceMgr.getFileManager().getDirectory( 539 llvm::sys::path::parent_path(FilePath))) { 540 llvm::SmallString<128> RealPath; 541 llvm::StringRef DirName = SourceMgr.getFileManager().getCanonicalName(*Dir); 542 llvm::sys::path::append(RealPath, DirName, 543 llvm::sys::path::filename(FilePath)); 544 return RealPath.str().str(); 545 } 546 547 return FilePath.str().str(); 548 } 549 550 TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M, 551 const LangOptions &L) { 552 TextEdit Result; 553 Result.range = 554 halfOpenToRange(M, Lexer::makeFileCharRange(FixIt.RemoveRange, M, L)); 555 Result.newText = FixIt.CodeToInsert; 556 return Result; 557 } 558 559 FileDigest digest(llvm::StringRef Content) { 560 uint64_t Hash{llvm::xxHash64(Content)}; 561 FileDigest Result; 562 for (unsigned I = 0; I < Result.size(); ++I) { 563 Result[I] = uint8_t(Hash); 564 Hash >>= 8; 565 } 566 return Result; 567 } 568 569 llvm::Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID) { 570 bool Invalid = false; 571 llvm::StringRef Content = SM.getBufferData(FID, &Invalid); 572 if (Invalid) 573 return None; 574 return digest(Content); 575 } 576 577 format::FormatStyle getFormatStyleForFile(llvm::StringRef File, 578 llvm::StringRef Content, 579 llvm::vfs::FileSystem *FS) { 580 auto Style = format::getStyle(format::DefaultFormatStyle, File, 581 format::DefaultFallbackStyle, Content, FS); 582 if (!Style) { 583 log("getStyle() failed for file {0}: {1}. Fallback is LLVM style.", File, 584 Style.takeError()); 585 return format::getLLVMStyle(); 586 } 587 return *Style; 588 } 589 590 llvm::Expected<tooling::Replacements> 591 cleanupAndFormat(StringRef Code, const tooling::Replacements &Replaces, 592 const format::FormatStyle &Style) { 593 auto CleanReplaces = cleanupAroundReplacements(Code, Replaces, Style); 594 if (!CleanReplaces) 595 return CleanReplaces; 596 return formatReplacements(Code, std::move(*CleanReplaces), Style); 597 } 598 599 static void 600 lex(llvm::StringRef Code, const LangOptions &LangOpts, 601 llvm::function_ref<void(const syntax::Token &, const SourceManager &SM)> 602 Action) { 603 // FIXME: InMemoryFileAdapter crashes unless the buffer is null terminated! 604 std::string NullTerminatedCode = Code.str(); 605 SourceManagerForFile FileSM("dummy.cpp", NullTerminatedCode); 606 auto &SM = FileSM.get(); 607 for (const auto &Tok : syntax::tokenize(SM.getMainFileID(), SM, LangOpts)) 608 Action(Tok, SM); 609 } 610 611 llvm::StringMap<unsigned> collectIdentifiers(llvm::StringRef Content, 612 const format::FormatStyle &Style) { 613 llvm::StringMap<unsigned> Identifiers; 614 auto LangOpt = format::getFormattingLangOpts(Style); 615 lex(Content, LangOpt, [&](const syntax::Token &Tok, const SourceManager &SM) { 616 if (Tok.kind() == tok::identifier) 617 ++Identifiers[Tok.text(SM)]; 618 // FIXME: Should this function really return keywords too ? 619 else if (const auto *Keyword = tok::getKeywordSpelling(Tok.kind())) 620 ++Identifiers[Keyword]; 621 }); 622 return Identifiers; 623 } 624 625 std::vector<Range> collectIdentifierRanges(llvm::StringRef Identifier, 626 llvm::StringRef Content, 627 const LangOptions &LangOpts) { 628 std::vector<Range> Ranges; 629 lex(Content, LangOpts, 630 [&](const syntax::Token &Tok, const SourceManager &SM) { 631 if (Tok.kind() != tok::identifier || Tok.text(SM) != Identifier) 632 return; 633 Ranges.push_back(halfOpenToRange(SM, Tok.range(SM).toCharRange(SM))); 634 }); 635 return Ranges; 636 } 637 638 namespace { 639 struct NamespaceEvent { 640 enum { 641 BeginNamespace, // namespace <ns> {. Payload is resolved <ns>. 642 EndNamespace, // } // namespace <ns>. Payload is resolved *outer* 643 // namespace. 644 UsingDirective // using namespace <ns>. Payload is unresolved <ns>. 645 } Trigger; 646 std::string Payload; 647 Position Pos; 648 }; 649 // Scans C++ source code for constructs that change the visible namespaces. 650 void parseNamespaceEvents(llvm::StringRef Code, const LangOptions &LangOpts, 651 llvm::function_ref<void(NamespaceEvent)> Callback) { 652 653 // Stack of enclosing namespaces, e.g. {"clang", "clangd"} 654 std::vector<std::string> Enclosing; // Contains e.g. "clang", "clangd" 655 // Stack counts open braces. true if the brace opened a namespace. 656 std::vector<bool> BraceStack; 657 658 enum { 659 Default, 660 Namespace, // just saw 'namespace' 661 NamespaceName, // just saw 'namespace' NSName 662 Using, // just saw 'using' 663 UsingNamespace, // just saw 'using namespace' 664 UsingNamespaceName, // just saw 'using namespace' NSName 665 } State = Default; 666 std::string NSName; 667 668 NamespaceEvent Event; 669 lex(Code, LangOpts, [&](const syntax::Token &Tok, const SourceManager &SM) { 670 Event.Pos = sourceLocToPosition(SM, Tok.location()); 671 switch (Tok.kind()) { 672 case tok::kw_using: 673 State = State == Default ? Using : Default; 674 break; 675 case tok::kw_namespace: 676 switch (State) { 677 case Using: 678 State = UsingNamespace; 679 break; 680 case Default: 681 State = Namespace; 682 break; 683 default: 684 State = Default; 685 break; 686 } 687 break; 688 case tok::identifier: 689 switch (State) { 690 case UsingNamespace: 691 NSName.clear(); 692 LLVM_FALLTHROUGH; 693 case UsingNamespaceName: 694 NSName.append(Tok.text(SM).str()); 695 State = UsingNamespaceName; 696 break; 697 case Namespace: 698 NSName.clear(); 699 LLVM_FALLTHROUGH; 700 case NamespaceName: 701 NSName.append(Tok.text(SM).str()); 702 State = NamespaceName; 703 break; 704 case Using: 705 case Default: 706 State = Default; 707 break; 708 } 709 break; 710 case tok::coloncolon: 711 // This can come at the beginning or in the middle of a namespace 712 // name. 713 switch (State) { 714 case UsingNamespace: 715 NSName.clear(); 716 LLVM_FALLTHROUGH; 717 case UsingNamespaceName: 718 NSName.append("::"); 719 State = UsingNamespaceName; 720 break; 721 case NamespaceName: 722 NSName.append("::"); 723 State = NamespaceName; 724 break; 725 case Namespace: // Not legal here. 726 case Using: 727 case Default: 728 State = Default; 729 break; 730 } 731 break; 732 case tok::l_brace: 733 // Record which { started a namespace, so we know when } ends one. 734 if (State == NamespaceName) { 735 // Parsed: namespace <name> { 736 BraceStack.push_back(true); 737 Enclosing.push_back(NSName); 738 Event.Trigger = NamespaceEvent::BeginNamespace; 739 Event.Payload = llvm::join(Enclosing, "::"); 740 Callback(Event); 741 } else { 742 // This case includes anonymous namespaces (State = Namespace). 743 // For our purposes, they're not namespaces and we ignore them. 744 BraceStack.push_back(false); 745 } 746 State = Default; 747 break; 748 case tok::r_brace: 749 // If braces are unmatched, we're going to be confused, but don't 750 // crash. 751 if (!BraceStack.empty()) { 752 if (BraceStack.back()) { 753 // Parsed: } // namespace 754 Enclosing.pop_back(); 755 Event.Trigger = NamespaceEvent::EndNamespace; 756 Event.Payload = llvm::join(Enclosing, "::"); 757 Callback(Event); 758 } 759 BraceStack.pop_back(); 760 } 761 break; 762 case tok::semi: 763 if (State == UsingNamespaceName) { 764 // Parsed: using namespace <name> ; 765 Event.Trigger = NamespaceEvent::UsingDirective; 766 Event.Payload = std::move(NSName); 767 Callback(Event); 768 } 769 State = Default; 770 break; 771 default: 772 State = Default; 773 break; 774 } 775 }); 776 } 777 778 // Returns the prefix namespaces of NS: {"" ... NS}. 779 llvm::SmallVector<llvm::StringRef, 8> ancestorNamespaces(llvm::StringRef NS) { 780 llvm::SmallVector<llvm::StringRef, 8> Results; 781 Results.push_back(NS.take_front(0)); 782 NS.split(Results, "::", /*MaxSplit=*/-1, /*KeepEmpty=*/false); 783 for (llvm::StringRef &R : Results) 784 R = NS.take_front(R.end() - NS.begin()); 785 return Results; 786 } 787 788 } // namespace 789 790 std::vector<std::string> visibleNamespaces(llvm::StringRef Code, 791 const LangOptions &LangOpts) { 792 std::string Current; 793 // Map from namespace to (resolved) namespaces introduced via using directive. 794 llvm::StringMap<llvm::StringSet<>> UsingDirectives; 795 796 parseNamespaceEvents(Code, LangOpts, [&](NamespaceEvent Event) { 797 llvm::StringRef NS = Event.Payload; 798 switch (Event.Trigger) { 799 case NamespaceEvent::BeginNamespace: 800 case NamespaceEvent::EndNamespace: 801 Current = std::move(Event.Payload); 802 break; 803 case NamespaceEvent::UsingDirective: 804 if (NS.consume_front("::")) 805 UsingDirectives[Current].insert(NS); 806 else { 807 for (llvm::StringRef Enclosing : ancestorNamespaces(Current)) { 808 if (Enclosing.empty()) 809 UsingDirectives[Current].insert(NS); 810 else 811 UsingDirectives[Current].insert((Enclosing + "::" + NS).str()); 812 } 813 } 814 break; 815 } 816 }); 817 818 std::vector<std::string> Found; 819 for (llvm::StringRef Enclosing : ancestorNamespaces(Current)) { 820 Found.push_back(std::string(Enclosing)); 821 auto It = UsingDirectives.find(Enclosing); 822 if (It != UsingDirectives.end()) 823 for (const auto &Used : It->second) 824 Found.push_back(std::string(Used.getKey())); 825 } 826 827 llvm::sort(Found, [&](const std::string &LHS, const std::string &RHS) { 828 if (Current == RHS) 829 return false; 830 if (Current == LHS) 831 return true; 832 return LHS < RHS; 833 }); 834 Found.erase(std::unique(Found.begin(), Found.end()), Found.end()); 835 return Found; 836 } 837 838 llvm::StringSet<> collectWords(llvm::StringRef Content) { 839 // We assume short words are not significant. 840 // We may want to consider other stopwords, e.g. language keywords. 841 // (A very naive implementation showed no benefit, but lexing might do better) 842 static constexpr int MinWordLength = 4; 843 844 std::vector<CharRole> Roles(Content.size()); 845 calculateRoles(Content, Roles); 846 847 llvm::StringSet<> Result; 848 llvm::SmallString<256> Word; 849 auto Flush = [&] { 850 if (Word.size() >= MinWordLength) { 851 for (char &C : Word) 852 C = llvm::toLower(C); 853 Result.insert(Word); 854 } 855 Word.clear(); 856 }; 857 for (unsigned I = 0; I < Content.size(); ++I) { 858 switch (Roles[I]) { 859 case Head: 860 Flush(); 861 LLVM_FALLTHROUGH; 862 case Tail: 863 Word.push_back(Content[I]); 864 break; 865 case Unknown: 866 case Separator: 867 Flush(); 868 break; 869 } 870 } 871 Flush(); 872 873 return Result; 874 } 875 876 static bool isLikelyIdentifier(llvm::StringRef Word, llvm::StringRef Before, 877 llvm::StringRef After) { 878 // `foo` is an identifier. 879 if (Before.endswith("`") && After.startswith("`")) 880 return true; 881 // In foo::bar, both foo and bar are identifiers. 882 if (Before.endswith("::") || After.startswith("::")) 883 return true; 884 // Doxygen tags like \c foo indicate identifiers. 885 // Don't search too far back. 886 // This duplicates clang's doxygen parser, revisit if it gets complicated. 887 Before = Before.take_back(100); // Don't search too far back. 888 auto Pos = Before.find_last_of("\\@"); 889 if (Pos != llvm::StringRef::npos) { 890 llvm::StringRef Tag = Before.substr(Pos + 1).rtrim(' '); 891 if (Tag == "p" || Tag == "c" || Tag == "class" || Tag == "tparam" || 892 Tag == "param" || Tag == "param[in]" || Tag == "param[out]" || 893 Tag == "param[in,out]" || Tag == "retval" || Tag == "throw" || 894 Tag == "throws" || Tag == "link") 895 return true; 896 } 897 898 // Word contains underscore. 899 // This handles things like snake_case and MACRO_CASE. 900 if (Word.contains('_')) { 901 return true; 902 } 903 // Word contains capital letter other than at beginning. 904 // This handles things like lowerCamel and UpperCamel. 905 // The check for also containing a lowercase letter is to rule out 906 // initialisms like "HTTP". 907 bool HasLower = Word.find_if(clang::isLowercase) != StringRef::npos; 908 bool HasUpper = Word.substr(1).find_if(clang::isUppercase) != StringRef::npos; 909 if (HasLower && HasUpper) { 910 return true; 911 } 912 // FIXME: consider mid-sentence Capitalization? 913 return false; 914 } 915 916 llvm::Optional<SpelledWord> SpelledWord::touching(SourceLocation SpelledLoc, 917 const syntax::TokenBuffer &TB, 918 const LangOptions &LangOpts) { 919 const auto &SM = TB.sourceManager(); 920 auto Touching = syntax::spelledTokensTouching(SpelledLoc, TB); 921 for (const auto &T : Touching) { 922 // If the token is an identifier or a keyword, don't use any heuristics. 923 if (tok::isAnyIdentifier(T.kind()) || tok::getKeywordSpelling(T.kind())) { 924 SpelledWord Result; 925 Result.Location = T.location(); 926 Result.Text = T.text(SM); 927 Result.LikelyIdentifier = tok::isAnyIdentifier(T.kind()); 928 Result.PartOfSpelledToken = &T; 929 Result.SpelledToken = &T; 930 auto Expanded = 931 TB.expandedTokens(SM.getMacroArgExpandedLocation(T.location())); 932 if (Expanded.size() == 1 && Expanded.front().text(SM) == Result.Text) 933 Result.ExpandedToken = &Expanded.front(); 934 return Result; 935 } 936 } 937 FileID File; 938 unsigned Offset; 939 std::tie(File, Offset) = SM.getDecomposedLoc(SpelledLoc); 940 bool Invalid = false; 941 llvm::StringRef Code = SM.getBufferData(File, &Invalid); 942 if (Invalid) 943 return llvm::None; 944 unsigned B = Offset, E = Offset; 945 while (B > 0 && isIdentifierBody(Code[B - 1])) 946 --B; 947 while (E < Code.size() && isIdentifierBody(Code[E])) 948 ++E; 949 if (B == E) 950 return llvm::None; 951 952 SpelledWord Result; 953 Result.Location = SM.getComposedLoc(File, B); 954 Result.Text = Code.slice(B, E); 955 Result.LikelyIdentifier = 956 isLikelyIdentifier(Result.Text, Code.substr(0, B), Code.substr(E)) && 957 // should not be a keyword 958 tok::isAnyIdentifier( 959 IdentifierTable(LangOpts).get(Result.Text).getTokenID()); 960 for (const auto &T : Touching) 961 if (T.location() <= Result.Location) 962 Result.PartOfSpelledToken = &T; 963 return Result; 964 } 965 966 llvm::Optional<DefinedMacro> locateMacroAt(const syntax::Token &SpelledTok, 967 Preprocessor &PP) { 968 SourceLocation Loc = SpelledTok.location(); 969 assert(Loc.isFileID()); 970 const auto &SM = PP.getSourceManager(); 971 IdentifierInfo *IdentifierInfo = PP.getIdentifierInfo(SpelledTok.text(SM)); 972 if (!IdentifierInfo || !IdentifierInfo->hadMacroDefinition()) 973 return None; 974 975 // Get the definition just before the searched location so that a macro 976 // referenced in a '#undef MACRO' can still be found. Note that we only do 977 // that if Loc is not pointing at start of file. 978 if (SM.getLocForStartOfFile(SM.getFileID(Loc)) != Loc) 979 Loc = Loc.getLocWithOffset(-1); 980 MacroDefinition MacroDef = PP.getMacroDefinitionAtLoc(IdentifierInfo, Loc); 981 if (auto *MI = MacroDef.getMacroInfo()) 982 return DefinedMacro{ 983 IdentifierInfo->getName(), MI, 984 translatePreamblePatchLocation(MI->getDefinitionLoc(), SM)}; 985 return None; 986 } 987 988 llvm::Expected<std::string> Edit::apply() const { 989 return tooling::applyAllReplacements(InitialCode, Replacements); 990 } 991 992 std::vector<TextEdit> Edit::asTextEdits() const { 993 return replacementsToEdits(InitialCode, Replacements); 994 } 995 996 bool Edit::canApplyTo(llvm::StringRef Code) const { 997 // Create line iterators, since line numbers are important while applying our 998 // edit we cannot skip blank lines. 999 auto LHS = llvm::MemoryBuffer::getMemBuffer(Code); 1000 llvm::line_iterator LHSIt(*LHS, /*SkipBlanks=*/false); 1001 1002 auto RHS = llvm::MemoryBuffer::getMemBuffer(InitialCode); 1003 llvm::line_iterator RHSIt(*RHS, /*SkipBlanks=*/false); 1004 1005 // Compare the InitialCode we prepared the edit for with the Code we received 1006 // line by line to make sure there are no differences. 1007 // FIXME: This check is too conservative now, it should be enough to only 1008 // check lines around the replacements contained inside the Edit. 1009 while (!LHSIt.is_at_eof() && !RHSIt.is_at_eof()) { 1010 if (*LHSIt != *RHSIt) 1011 return false; 1012 ++LHSIt; 1013 ++RHSIt; 1014 } 1015 1016 // After we reach EOF for any of the files we make sure the other one doesn't 1017 // contain any additional content except empty lines, they should not 1018 // interfere with the edit we produced. 1019 while (!LHSIt.is_at_eof()) { 1020 if (!LHSIt->empty()) 1021 return false; 1022 ++LHSIt; 1023 } 1024 while (!RHSIt.is_at_eof()) { 1025 if (!RHSIt->empty()) 1026 return false; 1027 ++RHSIt; 1028 } 1029 return true; 1030 } 1031 1032 llvm::Error reformatEdit(Edit &E, const format::FormatStyle &Style) { 1033 if (auto NewEdits = cleanupAndFormat(E.InitialCode, E.Replacements, Style)) 1034 E.Replacements = std::move(*NewEdits); 1035 else 1036 return NewEdits.takeError(); 1037 return llvm::Error::success(); 1038 } 1039 1040 EligibleRegion getEligiblePoints(llvm::StringRef Code, 1041 llvm::StringRef FullyQualifiedName, 1042 const LangOptions &LangOpts) { 1043 EligibleRegion ER; 1044 // Start with global namespace. 1045 std::vector<std::string> Enclosing = {""}; 1046 // FIXME: In addition to namespaces try to generate events for function 1047 // definitions as well. One might use a closing parantheses(")" followed by an 1048 // opening brace "{" to trigger the start. 1049 parseNamespaceEvents(Code, LangOpts, [&](NamespaceEvent Event) { 1050 // Using Directives only introduces declarations to current scope, they do 1051 // not change the current namespace, so skip them. 1052 if (Event.Trigger == NamespaceEvent::UsingDirective) 1053 return; 1054 // Do not qualify the global namespace. 1055 if (!Event.Payload.empty()) 1056 Event.Payload.append("::"); 1057 1058 std::string CurrentNamespace; 1059 if (Event.Trigger == NamespaceEvent::BeginNamespace) { 1060 Enclosing.emplace_back(std::move(Event.Payload)); 1061 CurrentNamespace = Enclosing.back(); 1062 // parseNameSpaceEvents reports the beginning position of a token; we want 1063 // to insert after '{', so increment by one. 1064 ++Event.Pos.character; 1065 } else { 1066 // Event.Payload points to outer namespace when exiting a scope, so use 1067 // the namespace we've last entered instead. 1068 CurrentNamespace = std::move(Enclosing.back()); 1069 Enclosing.pop_back(); 1070 assert(Enclosing.back() == Event.Payload); 1071 } 1072 1073 // Ignore namespaces that are not a prefix of the target. 1074 if (!FullyQualifiedName.startswith(CurrentNamespace)) 1075 return; 1076 1077 // Prefer the namespace that shares the longest prefix with target. 1078 if (CurrentNamespace.size() > ER.EnclosingNamespace.size()) { 1079 ER.EligiblePoints.clear(); 1080 ER.EnclosingNamespace = CurrentNamespace; 1081 } 1082 if (CurrentNamespace.size() == ER.EnclosingNamespace.size()) 1083 ER.EligiblePoints.emplace_back(std::move(Event.Pos)); 1084 }); 1085 // If there were no shared namespaces just return EOF. 1086 if (ER.EligiblePoints.empty()) { 1087 assert(ER.EnclosingNamespace.empty()); 1088 ER.EligiblePoints.emplace_back(offsetToPosition(Code, Code.size())); 1089 } 1090 return ER; 1091 } 1092 1093 bool isHeaderFile(llvm::StringRef FileName, 1094 llvm::Optional<LangOptions> LangOpts) { 1095 // Respect the langOpts, for non-file-extension cases, e.g. standard library 1096 // files. 1097 if (LangOpts && LangOpts->IsHeaderFile) 1098 return true; 1099 namespace types = clang::driver::types; 1100 auto Lang = types::lookupTypeForExtension( 1101 llvm::sys::path::extension(FileName).substr(1)); 1102 return Lang != types::TY_INVALID && types::onlyPrecompileType(Lang); 1103 } 1104 1105 bool isProtoFile(SourceLocation Loc, const SourceManager &SM) { 1106 auto FileName = SM.getFilename(Loc); 1107 if (!FileName.endswith(".proto.h") && !FileName.endswith(".pb.h")) 1108 return false; 1109 auto FID = SM.getFileID(Loc); 1110 // All proto generated headers should start with this line. 1111 static const char *PROTO_HEADER_COMMENT = 1112 "// Generated by the protocol buffer compiler. DO NOT EDIT!"; 1113 // Double check that this is an actual protobuf header. 1114 return SM.getBufferData(FID).startswith(PROTO_HEADER_COMMENT); 1115 } 1116 1117 } // namespace clangd 1118 } // namespace clang 1119