1 //===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "SourceCode.h" 9 10 #include "Context.h" 11 #include "FuzzyMatch.h" 12 #include "Logger.h" 13 #include "Protocol.h" 14 #include "refactor/Tweak.h" 15 #include "clang/AST/ASTContext.h" 16 #include "clang/Basic/LangOptions.h" 17 #include "clang/Basic/SourceLocation.h" 18 #include "clang/Basic/SourceManager.h" 19 #include "clang/Basic/TokenKinds.h" 20 #include "clang/Format/Format.h" 21 #include "clang/Lex/Lexer.h" 22 #include "clang/Lex/Preprocessor.h" 23 #include "clang/Lex/Token.h" 24 #include "clang/Tooling/Core/Replacement.h" 25 #include "llvm/ADT/ArrayRef.h" 26 #include "llvm/ADT/None.h" 27 #include "llvm/ADT/STLExtras.h" 28 #include "llvm/ADT/StringExtras.h" 29 #include "llvm/ADT/StringMap.h" 30 #include "llvm/ADT/StringRef.h" 31 #include "llvm/Support/Compiler.h" 32 #include "llvm/Support/Errc.h" 33 #include "llvm/Support/Error.h" 34 #include "llvm/Support/ErrorHandling.h" 35 #include "llvm/Support/LineIterator.h" 36 #include "llvm/Support/MemoryBuffer.h" 37 #include "llvm/Support/Path.h" 38 #include "llvm/Support/SHA1.h" 39 #include "llvm/Support/VirtualFileSystem.h" 40 #include "llvm/Support/xxhash.h" 41 #include <algorithm> 42 #include <cstddef> 43 #include <string> 44 #include <vector> 45 46 namespace clang { 47 namespace clangd { 48 49 // Here be dragons. LSP positions use columns measured in *UTF-16 code units*! 50 // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial. 51 52 // Iterates over unicode codepoints in the (UTF-8) string. For each, 53 // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true. 54 // Returns true if CB returned true, false if we hit the end of string. 55 template <typename Callback> 56 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { 57 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). 58 // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. 59 for (size_t I = 0; I < U8.size();) { 60 unsigned char C = static_cast<unsigned char>(U8[I]); 61 if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character. 62 if (CB(1, 1)) 63 return true; 64 ++I; 65 continue; 66 } 67 // This convenient property of UTF-8 holds for all non-ASCII characters. 68 size_t UTF8Length = llvm::countLeadingOnes(C); 69 // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. 70 // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug. 71 assert((UTF8Length >= 2 && UTF8Length <= 4) && 72 "Invalid UTF-8, or transcoding bug?"); 73 I += UTF8Length; // Skip over all trailing bytes. 74 // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). 75 // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) 76 if (CB(UTF8Length, UTF8Length == 4 ? 2 : 1)) 77 return true; 78 } 79 return false; 80 } 81 82 // Returns the byte offset into the string that is an offset of \p Units in 83 // the specified encoding. 84 // Conceptually, this converts to the encoding, truncates to CodeUnits, 85 // converts back to UTF-8, and returns the length in bytes. 86 static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc, 87 bool &Valid) { 88 Valid = Units >= 0; 89 if (Units <= 0) 90 return 0; 91 size_t Result = 0; 92 switch (Enc) { 93 case OffsetEncoding::UTF8: 94 Result = Units; 95 break; 96 case OffsetEncoding::UTF16: 97 Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { 98 Result += U8Len; 99 Units -= U16Len; 100 return Units <= 0; 101 }); 102 if (Units < 0) // Offset in the middle of a surrogate pair. 103 Valid = false; 104 break; 105 case OffsetEncoding::UTF32: 106 Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) { 107 Result += U8Len; 108 Units--; 109 return Units <= 0; 110 }); 111 break; 112 case OffsetEncoding::UnsupportedEncoding: 113 llvm_unreachable("unsupported encoding"); 114 } 115 // Don't return an out-of-range index if we overran. 116 if (Result > U8.size()) { 117 Valid = false; 118 return U8.size(); 119 } 120 return Result; 121 } 122 123 Key<OffsetEncoding> kCurrentOffsetEncoding; 124 static OffsetEncoding lspEncoding() { 125 auto *Enc = Context::current().get(kCurrentOffsetEncoding); 126 return Enc ? *Enc : OffsetEncoding::UTF16; 127 } 128 129 // Like most strings in clangd, the input is UTF-8 encoded. 130 size_t lspLength(llvm::StringRef Code) { 131 size_t Count = 0; 132 switch (lspEncoding()) { 133 case OffsetEncoding::UTF8: 134 Count = Code.size(); 135 break; 136 case OffsetEncoding::UTF16: 137 iterateCodepoints(Code, [&](int U8Len, int U16Len) { 138 Count += U16Len; 139 return false; 140 }); 141 break; 142 case OffsetEncoding::UTF32: 143 iterateCodepoints(Code, [&](int U8Len, int U16Len) { 144 ++Count; 145 return false; 146 }); 147 break; 148 case OffsetEncoding::UnsupportedEncoding: 149 llvm_unreachable("unsupported encoding"); 150 } 151 return Count; 152 } 153 154 llvm::Expected<size_t> positionToOffset(llvm::StringRef Code, Position P, 155 bool AllowColumnsBeyondLineLength) { 156 if (P.line < 0) 157 return llvm::make_error<llvm::StringError>( 158 llvm::formatv("Line value can't be negative ({0})", P.line), 159 llvm::errc::invalid_argument); 160 if (P.character < 0) 161 return llvm::make_error<llvm::StringError>( 162 llvm::formatv("Character value can't be negative ({0})", P.character), 163 llvm::errc::invalid_argument); 164 size_t StartOfLine = 0; 165 for (int I = 0; I != P.line; ++I) { 166 size_t NextNL = Code.find('\n', StartOfLine); 167 if (NextNL == llvm::StringRef::npos) 168 return llvm::make_error<llvm::StringError>( 169 llvm::formatv("Line value is out of range ({0})", P.line), 170 llvm::errc::invalid_argument); 171 StartOfLine = NextNL + 1; 172 } 173 StringRef Line = 174 Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; }); 175 176 // P.character may be in UTF-16, transcode if necessary. 177 bool Valid; 178 size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid); 179 if (!Valid && !AllowColumnsBeyondLineLength) 180 return llvm::make_error<llvm::StringError>( 181 llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(), 182 P.character, P.line), 183 llvm::errc::invalid_argument); 184 return StartOfLine + ByteInLine; 185 } 186 187 Position offsetToPosition(llvm::StringRef Code, size_t Offset) { 188 Offset = std::min(Code.size(), Offset); 189 llvm::StringRef Before = Code.substr(0, Offset); 190 int Lines = Before.count('\n'); 191 size_t PrevNL = Before.rfind('\n'); 192 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1); 193 Position Pos; 194 Pos.line = Lines; 195 Pos.character = lspLength(Before.substr(StartOfLine)); 196 return Pos; 197 } 198 199 Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc) { 200 // We use the SourceManager's line tables, but its column number is in bytes. 201 FileID FID; 202 unsigned Offset; 203 std::tie(FID, Offset) = SM.getDecomposedSpellingLoc(Loc); 204 Position P; 205 P.line = static_cast<int>(SM.getLineNumber(FID, Offset)) - 1; 206 bool Invalid = false; 207 llvm::StringRef Code = SM.getBufferData(FID, &Invalid); 208 if (!Invalid) { 209 auto ColumnInBytes = SM.getColumnNumber(FID, Offset) - 1; 210 auto LineSoFar = Code.substr(Offset - ColumnInBytes, ColumnInBytes); 211 P.character = lspLength(LineSoFar); 212 } 213 return P; 214 } 215 216 bool isSpelledInSource(SourceLocation Loc, const SourceManager &SM) { 217 if (Loc.isMacroID()) { 218 std::string PrintLoc = SM.getSpellingLoc(Loc).printToString(SM); 219 if (llvm::StringRef(PrintLoc).startswith("<scratch") || 220 llvm::StringRef(PrintLoc).startswith("<command line>")) 221 return false; 222 } 223 return true; 224 } 225 226 SourceLocation spellingLocIfSpelled(SourceLocation Loc, 227 const SourceManager &SM) { 228 if (!isSpelledInSource(Loc, SM)) 229 // Use the expansion location as spelling location is not interesting. 230 return SM.getExpansionRange(Loc).getBegin(); 231 return SM.getSpellingLoc(Loc); 232 } 233 234 llvm::Optional<Range> getTokenRange(const SourceManager &SM, 235 const LangOptions &LangOpts, 236 SourceLocation TokLoc) { 237 if (!TokLoc.isValid()) 238 return llvm::None; 239 SourceLocation End = Lexer::getLocForEndOfToken(TokLoc, 0, SM, LangOpts); 240 if (!End.isValid()) 241 return llvm::None; 242 return halfOpenToRange(SM, CharSourceRange::getCharRange(TokLoc, End)); 243 } 244 245 namespace { 246 247 enum TokenFlavor { Identifier, Operator, Whitespace, Other }; 248 249 bool isOverloadedOperator(const Token &Tok) { 250 switch (Tok.getKind()) { 251 #define OVERLOADED_OPERATOR(Name, Spelling, Token, Unary, Binary, MemOnly) \ 252 case tok::Token: 253 #define OVERLOADED_OPERATOR_MULTI(Name, Spelling, Unary, Binary, MemOnly) 254 #include "clang/Basic/OperatorKinds.def" 255 return true; 256 257 default: 258 break; 259 } 260 return false; 261 } 262 263 TokenFlavor getTokenFlavor(SourceLocation Loc, const SourceManager &SM, 264 const LangOptions &LangOpts) { 265 Token Tok; 266 Tok.setKind(tok::NUM_TOKENS); 267 if (Lexer::getRawToken(Loc, Tok, SM, LangOpts, 268 /*IgnoreWhiteSpace*/ false)) 269 return Other; 270 271 // getRawToken will return false without setting Tok when the token is 272 // whitespace, so if the flag is not set, we are sure this is a whitespace. 273 if (Tok.is(tok::TokenKind::NUM_TOKENS)) 274 return Whitespace; 275 if (Tok.is(tok::TokenKind::raw_identifier)) 276 return Identifier; 277 if (isOverloadedOperator(Tok)) 278 return Operator; 279 return Other; 280 } 281 282 } // namespace 283 284 SourceLocation getBeginningOfIdentifier(const Position &Pos, 285 const SourceManager &SM, 286 const LangOptions &LangOpts) { 287 FileID FID = SM.getMainFileID(); 288 auto Offset = positionToOffset(SM.getBufferData(FID), Pos); 289 if (!Offset) { 290 log("getBeginningOfIdentifier: {0}", Offset.takeError()); 291 return SourceLocation(); 292 } 293 294 // GetBeginningOfToken(InputLoc) is almost what we want, but does the wrong 295 // thing if the cursor is at the end of the token (identifier or operator). 296 // The cases are: 297 // 1) at the beginning of the token 298 // 2) at the middle of the token 299 // 3) at the end of the token 300 // 4) anywhere outside the identifier or operator 301 // To distinguish all cases, we lex both at the 302 // GetBeginningOfToken(InputLoc-1) and GetBeginningOfToken(InputLoc), for 303 // cases 1 and 4, we just return the original location. 304 SourceLocation InputLoc = SM.getComposedLoc(FID, *Offset); 305 if (*Offset == 0) // Case 1 or 4. 306 return InputLoc; 307 SourceLocation Before = SM.getComposedLoc(FID, *Offset - 1); 308 SourceLocation BeforeTokBeginning = 309 Lexer::GetBeginningOfToken(Before, SM, LangOpts); 310 TokenFlavor BeforeKind = getTokenFlavor(BeforeTokBeginning, SM, LangOpts); 311 312 SourceLocation CurrentTokBeginning = 313 Lexer::GetBeginningOfToken(InputLoc, SM, LangOpts); 314 TokenFlavor CurrentKind = getTokenFlavor(CurrentTokBeginning, SM, LangOpts); 315 316 // At the middle of the token. 317 if (BeforeTokBeginning == CurrentTokBeginning) { 318 // For interesting token, we return the beginning of the token. 319 if (CurrentKind == Identifier || CurrentKind == Operator) 320 return CurrentTokBeginning; 321 // otherwise, we return the original loc. 322 return InputLoc; 323 } 324 325 // Whitespace is not interesting. 326 if (BeforeKind == Whitespace) 327 return CurrentTokBeginning; 328 if (CurrentKind == Whitespace) 329 return BeforeTokBeginning; 330 331 // The cursor is at the token boundary, e.g. "Before^Current", we prefer 332 // identifiers to other tokens. 333 if (CurrentKind == Identifier) 334 return CurrentTokBeginning; 335 if (BeforeKind == Identifier) 336 return BeforeTokBeginning; 337 // Then prefer overloaded operators to other tokens. 338 if (CurrentKind == Operator) 339 return CurrentTokBeginning; 340 if (BeforeKind == Operator) 341 return BeforeTokBeginning; 342 343 // Non-interesting case, we just return the original location. 344 return InputLoc; 345 } 346 347 bool isValidFileRange(const SourceManager &Mgr, SourceRange R) { 348 if (!R.getBegin().isValid() || !R.getEnd().isValid()) 349 return false; 350 351 FileID BeginFID; 352 size_t BeginOffset = 0; 353 std::tie(BeginFID, BeginOffset) = Mgr.getDecomposedLoc(R.getBegin()); 354 355 FileID EndFID; 356 size_t EndOffset = 0; 357 std::tie(EndFID, EndOffset) = Mgr.getDecomposedLoc(R.getEnd()); 358 359 return BeginFID.isValid() && BeginFID == EndFID && BeginOffset <= EndOffset; 360 } 361 362 bool halfOpenRangeContains(const SourceManager &Mgr, SourceRange R, 363 SourceLocation L) { 364 assert(isValidFileRange(Mgr, R)); 365 366 FileID BeginFID; 367 size_t BeginOffset = 0; 368 std::tie(BeginFID, BeginOffset) = Mgr.getDecomposedLoc(R.getBegin()); 369 size_t EndOffset = Mgr.getFileOffset(R.getEnd()); 370 371 FileID LFid; 372 size_t LOffset; 373 std::tie(LFid, LOffset) = Mgr.getDecomposedLoc(L); 374 return BeginFID == LFid && BeginOffset <= LOffset && LOffset < EndOffset; 375 } 376 377 bool halfOpenRangeTouches(const SourceManager &Mgr, SourceRange R, 378 SourceLocation L) { 379 return L == R.getEnd() || halfOpenRangeContains(Mgr, R, L); 380 } 381 382 SourceLocation includeHashLoc(FileID IncludedFile, const SourceManager &SM) { 383 assert(SM.getLocForEndOfFile(IncludedFile).isFileID()); 384 FileID IncludingFile; 385 unsigned Offset; 386 std::tie(IncludingFile, Offset) = 387 SM.getDecomposedExpansionLoc(SM.getIncludeLoc(IncludedFile)); 388 bool Invalid = false; 389 llvm::StringRef Buf = SM.getBufferData(IncludingFile, &Invalid); 390 if (Invalid) 391 return SourceLocation(); 392 // Now buf is "...\n#include <foo>\n..." 393 // and Offset points here: ^ 394 // Rewind to the preceding # on the line. 395 assert(Offset < Buf.size()); 396 for (;; --Offset) { 397 if (Buf[Offset] == '#') 398 return SM.getComposedLoc(IncludingFile, Offset); 399 if (Buf[Offset] == '\n' || Offset == 0) // no hash, what's going on? 400 return SourceLocation(); 401 } 402 } 403 404 static unsigned getTokenLengthAtLoc(SourceLocation Loc, const SourceManager &SM, 405 const LangOptions &LangOpts) { 406 Token TheTok; 407 if (Lexer::getRawToken(Loc, TheTok, SM, LangOpts)) 408 return 0; 409 // FIXME: Here we check whether the token at the location is a greatergreater 410 // (>>) token and consider it as a single greater (>). This is to get it 411 // working for templates but it isn't correct for the right shift operator. We 412 // can avoid this by using half open char ranges in getFileRange() but getting 413 // token ending is not well supported in macroIDs. 414 if (TheTok.is(tok::greatergreater)) 415 return 1; 416 return TheTok.getLength(); 417 } 418 419 // Returns location of the last character of the token at a given loc 420 static SourceLocation getLocForTokenEnd(SourceLocation BeginLoc, 421 const SourceManager &SM, 422 const LangOptions &LangOpts) { 423 unsigned Len = getTokenLengthAtLoc(BeginLoc, SM, LangOpts); 424 return BeginLoc.getLocWithOffset(Len ? Len - 1 : 0); 425 } 426 427 // Returns location of the starting of the token at a given EndLoc 428 static SourceLocation getLocForTokenBegin(SourceLocation EndLoc, 429 const SourceManager &SM, 430 const LangOptions &LangOpts) { 431 return EndLoc.getLocWithOffset( 432 -(signed)getTokenLengthAtLoc(EndLoc, SM, LangOpts)); 433 } 434 435 // Converts a char source range to a token range. 436 static SourceRange toTokenRange(CharSourceRange Range, const SourceManager &SM, 437 const LangOptions &LangOpts) { 438 if (!Range.isTokenRange()) 439 Range.setEnd(getLocForTokenBegin(Range.getEnd(), SM, LangOpts)); 440 return Range.getAsRange(); 441 } 442 // Returns the union of two token ranges. 443 // To find the maximum of the Ends of the ranges, we compare the location of the 444 // last character of the token. 445 static SourceRange unionTokenRange(SourceRange R1, SourceRange R2, 446 const SourceManager &SM, 447 const LangOptions &LangOpts) { 448 SourceLocation Begin = 449 SM.isBeforeInTranslationUnit(R1.getBegin(), R2.getBegin()) 450 ? R1.getBegin() 451 : R2.getBegin(); 452 SourceLocation End = 453 SM.isBeforeInTranslationUnit(getLocForTokenEnd(R1.getEnd(), SM, LangOpts), 454 getLocForTokenEnd(R2.getEnd(), SM, LangOpts)) 455 ? R2.getEnd() 456 : R1.getEnd(); 457 return SourceRange(Begin, End); 458 } 459 460 // Given a range whose endpoints may be in different expansions or files, 461 // tries to find a range within a common file by following up the expansion and 462 // include location in each. 463 static SourceRange rangeInCommonFile(SourceRange R, const SourceManager &SM, 464 const LangOptions &LangOpts) { 465 // Fast path for most common cases. 466 if (SM.isWrittenInSameFile(R.getBegin(), R.getEnd())) 467 return R; 468 // Record the stack of expansion locations for the beginning, keyed by FileID. 469 llvm::DenseMap<FileID, SourceLocation> BeginExpansions; 470 for (SourceLocation Begin = R.getBegin(); Begin.isValid(); 471 Begin = Begin.isFileID() 472 ? includeHashLoc(SM.getFileID(Begin), SM) 473 : SM.getImmediateExpansionRange(Begin).getBegin()) { 474 BeginExpansions[SM.getFileID(Begin)] = Begin; 475 } 476 // Move up the stack of expansion locations for the end until we find the 477 // location in BeginExpansions with that has the same file id. 478 for (SourceLocation End = R.getEnd(); End.isValid(); 479 End = End.isFileID() ? includeHashLoc(SM.getFileID(End), SM) 480 : toTokenRange(SM.getImmediateExpansionRange(End), 481 SM, LangOpts) 482 .getEnd()) { 483 auto It = BeginExpansions.find(SM.getFileID(End)); 484 if (It != BeginExpansions.end()) { 485 if (SM.getFileOffset(It->second) > SM.getFileOffset(End)) 486 return SourceLocation(); 487 return {It->second, End}; 488 } 489 } 490 return SourceRange(); 491 } 492 493 // Find an expansion range (not necessarily immediate) the ends of which are in 494 // the same file id. 495 static SourceRange 496 getExpansionTokenRangeInSameFile(SourceLocation Loc, const SourceManager &SM, 497 const LangOptions &LangOpts) { 498 return rangeInCommonFile( 499 toTokenRange(SM.getImmediateExpansionRange(Loc), SM, LangOpts), SM, 500 LangOpts); 501 } 502 503 // Returns the file range for a given Location as a Token Range 504 // This is quite similar to getFileLoc in SourceManager as both use 505 // getImmediateExpansionRange and getImmediateSpellingLoc (for macro IDs). 506 // However: 507 // - We want to maintain the full range information as we move from one file to 508 // the next. getFileLoc only uses the BeginLoc of getImmediateExpansionRange. 509 // - We want to split '>>' tokens as the lexer parses the '>>' in nested 510 // template instantiations as a '>>' instead of two '>'s. 511 // There is also getExpansionRange but it simply calls 512 // getImmediateExpansionRange on the begin and ends separately which is wrong. 513 static SourceRange getTokenFileRange(SourceLocation Loc, 514 const SourceManager &SM, 515 const LangOptions &LangOpts) { 516 SourceRange FileRange = Loc; 517 while (!FileRange.getBegin().isFileID()) { 518 if (SM.isMacroArgExpansion(FileRange.getBegin())) { 519 FileRange = unionTokenRange( 520 SM.getImmediateSpellingLoc(FileRange.getBegin()), 521 SM.getImmediateSpellingLoc(FileRange.getEnd()), SM, LangOpts); 522 assert(SM.isWrittenInSameFile(FileRange.getBegin(), FileRange.getEnd())); 523 } else { 524 SourceRange ExpansionRangeForBegin = 525 getExpansionTokenRangeInSameFile(FileRange.getBegin(), SM, LangOpts); 526 SourceRange ExpansionRangeForEnd = 527 getExpansionTokenRangeInSameFile(FileRange.getEnd(), SM, LangOpts); 528 if (ExpansionRangeForBegin.isInvalid() || 529 ExpansionRangeForEnd.isInvalid()) 530 return SourceRange(); 531 assert(SM.isWrittenInSameFile(ExpansionRangeForBegin.getBegin(), 532 ExpansionRangeForEnd.getBegin()) && 533 "Both Expansion ranges should be in same file."); 534 FileRange = unionTokenRange(ExpansionRangeForBegin, ExpansionRangeForEnd, 535 SM, LangOpts); 536 } 537 } 538 return FileRange; 539 } 540 541 bool isInsideMainFile(SourceLocation Loc, const SourceManager &SM) { 542 return Loc.isValid() && SM.isWrittenInMainFile(SM.getExpansionLoc(Loc)); 543 } 544 545 llvm::Optional<SourceRange> toHalfOpenFileRange(const SourceManager &SM, 546 const LangOptions &LangOpts, 547 SourceRange R) { 548 SourceRange R1 = getTokenFileRange(R.getBegin(), SM, LangOpts); 549 if (!isValidFileRange(SM, R1)) 550 return llvm::None; 551 552 SourceRange R2 = getTokenFileRange(R.getEnd(), SM, LangOpts); 553 if (!isValidFileRange(SM, R2)) 554 return llvm::None; 555 556 SourceRange Result = 557 rangeInCommonFile(unionTokenRange(R1, R2, SM, LangOpts), SM, LangOpts); 558 unsigned TokLen = getTokenLengthAtLoc(Result.getEnd(), SM, LangOpts); 559 // Convert from closed token range to half-open (char) range 560 Result.setEnd(Result.getEnd().getLocWithOffset(TokLen)); 561 if (!isValidFileRange(SM, Result)) 562 return llvm::None; 563 564 return Result; 565 } 566 567 llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R) { 568 assert(isValidFileRange(SM, R)); 569 bool Invalid = false; 570 auto *Buf = SM.getBuffer(SM.getFileID(R.getBegin()), &Invalid); 571 assert(!Invalid); 572 573 size_t BeginOffset = SM.getFileOffset(R.getBegin()); 574 size_t EndOffset = SM.getFileOffset(R.getEnd()); 575 return Buf->getBuffer().substr(BeginOffset, EndOffset - BeginOffset); 576 } 577 578 llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM, 579 Position P) { 580 llvm::StringRef Code = SM.getBuffer(SM.getMainFileID())->getBuffer(); 581 auto Offset = 582 positionToOffset(Code, P, /*AllowColumnBeyondLineLength=*/false); 583 if (!Offset) 584 return Offset.takeError(); 585 return SM.getLocForStartOfFile(SM.getMainFileID()).getLocWithOffset(*Offset); 586 } 587 588 Range halfOpenToRange(const SourceManager &SM, CharSourceRange R) { 589 // Clang is 1-based, LSP uses 0-based indexes. 590 Position Begin = sourceLocToPosition(SM, R.getBegin()); 591 Position End = sourceLocToPosition(SM, R.getEnd()); 592 593 return {Begin, End}; 594 } 595 596 std::pair<size_t, size_t> offsetToClangLineColumn(llvm::StringRef Code, 597 size_t Offset) { 598 Offset = std::min(Code.size(), Offset); 599 llvm::StringRef Before = Code.substr(0, Offset); 600 int Lines = Before.count('\n'); 601 size_t PrevNL = Before.rfind('\n'); 602 size_t StartOfLine = (PrevNL == llvm::StringRef::npos) ? 0 : (PrevNL + 1); 603 return {Lines + 1, Offset - StartOfLine + 1}; 604 } 605 606 std::pair<StringRef, StringRef> splitQualifiedName(StringRef QName) { 607 size_t Pos = QName.rfind("::"); 608 if (Pos == llvm::StringRef::npos) 609 return {llvm::StringRef(), QName}; 610 return {QName.substr(0, Pos + 2), QName.substr(Pos + 2)}; 611 } 612 613 TextEdit replacementToEdit(llvm::StringRef Code, 614 const tooling::Replacement &R) { 615 Range ReplacementRange = { 616 offsetToPosition(Code, R.getOffset()), 617 offsetToPosition(Code, R.getOffset() + R.getLength())}; 618 return {ReplacementRange, R.getReplacementText()}; 619 } 620 621 std::vector<TextEdit> replacementsToEdits(llvm::StringRef Code, 622 const tooling::Replacements &Repls) { 623 std::vector<TextEdit> Edits; 624 for (const auto &R : Repls) 625 Edits.push_back(replacementToEdit(Code, R)); 626 return Edits; 627 } 628 629 llvm::Optional<std::string> getCanonicalPath(const FileEntry *F, 630 const SourceManager &SourceMgr) { 631 if (!F) 632 return None; 633 634 llvm::SmallString<128> FilePath = F->getName(); 635 if (!llvm::sys::path::is_absolute(FilePath)) { 636 if (auto EC = 637 SourceMgr.getFileManager().getVirtualFileSystem().makeAbsolute( 638 FilePath)) { 639 elog("Could not turn relative path '{0}' to absolute: {1}", FilePath, 640 EC.message()); 641 return None; 642 } 643 } 644 645 // Handle the symbolic link path case where the current working directory 646 // (getCurrentWorkingDirectory) is a symlink. We always want to the real 647 // file path (instead of the symlink path) for the C++ symbols. 648 // 649 // Consider the following example: 650 // 651 // src dir: /project/src/foo.h 652 // current working directory (symlink): /tmp/build -> /project/src/ 653 // 654 // The file path of Symbol is "/project/src/foo.h" instead of 655 // "/tmp/build/foo.h" 656 if (auto Dir = SourceMgr.getFileManager().getDirectory( 657 llvm::sys::path::parent_path(FilePath))) { 658 llvm::SmallString<128> RealPath; 659 llvm::StringRef DirName = SourceMgr.getFileManager().getCanonicalName(*Dir); 660 llvm::sys::path::append(RealPath, DirName, 661 llvm::sys::path::filename(FilePath)); 662 return RealPath.str().str(); 663 } 664 665 return FilePath.str().str(); 666 } 667 668 TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M, 669 const LangOptions &L) { 670 TextEdit Result; 671 Result.range = 672 halfOpenToRange(M, Lexer::makeFileCharRange(FixIt.RemoveRange, M, L)); 673 Result.newText = FixIt.CodeToInsert; 674 return Result; 675 } 676 677 bool isRangeConsecutive(const Range &Left, const Range &Right) { 678 return Left.end.line == Right.start.line && 679 Left.end.character == Right.start.character; 680 } 681 682 FileDigest digest(llvm::StringRef Content) { 683 uint64_t Hash{llvm::xxHash64(Content)}; 684 FileDigest Result; 685 for (unsigned I = 0; I < Result.size(); ++I) { 686 Result[I] = uint8_t(Hash); 687 Hash >>= 8; 688 } 689 return Result; 690 } 691 692 llvm::Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID) { 693 bool Invalid = false; 694 llvm::StringRef Content = SM.getBufferData(FID, &Invalid); 695 if (Invalid) 696 return None; 697 return digest(Content); 698 } 699 700 format::FormatStyle getFormatStyleForFile(llvm::StringRef File, 701 llvm::StringRef Content, 702 llvm::vfs::FileSystem *FS) { 703 auto Style = format::getStyle(format::DefaultFormatStyle, File, 704 format::DefaultFallbackStyle, Content, FS); 705 if (!Style) { 706 log("getStyle() failed for file {0}: {1}. Fallback is LLVM style.", File, 707 Style.takeError()); 708 Style = format::getLLVMStyle(); 709 } 710 return *Style; 711 } 712 713 llvm::Expected<tooling::Replacements> 714 cleanupAndFormat(StringRef Code, const tooling::Replacements &Replaces, 715 const format::FormatStyle &Style) { 716 auto CleanReplaces = cleanupAroundReplacements(Code, Replaces, Style); 717 if (!CleanReplaces) 718 return CleanReplaces; 719 return formatReplacements(Code, std::move(*CleanReplaces), Style); 720 } 721 722 void lex(llvm::StringRef Code, const format::FormatStyle &Style, 723 llvm::function_ref<void(const clang::Token &, Position)> Action) { 724 // FIXME: InMemoryFileAdapter crashes unless the buffer is null terminated! 725 std::string NullTerminatedCode = Code.str(); 726 SourceManagerForFile FileSM("dummy.cpp", NullTerminatedCode); 727 auto &SM = FileSM.get(); 728 auto FID = SM.getMainFileID(); 729 Lexer Lex(FID, SM.getBuffer(FID), SM, format::getFormattingLangOpts(Style)); 730 Token Tok; 731 732 while (!Lex.LexFromRawLexer(Tok)) 733 Action(Tok, sourceLocToPosition(SM, Tok.getLocation())); 734 // LexFromRawLexer returns true after it lexes last token, so we still have 735 // one more token to report. 736 Action(Tok, sourceLocToPosition(SM, Tok.getLocation())); 737 } 738 739 llvm::StringMap<unsigned> collectIdentifiers(llvm::StringRef Content, 740 const format::FormatStyle &Style) { 741 llvm::StringMap<unsigned> Identifiers; 742 lex(Content, Style, [&](const clang::Token &Tok, Position) { 743 switch (Tok.getKind()) { 744 case tok::identifier: 745 ++Identifiers[Tok.getIdentifierInfo()->getName()]; 746 break; 747 case tok::raw_identifier: 748 ++Identifiers[Tok.getRawIdentifier()]; 749 break; 750 default: 751 break; 752 } 753 }); 754 return Identifiers; 755 } 756 757 namespace { 758 struct NamespaceEvent { 759 enum { 760 BeginNamespace, // namespace <ns> {. Payload is resolved <ns>. 761 EndNamespace, // } // namespace <ns>. Payload is resolved *outer* 762 // namespace. 763 UsingDirective // using namespace <ns>. Payload is unresolved <ns>. 764 } Trigger; 765 std::string Payload; 766 Position Pos; 767 }; 768 // Scans C++ source code for constructs that change the visible namespaces. 769 void parseNamespaceEvents(llvm::StringRef Code, 770 const format::FormatStyle &Style, 771 llvm::function_ref<void(NamespaceEvent)> Callback) { 772 773 // Stack of enclosing namespaces, e.g. {"clang", "clangd"} 774 std::vector<std::string> Enclosing; // Contains e.g. "clang", "clangd" 775 // Stack counts open braces. true if the brace opened a namespace. 776 std::vector<bool> BraceStack; 777 778 enum { 779 Default, 780 Namespace, // just saw 'namespace' 781 NamespaceName, // just saw 'namespace' NSName 782 Using, // just saw 'using' 783 UsingNamespace, // just saw 'using namespace' 784 UsingNamespaceName, // just saw 'using namespace' NSName 785 } State = Default; 786 std::string NSName; 787 788 NamespaceEvent Event; 789 lex(Code, Style, [&](const clang::Token &Tok, Position P) { 790 Event.Pos = std::move(P); 791 switch (Tok.getKind()) { 792 case tok::raw_identifier: 793 // In raw mode, this could be a keyword or a name. 794 switch (State) { 795 case UsingNamespace: 796 case UsingNamespaceName: 797 NSName.append(Tok.getRawIdentifier()); 798 State = UsingNamespaceName; 799 break; 800 case Namespace: 801 case NamespaceName: 802 NSName.append(Tok.getRawIdentifier()); 803 State = NamespaceName; 804 break; 805 case Using: 806 State = 807 (Tok.getRawIdentifier() == "namespace") ? UsingNamespace : Default; 808 break; 809 case Default: 810 NSName.clear(); 811 if (Tok.getRawIdentifier() == "namespace") 812 State = Namespace; 813 else if (Tok.getRawIdentifier() == "using") 814 State = Using; 815 break; 816 } 817 break; 818 case tok::coloncolon: 819 // This can come at the beginning or in the middle of a namespace name. 820 switch (State) { 821 case UsingNamespace: 822 case UsingNamespaceName: 823 NSName.append("::"); 824 State = UsingNamespaceName; 825 break; 826 case NamespaceName: 827 NSName.append("::"); 828 State = NamespaceName; 829 break; 830 case Namespace: // Not legal here. 831 case Using: 832 case Default: 833 State = Default; 834 break; 835 } 836 break; 837 case tok::l_brace: 838 // Record which { started a namespace, so we know when } ends one. 839 if (State == NamespaceName) { 840 // Parsed: namespace <name> { 841 BraceStack.push_back(true); 842 Enclosing.push_back(NSName); 843 Event.Trigger = NamespaceEvent::BeginNamespace; 844 Event.Payload = llvm::join(Enclosing, "::"); 845 Callback(Event); 846 } else { 847 // This case includes anonymous namespaces (State = Namespace). 848 // For our purposes, they're not namespaces and we ignore them. 849 BraceStack.push_back(false); 850 } 851 State = Default; 852 break; 853 case tok::r_brace: 854 // If braces are unmatched, we're going to be confused, but don't crash. 855 if (!BraceStack.empty()) { 856 if (BraceStack.back()) { 857 // Parsed: } // namespace 858 Enclosing.pop_back(); 859 Event.Trigger = NamespaceEvent::EndNamespace; 860 Event.Payload = llvm::join(Enclosing, "::"); 861 Callback(Event); 862 } 863 BraceStack.pop_back(); 864 } 865 break; 866 case tok::semi: 867 if (State == UsingNamespaceName) { 868 // Parsed: using namespace <name> ; 869 Event.Trigger = NamespaceEvent::UsingDirective; 870 Event.Payload = std::move(NSName); 871 Callback(Event); 872 } 873 State = Default; 874 break; 875 default: 876 State = Default; 877 break; 878 } 879 }); 880 } 881 882 // Returns the prefix namespaces of NS: {"" ... NS}. 883 llvm::SmallVector<llvm::StringRef, 8> ancestorNamespaces(llvm::StringRef NS) { 884 llvm::SmallVector<llvm::StringRef, 8> Results; 885 Results.push_back(NS.take_front(0)); 886 NS.split(Results, "::", /*MaxSplit=*/-1, /*KeepEmpty=*/false); 887 for (llvm::StringRef &R : Results) 888 R = NS.take_front(R.end() - NS.begin()); 889 return Results; 890 } 891 892 } // namespace 893 894 std::vector<std::string> visibleNamespaces(llvm::StringRef Code, 895 const format::FormatStyle &Style) { 896 std::string Current; 897 // Map from namespace to (resolved) namespaces introduced via using directive. 898 llvm::StringMap<llvm::StringSet<>> UsingDirectives; 899 900 parseNamespaceEvents(Code, Style, [&](NamespaceEvent Event) { 901 llvm::StringRef NS = Event.Payload; 902 switch (Event.Trigger) { 903 case NamespaceEvent::BeginNamespace: 904 case NamespaceEvent::EndNamespace: 905 Current = std::move(Event.Payload); 906 break; 907 case NamespaceEvent::UsingDirective: 908 if (NS.consume_front("::")) 909 UsingDirectives[Current].insert(NS); 910 else { 911 for (llvm::StringRef Enclosing : ancestorNamespaces(Current)) { 912 if (Enclosing.empty()) 913 UsingDirectives[Current].insert(NS); 914 else 915 UsingDirectives[Current].insert((Enclosing + "::" + NS).str()); 916 } 917 } 918 break; 919 } 920 }); 921 922 std::vector<std::string> Found; 923 for (llvm::StringRef Enclosing : ancestorNamespaces(Current)) { 924 Found.push_back(Enclosing); 925 auto It = UsingDirectives.find(Enclosing); 926 if (It != UsingDirectives.end()) 927 for (const auto &Used : It->second) 928 Found.push_back(Used.getKey()); 929 } 930 931 llvm::sort(Found, [&](const std::string &LHS, const std::string &RHS) { 932 if (Current == RHS) 933 return false; 934 if (Current == LHS) 935 return true; 936 return LHS < RHS; 937 }); 938 Found.erase(std::unique(Found.begin(), Found.end()), Found.end()); 939 return Found; 940 } 941 942 llvm::StringSet<> collectWords(llvm::StringRef Content) { 943 // We assume short words are not significant. 944 // We may want to consider other stopwords, e.g. language keywords. 945 // (A very naive implementation showed no benefit, but lexing might do better) 946 static constexpr int MinWordLength = 4; 947 948 std::vector<CharRole> Roles(Content.size()); 949 calculateRoles(Content, Roles); 950 951 llvm::StringSet<> Result; 952 llvm::SmallString<256> Word; 953 auto Flush = [&] { 954 if (Word.size() >= MinWordLength) { 955 for (char &C : Word) 956 C = llvm::toLower(C); 957 Result.insert(Word); 958 } 959 Word.clear(); 960 }; 961 for (unsigned I = 0; I < Content.size(); ++I) { 962 switch (Roles[I]) { 963 case Head: 964 Flush(); 965 LLVM_FALLTHROUGH; 966 case Tail: 967 Word.push_back(Content[I]); 968 break; 969 case Unknown: 970 case Separator: 971 Flush(); 972 break; 973 } 974 } 975 Flush(); 976 977 return Result; 978 } 979 980 llvm::Optional<DefinedMacro> locateMacroAt(SourceLocation Loc, 981 Preprocessor &PP) { 982 const auto &SM = PP.getSourceManager(); 983 const auto &LangOpts = PP.getLangOpts(); 984 Token Result; 985 if (Lexer::getRawToken(SM.getSpellingLoc(Loc), Result, SM, LangOpts, false)) 986 return None; 987 if (Result.is(tok::raw_identifier)) 988 PP.LookUpIdentifierInfo(Result); 989 IdentifierInfo *IdentifierInfo = Result.getIdentifierInfo(); 990 if (!IdentifierInfo || !IdentifierInfo->hadMacroDefinition()) 991 return None; 992 993 std::pair<FileID, unsigned int> DecLoc = SM.getDecomposedExpansionLoc(Loc); 994 // Get the definition just before the searched location so that a macro 995 // referenced in a '#undef MACRO' can still be found. 996 SourceLocation BeforeSearchedLocation = 997 SM.getMacroArgExpandedLocation(SM.getLocForStartOfFile(DecLoc.first) 998 .getLocWithOffset(DecLoc.second - 1)); 999 MacroDefinition MacroDef = 1000 PP.getMacroDefinitionAtLoc(IdentifierInfo, BeforeSearchedLocation); 1001 if (auto *MI = MacroDef.getMacroInfo()) 1002 return DefinedMacro{IdentifierInfo->getName(), MI}; 1003 return None; 1004 } 1005 1006 llvm::Expected<std::string> Edit::apply() const { 1007 return tooling::applyAllReplacements(InitialCode, Replacements); 1008 } 1009 1010 std::vector<TextEdit> Edit::asTextEdits() const { 1011 return replacementsToEdits(InitialCode, Replacements); 1012 } 1013 1014 bool Edit::canApplyTo(llvm::StringRef Code) const { 1015 // Create line iterators, since line numbers are important while applying our 1016 // edit we cannot skip blank lines. 1017 auto LHS = llvm::MemoryBuffer::getMemBuffer(Code); 1018 llvm::line_iterator LHSIt(*LHS, /*SkipBlanks=*/false); 1019 1020 auto RHS = llvm::MemoryBuffer::getMemBuffer(InitialCode); 1021 llvm::line_iterator RHSIt(*RHS, /*SkipBlanks=*/false); 1022 1023 // Compare the InitialCode we prepared the edit for with the Code we received 1024 // line by line to make sure there are no differences. 1025 // FIXME: This check is too conservative now, it should be enough to only 1026 // check lines around the replacements contained inside the Edit. 1027 while (!LHSIt.is_at_eof() && !RHSIt.is_at_eof()) { 1028 if (*LHSIt != *RHSIt) 1029 return false; 1030 ++LHSIt; 1031 ++RHSIt; 1032 } 1033 1034 // After we reach EOF for any of the files we make sure the other one doesn't 1035 // contain any additional content except empty lines, they should not 1036 // interfere with the edit we produced. 1037 while (!LHSIt.is_at_eof()) { 1038 if (!LHSIt->empty()) 1039 return false; 1040 ++LHSIt; 1041 } 1042 while (!RHSIt.is_at_eof()) { 1043 if (!RHSIt->empty()) 1044 return false; 1045 ++RHSIt; 1046 } 1047 return true; 1048 } 1049 1050 llvm::Error reformatEdit(Edit &E, const format::FormatStyle &Style) { 1051 if (auto NewEdits = cleanupAndFormat(E.InitialCode, E.Replacements, Style)) 1052 E.Replacements = std::move(*NewEdits); 1053 else 1054 return NewEdits.takeError(); 1055 return llvm::Error::success(); 1056 } 1057 1058 EligibleRegion getEligiblePoints(llvm::StringRef Code, 1059 llvm::StringRef FullyQualifiedName, 1060 const format::FormatStyle &Style) { 1061 EligibleRegion ER; 1062 // Start with global namespace. 1063 std::vector<std::string> Enclosing = {""}; 1064 // FIXME: In addition to namespaces try to generate events for function 1065 // definitions as well. One might use a closing parantheses(")" followed by an 1066 // opening brace "{" to trigger the start. 1067 parseNamespaceEvents(Code, Style, [&](NamespaceEvent Event) { 1068 // Using Directives only introduces declarations to current scope, they do 1069 // not change the current namespace, so skip them. 1070 if (Event.Trigger == NamespaceEvent::UsingDirective) 1071 return; 1072 // Do not qualify the global namespace. 1073 if (!Event.Payload.empty()) 1074 Event.Payload.append("::"); 1075 1076 std::string CurrentNamespace; 1077 if (Event.Trigger == NamespaceEvent::BeginNamespace) { 1078 Enclosing.emplace_back(std::move(Event.Payload)); 1079 CurrentNamespace = Enclosing.back(); 1080 // parseNameSpaceEvents reports the beginning position of a token; we want 1081 // to insert after '{', so increment by one. 1082 ++Event.Pos.character; 1083 } else { 1084 // Event.Payload points to outer namespace when exiting a scope, so use 1085 // the namespace we've last entered instead. 1086 CurrentNamespace = std::move(Enclosing.back()); 1087 Enclosing.pop_back(); 1088 assert(Enclosing.back() == Event.Payload); 1089 } 1090 1091 // Ignore namespaces that are not a prefix of the target. 1092 if (!FullyQualifiedName.startswith(CurrentNamespace)) 1093 return; 1094 1095 // Prefer the namespace that shares the longest prefix with target. 1096 if (CurrentNamespace.size() > ER.EnclosingNamespace.size()) { 1097 ER.EligiblePoints.clear(); 1098 ER.EnclosingNamespace = CurrentNamespace; 1099 } 1100 if (CurrentNamespace.size() == ER.EnclosingNamespace.size()) 1101 ER.EligiblePoints.emplace_back(std::move(Event.Pos)); 1102 }); 1103 // If there were no shared namespaces just return EOF. 1104 if (ER.EligiblePoints.empty()) { 1105 assert(ER.EnclosingNamespace.empty()); 1106 ER.EligiblePoints.emplace_back(offsetToPosition(Code, Code.size())); 1107 } 1108 return ER; 1109 } 1110 1111 } // namespace clangd 1112 } // namespace clang 1113