1 //===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// This file implements classes for searching and analyzing source code clones. 10 /// 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Analysis/CloneDetection.h" 14 15 #include "clang/AST/DataCollection.h" 16 #include "clang/AST/DeclTemplate.h" 17 #include "llvm/Support/MD5.h" 18 #include "llvm/Support/Path.h" 19 20 using namespace clang; 21 22 StmtSequence::StmtSequence(const CompoundStmt *Stmt, const Decl *D, 23 unsigned StartIndex, unsigned EndIndex) 24 : S(Stmt), D(D), StartIndex(StartIndex), EndIndex(EndIndex) { 25 assert(Stmt && "Stmt must not be a nullptr"); 26 assert(StartIndex < EndIndex && "Given array should not be empty"); 27 assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt"); 28 } 29 30 StmtSequence::StmtSequence(const Stmt *Stmt, const Decl *D) 31 : S(Stmt), D(D), StartIndex(0), EndIndex(0) {} 32 33 StmtSequence::StmtSequence() 34 : S(nullptr), D(nullptr), StartIndex(0), EndIndex(0) {} 35 36 bool StmtSequence::contains(const StmtSequence &Other) const { 37 // If both sequences reside in different declarations, they can never contain 38 // each other. 39 if (D != Other.D) 40 return false; 41 42 const SourceManager &SM = getASTContext().getSourceManager(); 43 44 // Otherwise check if the start and end locations of the current sequence 45 // surround the other sequence. 46 bool StartIsInBounds = 47 SM.isBeforeInTranslationUnit(getBeginLoc(), Other.getBeginLoc()) || 48 getBeginLoc() == Other.getBeginLoc(); 49 if (!StartIsInBounds) 50 return false; 51 52 bool EndIsInBounds = 53 SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) || 54 Other.getEndLoc() == getEndLoc(); 55 return EndIsInBounds; 56 } 57 58 StmtSequence::iterator StmtSequence::begin() const { 59 if (!holdsSequence()) { 60 return &S; 61 } 62 auto CS = cast<CompoundStmt>(S); 63 return CS->body_begin() + StartIndex; 64 } 65 66 StmtSequence::iterator StmtSequence::end() const { 67 if (!holdsSequence()) { 68 return reinterpret_cast<StmtSequence::iterator>(&S) + 1; 69 } 70 auto CS = cast<CompoundStmt>(S); 71 return CS->body_begin() + EndIndex; 72 } 73 74 ASTContext &StmtSequence::getASTContext() const { 75 assert(D); 76 return D->getASTContext(); 77 } 78 79 SourceLocation StmtSequence::getBeginLoc() const { 80 return front()->getBeginLoc(); 81 } 82 83 SourceLocation StmtSequence::getEndLoc() const { return back()->getEndLoc(); } 84 85 SourceRange StmtSequence::getSourceRange() const { 86 return SourceRange(getBeginLoc(), getEndLoc()); 87 } 88 89 void CloneDetector::analyzeCodeBody(const Decl *D) { 90 assert(D); 91 assert(D->hasBody()); 92 93 Sequences.push_back(StmtSequence(D->getBody(), D)); 94 } 95 96 /// Returns true if and only if \p Stmt contains at least one other 97 /// sequence in the \p Group. 98 static bool containsAnyInGroup(StmtSequence &Seq, 99 CloneDetector::CloneGroup &Group) { 100 for (StmtSequence &GroupSeq : Group) { 101 if (Seq.contains(GroupSeq)) 102 return true; 103 } 104 return false; 105 } 106 107 /// Returns true if and only if all sequences in \p OtherGroup are 108 /// contained by a sequence in \p Group. 109 static bool containsGroup(CloneDetector::CloneGroup &Group, 110 CloneDetector::CloneGroup &OtherGroup) { 111 // We have less sequences in the current group than we have in the other, 112 // so we will never fulfill the requirement for returning true. This is only 113 // possible because we know that a sequence in Group can contain at most 114 // one sequence in OtherGroup. 115 if (Group.size() < OtherGroup.size()) 116 return false; 117 118 for (StmtSequence &Stmt : Group) { 119 if (!containsAnyInGroup(Stmt, OtherGroup)) 120 return false; 121 } 122 return true; 123 } 124 125 void OnlyLargestCloneConstraint::constrain( 126 std::vector<CloneDetector::CloneGroup> &Result) { 127 std::vector<unsigned> IndexesToRemove; 128 129 // Compare every group in the result with the rest. If one groups contains 130 // another group, we only need to return the bigger group. 131 // Note: This doesn't scale well, so if possible avoid calling any heavy 132 // function from this loop to minimize the performance impact. 133 for (unsigned i = 0; i < Result.size(); ++i) { 134 for (unsigned j = 0; j < Result.size(); ++j) { 135 // Don't compare a group with itself. 136 if (i == j) 137 continue; 138 139 if (containsGroup(Result[j], Result[i])) { 140 IndexesToRemove.push_back(i); 141 break; 142 } 143 } 144 } 145 146 // Erasing a list of indexes from the vector should be done with decreasing 147 // indexes. As IndexesToRemove is constructed with increasing values, we just 148 // reverse iterate over it to get the desired order. 149 for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) { 150 Result.erase(Result.begin() + *I); 151 } 152 } 153 154 bool FilenamePatternConstraint::isAutoGenerated( 155 const CloneDetector::CloneGroup &Group) { 156 std::string Error; 157 if (IgnoredFilesPattern.empty() || Group.empty() || 158 !IgnoredFilesRegex->isValid(Error)) 159 return false; 160 161 for (const StmtSequence &S : Group) { 162 const SourceManager &SM = S.getASTContext().getSourceManager(); 163 StringRef Filename = llvm::sys::path::filename( 164 SM.getFilename(S.getContainingDecl()->getLocation())); 165 if (IgnoredFilesRegex->match(Filename)) 166 return true; 167 } 168 169 return false; 170 } 171 172 /// This class defines what a type II code clone is: If it collects for two 173 /// statements the same data, then those two statements are considered to be 174 /// clones of each other. 175 /// 176 /// All collected data is forwarded to the given data consumer of the type T. 177 /// The data consumer class needs to provide a member method with the signature: 178 /// update(StringRef Str) 179 namespace { 180 template <class T> 181 class CloneTypeIIStmtDataCollector 182 : public ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>> { 183 ASTContext &Context; 184 /// The data sink to which all data is forwarded. 185 T &DataConsumer; 186 187 template <class Ty> void addData(const Ty &Data) { 188 data_collection::addDataToConsumer(DataConsumer, Data); 189 } 190 191 public: 192 CloneTypeIIStmtDataCollector(const Stmt *S, ASTContext &Context, 193 T &DataConsumer) 194 : Context(Context), DataConsumer(DataConsumer) { 195 this->Visit(S); 196 } 197 198 // Define a visit method for each class to collect data and subsequently visit 199 // all parent classes. This uses a template so that custom visit methods by us 200 // take precedence. 201 #define DEF_ADD_DATA(CLASS, CODE) \ 202 template <class = void> void Visit##CLASS(const CLASS *S) { \ 203 CODE; \ 204 ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \ 205 } 206 207 #include "clang/AST/StmtDataCollectors.inc" 208 209 // Type II clones ignore variable names and literals, so let's skip them. 210 #define SKIP(CLASS) \ 211 void Visit##CLASS(const CLASS *S) { \ 212 ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \ 213 } 214 SKIP(DeclRefExpr) 215 SKIP(MemberExpr) 216 SKIP(IntegerLiteral) 217 SKIP(FloatingLiteral) 218 SKIP(StringLiteral) 219 SKIP(CXXBoolLiteralExpr) 220 SKIP(CharacterLiteral) 221 #undef SKIP 222 }; 223 } // end anonymous namespace 224 225 static size_t createHash(llvm::MD5 &Hash) { 226 size_t HashCode; 227 228 // Create the final hash code for the current Stmt. 229 llvm::MD5::MD5Result HashResult; 230 Hash.final(HashResult); 231 232 // Copy as much as possible of the generated hash code to the Stmt's hash 233 // code. 234 std::memcpy(&HashCode, &HashResult, 235 std::min(sizeof(HashCode), sizeof(HashResult))); 236 237 return HashCode; 238 } 239 240 /// Generates and saves a hash code for the given Stmt. 241 /// \param S The given Stmt. 242 /// \param D The Decl containing S. 243 /// \param StmtsByHash Output parameter that will contain the hash codes for 244 /// each StmtSequence in the given Stmt. 245 /// \return The hash code of the given Stmt. 246 /// 247 /// If the given Stmt is a CompoundStmt, this method will also generate 248 /// hashes for all possible StmtSequences in the children of this Stmt. 249 static size_t 250 saveHash(const Stmt *S, const Decl *D, 251 std::vector<std::pair<size_t, StmtSequence>> &StmtsByHash) { 252 llvm::MD5 Hash; 253 ASTContext &Context = D->getASTContext(); 254 255 CloneTypeIIStmtDataCollector<llvm::MD5>(S, Context, Hash); 256 257 auto CS = dyn_cast<CompoundStmt>(S); 258 SmallVector<size_t, 8> ChildHashes; 259 260 for (const Stmt *Child : S->children()) { 261 if (Child == nullptr) { 262 ChildHashes.push_back(0); 263 continue; 264 } 265 size_t ChildHash = saveHash(Child, D, StmtsByHash); 266 Hash.update( 267 StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash))); 268 ChildHashes.push_back(ChildHash); 269 } 270 271 if (CS) { 272 // If we're in a CompoundStmt, we hash all possible combinations of child 273 // statements to find clones in those subsequences. 274 // We first go through every possible starting position of a subsequence. 275 for (unsigned Pos = 0; Pos < CS->size(); ++Pos) { 276 // Then we try all possible lengths this subsequence could have and 277 // reuse the same hash object to make sure we only hash every child 278 // hash exactly once. 279 llvm::MD5 Hash; 280 for (unsigned Length = 1; Length <= CS->size() - Pos; ++Length) { 281 // Grab the current child hash and put it into our hash. We do 282 // -1 on the index because we start counting the length at 1. 283 size_t ChildHash = ChildHashes[Pos + Length - 1]; 284 Hash.update( 285 StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash))); 286 // If we have at least two elements in our subsequence, we can start 287 // saving it. 288 if (Length > 1) { 289 llvm::MD5 SubHash = Hash; 290 StmtsByHash.push_back(std::make_pair( 291 createHash(SubHash), StmtSequence(CS, D, Pos, Pos + Length))); 292 } 293 } 294 } 295 } 296 297 size_t HashCode = createHash(Hash); 298 StmtsByHash.push_back(std::make_pair(HashCode, StmtSequence(S, D))); 299 return HashCode; 300 } 301 302 namespace { 303 /// Wrapper around FoldingSetNodeID that it can be used as the template 304 /// argument of the StmtDataCollector. 305 class FoldingSetNodeIDWrapper { 306 307 llvm::FoldingSetNodeID &FS; 308 309 public: 310 FoldingSetNodeIDWrapper(llvm::FoldingSetNodeID &FS) : FS(FS) {} 311 312 void update(StringRef Str) { FS.AddString(Str); } 313 }; 314 } // end anonymous namespace 315 316 /// Writes the relevant data from all statements and child statements 317 /// in the given StmtSequence into the given FoldingSetNodeID. 318 static void CollectStmtSequenceData(const StmtSequence &Sequence, 319 FoldingSetNodeIDWrapper &OutputData) { 320 for (const Stmt *S : Sequence) { 321 CloneTypeIIStmtDataCollector<FoldingSetNodeIDWrapper>( 322 S, Sequence.getASTContext(), OutputData); 323 324 for (const Stmt *Child : S->children()) { 325 if (!Child) 326 continue; 327 328 CollectStmtSequenceData(StmtSequence(Child, Sequence.getContainingDecl()), 329 OutputData); 330 } 331 } 332 } 333 334 /// Returns true if both sequences are clones of each other. 335 static bool areSequencesClones(const StmtSequence &LHS, 336 const StmtSequence &RHS) { 337 // We collect the data from all statements in the sequence as we did before 338 // when generating a hash value for each sequence. But this time we don't 339 // hash the collected data and compare the whole data set instead. This 340 // prevents any false-positives due to hash code collisions. 341 llvm::FoldingSetNodeID DataLHS, DataRHS; 342 FoldingSetNodeIDWrapper LHSWrapper(DataLHS); 343 FoldingSetNodeIDWrapper RHSWrapper(DataRHS); 344 345 CollectStmtSequenceData(LHS, LHSWrapper); 346 CollectStmtSequenceData(RHS, RHSWrapper); 347 348 return DataLHS == DataRHS; 349 } 350 351 void RecursiveCloneTypeIIHashConstraint::constrain( 352 std::vector<CloneDetector::CloneGroup> &Sequences) { 353 // FIXME: Maybe we can do this in-place and don't need this additional vector. 354 std::vector<CloneDetector::CloneGroup> Result; 355 356 for (CloneDetector::CloneGroup &Group : Sequences) { 357 // We assume in the following code that the Group is non-empty, so we 358 // skip all empty groups. 359 if (Group.empty()) 360 continue; 361 362 std::vector<std::pair<size_t, StmtSequence>> StmtsByHash; 363 364 // Generate hash codes for all children of S and save them in StmtsByHash. 365 for (const StmtSequence &S : Group) { 366 saveHash(S.front(), S.getContainingDecl(), StmtsByHash); 367 } 368 369 // Sort hash_codes in StmtsByHash. 370 std::stable_sort(StmtsByHash.begin(), StmtsByHash.end(), 371 [](std::pair<size_t, StmtSequence> LHS, 372 std::pair<size_t, StmtSequence> RHS) { 373 return LHS.first < RHS.first; 374 }); 375 376 // Check for each StmtSequence if its successor has the same hash value. 377 // We don't check the last StmtSequence as it has no successor. 378 // Note: The 'size - 1 ' in the condition is safe because we check for an 379 // empty Group vector at the beginning of this function. 380 for (unsigned i = 0; i < StmtsByHash.size() - 1; ++i) { 381 const auto Current = StmtsByHash[i]; 382 383 // It's likely that we just found a sequence of StmtSequences that 384 // represent a CloneGroup, so we create a new group and start checking and 385 // adding the StmtSequences in this sequence. 386 CloneDetector::CloneGroup NewGroup; 387 388 size_t PrototypeHash = Current.first; 389 390 for (; i < StmtsByHash.size(); ++i) { 391 // A different hash value means we have reached the end of the sequence. 392 if (PrototypeHash != StmtsByHash[i].first) { 393 // The current sequence could be the start of a new CloneGroup. So we 394 // decrement i so that we visit it again in the outer loop. 395 // Note: i can never be 0 at this point because we are just comparing 396 // the hash of the Current StmtSequence with itself in the 'if' above. 397 assert(i != 0); 398 --i; 399 break; 400 } 401 // Same hash value means we should add the StmtSequence to the current 402 // group. 403 NewGroup.push_back(StmtsByHash[i].second); 404 } 405 406 // We created a new clone group with matching hash codes and move it to 407 // the result vector. 408 Result.push_back(NewGroup); 409 } 410 } 411 // Sequences is the output parameter, so we copy our result into it. 412 Sequences = Result; 413 } 414 415 void RecursiveCloneTypeIIVerifyConstraint::constrain( 416 std::vector<CloneDetector::CloneGroup> &Sequences) { 417 CloneConstraint::splitCloneGroups( 418 Sequences, [](const StmtSequence &A, const StmtSequence &B) { 419 return areSequencesClones(A, B); 420 }); 421 } 422 423 size_t MinComplexityConstraint::calculateStmtComplexity( 424 const StmtSequence &Seq, std::size_t Limit, 425 const std::string &ParentMacroStack) { 426 if (Seq.empty()) 427 return 0; 428 429 size_t Complexity = 1; 430 431 ASTContext &Context = Seq.getASTContext(); 432 433 // Look up what macros expanded into the current statement. 434 std::string MacroStack = 435 data_collection::getMacroStack(Seq.getBeginLoc(), Context); 436 437 // First, check if ParentMacroStack is not empty which means we are currently 438 // dealing with a parent statement which was expanded from a macro. 439 // If this parent statement was expanded from the same macros as this 440 // statement, we reduce the initial complexity of this statement to zero. 441 // This causes that a group of statements that were generated by a single 442 // macro expansion will only increase the total complexity by one. 443 // Note: This is not the final complexity of this statement as we still 444 // add the complexity of the child statements to the complexity value. 445 if (!ParentMacroStack.empty() && MacroStack == ParentMacroStack) { 446 Complexity = 0; 447 } 448 449 // Iterate over the Stmts in the StmtSequence and add their complexity values 450 // to the current complexity value. 451 if (Seq.holdsSequence()) { 452 for (const Stmt *S : Seq) { 453 Complexity += calculateStmtComplexity( 454 StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack); 455 if (Complexity >= Limit) 456 return Limit; 457 } 458 } else { 459 for (const Stmt *S : Seq.front()->children()) { 460 Complexity += calculateStmtComplexity( 461 StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack); 462 if (Complexity >= Limit) 463 return Limit; 464 } 465 } 466 return Complexity; 467 } 468 469 void MatchingVariablePatternConstraint::constrain( 470 std::vector<CloneDetector::CloneGroup> &CloneGroups) { 471 CloneConstraint::splitCloneGroups( 472 CloneGroups, [](const StmtSequence &A, const StmtSequence &B) { 473 VariablePattern PatternA(A); 474 VariablePattern PatternB(B); 475 return PatternA.countPatternDifferences(PatternB) == 0; 476 }); 477 } 478 479 void CloneConstraint::splitCloneGroups( 480 std::vector<CloneDetector::CloneGroup> &CloneGroups, 481 llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)> 482 Compare) { 483 std::vector<CloneDetector::CloneGroup> Result; 484 for (auto &HashGroup : CloneGroups) { 485 // Contains all indexes in HashGroup that were already added to a 486 // CloneGroup. 487 std::vector<char> Indexes; 488 Indexes.resize(HashGroup.size()); 489 490 for (unsigned i = 0; i < HashGroup.size(); ++i) { 491 // Skip indexes that are already part of a CloneGroup. 492 if (Indexes[i]) 493 continue; 494 495 // Pick the first unhandled StmtSequence and consider it as the 496 // beginning 497 // of a new CloneGroup for now. 498 // We don't add i to Indexes because we never iterate back. 499 StmtSequence Prototype = HashGroup[i]; 500 CloneDetector::CloneGroup PotentialGroup = {Prototype}; 501 ++Indexes[i]; 502 503 // Check all following StmtSequences for clones. 504 for (unsigned j = i + 1; j < HashGroup.size(); ++j) { 505 // Skip indexes that are already part of a CloneGroup. 506 if (Indexes[j]) 507 continue; 508 509 // If a following StmtSequence belongs to our CloneGroup, we add it. 510 const StmtSequence &Candidate = HashGroup[j]; 511 512 if (!Compare(Prototype, Candidate)) 513 continue; 514 515 PotentialGroup.push_back(Candidate); 516 // Make sure we never visit this StmtSequence again. 517 ++Indexes[j]; 518 } 519 520 // Otherwise, add it to the result and continue searching for more 521 // groups. 522 Result.push_back(PotentialGroup); 523 } 524 525 assert(llvm::all_of(Indexes, [](char c) { return c == 1; })); 526 } 527 CloneGroups = Result; 528 } 529 530 void VariablePattern::addVariableOccurence(const VarDecl *VarDecl, 531 const Stmt *Mention) { 532 // First check if we already reference this variable 533 for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) { 534 if (Variables[KindIndex] == VarDecl) { 535 // If yes, add a new occurrence that points to the existing entry in 536 // the Variables vector. 537 Occurences.emplace_back(KindIndex, Mention); 538 return; 539 } 540 } 541 // If this variable wasn't already referenced, add it to the list of 542 // referenced variables and add a occurrence that points to this new entry. 543 Occurences.emplace_back(Variables.size(), Mention); 544 Variables.push_back(VarDecl); 545 } 546 547 void VariablePattern::addVariables(const Stmt *S) { 548 // Sometimes we get a nullptr (such as from IfStmts which often have nullptr 549 // children). We skip such statements as they don't reference any 550 // variables. 551 if (!S) 552 return; 553 554 // Check if S is a reference to a variable. If yes, add it to the pattern. 555 if (auto D = dyn_cast<DeclRefExpr>(S)) { 556 if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl())) 557 addVariableOccurence(VD, D); 558 } 559 560 // Recursively check all children of the given statement. 561 for (const Stmt *Child : S->children()) { 562 addVariables(Child); 563 } 564 } 565 566 unsigned VariablePattern::countPatternDifferences( 567 const VariablePattern &Other, 568 VariablePattern::SuspiciousClonePair *FirstMismatch) { 569 unsigned NumberOfDifferences = 0; 570 571 assert(Other.Occurences.size() == Occurences.size()); 572 for (unsigned i = 0; i < Occurences.size(); ++i) { 573 auto ThisOccurence = Occurences[i]; 574 auto OtherOccurence = Other.Occurences[i]; 575 if (ThisOccurence.KindID == OtherOccurence.KindID) 576 continue; 577 578 ++NumberOfDifferences; 579 580 // If FirstMismatch is not a nullptr, we need to store information about 581 // the first difference between the two patterns. 582 if (FirstMismatch == nullptr) 583 continue; 584 585 // Only proceed if we just found the first difference as we only store 586 // information about the first difference. 587 if (NumberOfDifferences != 1) 588 continue; 589 590 const VarDecl *FirstSuggestion = nullptr; 591 // If there is a variable available in the list of referenced variables 592 // which wouldn't break the pattern if it is used in place of the 593 // current variable, we provide this variable as the suggested fix. 594 if (OtherOccurence.KindID < Variables.size()) 595 FirstSuggestion = Variables[OtherOccurence.KindID]; 596 597 // Store information about the first clone. 598 FirstMismatch->FirstCloneInfo = 599 VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo( 600 Variables[ThisOccurence.KindID], ThisOccurence.Mention, 601 FirstSuggestion); 602 603 // Same as above but with the other clone. We do this for both clones as 604 // we don't know which clone is the one containing the unintended 605 // pattern error. 606 const VarDecl *SecondSuggestion = nullptr; 607 if (ThisOccurence.KindID < Other.Variables.size()) 608 SecondSuggestion = Other.Variables[ThisOccurence.KindID]; 609 610 // Store information about the second clone. 611 FirstMismatch->SecondCloneInfo = 612 VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo( 613 Other.Variables[OtherOccurence.KindID], OtherOccurence.Mention, 614 SecondSuggestion); 615 616 // SuspiciousClonePair guarantees that the first clone always has a 617 // suggested variable associated with it. As we know that one of the two 618 // clones in the pair always has suggestion, we swap the two clones 619 // in case the first clone has no suggested variable which means that 620 // the second clone has a suggested variable and should be first. 621 if (!FirstMismatch->FirstCloneInfo.Suggestion) 622 std::swap(FirstMismatch->FirstCloneInfo, FirstMismatch->SecondCloneInfo); 623 624 // This ensures that we always have at least one suggestion in a pair. 625 assert(FirstMismatch->FirstCloneInfo.Suggestion); 626 } 627 628 return NumberOfDifferences; 629 } 630