1 //===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// This file implements classes for searching and anlyzing source code clones. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "clang/Analysis/CloneDetection.h" 15 16 #include "clang/AST/DataCollection.h" 17 #include "clang/AST/DeclTemplate.h" 18 #include "llvm/Support/MD5.h" 19 #include "llvm/Support/Path.h" 20 21 using namespace clang; 22 23 StmtSequence::StmtSequence(const CompoundStmt *Stmt, const Decl *D, 24 unsigned StartIndex, unsigned EndIndex) 25 : S(Stmt), D(D), StartIndex(StartIndex), EndIndex(EndIndex) { 26 assert(Stmt && "Stmt must not be a nullptr"); 27 assert(StartIndex < EndIndex && "Given array should not be empty"); 28 assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt"); 29 } 30 31 StmtSequence::StmtSequence(const Stmt *Stmt, const Decl *D) 32 : S(Stmt), D(D), StartIndex(0), EndIndex(0) {} 33 34 StmtSequence::StmtSequence() 35 : S(nullptr), D(nullptr), StartIndex(0), EndIndex(0) {} 36 37 bool StmtSequence::contains(const StmtSequence &Other) const { 38 // If both sequences reside in different declarations, they can never contain 39 // each other. 40 if (D != Other.D) 41 return false; 42 43 const SourceManager &SM = getASTContext().getSourceManager(); 44 45 // Otherwise check if the start and end locations of the current sequence 46 // surround the other sequence. 47 bool StartIsInBounds = 48 SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) || 49 getStartLoc() == Other.getStartLoc(); 50 if (!StartIsInBounds) 51 return false; 52 53 bool EndIsInBounds = 54 SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) || 55 Other.getEndLoc() == getEndLoc(); 56 return EndIsInBounds; 57 } 58 59 StmtSequence::iterator StmtSequence::begin() const { 60 if (!holdsSequence()) { 61 return &S; 62 } 63 auto CS = cast<CompoundStmt>(S); 64 return CS->body_begin() + StartIndex; 65 } 66 67 StmtSequence::iterator StmtSequence::end() const { 68 if (!holdsSequence()) { 69 return reinterpret_cast<StmtSequence::iterator>(&S) + 1; 70 } 71 auto CS = cast<CompoundStmt>(S); 72 return CS->body_begin() + EndIndex; 73 } 74 75 ASTContext &StmtSequence::getASTContext() const { 76 assert(D); 77 return D->getASTContext(); 78 } 79 80 SourceLocation StmtSequence::getStartLoc() const { 81 return front()->getLocStart(); 82 } 83 84 SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); } 85 86 SourceRange StmtSequence::getSourceRange() const { 87 return SourceRange(getStartLoc(), getEndLoc()); 88 } 89 90 void CloneDetector::analyzeCodeBody(const Decl *D) { 91 assert(D); 92 assert(D->hasBody()); 93 94 Sequences.push_back(StmtSequence(D->getBody(), D)); 95 } 96 97 /// Returns true if and only if \p Stmt contains at least one other 98 /// sequence in the \p Group. 99 static bool containsAnyInGroup(StmtSequence &Seq, 100 CloneDetector::CloneGroup &Group) { 101 for (StmtSequence &GroupSeq : Group) { 102 if (Seq.contains(GroupSeq)) 103 return true; 104 } 105 return false; 106 } 107 108 /// Returns true if and only if all sequences in \p OtherGroup are 109 /// contained by a sequence in \p Group. 110 static bool containsGroup(CloneDetector::CloneGroup &Group, 111 CloneDetector::CloneGroup &OtherGroup) { 112 // We have less sequences in the current group than we have in the other, 113 // so we will never fulfill the requirement for returning true. This is only 114 // possible because we know that a sequence in Group can contain at most 115 // one sequence in OtherGroup. 116 if (Group.size() < OtherGroup.size()) 117 return false; 118 119 for (StmtSequence &Stmt : Group) { 120 if (!containsAnyInGroup(Stmt, OtherGroup)) 121 return false; 122 } 123 return true; 124 } 125 126 void OnlyLargestCloneConstraint::constrain( 127 std::vector<CloneDetector::CloneGroup> &Result) { 128 std::vector<unsigned> IndexesToRemove; 129 130 // Compare every group in the result with the rest. If one groups contains 131 // another group, we only need to return the bigger group. 132 // Note: This doesn't scale well, so if possible avoid calling any heavy 133 // function from this loop to minimize the performance impact. 134 for (unsigned i = 0; i < Result.size(); ++i) { 135 for (unsigned j = 0; j < Result.size(); ++j) { 136 // Don't compare a group with itself. 137 if (i == j) 138 continue; 139 140 if (containsGroup(Result[j], Result[i])) { 141 IndexesToRemove.push_back(i); 142 break; 143 } 144 } 145 } 146 147 // Erasing a list of indexes from the vector should be done with decreasing 148 // indexes. As IndexesToRemove is constructed with increasing values, we just 149 // reverse iterate over it to get the desired order. 150 for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) { 151 Result.erase(Result.begin() + *I); 152 } 153 } 154 155 bool FilenamePatternConstraint::isAutoGenerated( 156 const CloneDetector::CloneGroup &Group) { 157 std::string Error; 158 if (IgnoredFilesPattern.empty() || Group.empty() || 159 !IgnoredFilesRegex->isValid(Error)) 160 return false; 161 162 for (const StmtSequence &S : Group) { 163 const SourceManager &SM = S.getASTContext().getSourceManager(); 164 StringRef Filename = llvm::sys::path::filename( 165 SM.getFilename(S.getContainingDecl()->getLocation())); 166 if (IgnoredFilesRegex->match(Filename)) 167 return true; 168 } 169 170 return false; 171 } 172 173 /// This class defines what a type II code clone is: If it collects for two 174 /// statements the same data, then those two statements are considered to be 175 /// clones of each other. 176 /// 177 /// All collected data is forwarded to the given data consumer of the type T. 178 /// The data consumer class needs to provide a member method with the signature: 179 /// update(StringRef Str) 180 namespace { 181 template <class T> 182 class CloneTypeIIStmtDataCollector 183 : public ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>> { 184 ASTContext &Context; 185 /// The data sink to which all data is forwarded. 186 T &DataConsumer; 187 188 template <class Ty> void addData(const Ty &Data) { 189 data_collection::addDataToConsumer(DataConsumer, Data); 190 } 191 192 public: 193 CloneTypeIIStmtDataCollector(const Stmt *S, ASTContext &Context, 194 T &DataConsumer) 195 : Context(Context), DataConsumer(DataConsumer) { 196 this->Visit(S); 197 } 198 199 // Define a visit method for each class to collect data and subsequently visit 200 // all parent classes. This uses a template so that custom visit methods by us 201 // take precedence. 202 #define DEF_ADD_DATA(CLASS, CODE) \ 203 template <class = void> void Visit##CLASS(const CLASS *S) { \ 204 CODE; \ 205 ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \ 206 } 207 208 #include "clang/AST/StmtDataCollectors.inc" 209 210 // Type II clones ignore variable names and literals, so let's skip them. 211 #define SKIP(CLASS) \ 212 void Visit##CLASS(const CLASS *S) { \ 213 ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S); \ 214 } 215 SKIP(DeclRefExpr) 216 SKIP(MemberExpr) 217 SKIP(IntegerLiteral) 218 SKIP(FloatingLiteral) 219 SKIP(StringLiteral) 220 SKIP(CXXBoolLiteralExpr) 221 SKIP(CharacterLiteral) 222 #undef SKIP 223 }; 224 } // end anonymous namespace 225 226 static size_t createHash(llvm::MD5 &Hash) { 227 size_t HashCode; 228 229 // Create the final hash code for the current Stmt. 230 llvm::MD5::MD5Result HashResult; 231 Hash.final(HashResult); 232 233 // Copy as much as possible of the generated hash code to the Stmt's hash 234 // code. 235 std::memcpy(&HashCode, &HashResult, 236 std::min(sizeof(HashCode), sizeof(HashResult))); 237 238 return HashCode; 239 } 240 241 /// Generates and saves a hash code for the given Stmt. 242 /// \param S The given Stmt. 243 /// \param D The Decl containing S. 244 /// \param StmtsByHash Output parameter that will contain the hash codes for 245 /// each StmtSequence in the given Stmt. 246 /// \return The hash code of the given Stmt. 247 /// 248 /// If the given Stmt is a CompoundStmt, this method will also generate 249 /// hashes for all possible StmtSequences in the children of this Stmt. 250 static size_t 251 saveHash(const Stmt *S, const Decl *D, 252 std::vector<std::pair<size_t, StmtSequence>> &StmtsByHash) { 253 llvm::MD5 Hash; 254 ASTContext &Context = D->getASTContext(); 255 256 CloneTypeIIStmtDataCollector<llvm::MD5>(S, Context, Hash); 257 258 auto CS = dyn_cast<CompoundStmt>(S); 259 SmallVector<size_t, 8> ChildHashes; 260 261 for (const Stmt *Child : S->children()) { 262 if (Child == nullptr) { 263 ChildHashes.push_back(0); 264 continue; 265 } 266 size_t ChildHash = saveHash(Child, D, StmtsByHash); 267 Hash.update( 268 StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash))); 269 ChildHashes.push_back(ChildHash); 270 } 271 272 if (CS) { 273 // If we're in a CompoundStmt, we hash all possible combinations of child 274 // statements to find clones in those subsequences. 275 // We first go through every possible starting position of a subsequence. 276 for (unsigned Pos = 0; Pos < CS->size(); ++Pos) { 277 // Then we try all possible lengths this subsequence could have and 278 // reuse the same hash object to make sure we only hash every child 279 // hash exactly once. 280 llvm::MD5 Hash; 281 for (unsigned Length = 1; Length <= CS->size() - Pos; ++Length) { 282 // Grab the current child hash and put it into our hash. We do 283 // -1 on the index because we start counting the length at 1. 284 size_t ChildHash = ChildHashes[Pos + Length - 1]; 285 Hash.update( 286 StringRef(reinterpret_cast<char *>(&ChildHash), sizeof(ChildHash))); 287 // If we have at least two elements in our subsequence, we can start 288 // saving it. 289 if (Length > 1) { 290 llvm::MD5 SubHash = Hash; 291 StmtsByHash.push_back(std::make_pair( 292 createHash(SubHash), StmtSequence(CS, D, Pos, Pos + Length))); 293 } 294 } 295 } 296 } 297 298 size_t HashCode = createHash(Hash); 299 StmtsByHash.push_back(std::make_pair(HashCode, StmtSequence(S, D))); 300 return HashCode; 301 } 302 303 namespace { 304 /// Wrapper around FoldingSetNodeID that it can be used as the template 305 /// argument of the StmtDataCollector. 306 class FoldingSetNodeIDWrapper { 307 308 llvm::FoldingSetNodeID &FS; 309 310 public: 311 FoldingSetNodeIDWrapper(llvm::FoldingSetNodeID &FS) : FS(FS) {} 312 313 void update(StringRef Str) { FS.AddString(Str); } 314 }; 315 } // end anonymous namespace 316 317 /// Writes the relevant data from all statements and child statements 318 /// in the given StmtSequence into the given FoldingSetNodeID. 319 static void CollectStmtSequenceData(const StmtSequence &Sequence, 320 FoldingSetNodeIDWrapper &OutputData) { 321 for (const Stmt *S : Sequence) { 322 CloneTypeIIStmtDataCollector<FoldingSetNodeIDWrapper>( 323 S, Sequence.getASTContext(), OutputData); 324 325 for (const Stmt *Child : S->children()) { 326 if (!Child) 327 continue; 328 329 CollectStmtSequenceData(StmtSequence(Child, Sequence.getContainingDecl()), 330 OutputData); 331 } 332 } 333 } 334 335 /// Returns true if both sequences are clones of each other. 336 static bool areSequencesClones(const StmtSequence &LHS, 337 const StmtSequence &RHS) { 338 // We collect the data from all statements in the sequence as we did before 339 // when generating a hash value for each sequence. But this time we don't 340 // hash the collected data and compare the whole data set instead. This 341 // prevents any false-positives due to hash code collisions. 342 llvm::FoldingSetNodeID DataLHS, DataRHS; 343 FoldingSetNodeIDWrapper LHSWrapper(DataLHS); 344 FoldingSetNodeIDWrapper RHSWrapper(DataRHS); 345 346 CollectStmtSequenceData(LHS, LHSWrapper); 347 CollectStmtSequenceData(RHS, RHSWrapper); 348 349 return DataLHS == DataRHS; 350 } 351 352 void RecursiveCloneTypeIIHashConstraint::constrain( 353 std::vector<CloneDetector::CloneGroup> &Sequences) { 354 // FIXME: Maybe we can do this in-place and don't need this additional vector. 355 std::vector<CloneDetector::CloneGroup> Result; 356 357 for (CloneDetector::CloneGroup &Group : Sequences) { 358 // We assume in the following code that the Group is non-empty, so we 359 // skip all empty groups. 360 if (Group.empty()) 361 continue; 362 363 std::vector<std::pair<size_t, StmtSequence>> StmtsByHash; 364 365 // Generate hash codes for all children of S and save them in StmtsByHash. 366 for (const StmtSequence &S : Group) { 367 saveHash(S.front(), S.getContainingDecl(), StmtsByHash); 368 } 369 370 // Sort hash_codes in StmtsByHash. 371 std::stable_sort(StmtsByHash.begin(), StmtsByHash.end(), 372 [](std::pair<size_t, StmtSequence> LHS, 373 std::pair<size_t, StmtSequence> RHS) { 374 return LHS.first < RHS.first; 375 }); 376 377 // Check for each StmtSequence if its successor has the same hash value. 378 // We don't check the last StmtSequence as it has no successor. 379 // Note: The 'size - 1 ' in the condition is safe because we check for an 380 // empty Group vector at the beginning of this function. 381 for (unsigned i = 0; i < StmtsByHash.size() - 1; ++i) { 382 const auto Current = StmtsByHash[i]; 383 384 // It's likely that we just found a sequence of StmtSequences that 385 // represent a CloneGroup, so we create a new group and start checking and 386 // adding the StmtSequences in this sequence. 387 CloneDetector::CloneGroup NewGroup; 388 389 size_t PrototypeHash = Current.first; 390 391 for (; i < StmtsByHash.size(); ++i) { 392 // A different hash value means we have reached the end of the sequence. 393 if (PrototypeHash != StmtsByHash[i].first) { 394 // The current sequence could be the start of a new CloneGroup. So we 395 // decrement i so that we visit it again in the outer loop. 396 // Note: i can never be 0 at this point because we are just comparing 397 // the hash of the Current StmtSequence with itself in the 'if' above. 398 assert(i != 0); 399 --i; 400 break; 401 } 402 // Same hash value means we should add the StmtSequence to the current 403 // group. 404 NewGroup.push_back(StmtsByHash[i].second); 405 } 406 407 // We created a new clone group with matching hash codes and move it to 408 // the result vector. 409 Result.push_back(NewGroup); 410 } 411 } 412 // Sequences is the output parameter, so we copy our result into it. 413 Sequences = Result; 414 } 415 416 void RecursiveCloneTypeIIVerifyConstraint::constrain( 417 std::vector<CloneDetector::CloneGroup> &Sequences) { 418 CloneConstraint::splitCloneGroups( 419 Sequences, [](const StmtSequence &A, const StmtSequence &B) { 420 return areSequencesClones(A, B); 421 }); 422 } 423 424 size_t MinComplexityConstraint::calculateStmtComplexity( 425 const StmtSequence &Seq, std::size_t Limit, 426 const std::string &ParentMacroStack) { 427 if (Seq.empty()) 428 return 0; 429 430 size_t Complexity = 1; 431 432 ASTContext &Context = Seq.getASTContext(); 433 434 // Look up what macros expanded into the current statement. 435 std::string MacroStack = 436 data_collection::getMacroStack(Seq.getStartLoc(), Context); 437 438 // First, check if ParentMacroStack is not empty which means we are currently 439 // dealing with a parent statement which was expanded from a macro. 440 // If this parent statement was expanded from the same macros as this 441 // statement, we reduce the initial complexity of this statement to zero. 442 // This causes that a group of statements that were generated by a single 443 // macro expansion will only increase the total complexity by one. 444 // Note: This is not the final complexity of this statement as we still 445 // add the complexity of the child statements to the complexity value. 446 if (!ParentMacroStack.empty() && MacroStack == ParentMacroStack) { 447 Complexity = 0; 448 } 449 450 // Iterate over the Stmts in the StmtSequence and add their complexity values 451 // to the current complexity value. 452 if (Seq.holdsSequence()) { 453 for (const Stmt *S : Seq) { 454 Complexity += calculateStmtComplexity( 455 StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack); 456 if (Complexity >= Limit) 457 return Limit; 458 } 459 } else { 460 for (const Stmt *S : Seq.front()->children()) { 461 Complexity += calculateStmtComplexity( 462 StmtSequence(S, Seq.getContainingDecl()), Limit, MacroStack); 463 if (Complexity >= Limit) 464 return Limit; 465 } 466 } 467 return Complexity; 468 } 469 470 void MatchingVariablePatternConstraint::constrain( 471 std::vector<CloneDetector::CloneGroup> &CloneGroups) { 472 CloneConstraint::splitCloneGroups( 473 CloneGroups, [](const StmtSequence &A, const StmtSequence &B) { 474 VariablePattern PatternA(A); 475 VariablePattern PatternB(B); 476 return PatternA.countPatternDifferences(PatternB) == 0; 477 }); 478 } 479 480 void CloneConstraint::splitCloneGroups( 481 std::vector<CloneDetector::CloneGroup> &CloneGroups, 482 llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)> 483 Compare) { 484 std::vector<CloneDetector::CloneGroup> Result; 485 for (auto &HashGroup : CloneGroups) { 486 // Contains all indexes in HashGroup that were already added to a 487 // CloneGroup. 488 std::vector<char> Indexes; 489 Indexes.resize(HashGroup.size()); 490 491 for (unsigned i = 0; i < HashGroup.size(); ++i) { 492 // Skip indexes that are already part of a CloneGroup. 493 if (Indexes[i]) 494 continue; 495 496 // Pick the first unhandled StmtSequence and consider it as the 497 // beginning 498 // of a new CloneGroup for now. 499 // We don't add i to Indexes because we never iterate back. 500 StmtSequence Prototype = HashGroup[i]; 501 CloneDetector::CloneGroup PotentialGroup = {Prototype}; 502 ++Indexes[i]; 503 504 // Check all following StmtSequences for clones. 505 for (unsigned j = i + 1; j < HashGroup.size(); ++j) { 506 // Skip indexes that are already part of a CloneGroup. 507 if (Indexes[j]) 508 continue; 509 510 // If a following StmtSequence belongs to our CloneGroup, we add it. 511 const StmtSequence &Candidate = HashGroup[j]; 512 513 if (!Compare(Prototype, Candidate)) 514 continue; 515 516 PotentialGroup.push_back(Candidate); 517 // Make sure we never visit this StmtSequence again. 518 ++Indexes[j]; 519 } 520 521 // Otherwise, add it to the result and continue searching for more 522 // groups. 523 Result.push_back(PotentialGroup); 524 } 525 526 assert(std::all_of(Indexes.begin(), Indexes.end(), 527 [](char c) { return c == 1; })); 528 } 529 CloneGroups = Result; 530 } 531 532 void VariablePattern::addVariableOccurence(const VarDecl *VarDecl, 533 const Stmt *Mention) { 534 // First check if we already reference this variable 535 for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) { 536 if (Variables[KindIndex] == VarDecl) { 537 // If yes, add a new occurrence that points to the existing entry in 538 // the Variables vector. 539 Occurences.emplace_back(KindIndex, Mention); 540 return; 541 } 542 } 543 // If this variable wasn't already referenced, add it to the list of 544 // referenced variables and add a occurrence that points to this new entry. 545 Occurences.emplace_back(Variables.size(), Mention); 546 Variables.push_back(VarDecl); 547 } 548 549 void VariablePattern::addVariables(const Stmt *S) { 550 // Sometimes we get a nullptr (such as from IfStmts which often have nullptr 551 // children). We skip such statements as they don't reference any 552 // variables. 553 if (!S) 554 return; 555 556 // Check if S is a reference to a variable. If yes, add it to the pattern. 557 if (auto D = dyn_cast<DeclRefExpr>(S)) { 558 if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl())) 559 addVariableOccurence(VD, D); 560 } 561 562 // Recursively check all children of the given statement. 563 for (const Stmt *Child : S->children()) { 564 addVariables(Child); 565 } 566 } 567 568 unsigned VariablePattern::countPatternDifferences( 569 const VariablePattern &Other, 570 VariablePattern::SuspiciousClonePair *FirstMismatch) { 571 unsigned NumberOfDifferences = 0; 572 573 assert(Other.Occurences.size() == Occurences.size()); 574 for (unsigned i = 0; i < Occurences.size(); ++i) { 575 auto ThisOccurence = Occurences[i]; 576 auto OtherOccurence = Other.Occurences[i]; 577 if (ThisOccurence.KindID == OtherOccurence.KindID) 578 continue; 579 580 ++NumberOfDifferences; 581 582 // If FirstMismatch is not a nullptr, we need to store information about 583 // the first difference between the two patterns. 584 if (FirstMismatch == nullptr) 585 continue; 586 587 // Only proceed if we just found the first difference as we only store 588 // information about the first difference. 589 if (NumberOfDifferences != 1) 590 continue; 591 592 const VarDecl *FirstSuggestion = nullptr; 593 // If there is a variable available in the list of referenced variables 594 // which wouldn't break the pattern if it is used in place of the 595 // current variable, we provide this variable as the suggested fix. 596 if (OtherOccurence.KindID < Variables.size()) 597 FirstSuggestion = Variables[OtherOccurence.KindID]; 598 599 // Store information about the first clone. 600 FirstMismatch->FirstCloneInfo = 601 VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo( 602 Variables[ThisOccurence.KindID], ThisOccurence.Mention, 603 FirstSuggestion); 604 605 // Same as above but with the other clone. We do this for both clones as 606 // we don't know which clone is the one containing the unintended 607 // pattern error. 608 const VarDecl *SecondSuggestion = nullptr; 609 if (ThisOccurence.KindID < Other.Variables.size()) 610 SecondSuggestion = Other.Variables[ThisOccurence.KindID]; 611 612 // Store information about the second clone. 613 FirstMismatch->SecondCloneInfo = 614 VariablePattern::SuspiciousClonePair::SuspiciousCloneInfo( 615 Other.Variables[OtherOccurence.KindID], OtherOccurence.Mention, 616 SecondSuggestion); 617 618 // SuspiciousClonePair guarantees that the first clone always has a 619 // suggested variable associated with it. As we know that one of the two 620 // clones in the pair always has suggestion, we swap the two clones 621 // in case the first clone has no suggested variable which means that 622 // the second clone has a suggested variable and should be first. 623 if (!FirstMismatch->FirstCloneInfo.Suggestion) 624 std::swap(FirstMismatch->FirstCloneInfo, FirstMismatch->SecondCloneInfo); 625 626 // This ensures that we always have at least one suggestion in a pair. 627 assert(FirstMismatch->FirstCloneInfo.Suggestion); 628 } 629 630 return NumberOfDifferences; 631 } 632