1 //===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// This file implements classes for searching and anlyzing source code clones. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "clang/Analysis/CloneDetection.h" 15 16 #include "clang/AST/ASTContext.h" 17 #include "clang/AST/RecursiveASTVisitor.h" 18 #include "clang/AST/Stmt.h" 19 #include "clang/AST/StmtVisitor.h" 20 #include "llvm/ADT/StringRef.h" 21 22 using namespace clang; 23 24 StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context, 25 unsigned StartIndex, unsigned EndIndex) 26 : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) { 27 assert(Stmt && "Stmt must not be a nullptr"); 28 assert(StartIndex < EndIndex && "Given array should not be empty"); 29 assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt"); 30 } 31 32 StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context) 33 : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {} 34 35 StmtSequence::StmtSequence() 36 : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {} 37 38 bool StmtSequence::contains(const StmtSequence &Other) const { 39 // If both sequences reside in different translation units, they can never 40 // contain each other. 41 if (Context != Other.Context) 42 return false; 43 44 const SourceManager &SM = Context->getSourceManager(); 45 46 // Otherwise check if the start and end locations of the current sequence 47 // surround the other sequence. 48 bool StartIsInBounds = 49 SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) || 50 getStartLoc() == Other.getStartLoc(); 51 if (!StartIsInBounds) 52 return false; 53 54 bool EndIsInBounds = 55 SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) || 56 Other.getEndLoc() == getEndLoc(); 57 return EndIsInBounds; 58 } 59 60 StmtSequence::iterator StmtSequence::begin() const { 61 if (!holdsSequence()) { 62 return &S; 63 } 64 auto CS = cast<CompoundStmt>(S); 65 return CS->body_begin() + StartIndex; 66 } 67 68 StmtSequence::iterator StmtSequence::end() const { 69 if (!holdsSequence()) { 70 return reinterpret_cast<StmtSequence::iterator>(&S) + 1; 71 } 72 auto CS = cast<CompoundStmt>(S); 73 return CS->body_begin() + EndIndex; 74 } 75 76 SourceLocation StmtSequence::getStartLoc() const { 77 return front()->getLocStart(); 78 } 79 80 SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); } 81 82 namespace { 83 84 /// \brief Analyzes the pattern of the referenced variables in a statement. 85 class VariablePattern { 86 87 /// \brief Describes an occurence of a variable reference in a statement. 88 struct VariableOccurence { 89 /// The index of the associated VarDecl in the Variables vector. 90 size_t KindID; 91 /// The source range in the code where the variable was referenced. 92 SourceRange Range; 93 94 VariableOccurence(size_t KindID, SourceRange Range) 95 : KindID(KindID), Range(Range) {} 96 }; 97 98 /// All occurences of referenced variables in the order of appearance. 99 std::vector<VariableOccurence> Occurences; 100 /// List of referenced variables in the order of appearance. 101 /// Every item in this list is unique. 102 std::vector<const VarDecl *> Variables; 103 104 /// \brief Adds a new variable referenced to this pattern. 105 /// \param VarDecl The declaration of the variable that is referenced. 106 /// \param Range The SourceRange where this variable is referenced. 107 void addVariableOccurence(const VarDecl *VarDecl, SourceRange Range) { 108 // First check if we already reference this variable 109 for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) { 110 if (Variables[KindIndex] == VarDecl) { 111 // If yes, add a new occurence that points to the existing entry in 112 // the Variables vector. 113 Occurences.emplace_back(KindIndex, Range); 114 return; 115 } 116 } 117 // If this variable wasn't already referenced, add it to the list of 118 // referenced variables and add a occurence that points to this new entry. 119 Occurences.emplace_back(Variables.size(), Range); 120 Variables.push_back(VarDecl); 121 } 122 123 /// \brief Adds each referenced variable from the given statement. 124 void addVariables(const Stmt *S) { 125 // Sometimes we get a nullptr (such as from IfStmts which often have nullptr 126 // children). We skip such statements as they don't reference any 127 // variables. 128 if (!S) 129 return; 130 131 // Check if S is a reference to a variable. If yes, add it to the pattern. 132 if (auto D = dyn_cast<DeclRefExpr>(S)) { 133 if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl())) 134 addVariableOccurence(VD, D->getSourceRange()); 135 } 136 137 // Recursively check all children of the given statement. 138 for (const Stmt *Child : S->children()) { 139 addVariables(Child); 140 } 141 } 142 143 public: 144 /// \brief Creates an VariablePattern object with information about the given 145 /// StmtSequence. 146 VariablePattern(const StmtSequence &Sequence) { 147 for (const Stmt *S : Sequence) 148 addVariables(S); 149 } 150 151 /// \brief Counts the differences between this pattern and the given one. 152 /// \param Other The given VariablePattern to compare with. 153 /// \param FirstMismatch Output parameter that will be filled with information 154 /// about the first difference between the two patterns. This parameter 155 /// can be a nullptr, in which case it will be ignored. 156 /// \return Returns the number of differences between the pattern this object 157 /// is following and the given VariablePattern. 158 /// 159 /// For example, the following statements all have the same pattern and this 160 /// function would return zero: 161 /// 162 /// if (a < b) return a; return b; 163 /// if (x < y) return x; return y; 164 /// if (u2 < u1) return u2; return u1; 165 /// 166 /// But the following statement has a different pattern (note the changed 167 /// variables in the return statements) and would have two differences when 168 /// compared with one of the statements above. 169 /// 170 /// if (a < b) return b; return a; 171 /// 172 /// This function should only be called if the related statements of the given 173 /// pattern and the statements of this objects are clones of each other. 174 unsigned countPatternDifferences( 175 const VariablePattern &Other, 176 CloneDetector::SuspiciousClonePair *FirstMismatch = nullptr) { 177 unsigned NumberOfDifferences = 0; 178 179 assert(Other.Occurences.size() == Occurences.size()); 180 for (unsigned i = 0; i < Occurences.size(); ++i) { 181 auto ThisOccurence = Occurences[i]; 182 auto OtherOccurence = Other.Occurences[i]; 183 if (ThisOccurence.KindID == OtherOccurence.KindID) 184 continue; 185 186 ++NumberOfDifferences; 187 188 // If FirstMismatch is not a nullptr, we need to store information about 189 // the first difference between the two patterns. 190 if (FirstMismatch == nullptr) 191 continue; 192 193 // Only proceed if we just found the first difference as we only store 194 // information about the first difference. 195 if (NumberOfDifferences != 1) 196 continue; 197 198 const VarDecl *FirstSuggestion = nullptr; 199 // If there is a variable available in the list of referenced variables 200 // which wouldn't break the pattern if it is used in place of the 201 // current variable, we provide this variable as the suggested fix. 202 if (OtherOccurence.KindID < Variables.size()) 203 FirstSuggestion = Variables[OtherOccurence.KindID]; 204 205 // Store information about the first clone. 206 FirstMismatch->FirstCloneInfo = 207 CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo( 208 Variables[ThisOccurence.KindID], ThisOccurence.Range, 209 FirstSuggestion); 210 211 // Same as above but with the other clone. We do this for both clones as 212 // we don't know which clone is the one containing the unintended 213 // pattern error. 214 const VarDecl *SecondSuggestion = nullptr; 215 if (ThisOccurence.KindID < Other.Variables.size()) 216 SecondSuggestion = Other.Variables[ThisOccurence.KindID]; 217 218 // Store information about the second clone. 219 FirstMismatch->SecondCloneInfo = 220 CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo( 221 Variables[ThisOccurence.KindID], OtherOccurence.Range, 222 SecondSuggestion); 223 224 // SuspiciousClonePair guarantees that the first clone always has a 225 // suggested variable associated with it. As we know that one of the two 226 // clones in the pair always has suggestion, we swap the two clones 227 // in case the first clone has no suggested variable which means that 228 // the second clone has a suggested variable and should be first. 229 if (!FirstMismatch->FirstCloneInfo.Suggestion) 230 std::swap(FirstMismatch->FirstCloneInfo, 231 FirstMismatch->SecondCloneInfo); 232 233 // This ensures that we always have at least one suggestion in a pair. 234 assert(FirstMismatch->FirstCloneInfo.Suggestion); 235 } 236 237 return NumberOfDifferences; 238 } 239 }; 240 } 241 242 namespace { 243 /// \brief Collects the data of a single Stmt. 244 /// 245 /// This class defines what a code clone is: If it collects for two statements 246 /// the same data, then those two statements are considered to be clones of each 247 /// other. 248 class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector> { 249 250 ASTContext &Context; 251 std::vector<CloneDetector::DataPiece> &CollectedData; 252 253 public: 254 /// \brief Collects data of the given Stmt. 255 /// \param S The given statement. 256 /// \param Context The ASTContext of S. 257 /// \param D The given data vector to which all collected data is appended. 258 StmtDataCollector(const Stmt *S, ASTContext &Context, 259 std::vector<CloneDetector::DataPiece> &D) 260 : Context(Context), CollectedData(D) { 261 Visit(S); 262 } 263 264 // Below are utility methods for appending different data to the vector. 265 266 void addData(CloneDetector::DataPiece Integer) { 267 CollectedData.push_back(Integer); 268 } 269 270 // FIXME: The functions below add long strings to the data vector which are 271 // probably not good for performance. Replace the strings with pointer values 272 // or a some other unique integer. 273 274 void addData(llvm::StringRef Str) { 275 if (Str.empty()) 276 return; 277 278 const size_t OldSize = CollectedData.size(); 279 280 const size_t PieceSize = sizeof(CloneDetector::DataPiece); 281 // Calculate how many vector units we need to accomodate all string bytes. 282 size_t RoundedUpPieceNumber = (Str.size() + PieceSize - 1) / PieceSize; 283 // Allocate space for the string in the data vector. 284 CollectedData.resize(CollectedData.size() + RoundedUpPieceNumber); 285 286 // Copy the string to the allocated space at the end of the vector. 287 std::memcpy(CollectedData.data() + OldSize, Str.data(), Str.size()); 288 } 289 290 void addData(const QualType &QT) { addData(QT.getAsString()); } 291 292 // The functions below collect the class specific data of each Stmt subclass. 293 294 // Utility macro for defining a visit method for a given class. This method 295 // calls back to the ConstStmtVisitor to visit all parent classes. 296 #define DEF_ADD_DATA(CLASS, CODE) \ 297 void Visit##CLASS(const CLASS *S) { \ 298 CODE; \ 299 ConstStmtVisitor<StmtDataCollector>::Visit##CLASS(S); \ 300 } 301 302 DEF_ADD_DATA(Stmt, { addData(S->getStmtClass()); }) 303 DEF_ADD_DATA(Expr, { addData(S->getType()); }) 304 305 //--- Builtin functionality ----------------------------------------------// 306 DEF_ADD_DATA(ArrayTypeTraitExpr, { addData(S->getTrait()); }) 307 DEF_ADD_DATA(ExpressionTraitExpr, { addData(S->getTrait()); }) 308 DEF_ADD_DATA(PredefinedExpr, { addData(S->getIdentType()); }) 309 DEF_ADD_DATA(TypeTraitExpr, { 310 addData(S->getTrait()); 311 for (unsigned i = 0; i < S->getNumArgs(); ++i) 312 addData(S->getArg(i)->getType()); 313 }) 314 315 //--- Calls --------------------------------------------------------------// 316 DEF_ADD_DATA(CallExpr, { 317 // Function pointers don't have a callee and we just skip hashing it. 318 if (S->getDirectCallee()) 319 addData(S->getDirectCallee()->getQualifiedNameAsString()); 320 }) 321 322 //--- Exceptions ---------------------------------------------------------// 323 DEF_ADD_DATA(CXXCatchStmt, { addData(S->getCaughtType()); }) 324 325 //--- C++ OOP Stmts ------------------------------------------------------// 326 DEF_ADD_DATA(CXXDeleteExpr, { 327 addData(S->isArrayFormAsWritten()); 328 addData(S->isGlobalDelete()); 329 }) 330 331 //--- Casts --------------------------------------------------------------// 332 DEF_ADD_DATA(ObjCBridgedCastExpr, { addData(S->getBridgeKind()); }) 333 334 //--- Miscellaneous Exprs ------------------------------------------------// 335 DEF_ADD_DATA(BinaryOperator, { addData(S->getOpcode()); }) 336 DEF_ADD_DATA(UnaryOperator, { addData(S->getOpcode()); }) 337 338 //--- Control flow -------------------------------------------------------// 339 DEF_ADD_DATA(GotoStmt, { addData(S->getLabel()->getName()); }) 340 DEF_ADD_DATA(IndirectGotoStmt, { 341 if (S->getConstantTarget()) 342 addData(S->getConstantTarget()->getName()); 343 }) 344 DEF_ADD_DATA(LabelStmt, { addData(S->getDecl()->getName()); }) 345 DEF_ADD_DATA(MSDependentExistsStmt, { addData(S->isIfExists()); }) 346 DEF_ADD_DATA(AddrLabelExpr, { addData(S->getLabel()->getName()); }) 347 348 //--- Objective-C --------------------------------------------------------// 349 DEF_ADD_DATA(ObjCIndirectCopyRestoreExpr, { addData(S->shouldCopy()); }) 350 DEF_ADD_DATA(ObjCPropertyRefExpr, { 351 addData(S->isSuperReceiver()); 352 addData(S->isImplicitProperty()); 353 }) 354 DEF_ADD_DATA(ObjCAtCatchStmt, { addData(S->hasEllipsis()); }) 355 356 //--- Miscellaneous Stmts ------------------------------------------------// 357 DEF_ADD_DATA(CXXFoldExpr, { 358 addData(S->isRightFold()); 359 addData(S->getOperator()); 360 }) 361 DEF_ADD_DATA(GenericSelectionExpr, { 362 for (unsigned i = 0; i < S->getNumAssocs(); ++i) { 363 addData(S->getAssocType(i)); 364 } 365 }) 366 DEF_ADD_DATA(LambdaExpr, { 367 for (const LambdaCapture &C : S->captures()) { 368 addData(C.isPackExpansion()); 369 addData(C.getCaptureKind()); 370 if (C.capturesVariable()) 371 addData(C.getCapturedVar()->getType()); 372 } 373 addData(S->isGenericLambda()); 374 addData(S->isMutable()); 375 }) 376 DEF_ADD_DATA(DeclStmt, { 377 auto numDecls = std::distance(S->decl_begin(), S->decl_end()); 378 addData(static_cast<CloneDetector::DataPiece>(numDecls)); 379 for (const Decl *D : S->decls()) { 380 if (const VarDecl *VD = dyn_cast<VarDecl>(D)) { 381 addData(VD->getType()); 382 } 383 } 384 }) 385 DEF_ADD_DATA(AsmStmt, { 386 addData(S->isSimple()); 387 addData(S->isVolatile()); 388 addData(S->generateAsmString(Context)); 389 for (unsigned i = 0; i < S->getNumInputs(); ++i) { 390 addData(S->getInputConstraint(i)); 391 } 392 for (unsigned i = 0; i < S->getNumOutputs(); ++i) { 393 addData(S->getOutputConstraint(i)); 394 } 395 for (unsigned i = 0; i < S->getNumClobbers(); ++i) { 396 addData(S->getClobber(i)); 397 } 398 }) 399 DEF_ADD_DATA(AttributedStmt, { 400 for (const Attr *A : S->getAttrs()) { 401 addData(std::string(A->getSpelling())); 402 } 403 }) 404 }; 405 } // end anonymous namespace 406 407 namespace { 408 /// Generates CloneSignatures for a set of statements and stores the results in 409 /// a CloneDetector object. 410 class CloneSignatureGenerator { 411 412 CloneDetector &CD; 413 ASTContext &Context; 414 415 /// \brief Generates CloneSignatures for all statements in the given statement 416 /// tree and stores them in the CloneDetector. 417 /// 418 /// \param S The root of the given statement tree. 419 /// \return The CloneSignature of the root statement. 420 CloneDetector::CloneSignature generateSignatures(const Stmt *S) { 421 // Create an empty signature that will be filled in this method. 422 CloneDetector::CloneSignature Signature; 423 424 // Collect all relevant data from S and put it into the empty signature. 425 StmtDataCollector(S, Context, Signature.Data); 426 427 // Storage for the signatures of the direct child statements. This is only 428 // needed if the current statement is a CompoundStmt. 429 std::vector<CloneDetector::CloneSignature> ChildSignatures; 430 const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S); 431 432 // The signature of a statement includes the signatures of its children. 433 // Therefore we create the signatures for every child and add them to the 434 // current signature. 435 for (const Stmt *Child : S->children()) { 436 // Some statements like 'if' can have nullptr children that we will skip. 437 if (!Child) 438 continue; 439 440 // Recursive call to create the signature of the child statement. This 441 // will also create and store all clone groups in this child statement. 442 auto ChildSignature = generateSignatures(Child); 443 444 // Add the collected data to the signature of the current statement. 445 Signature.add(ChildSignature); 446 447 // If the current statement is a CompoundStatement, we need to store the 448 // signature for the generation of the sub-sequences. 449 if (CS) 450 ChildSignatures.push_back(ChildSignature); 451 } 452 453 // If the current statement is a CompoundStmt, we also need to create the 454 // clone groups from the sub-sequences inside the children. 455 if (CS) 456 handleSubSequences(CS, ChildSignatures); 457 458 // Save the signature for the current statement in the CloneDetector object. 459 CD.add(StmtSequence(S, Context), Signature); 460 461 return Signature; 462 } 463 464 /// \brief Adds all possible sub-sequences in the child array of the given 465 /// CompoundStmt to the CloneDetector. 466 /// \param CS The given CompoundStmt. 467 /// \param ChildSignatures A list of calculated signatures for each child in 468 /// the given CompoundStmt. 469 void handleSubSequences( 470 const CompoundStmt *CS, 471 const std::vector<CloneDetector::CloneSignature> &ChildSignatures) { 472 473 // FIXME: This function has quadratic runtime right now. Check if skipping 474 // this function for too long CompoundStmts is an option. 475 476 // The length of the sub-sequence. We don't need to handle sequences with 477 // the length 1 as they are already handled in CollectData(). 478 for (unsigned Length = 2; Length <= CS->size(); ++Length) { 479 // The start index in the body of the CompoundStmt. We increase the 480 // position until the end of the sub-sequence reaches the end of the 481 // CompoundStmt body. 482 for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) { 483 // Create an empty signature and add the signatures of all selected 484 // child statements to it. 485 CloneDetector::CloneSignature SubSignature; 486 487 for (unsigned i = Pos; i < Pos + Length; ++i) { 488 SubSignature.add(ChildSignatures[i]); 489 } 490 491 // Save the signature together with the information about what children 492 // sequence we selected. 493 CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature); 494 } 495 } 496 } 497 498 public: 499 explicit CloneSignatureGenerator(CloneDetector &CD, ASTContext &Context) 500 : CD(CD), Context(Context) {} 501 502 /// \brief Generates signatures for all statements in the given function body. 503 void consumeCodeBody(const Stmt *S) { generateSignatures(S); } 504 }; 505 } // end anonymous namespace 506 507 void CloneDetector::analyzeCodeBody(const Decl *D) { 508 assert(D); 509 assert(D->hasBody()); 510 CloneSignatureGenerator Generator(*this, D->getASTContext()); 511 Generator.consumeCodeBody(D->getBody()); 512 } 513 514 void CloneDetector::add(const StmtSequence &S, 515 const CloneSignature &Signature) { 516 // StringMap only works with StringRefs, so we create one for our data vector. 517 auto &Data = Signature.Data; 518 StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()), 519 Data.size() * sizeof(unsigned)); 520 521 // Search with the help of the signature if we already have encountered a 522 // clone of the given StmtSequence. 523 auto I = CloneGroupIndexes.find(DataRef); 524 if (I == CloneGroupIndexes.end()) { 525 // We haven't found an existing clone group, so we create a new clone group 526 // for this StmtSequence and store the index of it in our search map. 527 CloneGroupIndexes[DataRef] = CloneGroups.size(); 528 CloneGroups.emplace_back(S, Signature.Complexity); 529 return; 530 } 531 532 // We have found an existing clone group and can expand it with the given 533 // StmtSequence. 534 CloneGroups[I->getValue()].Sequences.push_back(S); 535 } 536 537 namespace { 538 /// \brief Returns true if and only if \p Stmt contains at least one other 539 /// sequence in the \p Group. 540 bool containsAnyInGroup(StmtSequence &Stmt, CloneDetector::CloneGroup &Group) { 541 for (StmtSequence &GroupStmt : Group.Sequences) { 542 if (Stmt.contains(GroupStmt)) 543 return true; 544 } 545 return false; 546 } 547 548 /// \brief Returns true if and only if all sequences in \p OtherGroup are 549 /// contained by a sequence in \p Group. 550 bool containsGroup(CloneDetector::CloneGroup &Group, 551 CloneDetector::CloneGroup &OtherGroup) { 552 // We have less sequences in the current group than we have in the other, 553 // so we will never fulfill the requirement for returning true. This is only 554 // possible because we know that a sequence in Group can contain at most 555 // one sequence in OtherGroup. 556 if (Group.Sequences.size() < OtherGroup.Sequences.size()) 557 return false; 558 559 for (StmtSequence &Stmt : Group.Sequences) { 560 if (!containsAnyInGroup(Stmt, OtherGroup)) 561 return false; 562 } 563 return true; 564 } 565 } // end anonymous namespace 566 567 /// \brief Finds all actual clone groups in a single group of presumed clones. 568 /// \param Result Output parameter to which all found groups are added. Every 569 /// clone in a group that was added this way follows the same 570 /// variable pattern as the other clones in its group. 571 /// \param Group A group of clones. The clones are allowed to have a different 572 /// variable pattern. 573 static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result, 574 const CloneDetector::CloneGroup &Group) { 575 // We remove the Sequences one by one, so a list is more appropriate. 576 std::list<StmtSequence> UnassignedSequences(Group.Sequences.begin(), 577 Group.Sequences.end()); 578 579 // Search for clones as long as there could be clones in UnassignedSequences. 580 while (UnassignedSequences.size() > 1) { 581 582 // Pick the first Sequence as a protoype for a new clone group. 583 StmtSequence Prototype = UnassignedSequences.front(); 584 UnassignedSequences.pop_front(); 585 586 CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Complexity); 587 588 // Analyze the variable pattern of the prototype. Every other StmtSequence 589 // needs to have the same pattern to get into the new clone group. 590 VariablePattern PrototypeFeatures(Prototype); 591 592 // Search all remaining StmtSequences for an identical variable pattern 593 // and assign them to our new clone group. 594 auto I = UnassignedSequences.begin(), E = UnassignedSequences.end(); 595 while (I != E) { 596 597 if (VariablePattern(*I).countPatternDifferences(PrototypeFeatures) == 0) { 598 FilteredGroup.Sequences.push_back(*I); 599 I = UnassignedSequences.erase(I); 600 continue; 601 } 602 ++I; 603 } 604 605 // Add a valid clone group to the list of found clone groups. 606 if (!FilteredGroup.isValid()) 607 continue; 608 609 Result.push_back(FilteredGroup); 610 } 611 } 612 613 void CloneDetector::findClones(std::vector<CloneGroup> &Result, 614 unsigned MinGroupComplexity, 615 bool CheckPatterns) { 616 // Add every valid clone group that fulfills the complexity requirement. 617 for (const CloneGroup &Group : CloneGroups) { 618 if (Group.isValid() && Group.Complexity >= MinGroupComplexity) { 619 if (CheckPatterns) 620 createCloneGroups(Result, Group); 621 else 622 Result.push_back(Group); 623 } 624 } 625 626 std::vector<unsigned> IndexesToRemove; 627 628 // Compare every group in the result with the rest. If one groups contains 629 // another group, we only need to return the bigger group. 630 // Note: This doesn't scale well, so if possible avoid calling any heavy 631 // function from this loop to minimize the performance impact. 632 for (unsigned i = 0; i < Result.size(); ++i) { 633 for (unsigned j = 0; j < Result.size(); ++j) { 634 // Don't compare a group with itself. 635 if (i == j) 636 continue; 637 638 if (containsGroup(Result[j], Result[i])) { 639 IndexesToRemove.push_back(i); 640 break; 641 } 642 } 643 } 644 645 // Erasing a list of indexes from the vector should be done with decreasing 646 // indexes. As IndexesToRemove is constructed with increasing values, we just 647 // reverse iterate over it to get the desired order. 648 for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) { 649 Result.erase(Result.begin() + *I); 650 } 651 } 652 653 void CloneDetector::findSuspiciousClones( 654 std::vector<CloneDetector::SuspiciousClonePair> &Result, 655 unsigned MinGroupComplexity) { 656 std::vector<CloneGroup> Clones; 657 // Reuse the normal search for clones but specify that the clone groups don't 658 // need to have a common referenced variable pattern so that we can manually 659 // search for the kind of pattern errors this function is supposed to find. 660 findClones(Clones, MinGroupComplexity, false); 661 662 for (const CloneGroup &Group : Clones) { 663 for (unsigned i = 0; i < Group.Sequences.size(); ++i) { 664 VariablePattern PatternA(Group.Sequences[i]); 665 666 for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) { 667 VariablePattern PatternB(Group.Sequences[j]); 668 669 CloneDetector::SuspiciousClonePair ClonePair; 670 // For now, we only report clones which break the variable pattern just 671 // once because multiple differences in a pattern are an indicator that 672 // those differences are maybe intended (e.g. because it's actually 673 // a different algorithm). 674 // TODO: In very big clones even multiple variables can be unintended, 675 // so replacing this number with a percentage could better handle such 676 // cases. On the other hand it could increase the false-positive rate 677 // for all clones if the percentage is too high. 678 if (PatternA.countPatternDifferences(PatternB, &ClonePair) == 1) { 679 Result.push_back(ClonePair); 680 break; 681 } 682 } 683 } 684 } 685 } 686