1 //===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 ///  This file implements classes for searching and anlyzing source code clones.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/Analysis/CloneDetection.h"
15 
16 #include "clang/AST/ASTContext.h"
17 #include "clang/AST/RecursiveASTVisitor.h"
18 #include "clang/AST/Stmt.h"
19 #include "clang/AST/StmtVisitor.h"
20 #include "llvm/ADT/StringRef.h"
21 
22 using namespace clang;
23 
24 StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
25                            unsigned StartIndex, unsigned EndIndex)
26     : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
27   assert(Stmt && "Stmt must not be a nullptr");
28   assert(StartIndex < EndIndex && "Given array should not be empty");
29   assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
30 }
31 
32 StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context)
33     : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}
34 
35 StmtSequence::StmtSequence()
36     : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}
37 
38 bool StmtSequence::contains(const StmtSequence &Other) const {
39   // If both sequences reside in different translation units, they can never
40   // contain each other.
41   if (Context != Other.Context)
42     return false;
43 
44   const SourceManager &SM = Context->getSourceManager();
45 
46   // Otherwise check if the start and end locations of the current sequence
47   // surround the other sequence.
48   bool StartIsInBounds =
49       SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
50       getStartLoc() == Other.getStartLoc();
51   if (!StartIsInBounds)
52     return false;
53 
54   bool EndIsInBounds =
55       SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
56       Other.getEndLoc() == getEndLoc();
57   return EndIsInBounds;
58 }
59 
60 StmtSequence::iterator StmtSequence::begin() const {
61   if (!holdsSequence()) {
62     return &S;
63   }
64   auto CS = cast<CompoundStmt>(S);
65   return CS->body_begin() + StartIndex;
66 }
67 
68 StmtSequence::iterator StmtSequence::end() const {
69   if (!holdsSequence()) {
70     return reinterpret_cast<StmtSequence::iterator>(&S) + 1;
71   }
72   auto CS = cast<CompoundStmt>(S);
73   return CS->body_begin() + EndIndex;
74 }
75 
76 SourceLocation StmtSequence::getStartLoc() const {
77   return front()->getLocStart();
78 }
79 
80 SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
81 
82 namespace {
83 
84 /// \brief Analyzes the pattern of the referenced variables in a statement.
85 class VariablePattern {
86 
87   /// \brief Describes an occurence of a variable reference in a statement.
88   struct VariableOccurence {
89     /// The index of the associated VarDecl in the Variables vector.
90     size_t KindID;
91     /// The source range in the code where the variable was referenced.
92     SourceRange Range;
93 
94     VariableOccurence(size_t KindID, SourceRange Range)
95         : KindID(KindID), Range(Range) {}
96   };
97 
98   /// All occurences of referenced variables in the order of appearance.
99   std::vector<VariableOccurence> Occurences;
100   /// List of referenced variables in the order of appearance.
101   /// Every item in this list is unique.
102   std::vector<const VarDecl *> Variables;
103 
104   /// \brief Adds a new variable referenced to this pattern.
105   /// \param VarDecl The declaration of the variable that is referenced.
106   /// \param Range The SourceRange where this variable is referenced.
107   void addVariableOccurence(const VarDecl *VarDecl, SourceRange Range) {
108     // First check if we already reference this variable
109     for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
110       if (Variables[KindIndex] == VarDecl) {
111         // If yes, add a new occurence that points to the existing entry in
112         // the Variables vector.
113         Occurences.emplace_back(KindIndex, Range);
114         return;
115       }
116     }
117     // If this variable wasn't already referenced, add it to the list of
118     // referenced variables and add a occurence that points to this new entry.
119     Occurences.emplace_back(Variables.size(), Range);
120     Variables.push_back(VarDecl);
121   }
122 
123   /// \brief Adds each referenced variable from the given statement.
124   void addVariables(const Stmt *S) {
125     // Sometimes we get a nullptr (such as from IfStmts which often have nullptr
126     // children). We skip such statements as they don't reference any
127     // variables.
128     if (!S)
129       return;
130 
131     // Check if S is a reference to a variable. If yes, add it to the pattern.
132     if (auto D = dyn_cast<DeclRefExpr>(S)) {
133       if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))
134         addVariableOccurence(VD, D->getSourceRange());
135     }
136 
137     // Recursively check all children of the given statement.
138     for (const Stmt *Child : S->children()) {
139       addVariables(Child);
140     }
141   }
142 
143 public:
144   /// \brief Creates an VariablePattern object with information about the given
145   ///        StmtSequence.
146   VariablePattern(const StmtSequence &Sequence) {
147     for (const Stmt *S : Sequence)
148       addVariables(S);
149   }
150 
151   /// \brief Counts the differences between this pattern and the given one.
152   /// \param Other The given VariablePattern to compare with.
153   /// \param FirstMismatch Output parameter that will be filled with information
154   ///        about the first difference between the two patterns. This parameter
155   ///        can be a nullptr, in which case it will be ignored.
156   /// \return Returns the number of differences between the pattern this object
157   ///         is following and the given VariablePattern.
158   ///
159   /// For example, the following statements all have the same pattern and this
160   /// function would return zero:
161   ///
162   ///   if (a < b) return a; return b;
163   ///   if (x < y) return x; return y;
164   ///   if (u2 < u1) return u2; return u1;
165   ///
166   /// But the following statement has a different pattern (note the changed
167   /// variables in the return statements) and would have two differences when
168   /// compared with one of the statements above.
169   ///
170   ///   if (a < b) return b; return a;
171   ///
172   /// This function should only be called if the related statements of the given
173   /// pattern and the statements of this objects are clones of each other.
174   unsigned countPatternDifferences(
175       const VariablePattern &Other,
176       CloneDetector::SuspiciousClonePair *FirstMismatch = nullptr) {
177     unsigned NumberOfDifferences = 0;
178 
179     assert(Other.Occurences.size() == Occurences.size());
180     for (unsigned i = 0; i < Occurences.size(); ++i) {
181       auto ThisOccurence = Occurences[i];
182       auto OtherOccurence = Other.Occurences[i];
183       if (ThisOccurence.KindID == OtherOccurence.KindID)
184         continue;
185 
186       ++NumberOfDifferences;
187 
188       // If FirstMismatch is not a nullptr, we need to store information about
189       // the first difference between the two patterns.
190       if (FirstMismatch == nullptr)
191         continue;
192 
193       // Only proceed if we just found the first difference as we only store
194       // information about the first difference.
195       if (NumberOfDifferences != 1)
196         continue;
197 
198       const VarDecl *FirstSuggestion = nullptr;
199       // If there is a variable available in the list of referenced variables
200       // which wouldn't break the pattern if it is used in place of the
201       // current variable, we provide this variable as the suggested fix.
202       if (OtherOccurence.KindID < Variables.size())
203         FirstSuggestion = Variables[OtherOccurence.KindID];
204 
205       // Store information about the first clone.
206       FirstMismatch->FirstCloneInfo =
207           CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo(
208               Variables[ThisOccurence.KindID], ThisOccurence.Range,
209               FirstSuggestion);
210 
211       // Same as above but with the other clone. We do this for both clones as
212       // we don't know which clone is the one containing the unintended
213       // pattern error.
214       const VarDecl *SecondSuggestion = nullptr;
215       if (ThisOccurence.KindID < Other.Variables.size())
216         SecondSuggestion = Other.Variables[ThisOccurence.KindID];
217 
218       // Store information about the second clone.
219       FirstMismatch->SecondCloneInfo =
220           CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo(
221               Variables[ThisOccurence.KindID], OtherOccurence.Range,
222               SecondSuggestion);
223 
224       // SuspiciousClonePair guarantees that the first clone always has a
225       // suggested variable associated with it. As we know that one of the two
226       // clones in the pair always has suggestion, we swap the two clones
227       // in case the first clone has no suggested variable which means that
228       // the second clone has a suggested variable and should be first.
229       if (!FirstMismatch->FirstCloneInfo.Suggestion)
230         std::swap(FirstMismatch->FirstCloneInfo,
231                   FirstMismatch->SecondCloneInfo);
232 
233       // This ensures that we always have at least one suggestion in a pair.
234       assert(FirstMismatch->FirstCloneInfo.Suggestion);
235     }
236 
237     return NumberOfDifferences;
238   }
239 };
240 }
241 
242 namespace {
243 /// \brief Collects the data of a single Stmt.
244 ///
245 /// This class defines what a code clone is: If it collects for two statements
246 /// the same data, then those two statements are considered to be clones of each
247 /// other.
248 class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector> {
249 
250   ASTContext &Context;
251   std::vector<CloneDetector::DataPiece> &CollectedData;
252 
253 public:
254   /// \brief Collects data of the given Stmt.
255   /// \param S The given statement.
256   /// \param Context The ASTContext of S.
257   /// \param D The given data vector to which all collected data is appended.
258   StmtDataCollector(const Stmt *S, ASTContext &Context,
259                     std::vector<CloneDetector::DataPiece> &D)
260       : Context(Context), CollectedData(D) {
261     Visit(S);
262   }
263 
264   // Below are utility methods for appending different data to the vector.
265 
266   void addData(CloneDetector::DataPiece Integer) {
267     CollectedData.push_back(Integer);
268   }
269 
270   // FIXME: The functions below add long strings to the data vector which are
271   // probably not good for performance. Replace the strings with pointer values
272   // or a some other unique integer.
273 
274   void addData(llvm::StringRef Str) {
275     if (Str.empty())
276       return;
277 
278     const size_t OldSize = CollectedData.size();
279 
280     const size_t PieceSize = sizeof(CloneDetector::DataPiece);
281     // Calculate how many vector units we need to accomodate all string bytes.
282     size_t RoundedUpPieceNumber = (Str.size() + PieceSize - 1) / PieceSize;
283     // Allocate space for the string in the data vector.
284     CollectedData.resize(CollectedData.size() + RoundedUpPieceNumber);
285 
286     // Copy the string to the allocated space at the end of the vector.
287     std::memcpy(CollectedData.data() + OldSize, Str.data(), Str.size());
288   }
289 
290   void addData(const QualType &QT) { addData(QT.getAsString()); }
291 
292 // The functions below collect the class specific data of each Stmt subclass.
293 
294 // Utility macro for defining a visit method for a given class. This method
295 // calls back to the ConstStmtVisitor to visit all parent classes.
296 #define DEF_ADD_DATA(CLASS, CODE)                                              \
297   void Visit##CLASS(const CLASS *S) {                                          \
298     CODE;                                                                      \
299     ConstStmtVisitor<StmtDataCollector>::Visit##CLASS(S);                      \
300   }
301 
302   DEF_ADD_DATA(Stmt, { addData(S->getStmtClass()); })
303   DEF_ADD_DATA(Expr, { addData(S->getType()); })
304 
305   //--- Builtin functionality ----------------------------------------------//
306   DEF_ADD_DATA(ArrayTypeTraitExpr, { addData(S->getTrait()); })
307   DEF_ADD_DATA(ExpressionTraitExpr, { addData(S->getTrait()); })
308   DEF_ADD_DATA(PredefinedExpr, { addData(S->getIdentType()); })
309   DEF_ADD_DATA(TypeTraitExpr, {
310     addData(S->getTrait());
311     for (unsigned i = 0; i < S->getNumArgs(); ++i)
312       addData(S->getArg(i)->getType());
313   })
314 
315   //--- Calls --------------------------------------------------------------//
316   DEF_ADD_DATA(CallExpr, {
317     // Function pointers don't have a callee and we just skip hashing it.
318     if (S->getDirectCallee())
319       addData(S->getDirectCallee()->getQualifiedNameAsString());
320   })
321 
322   //--- Exceptions ---------------------------------------------------------//
323   DEF_ADD_DATA(CXXCatchStmt, { addData(S->getCaughtType()); })
324 
325   //--- C++ OOP Stmts ------------------------------------------------------//
326   DEF_ADD_DATA(CXXDeleteExpr, {
327     addData(S->isArrayFormAsWritten());
328     addData(S->isGlobalDelete());
329   })
330 
331   //--- Casts --------------------------------------------------------------//
332   DEF_ADD_DATA(ObjCBridgedCastExpr, { addData(S->getBridgeKind()); })
333 
334   //--- Miscellaneous Exprs ------------------------------------------------//
335   DEF_ADD_DATA(BinaryOperator, { addData(S->getOpcode()); })
336   DEF_ADD_DATA(UnaryOperator, { addData(S->getOpcode()); })
337 
338   //--- Control flow -------------------------------------------------------//
339   DEF_ADD_DATA(GotoStmt, { addData(S->getLabel()->getName()); })
340   DEF_ADD_DATA(IndirectGotoStmt, {
341     if (S->getConstantTarget())
342       addData(S->getConstantTarget()->getName());
343   })
344   DEF_ADD_DATA(LabelStmt, { addData(S->getDecl()->getName()); })
345   DEF_ADD_DATA(MSDependentExistsStmt, { addData(S->isIfExists()); })
346   DEF_ADD_DATA(AddrLabelExpr, { addData(S->getLabel()->getName()); })
347 
348   //--- Objective-C --------------------------------------------------------//
349   DEF_ADD_DATA(ObjCIndirectCopyRestoreExpr, { addData(S->shouldCopy()); })
350   DEF_ADD_DATA(ObjCPropertyRefExpr, {
351     addData(S->isSuperReceiver());
352     addData(S->isImplicitProperty());
353   })
354   DEF_ADD_DATA(ObjCAtCatchStmt, { addData(S->hasEllipsis()); })
355 
356   //--- Miscellaneous Stmts ------------------------------------------------//
357   DEF_ADD_DATA(CXXFoldExpr, {
358     addData(S->isRightFold());
359     addData(S->getOperator());
360   })
361   DEF_ADD_DATA(GenericSelectionExpr, {
362     for (unsigned i = 0; i < S->getNumAssocs(); ++i) {
363       addData(S->getAssocType(i));
364     }
365   })
366   DEF_ADD_DATA(LambdaExpr, {
367     for (const LambdaCapture &C : S->captures()) {
368       addData(C.isPackExpansion());
369       addData(C.getCaptureKind());
370       if (C.capturesVariable())
371         addData(C.getCapturedVar()->getType());
372     }
373     addData(S->isGenericLambda());
374     addData(S->isMutable());
375   })
376   DEF_ADD_DATA(DeclStmt, {
377     auto numDecls = std::distance(S->decl_begin(), S->decl_end());
378     addData(static_cast<CloneDetector::DataPiece>(numDecls));
379     for (const Decl *D : S->decls()) {
380       if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
381         addData(VD->getType());
382       }
383     }
384   })
385   DEF_ADD_DATA(AsmStmt, {
386     addData(S->isSimple());
387     addData(S->isVolatile());
388     addData(S->generateAsmString(Context));
389     for (unsigned i = 0; i < S->getNumInputs(); ++i) {
390       addData(S->getInputConstraint(i));
391     }
392     for (unsigned i = 0; i < S->getNumOutputs(); ++i) {
393       addData(S->getOutputConstraint(i));
394     }
395     for (unsigned i = 0; i < S->getNumClobbers(); ++i) {
396       addData(S->getClobber(i));
397     }
398   })
399   DEF_ADD_DATA(AttributedStmt, {
400     for (const Attr *A : S->getAttrs()) {
401       addData(std::string(A->getSpelling()));
402     }
403   })
404 };
405 } // end anonymous namespace
406 
407 namespace {
408 /// Generates CloneSignatures for a set of statements and stores the results in
409 /// a CloneDetector object.
410 class CloneSignatureGenerator {
411 
412   CloneDetector &CD;
413   ASTContext &Context;
414 
415   /// \brief Generates CloneSignatures for all statements in the given statement
416   /// tree and stores them in the CloneDetector.
417   ///
418   /// \param S The root of the given statement tree.
419   /// \return The CloneSignature of the root statement.
420   CloneDetector::CloneSignature generateSignatures(const Stmt *S) {
421     // Create an empty signature that will be filled in this method.
422     CloneDetector::CloneSignature Signature;
423 
424     // Collect all relevant data from S and put it into the empty signature.
425     StmtDataCollector(S, Context, Signature.Data);
426 
427     // Storage for the signatures of the direct child statements. This is only
428     // needed if the current statement is a CompoundStmt.
429     std::vector<CloneDetector::CloneSignature> ChildSignatures;
430     const CompoundStmt *CS = dyn_cast<const CompoundStmt>(S);
431 
432     // The signature of a statement includes the signatures of its children.
433     // Therefore we create the signatures for every child and add them to the
434     // current signature.
435     for (const Stmt *Child : S->children()) {
436       // Some statements like 'if' can have nullptr children that we will skip.
437       if (!Child)
438         continue;
439 
440       // Recursive call to create the signature of the child statement. This
441       // will also create and store all clone groups in this child statement.
442       auto ChildSignature = generateSignatures(Child);
443 
444       // Add the collected data to the signature of the current statement.
445       Signature.add(ChildSignature);
446 
447       // If the current statement is a CompoundStatement, we need to store the
448       // signature for the generation of the sub-sequences.
449       if (CS)
450         ChildSignatures.push_back(ChildSignature);
451     }
452 
453     // If the current statement is a CompoundStmt, we also need to create the
454     // clone groups from the sub-sequences inside the children.
455     if (CS)
456       handleSubSequences(CS, ChildSignatures);
457 
458     // Save the signature for the current statement in the CloneDetector object.
459     CD.add(StmtSequence(S, Context), Signature);
460 
461     return Signature;
462   }
463 
464   /// \brief Adds all possible sub-sequences in the child array of the given
465   ///        CompoundStmt to the CloneDetector.
466   /// \param CS The given CompoundStmt.
467   /// \param ChildSignatures A list of calculated signatures for each child in
468   ///                        the given CompoundStmt.
469   void handleSubSequences(
470       const CompoundStmt *CS,
471       const std::vector<CloneDetector::CloneSignature> &ChildSignatures) {
472 
473     // FIXME: This function has quadratic runtime right now. Check if skipping
474     // this function for too long CompoundStmts is an option.
475 
476     // The length of the sub-sequence. We don't need to handle sequences with
477     // the length 1 as they are already handled in CollectData().
478     for (unsigned Length = 2; Length <= CS->size(); ++Length) {
479       // The start index in the body of the CompoundStmt. We increase the
480       // position until the end of the sub-sequence reaches the end of the
481       // CompoundStmt body.
482       for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
483         // Create an empty signature and add the signatures of all selected
484         // child statements to it.
485         CloneDetector::CloneSignature SubSignature;
486 
487         for (unsigned i = Pos; i < Pos + Length; ++i) {
488           SubSignature.add(ChildSignatures[i]);
489         }
490 
491         // Save the signature together with the information about what children
492         // sequence we selected.
493         CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
494       }
495     }
496   }
497 
498 public:
499   explicit CloneSignatureGenerator(CloneDetector &CD, ASTContext &Context)
500       : CD(CD), Context(Context) {}
501 
502   /// \brief Generates signatures for all statements in the given function body.
503   void consumeCodeBody(const Stmt *S) { generateSignatures(S); }
504 };
505 } // end anonymous namespace
506 
507 void CloneDetector::analyzeCodeBody(const Decl *D) {
508   assert(D);
509   assert(D->hasBody());
510   CloneSignatureGenerator Generator(*this, D->getASTContext());
511   Generator.consumeCodeBody(D->getBody());
512 }
513 
514 void CloneDetector::add(const StmtSequence &S,
515                         const CloneSignature &Signature) {
516   // StringMap only works with StringRefs, so we create one for our data vector.
517   auto &Data = Signature.Data;
518   StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
519                                 Data.size() * sizeof(unsigned));
520 
521   // Search with the help of the signature if we already have encountered a
522   // clone of the given StmtSequence.
523   auto I = CloneGroupIndexes.find(DataRef);
524   if (I == CloneGroupIndexes.end()) {
525     // We haven't found an existing clone group, so we create a new clone group
526     // for this StmtSequence and store the index of it in our search map.
527     CloneGroupIndexes[DataRef] = CloneGroups.size();
528     CloneGroups.emplace_back(S, Signature.Complexity);
529     return;
530   }
531 
532   // We have found an existing clone group and can expand it with the given
533   // StmtSequence.
534   CloneGroups[I->getValue()].Sequences.push_back(S);
535 }
536 
537 namespace {
538 /// \brief Returns true if and only if \p Stmt contains at least one other
539 /// sequence in the \p Group.
540 bool containsAnyInGroup(StmtSequence &Stmt, CloneDetector::CloneGroup &Group) {
541   for (StmtSequence &GroupStmt : Group.Sequences) {
542     if (Stmt.contains(GroupStmt))
543       return true;
544   }
545   return false;
546 }
547 
548 /// \brief Returns true if and only if all sequences in \p OtherGroup are
549 /// contained by a sequence in \p Group.
550 bool containsGroup(CloneDetector::CloneGroup &Group,
551                    CloneDetector::CloneGroup &OtherGroup) {
552   // We have less sequences in the current group than we have in the other,
553   // so we will never fulfill the requirement for returning true. This is only
554   // possible because we know that a sequence in Group can contain at most
555   // one sequence in OtherGroup.
556   if (Group.Sequences.size() < OtherGroup.Sequences.size())
557     return false;
558 
559   for (StmtSequence &Stmt : Group.Sequences) {
560     if (!containsAnyInGroup(Stmt, OtherGroup))
561       return false;
562   }
563   return true;
564 }
565 } // end anonymous namespace
566 
567 /// \brief Finds all actual clone groups in a single group of presumed clones.
568 /// \param Result Output parameter to which all found groups are added. Every
569 ///               clone in a group that was added this way follows the same
570 ///               variable pattern as the other clones in its group.
571 /// \param Group A group of clones. The clones are allowed to have a different
572 ///              variable pattern.
573 static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
574                               const CloneDetector::CloneGroup &Group) {
575   // We remove the Sequences one by one, so a list is more appropriate.
576   std::list<StmtSequence> UnassignedSequences(Group.Sequences.begin(),
577                                               Group.Sequences.end());
578 
579   // Search for clones as long as there could be clones in UnassignedSequences.
580   while (UnassignedSequences.size() > 1) {
581 
582     // Pick the first Sequence as a protoype for a new clone group.
583     StmtSequence Prototype = UnassignedSequences.front();
584     UnassignedSequences.pop_front();
585 
586     CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Complexity);
587 
588     // Analyze the variable pattern of the prototype. Every other StmtSequence
589     // needs to have the same pattern to get into the new clone group.
590     VariablePattern PrototypeFeatures(Prototype);
591 
592     // Search all remaining StmtSequences for an identical variable pattern
593     // and assign them to our new clone group.
594     auto I = UnassignedSequences.begin(), E = UnassignedSequences.end();
595     while (I != E) {
596 
597       if (VariablePattern(*I).countPatternDifferences(PrototypeFeatures) == 0) {
598         FilteredGroup.Sequences.push_back(*I);
599         I = UnassignedSequences.erase(I);
600         continue;
601       }
602       ++I;
603     }
604 
605     // Add a valid clone group to the list of found clone groups.
606     if (!FilteredGroup.isValid())
607       continue;
608 
609     Result.push_back(FilteredGroup);
610   }
611 }
612 
613 void CloneDetector::findClones(std::vector<CloneGroup> &Result,
614                                unsigned MinGroupComplexity,
615                                bool CheckPatterns) {
616   // Add every valid clone group that fulfills the complexity requirement.
617   for (const CloneGroup &Group : CloneGroups) {
618     if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
619       if (CheckPatterns)
620         createCloneGroups(Result, Group);
621       else
622         Result.push_back(Group);
623     }
624   }
625 
626   std::vector<unsigned> IndexesToRemove;
627 
628   // Compare every group in the result with the rest. If one groups contains
629   // another group, we only need to return the bigger group.
630   // Note: This doesn't scale well, so if possible avoid calling any heavy
631   // function from this loop to minimize the performance impact.
632   for (unsigned i = 0; i < Result.size(); ++i) {
633     for (unsigned j = 0; j < Result.size(); ++j) {
634       // Don't compare a group with itself.
635       if (i == j)
636         continue;
637 
638       if (containsGroup(Result[j], Result[i])) {
639         IndexesToRemove.push_back(i);
640         break;
641       }
642     }
643   }
644 
645   // Erasing a list of indexes from the vector should be done with decreasing
646   // indexes. As IndexesToRemove is constructed with increasing values, we just
647   // reverse iterate over it to get the desired order.
648   for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) {
649     Result.erase(Result.begin() + *I);
650   }
651 }
652 
653 void CloneDetector::findSuspiciousClones(
654     std::vector<CloneDetector::SuspiciousClonePair> &Result,
655     unsigned MinGroupComplexity) {
656   std::vector<CloneGroup> Clones;
657   // Reuse the normal search for clones but specify that the clone groups don't
658   // need to have a common referenced variable pattern so that we can manually
659   // search for the kind of pattern errors this function is supposed to find.
660   findClones(Clones, MinGroupComplexity, false);
661 
662   for (const CloneGroup &Group : Clones) {
663     for (unsigned i = 0; i < Group.Sequences.size(); ++i) {
664       VariablePattern PatternA(Group.Sequences[i]);
665 
666       for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) {
667         VariablePattern PatternB(Group.Sequences[j]);
668 
669         CloneDetector::SuspiciousClonePair ClonePair;
670         // For now, we only report clones which break the variable pattern just
671         // once because multiple differences in a pattern are an indicator that
672         // those differences are maybe intended (e.g. because it's actually
673         // a different algorithm).
674         // TODO: In very big clones even multiple variables can be unintended,
675         // so replacing this number with a percentage could better handle such
676         // cases. On the other hand it could increase the false-positive rate
677         // for all clones if the percentage is too high.
678         if (PatternA.countPatternDifferences(PatternB, &ClonePair) == 1) {
679           Result.push_back(ClonePair);
680           break;
681         }
682       }
683     }
684   }
685 }
686