1 //===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // Implementation file for the IRSimilarityIdentifier for identifying
11 // similarities in IR including the IRInstructionMapper.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Analysis/IRSimilarityIdentifier.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/IR/Intrinsics.h"
18 #include "llvm/IR/Operator.h"
19 #include "llvm/IR/User.h"
20 #include "llvm/InitializePasses.h"
21 #include "llvm/Support/SuffixTree.h"
22 
23 using namespace llvm;
24 using namespace IRSimilarity;
25 
26 IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
27                                      IRInstructionDataList &IDList)
28     : Inst(&I), Legal(Legality), IDL(&IDList) {
29   // We check for whether we have a comparison instruction.  If it is, we
30   // find the "less than" version of the predicate for consistency for
31   // comparison instructions throught the program.
32   if (CmpInst *C = dyn_cast<CmpInst>(&I)) {
33     CmpInst::Predicate Predicate = predicateForConsistency(C);
34     if (Predicate != C->getPredicate())
35       RevisedPredicate = Predicate;
36   }
37 
38   // Here we collect the operands and their types for determining whether
39   // the structure of the operand use matches between two different candidates.
40   for (Use &OI : I.operands()) {
41     if (isa<CmpInst>(I) && RevisedPredicate.hasValue()) {
42       // If we have a CmpInst where the predicate is reversed, it means the
43       // operands must be reversed as well.
44       OperVals.insert(OperVals.begin(), OI.get());
45       continue;
46     }
47 
48     OperVals.push_back(OI.get());
49   }
50 }
51 
52 CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
53   switch (CI->getPredicate()) {
54   case CmpInst::FCMP_OGT:
55   case CmpInst::FCMP_UGT:
56   case CmpInst::FCMP_OGE:
57   case CmpInst::FCMP_UGE:
58   case CmpInst::ICMP_SGT:
59   case CmpInst::ICMP_UGT:
60   case CmpInst::ICMP_SGE:
61   case CmpInst::ICMP_UGE:
62     return CI->getSwappedPredicate();
63   default:
64     return CI->getPredicate();
65   }
66 }
67 
68 CmpInst::Predicate IRInstructionData::getPredicate() const {
69   assert(isa<CmpInst>(Inst) &&
70          "Can only get a predicate from a compare instruction");
71 
72   if (RevisedPredicate.hasValue())
73     return RevisedPredicate.getValue();
74 
75   return cast<CmpInst>(Inst)->getPredicate();
76 }
77 
78 static StringRef getCalledFunctionName(CallInst &CI) {
79   assert(CI.getCalledFunction() != nullptr && "Called Function is nullptr?");
80 
81   return CI.getCalledFunction()->getName();
82 }
83 
84 bool IRSimilarity::isClose(const IRInstructionData &A,
85                            const IRInstructionData &B) {
86 
87   if (!A.Legal || !B.Legal)
88     return false;
89 
90   // Check if we are performing the same sort of operation on the same types
91   // but not on the same values.
92   if (!A.Inst->isSameOperationAs(B.Inst)) {
93     // If there is a predicate, this means that either there is a swapped
94     // predicate, or that the types are different, we want to make sure that
95     // the predicates are equivalent via swapping.
96     if (isa<CmpInst>(A.Inst) && isa<CmpInst>(B.Inst)) {
97 
98       if (A.getPredicate() != B.getPredicate())
99         return false;
100 
101       // If the predicates are the same via swap, make sure that the types are
102       // still the same.
103       auto ZippedTypes = zip(A.OperVals, B.OperVals);
104 
105       return all_of(
106           ZippedTypes, [](std::tuple<llvm::Value *, llvm::Value *> R) {
107             return std::get<0>(R)->getType() == std::get<1>(R)->getType();
108           });
109     }
110 
111     return false;
112   }
113 
114   // Since any GEP Instruction operands after the first operand cannot be
115   // defined by a register, we must make sure that the operands after the first
116   // are the same in the two instructions
117   if (auto *GEP = dyn_cast<GetElementPtrInst>(A.Inst)) {
118     auto *OtherGEP = cast<GetElementPtrInst>(B.Inst);
119 
120     // If the instructions do not have the same inbounds restrictions, we do
121     // not consider them the same.
122     if (GEP->isInBounds() != OtherGEP->isInBounds())
123       return false;
124 
125     auto ZippedOperands = zip(GEP->indices(), OtherGEP->indices());
126 
127     auto ZIt = ZippedOperands.begin();
128 
129     // We increment here since we do not care about the first instruction,
130     // we only care about the following operands since they must be the
131     // exact same to be considered similar.
132     return std::all_of(++ZIt, ZippedOperands.end(),
133                        [](std::tuple<llvm::Use &, llvm::Use &> R) {
134                          return std::get<0>(R) == std::get<1>(R);
135                        });
136   }
137 
138   // If the instructions are functions, we make sure that the function name is
139   // the same.  We already know that the types are since is isSameOperationAs is
140   // true.
141   if (isa<CallInst>(A.Inst) && isa<CallInst>(B.Inst)) {
142     CallInst *CIA = cast<CallInst>(A.Inst);
143     CallInst *CIB = cast<CallInst>(B.Inst);
144     if (getCalledFunctionName(*CIA).compare(getCalledFunctionName(*CIB)) != 0)
145       return false;
146   }
147 
148   return true;
149 }
150 
151 // TODO: This is the same as the MachineOutliner, and should be consolidated
152 // into the same interface.
153 void IRInstructionMapper::convertToUnsignedVec(
154     BasicBlock &BB, std::vector<IRInstructionData *> &InstrList,
155     std::vector<unsigned> &IntegerMapping) {
156   BasicBlock::iterator It = BB.begin();
157 
158   std::vector<unsigned> IntegerMappingForBB;
159   std::vector<IRInstructionData *> InstrListForBB;
160 
161   HaveLegalRange = false;
162   CanCombineWithPrevInstr = false;
163   AddedIllegalLastTime = true;
164 
165   for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
166     switch (InstClassifier.visit(*It)) {
167     case InstrType::Legal:
168       mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
169       break;
170     case InstrType::Illegal:
171       mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
172       break;
173     case InstrType::Invisible:
174       AddedIllegalLastTime = false;
175       break;
176     }
177   }
178 
179   if (HaveLegalRange) {
180     mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
181     for_each(InstrListForBB,
182              [this](IRInstructionData *ID) { this->IDL->push_back(*ID); });
183     llvm::append_range(InstrList, InstrListForBB);
184     llvm::append_range(IntegerMapping, IntegerMappingForBB);
185   }
186 }
187 
188 // TODO: This is the same as the MachineOutliner, and should be consolidated
189 // into the same interface.
190 unsigned IRInstructionMapper::mapToLegalUnsigned(
191     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
192     std::vector<IRInstructionData *> &InstrListForBB) {
193   // We added something legal, so we should unset the AddedLegalLastTime
194   // flag.
195   AddedIllegalLastTime = false;
196 
197   // If we have at least two adjacent legal instructions (which may have
198   // invisible instructions in between), remember that.
199   if (CanCombineWithPrevInstr)
200     HaveLegalRange = true;
201   CanCombineWithPrevInstr = true;
202 
203   // Get the integer for this instruction or give it the current
204   // LegalInstrNumber.
205   IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL);
206   InstrListForBB.push_back(ID);
207 
208   // Add to the instruction list
209   bool WasInserted;
210   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
211       ResultIt;
212   std::tie(ResultIt, WasInserted) =
213       InstructionIntegerMap.insert(std::make_pair(ID, LegalInstrNumber));
214   unsigned INumber = ResultIt->second;
215 
216   // There was an insertion.
217   if (WasInserted)
218     LegalInstrNumber++;
219 
220   IntegerMappingForBB.push_back(INumber);
221 
222   // Make sure we don't overflow or use any integers reserved by the DenseMap.
223   assert(LegalInstrNumber < IllegalInstrNumber &&
224          "Instruction mapping overflow!");
225 
226   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
227          "Tried to assign DenseMap tombstone or empty key to instruction.");
228   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
229          "Tried to assign DenseMap tombstone or empty key to instruction.");
230 
231   return INumber;
232 }
233 
234 IRInstructionData *
235 IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
236                                                IRInstructionDataList &IDL) {
237   return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
238 }
239 
240 IRInstructionDataList *
241 IRInstructionMapper::allocateIRInstructionDataList() {
242   return new (IDLAllocator->Allocate()) IRInstructionDataList();
243 }
244 
245 // TODO: This is the same as the MachineOutliner, and should be consolidated
246 // into the same interface.
247 unsigned IRInstructionMapper::mapToIllegalUnsigned(
248     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
249     std::vector<IRInstructionData *> &InstrListForBB, bool End) {
250   // Can't combine an illegal instruction. Set the flag.
251   CanCombineWithPrevInstr = false;
252 
253   // Only add one illegal number per range of legal numbers.
254   if (AddedIllegalLastTime)
255     return IllegalInstrNumber;
256 
257   IRInstructionData *ID = nullptr;
258   if (!End)
259     ID = allocateIRInstructionData(*It, false, *IDL);
260   InstrListForBB.push_back(ID);
261 
262   // Remember that we added an illegal number last time.
263   AddedIllegalLastTime = true;
264   unsigned INumber = IllegalInstrNumber;
265   IntegerMappingForBB.push_back(IllegalInstrNumber--);
266 
267   assert(LegalInstrNumber < IllegalInstrNumber &&
268          "Instruction mapping overflow!");
269 
270   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
271          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
272 
273   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
274          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
275 
276   return INumber;
277 }
278 
279 IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
280                                              IRInstructionData *FirstInstIt,
281                                              IRInstructionData *LastInstIt)
282     : StartIdx(StartIdx), Len(Len) {
283 
284   assert(FirstInstIt != nullptr && "Instruction is nullptr!");
285   assert(LastInstIt != nullptr && "Instruction is nullptr!");
286   assert(StartIdx + Len > StartIdx &&
287          "Overflow for IRSimilarityCandidate range?");
288   assert(Len - 1 == static_cast<unsigned>(std::distance(
289                         iterator(FirstInstIt), iterator(LastInstIt))) &&
290          "Length of the first and last IRInstructionData do not match the "
291          "given length");
292 
293   // We iterate over the given instructions, and map each unique value
294   // to a unique number in the IRSimilarityCandidate ValueToNumber and
295   // NumberToValue maps.  A constant get its own value globally, the individual
296   // uses of the constants are not considered to be unique.
297   //
298   // IR:                    Mapping Added:
299   // %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
300   // %add2 = add i32 %a, %1    %add2 -> 4
301   // %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
302   //
303   // when replace with global values, starting from 1, would be
304   //
305   // 3 = add i32 1, 2
306   // 4 = add i32 1, 3
307   // 6 = add i32 5, 2
308   unsigned LocalValNumber = 1;
309   IRInstructionDataList::iterator ID = iterator(*FirstInstIt);
310   for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) {
311     // Map the operand values to an unsigned integer if it does not already
312     // have an unsigned integer assigned to it.
313     for (Value *Arg : ID->OperVals)
314       if (ValueToNumber.find(Arg) == ValueToNumber.end()) {
315         ValueToNumber.try_emplace(Arg, LocalValNumber);
316         NumberToValue.try_emplace(LocalValNumber, Arg);
317         LocalValNumber++;
318       }
319 
320     // Mapping the instructions to an unsigned integer if it is not already
321     // exist in the mapping.
322     if (ValueToNumber.find(ID->Inst) == ValueToNumber.end()) {
323       ValueToNumber.try_emplace(ID->Inst, LocalValNumber);
324       NumberToValue.try_emplace(LocalValNumber, ID->Inst);
325       LocalValNumber++;
326     }
327   }
328 
329   // Setting the first and last instruction data pointers for the candidate.  If
330   // we got through the entire for loop without hitting an assert, we know
331   // that both of these instructions are not nullptrs.
332   FirstInst = FirstInstIt;
333   LastInst = LastInstIt;
334 }
335 
336 bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
337                                       const IRSimilarityCandidate &B) {
338   if (A.getLength() != B.getLength())
339     return false;
340 
341   auto InstrDataForBoth =
342       zip(make_range(A.begin(), A.end()), make_range(B.begin(), B.end()));
343 
344   return all_of(InstrDataForBoth,
345                 [](std::tuple<IRInstructionData &, IRInstructionData &> R) {
346                   IRInstructionData &A = std::get<0>(R);
347                   IRInstructionData &B = std::get<1>(R);
348                   if (!A.Legal || !B.Legal)
349                     return false;
350                   return isClose(A, B);
351                 });
352 }
353 
354 /// Determine if one or more of the assigned global value numbers for the
355 /// operands in \p TargetValueNumbers is in the current mapping set for operand
356 /// numbers in \p SourceOperands.  The set of possible corresponding global
357 /// value numbers are replaced with the most recent version of compatible
358 /// values.
359 ///
360 /// \param [in] SourceValueToNumberMapping - The mapping of a Value to global
361 /// value number for the source IRInstructionCandidate.
362 /// \param [in, out] CurrentSrcTgtNumberMapping - The current mapping of source
363 /// IRSimilarityCandidate global value numbers to a set of possible numbers in
364 /// the target.
365 /// \param [in] SourceOperands - The operands in the original
366 /// IRSimilarityCandidate in the current instruction.
367 /// \param [in] TargetValueNumbers - The global value numbers of the operands in
368 /// the corresponding Instruction in the other IRSimilarityCandidate.
369 /// \returns true if there exists a possible mapping between the source
370 /// Instruction operands and the target Instruction operands, and false if not.
371 static bool checkNumberingAndReplaceCommutative(
372   const DenseMap<Value *, unsigned> &SourceValueToNumberMapping,
373   DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
374   ArrayRef<Value *> &SourceOperands,
375   DenseSet<unsigned> &TargetValueNumbers){
376 
377   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
378 
379   unsigned ArgVal;
380   bool WasInserted;
381 
382   // Iterate over the operands in the source IRSimilarityCandidate to determine
383   // whether there exists an operand in the other IRSimilarityCandidate that
384   // creates a valid mapping of Value to Value between the
385   // IRSimilarityCaniddates.
386   for (Value *V : SourceOperands) {
387     ArgVal = SourceValueToNumberMapping.find(V)->second;
388 
389     std::tie(ValueMappingIt, WasInserted) = CurrentSrcTgtNumberMapping.insert(
390         std::make_pair(ArgVal, TargetValueNumbers));
391 
392     // Instead of finding a current mapping, we inserted a set.  This means a
393     // mapping did not exist for the source Instruction operand, it has no
394     // current constraints we need to check.
395     if (WasInserted)
396       continue;
397 
398     // If a mapping already exists for the source operand to the values in the
399     // other IRSimilarityCandidate we need to iterate over the items in other
400     // IRSimilarityCandidate's Instruction to determine whether there is a valid
401     // mapping of Value to Value.
402     DenseSet<unsigned> NewSet;
403     for (unsigned &Curr : ValueMappingIt->second)
404       // If we can find the value in the mapping, we add it to the new set.
405       if (TargetValueNumbers.contains(Curr))
406         NewSet.insert(Curr);
407 
408     // If we could not find a Value, return 0.
409     if (NewSet.empty())
410       return false;
411 
412     // Otherwise replace the old mapping with the newly constructed one.
413     if (NewSet.size() != ValueMappingIt->second.size())
414       ValueMappingIt->second.swap(NewSet);
415 
416     // We have reached no conclusions about the mapping, and cannot remove
417     // any items from the other operands, so we move to check the next operand.
418     if (ValueMappingIt->second.size() != 1)
419       continue;
420 
421 
422     unsigned ValToRemove = *ValueMappingIt->second.begin();
423     // When there is only one item left in the mapping for and operand, remove
424     // the value from the other operands.  If it results in there being no
425     // mapping, return false, it means the mapping is wrong
426     for (Value *InnerV : SourceOperands) {
427       if (V == InnerV)
428         continue;
429 
430       unsigned InnerVal = SourceValueToNumberMapping.find(InnerV)->second;
431       ValueMappingIt = CurrentSrcTgtNumberMapping.find(InnerVal);
432       if (ValueMappingIt == CurrentSrcTgtNumberMapping.end())
433         continue;
434 
435       ValueMappingIt->second.erase(ValToRemove);
436       if (ValueMappingIt->second.empty())
437         return false;
438     }
439   }
440 
441   return true;
442 }
443 
444 /// Determine if operand number \p TargetArgVal is in the current mapping set
445 /// for operand number \p SourceArgVal.
446 ///
447 /// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global
448 /// value numbers from source IRSimilarityCandidate to target
449 /// IRSimilarityCandidate.
450 /// \param [in] SourceArgVal The global value number for an operand in the
451 /// in the original candidate.
452 /// \param [in] TargetArgVal The global value number for the corresponding
453 /// operand in the other candidate.
454 /// \returns True if there exists a mapping and false if not.
455 bool checkNumberingAndReplace(
456     DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
457     unsigned SourceArgVal, unsigned TargetArgVal) {
458   // We are given two unsigned integers representing the global values of
459   // the operands in different IRSimilarityCandidates and a current mapping
460   // between the two.
461   //
462   // Source Operand GVN: 1
463   // Target Operand GVN: 2
464   // CurrentMapping: {1: {1, 2}}
465   //
466   // Since we have mapping, and the target operand is contained in the set, we
467   // update it to:
468   // CurrentMapping: {1: {2}}
469   // and can return true. But, if the mapping was
470   // CurrentMapping: {1: {3}}
471   // we would return false.
472 
473   bool WasInserted;
474   DenseMap<unsigned, DenseSet<unsigned>>::iterator Val;
475 
476   std::tie(Val, WasInserted) = CurrentSrcTgtNumberMapping.insert(
477       std::make_pair(SourceArgVal, DenseSet<unsigned>({TargetArgVal})));
478 
479   // If we created a new mapping, then we are done.
480   if (WasInserted)
481     return true;
482 
483   // If there is more than one option in the mapping set, and the target value
484   // is included in the mapping set replace that set with one that only includes
485   // the target value, as it is the only valid mapping via the non commutative
486   // instruction.
487 
488   DenseSet<unsigned> &TargetSet = Val->second;
489   if (TargetSet.size() > 1 && TargetSet.contains(TargetArgVal)) {
490     TargetSet.clear();
491     TargetSet.insert(TargetArgVal);
492     return true;
493   }
494 
495   // Return true if we can find the value in the set.
496   return TargetSet.contains(TargetArgVal);
497 }
498 
499 bool IRSimilarityCandidate::compareNonCommutativeOperandMapping(
500     OperandMapping A, OperandMapping B) {
501   // Iterators to keep track of where we are in the operands for each
502   // Instruction.
503   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
504   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
505   unsigned OperandLength = A.OperVals.size();
506 
507   // For each operand, get the value numbering and ensure it is consistent.
508   for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) {
509     unsigned OperValA = A.IRSC.ValueToNumber.find(*VItA)->second;
510     unsigned OperValB = B.IRSC.ValueToNumber.find(*VItB)->second;
511 
512     // Attempt to add a set with only the target value.  If there is no mapping
513     // we can create it here.
514     //
515     // For an instruction like a subtraction:
516     // IRSimilarityCandidateA:  IRSimilarityCandidateB:
517     // %resultA = sub %a, %b    %resultB = sub %d, %e
518     //
519     // We map %a -> %d and %b -> %e.
520     //
521     // And check to see whether their mapping is consistent in
522     // checkNumberingAndReplace.
523 
524     if (!checkNumberingAndReplace(A.ValueNumberMapping, OperValA, OperValB))
525       return false;
526 
527     if (!checkNumberingAndReplace(B.ValueNumberMapping, OperValB, OperValA))
528       return false;
529   }
530   return true;
531 }
532 
533 bool IRSimilarityCandidate::compareCommutativeOperandMapping(
534     OperandMapping A, OperandMapping B) {
535   DenseSet<unsigned> ValueNumbersA;
536   DenseSet<unsigned> ValueNumbersB;
537 
538   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
539   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
540   unsigned OperandLength = A.OperVals.size();
541 
542   // Find the value number sets for the operands.
543   for (unsigned Idx = 0; Idx < OperandLength;
544        Idx++, VItA++, VItB++) {
545     ValueNumbersA.insert(A.IRSC.ValueToNumber.find(*VItA)->second);
546     ValueNumbersB.insert(B.IRSC.ValueToNumber.find(*VItB)->second);
547   }
548 
549   // Iterate over the operands in the first IRSimilarityCandidate and make sure
550   // there exists a possible mapping with the operands in the second
551   // IRSimilarityCandidate.
552   if (!checkNumberingAndReplaceCommutative(A.IRSC.ValueToNumber,
553                                            A.ValueNumberMapping, A.OperVals,
554                                            ValueNumbersB))
555     return false;
556 
557   // Iterate over the operands in the second IRSimilarityCandidate and make sure
558   // there exists a possible mapping with the operands in the first
559   // IRSimilarityCandidate.
560   if (!checkNumberingAndReplaceCommutative(B.IRSC.ValueToNumber,
561                                            B.ValueNumberMapping, B.OperVals,
562                                            ValueNumbersA))
563     return false;
564 
565   return true;
566 }
567 
568 bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
569                                              const IRSimilarityCandidate &B) {
570   if (A.getLength() != B.getLength())
571     return false;
572 
573   if (A.ValueToNumber.size() != B.ValueToNumber.size())
574     return false;
575 
576   iterator ItA = A.begin();
577   iterator ItB = B.begin();
578 
579   // These sets create a create a mapping between the values in one candidate
580   // to values in the other candidate.  If we create a set with one element,
581   // and that same element maps to the original element in the candidate
582   // we have a good mapping.
583   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
584   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
585   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
586 
587   bool WasInserted;
588 
589   // Iterate over the instructions contained in each candidate
590   unsigned SectionLength = A.getStartIdx() + A.getLength();
591   for (unsigned Loc = A.getStartIdx(); Loc < SectionLength;
592        ItA++, ItB++, Loc++) {
593     // Make sure the instructions are similar to one another.
594     if (!isClose(*ItA, *ItB))
595       return false;
596 
597     Instruction *IA = ItA->Inst;
598     Instruction *IB = ItB->Inst;
599 
600     if (!ItA->Legal || !ItB->Legal)
601       return false;
602 
603     // Get the operand sets for the instructions.
604     ArrayRef<Value *> OperValsA = ItA->OperVals;
605     ArrayRef<Value *> OperValsB = ItB->OperVals;
606 
607     unsigned InstValA = A.ValueToNumber.find(IA)->second;
608     unsigned InstValB = B.ValueToNumber.find(IB)->second;
609 
610     // Ensure that the mappings for the instructions exists.
611     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingA.insert(
612         std::make_pair(InstValA, DenseSet<unsigned>({InstValB})));
613     if (!WasInserted && !ValueMappingIt->second.contains(InstValB))
614       return false;
615 
616     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingB.insert(
617         std::make_pair(InstValB, DenseSet<unsigned>({InstValA})));
618     if (!WasInserted && !ValueMappingIt->second.contains(InstValA))
619       return false;
620 
621     // We have different paths for commutative instructions and non-commutative
622     // instructions since commutative instructions could allow multiple mappings
623     // to certain values.
624     if (IA->isCommutative() && !isa<FPMathOperator>(IA)) {
625       if (!compareCommutativeOperandMapping(
626               {A, OperValsA, ValueNumberMappingA},
627               {B, OperValsB, ValueNumberMappingB}))
628         return false;
629       continue;
630     }
631 
632     // Handle the non-commutative cases.
633     if (!compareNonCommutativeOperandMapping(
634             {A, OperValsA, ValueNumberMappingA},
635             {B, OperValsB, ValueNumberMappingB}))
636       return false;
637   }
638   return true;
639 }
640 
641 bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A,
642                                     const IRSimilarityCandidate &B) {
643   auto DoesOverlap = [](const IRSimilarityCandidate &X,
644                         const IRSimilarityCandidate &Y) {
645     // Check:
646     // XXXXXX        X starts before Y ends
647     //      YYYYYYY  Y starts after X starts
648     return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx;
649   };
650 
651   return DoesOverlap(A, B) || DoesOverlap(B, A);
652 }
653 
654 void IRSimilarityIdentifier::populateMapper(
655     Module &M, std::vector<IRInstructionData *> &InstrList,
656     std::vector<unsigned> &IntegerMapping) {
657 
658   std::vector<IRInstructionData *> InstrListForModule;
659   std::vector<unsigned> IntegerMappingForModule;
660   // Iterate over the functions in the module to map each Instruction in each
661   // BasicBlock to an unsigned integer.
662   for (Function &F : M) {
663 
664     if (F.empty())
665       continue;
666 
667     for (BasicBlock &BB : F) {
668 
669       if (BB.sizeWithoutDebug() < 2)
670         continue;
671 
672       // BB has potential to have similarity since it has a size greater than 2
673       // and can therefore match other regions greater than 2. Map it to a list
674       // of unsigned integers.
675       Mapper.convertToUnsignedVec(BB, InstrListForModule,
676                                   IntegerMappingForModule);
677     }
678   }
679 
680   // Insert the InstrListForModule at the end of the overall InstrList so that
681   // we can have a long InstrList for the entire set of Modules being analyzed.
682   llvm::append_range(InstrList, InstrListForModule);
683   // Do the same as above, but for IntegerMapping.
684   llvm::append_range(IntegerMapping, IntegerMappingForModule);
685 }
686 
687 void IRSimilarityIdentifier::populateMapper(
688     ArrayRef<std::unique_ptr<Module>> &Modules,
689     std::vector<IRInstructionData *> &InstrList,
690     std::vector<unsigned> &IntegerMapping) {
691 
692   // Iterate over, and map the instructions in each module.
693   for (const std::unique_ptr<Module> &M : Modules)
694     populateMapper(*M, InstrList, IntegerMapping);
695 }
696 
697 /// From a repeated subsequence, find all the different instances of the
698 /// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
699 /// the IRInstructionData in subsequence.
700 ///
701 /// \param [in] Mapper - The instruction mapper for sanity checks.
702 /// \param [in] InstrList - The vector that holds the instruction data.
703 /// \param [in] IntegerMapping - The vector that holds the mapped integers.
704 /// \param [out] CandsForRepSubstring - The vector to store the generated
705 /// IRSimilarityCandidates.
706 static void createCandidatesFromSuffixTree(
707     IRInstructionMapper Mapper, std::vector<IRInstructionData *> &InstrList,
708     std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
709     std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
710 
711   unsigned StringLen = RS.Length;
712 
713   // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
714   for (const unsigned &StartIdx : RS.StartIndices) {
715     unsigned EndIdx = StartIdx + StringLen - 1;
716 
717     // Check that this subsequence does not contain an illegal instruction.
718     bool ContainsIllegal = false;
719     for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
720       unsigned Key = IntegerMapping[CurrIdx];
721       if (Key > Mapper.IllegalInstrNumber) {
722         ContainsIllegal = true;
723         break;
724       }
725     }
726 
727     // If we have an illegal instruction, we should not create an
728     // IRSimilarityCandidate for this region.
729     if (ContainsIllegal)
730       continue;
731 
732     // We are getting iterators to the instructions in this region of code
733     // by advancing the start and end indices from the start of the
734     // InstrList.
735     std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
736     std::advance(StartIt, StartIdx);
737     std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
738     std::advance(EndIt, EndIdx);
739 
740     CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt);
741   }
742 }
743 
744 /// From the list of IRSimilarityCandidates, perform a comparison between each
745 /// IRSimilarityCandidate to determine if there are overlapping
746 /// IRInstructionData, or if they do not have the same structure.
747 ///
748 /// \param [in] CandsForRepSubstring - The vector containing the
749 /// IRSimilarityCandidates.
750 /// \param [out] StructuralGroups - the mapping of unsigned integers to vector
751 /// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
752 /// vector are structurally similar to one another.
753 static void findCandidateStructures(
754     std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
755     DenseMap<unsigned, SimilarityGroup> &StructuralGroups) {
756   std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
757       InnerCandEndIt;
758 
759   // IRSimilarityCandidates each have a structure for operand use.  It is
760   // possible that two instances of the same subsequences have different
761   // structure. Each type of structure found is assigned a number.  This
762   // DenseMap maps an IRSimilarityCandidate to which type of similarity
763   // discovered it fits within.
764   DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
765 
766   // Find the compatibility from each candidate to the others to determine
767   // which candidates overlap and which have the same structure by mapping
768   // each structure to a different group.
769   bool SameStructure;
770   bool Inserted;
771   unsigned CurrentGroupNum = 0;
772   unsigned OuterGroupNum;
773   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
774   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
775   DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
776 
777   // Iterate over the candidates to determine its structural and overlapping
778   // compatibility with other instructions
779   for (CandIt = CandsForRepSubstring.begin(),
780       CandEndIt = CandsForRepSubstring.end();
781        CandIt != CandEndIt; CandIt++) {
782 
783     // Determine if it has an assigned structural group already.
784     CandToGroupIt = CandToGroup.find(&*CandIt);
785     if (CandToGroupIt == CandToGroup.end()) {
786       // If not, we assign it one, and add it to our mapping.
787       std::tie(CandToGroupIt, Inserted) =
788           CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++));
789     }
790 
791     // Get the structural group number from the iterator.
792     OuterGroupNum = CandToGroupIt->second;
793 
794     // Check if we already have a list of IRSimilarityCandidates for the current
795     // structural group.  Create one if one does not exist.
796     CurrentGroupPair = StructuralGroups.find(OuterGroupNum);
797     if (CurrentGroupPair == StructuralGroups.end())
798       std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert(
799           std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt})));
800 
801     // Iterate over the IRSimilarityCandidates following the current
802     // IRSimilarityCandidate in the list to determine whether the two
803     // IRSimilarityCandidates are compatible.  This is so we do not repeat pairs
804     // of IRSimilarityCandidates.
805     for (InnerCandIt = std::next(CandIt),
806         InnerCandEndIt = CandsForRepSubstring.end();
807          InnerCandIt != InnerCandEndIt; InnerCandIt++) {
808 
809       // We check if the inner item has a group already, if it does, we skip it.
810       CandToGroupItInner = CandToGroup.find(&*InnerCandIt);
811       if (CandToGroupItInner != CandToGroup.end())
812         continue;
813 
814       // Otherwise we determine if they have the same structure and add it to
815       // vector if they match.
816       SameStructure =
817           IRSimilarityCandidate::compareStructure(*CandIt, *InnerCandIt);
818       if (!SameStructure)
819         continue;
820 
821       CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum));
822       CurrentGroupPair->second.push_back(*InnerCandIt);
823     }
824   }
825 }
826 
827 void IRSimilarityIdentifier::findCandidates(
828     std::vector<IRInstructionData *> &InstrList,
829     std::vector<unsigned> &IntegerMapping) {
830   SuffixTree ST(IntegerMapping);
831 
832   std::vector<IRSimilarityCandidate> CandsForRepSubstring;
833   std::vector<SimilarityGroup> NewCandidateGroups;
834 
835   DenseMap<unsigned, SimilarityGroup> StructuralGroups;
836 
837   // Iterate over the subsequences found by the Suffix Tree to create
838   // IRSimilarityCandidates for each repeated subsequence and determine which
839   // instances are structurally similar to one another.
840   for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) {
841     createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, *It,
842                                    CandsForRepSubstring);
843 
844     if (CandsForRepSubstring.size() < 2)
845       continue;
846 
847     findCandidateStructures(CandsForRepSubstring, StructuralGroups);
848     for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups)
849       // We only add the group if it contains more than one
850       // IRSimilarityCandidate.  If there is only one, that means there is no
851       // other repeated subsequence with the same structure.
852       if (Group.second.size() > 1)
853         SimilarityCandidates->push_back(Group.second);
854 
855     CandsForRepSubstring.clear();
856     StructuralGroups.clear();
857     NewCandidateGroups.clear();
858   }
859 }
860 
861 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
862     ArrayRef<std::unique_ptr<Module>> Modules) {
863   resetSimilarityCandidates();
864 
865   std::vector<IRInstructionData *> InstrList;
866   std::vector<unsigned> IntegerMapping;
867 
868   populateMapper(Modules, InstrList, IntegerMapping);
869   findCandidates(InstrList, IntegerMapping);
870 
871   return SimilarityCandidates.getValue();
872 }
873 
874 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
875   resetSimilarityCandidates();
876 
877   std::vector<IRInstructionData *> InstrList;
878   std::vector<unsigned> IntegerMapping;
879 
880   populateMapper(M, InstrList, IntegerMapping);
881   findCandidates(InstrList, IntegerMapping);
882 
883   return SimilarityCandidates.getValue();
884 }
885 
886 INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
887                 "ir-similarity-identifier", false, true)
888 
889 IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
890     : ModulePass(ID) {
891   initializeIRSimilarityIdentifierWrapperPassPass(
892       *PassRegistry::getPassRegistry());
893 }
894 
895 bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
896   IRSI.reset(new IRSimilarityIdentifier(M));
897   return false;
898 }
899 
900 bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) {
901   IRSI.reset();
902   return false;
903 }
904 
905 bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
906   // All the real work is done in the constructor for the pass.
907   IRSI.reset(new IRSimilarityIdentifier(M));
908   return false;
909 }
910 
911 AnalysisKey IRSimilarityAnalysis::Key;
912 IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
913                                                ModuleAnalysisManager &) {
914 
915   return IRSimilarityIdentifier(M);
916 }
917 
918 PreservedAnalyses
919 IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
920   IRSimilarityIdentifier &IRSI = AM.getResult<IRSimilarityAnalysis>(M);
921   Optional<SimilarityGroupList> &SimilarityCandidatesOpt = IRSI.getSimilarity();
922 
923   for (std::vector<IRSimilarityCandidate> &CandVec : *SimilarityCandidatesOpt) {
924     OS << CandVec.size() << " candidates of length "
925        << CandVec.begin()->getLength() << ".  Found in: \n";
926     for (IRSimilarityCandidate &Cand : CandVec) {
927       OS << "  Function: " << Cand.front()->Inst->getFunction()->getName().str()
928          << ",  Basic Block: ";
929       if (Cand.front()->Inst->getParent()->getName().str() == "")
930         OS << "(unnamed)\n";
931       else
932         OS << Cand.front()->Inst->getParent()->getName().str() << "\n";
933     }
934   }
935 
936   return PreservedAnalyses::all();
937 }
938 
939 char IRSimilarityIdentifierWrapperPass::ID = 0;
940