1 //===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // Implementation file for the IRSimilarityIdentifier for identifying
11 // similarities in IR including the IRInstructionMapper.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Analysis/IRSimilarityIdentifier.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/IR/Intrinsics.h"
18 #include "llvm/IR/Operator.h"
19 #include "llvm/IR/User.h"
20 #include "llvm/InitializePasses.h"
21 #include "llvm/Support/SuffixTree.h"
22 
23 using namespace llvm;
24 using namespace IRSimilarity;
25 
26 IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
27                                      IRInstructionDataList &IDList)
28     : Inst(&I), Legal(Legality), IDL(&IDList) {
29   // We check for whether we have a comparison instruction.  If it is, we
30   // find the "less than" version of the predicate for consistency for
31   // comparison instructions throught the program.
32   if (CmpInst *C = dyn_cast<CmpInst>(&I)) {
33     CmpInst::Predicate Predicate = predicateForConsistency(C);
34     if (Predicate != C->getPredicate())
35       RevisedPredicate = Predicate;
36   }
37 
38   // Here we collect the operands and their types for determining whether
39   // the structure of the operand use matches between two different candidates.
40   for (Use &OI : I.operands()) {
41     if (isa<CmpInst>(I) && RevisedPredicate.hasValue()) {
42       // If we have a CmpInst where the predicate is reversed, it means the
43       // operands must be reversed as well.
44       OperVals.insert(OperVals.begin(), OI.get());
45       continue;
46     }
47 
48     OperVals.push_back(OI.get());
49   }
50 }
51 
52 CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
53   switch (CI->getPredicate()) {
54   case CmpInst::FCMP_OGT:
55   case CmpInst::FCMP_UGT:
56   case CmpInst::FCMP_OGE:
57   case CmpInst::FCMP_UGE:
58   case CmpInst::ICMP_SGT:
59   case CmpInst::ICMP_UGT:
60   case CmpInst::ICMP_SGE:
61   case CmpInst::ICMP_UGE:
62     return CI->getSwappedPredicate();
63   default:
64     return CI->getPredicate();
65   }
66 }
67 
68 CmpInst::Predicate IRInstructionData::getPredicate() const {
69   assert(isa<CmpInst>(Inst) &&
70          "Can only get a predicate from a compare instruction");
71 
72   if (RevisedPredicate.hasValue())
73     return RevisedPredicate.getValue();
74 
75   return cast<CmpInst>(Inst)->getPredicate();
76 }
77 
78 static StringRef getCalledFunctionName(CallInst &CI) {
79   assert(CI.getCalledFunction() != nullptr && "Called Function is nullptr?");
80 
81   return CI.getCalledFunction()->getName();
82 }
83 
84 bool IRSimilarity::isClose(const IRInstructionData &A,
85                            const IRInstructionData &B) {
86 
87   if (!A.Legal || !B.Legal)
88     return false;
89 
90   // Check if we are performing the same sort of operation on the same types
91   // but not on the same values.
92   if (!A.Inst->isSameOperationAs(B.Inst)) {
93     // If there is a predicate, this means that either there is a swapped
94     // predicate, or that the types are different, we want to make sure that
95     // the predicates are equivalent via swapping.
96     if (isa<CmpInst>(A.Inst) && isa<CmpInst>(B.Inst)) {
97 
98       if (A.getPredicate() != B.getPredicate())
99         return false;
100 
101       // If the predicates are the same via swap, make sure that the types are
102       // still the same.
103       auto ZippedTypes = zip(A.OperVals, B.OperVals);
104 
105       return all_of(
106           ZippedTypes, [](std::tuple<llvm::Value *, llvm::Value *> R) {
107             return std::get<0>(R)->getType() == std::get<1>(R)->getType();
108           });
109     }
110 
111     return false;
112   }
113 
114   // Since any GEP Instruction operands after the first operand cannot be
115   // defined by a register, we must make sure that the operands after the first
116   // are the same in the two instructions
117   if (auto *GEP = dyn_cast<GetElementPtrInst>(A.Inst)) {
118     auto *OtherGEP = cast<GetElementPtrInst>(B.Inst);
119 
120     // If the instructions do not have the same inbounds restrictions, we do
121     // not consider them the same.
122     if (GEP->isInBounds() != OtherGEP->isInBounds())
123       return false;
124 
125     auto ZippedOperands = zip(GEP->indices(), OtherGEP->indices());
126 
127     // We increment here since we do not care about the first instruction,
128     // we only care about the following operands since they must be the
129     // exact same to be considered similar.
130     return all_of(drop_begin(ZippedOperands),
131                   [](std::tuple<llvm::Use &, llvm::Use &> R) {
132                     return std::get<0>(R) == std::get<1>(R);
133                   });
134   }
135 
136   // If the instructions are functions, we make sure that the function name is
137   // the same.  We already know that the types are since is isSameOperationAs is
138   // true.
139   if (isa<CallInst>(A.Inst) && isa<CallInst>(B.Inst)) {
140     CallInst *CIA = cast<CallInst>(A.Inst);
141     CallInst *CIB = cast<CallInst>(B.Inst);
142     if (getCalledFunctionName(*CIA).compare(getCalledFunctionName(*CIB)) != 0)
143       return false;
144   }
145 
146   return true;
147 }
148 
149 // TODO: This is the same as the MachineOutliner, and should be consolidated
150 // into the same interface.
151 void IRInstructionMapper::convertToUnsignedVec(
152     BasicBlock &BB, std::vector<IRInstructionData *> &InstrList,
153     std::vector<unsigned> &IntegerMapping) {
154   BasicBlock::iterator It = BB.begin();
155 
156   std::vector<unsigned> IntegerMappingForBB;
157   std::vector<IRInstructionData *> InstrListForBB;
158 
159   HaveLegalRange = false;
160   CanCombineWithPrevInstr = false;
161   AddedIllegalLastTime = true;
162 
163   for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
164     switch (InstClassifier.visit(*It)) {
165     case InstrType::Legal:
166       mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
167       break;
168     case InstrType::Illegal:
169       mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
170       break;
171     case InstrType::Invisible:
172       AddedIllegalLastTime = false;
173       break;
174     }
175   }
176 
177   if (HaveLegalRange) {
178     mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
179     for (IRInstructionData *ID : InstrListForBB)
180       this->IDL->push_back(*ID);
181     llvm::append_range(InstrList, InstrListForBB);
182     llvm::append_range(IntegerMapping, IntegerMappingForBB);
183   }
184 }
185 
186 // TODO: This is the same as the MachineOutliner, and should be consolidated
187 // into the same interface.
188 unsigned IRInstructionMapper::mapToLegalUnsigned(
189     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
190     std::vector<IRInstructionData *> &InstrListForBB) {
191   // We added something legal, so we should unset the AddedLegalLastTime
192   // flag.
193   AddedIllegalLastTime = false;
194 
195   // If we have at least two adjacent legal instructions (which may have
196   // invisible instructions in between), remember that.
197   if (CanCombineWithPrevInstr)
198     HaveLegalRange = true;
199   CanCombineWithPrevInstr = true;
200 
201   // Get the integer for this instruction or give it the current
202   // LegalInstrNumber.
203   IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL);
204   InstrListForBB.push_back(ID);
205 
206   // Add to the instruction list
207   bool WasInserted;
208   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
209       ResultIt;
210   std::tie(ResultIt, WasInserted) =
211       InstructionIntegerMap.insert(std::make_pair(ID, LegalInstrNumber));
212   unsigned INumber = ResultIt->second;
213 
214   // There was an insertion.
215   if (WasInserted)
216     LegalInstrNumber++;
217 
218   IntegerMappingForBB.push_back(INumber);
219 
220   // Make sure we don't overflow or use any integers reserved by the DenseMap.
221   assert(LegalInstrNumber < IllegalInstrNumber &&
222          "Instruction mapping overflow!");
223 
224   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
225          "Tried to assign DenseMap tombstone or empty key to instruction.");
226   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
227          "Tried to assign DenseMap tombstone or empty key to instruction.");
228 
229   return INumber;
230 }
231 
232 IRInstructionData *
233 IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
234                                                IRInstructionDataList &IDL) {
235   return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
236 }
237 
238 IRInstructionDataList *
239 IRInstructionMapper::allocateIRInstructionDataList() {
240   return new (IDLAllocator->Allocate()) IRInstructionDataList();
241 }
242 
243 // TODO: This is the same as the MachineOutliner, and should be consolidated
244 // into the same interface.
245 unsigned IRInstructionMapper::mapToIllegalUnsigned(
246     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
247     std::vector<IRInstructionData *> &InstrListForBB, bool End) {
248   // Can't combine an illegal instruction. Set the flag.
249   CanCombineWithPrevInstr = false;
250 
251   // Only add one illegal number per range of legal numbers.
252   if (AddedIllegalLastTime)
253     return IllegalInstrNumber;
254 
255   IRInstructionData *ID = nullptr;
256   if (!End)
257     ID = allocateIRInstructionData(*It, false, *IDL);
258   InstrListForBB.push_back(ID);
259 
260   // Remember that we added an illegal number last time.
261   AddedIllegalLastTime = true;
262   unsigned INumber = IllegalInstrNumber;
263   IntegerMappingForBB.push_back(IllegalInstrNumber--);
264 
265   assert(LegalInstrNumber < IllegalInstrNumber &&
266          "Instruction mapping overflow!");
267 
268   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
269          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
270 
271   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
272          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
273 
274   return INumber;
275 }
276 
277 IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
278                                              IRInstructionData *FirstInstIt,
279                                              IRInstructionData *LastInstIt)
280     : StartIdx(StartIdx), Len(Len) {
281 
282   assert(FirstInstIt != nullptr && "Instruction is nullptr!");
283   assert(LastInstIt != nullptr && "Instruction is nullptr!");
284   assert(StartIdx + Len > StartIdx &&
285          "Overflow for IRSimilarityCandidate range?");
286   assert(Len - 1 == static_cast<unsigned>(std::distance(
287                         iterator(FirstInstIt), iterator(LastInstIt))) &&
288          "Length of the first and last IRInstructionData do not match the "
289          "given length");
290 
291   // We iterate over the given instructions, and map each unique value
292   // to a unique number in the IRSimilarityCandidate ValueToNumber and
293   // NumberToValue maps.  A constant get its own value globally, the individual
294   // uses of the constants are not considered to be unique.
295   //
296   // IR:                    Mapping Added:
297   // %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
298   // %add2 = add i32 %a, %1    %add2 -> 4
299   // %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
300   //
301   // when replace with global values, starting from 1, would be
302   //
303   // 3 = add i32 1, 2
304   // 4 = add i32 1, 3
305   // 6 = add i32 5, 2
306   unsigned LocalValNumber = 1;
307   IRInstructionDataList::iterator ID = iterator(*FirstInstIt);
308   for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) {
309     // Map the operand values to an unsigned integer if it does not already
310     // have an unsigned integer assigned to it.
311     for (Value *Arg : ID->OperVals)
312       if (ValueToNumber.find(Arg) == ValueToNumber.end()) {
313         ValueToNumber.try_emplace(Arg, LocalValNumber);
314         NumberToValue.try_emplace(LocalValNumber, Arg);
315         LocalValNumber++;
316       }
317 
318     // Mapping the instructions to an unsigned integer if it is not already
319     // exist in the mapping.
320     if (ValueToNumber.find(ID->Inst) == ValueToNumber.end()) {
321       ValueToNumber.try_emplace(ID->Inst, LocalValNumber);
322       NumberToValue.try_emplace(LocalValNumber, ID->Inst);
323       LocalValNumber++;
324     }
325   }
326 
327   // Setting the first and last instruction data pointers for the candidate.  If
328   // we got through the entire for loop without hitting an assert, we know
329   // that both of these instructions are not nullptrs.
330   FirstInst = FirstInstIt;
331   LastInst = LastInstIt;
332 }
333 
334 bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
335                                       const IRSimilarityCandidate &B) {
336   if (A.getLength() != B.getLength())
337     return false;
338 
339   auto InstrDataForBoth =
340       zip(make_range(A.begin(), A.end()), make_range(B.begin(), B.end()));
341 
342   return all_of(InstrDataForBoth,
343                 [](std::tuple<IRInstructionData &, IRInstructionData &> R) {
344                   IRInstructionData &A = std::get<0>(R);
345                   IRInstructionData &B = std::get<1>(R);
346                   if (!A.Legal || !B.Legal)
347                     return false;
348                   return isClose(A, B);
349                 });
350 }
351 
352 /// Determine if one or more of the assigned global value numbers for the
353 /// operands in \p TargetValueNumbers is in the current mapping set for operand
354 /// numbers in \p SourceOperands.  The set of possible corresponding global
355 /// value numbers are replaced with the most recent version of compatible
356 /// values.
357 ///
358 /// \param [in] SourceValueToNumberMapping - The mapping of a Value to global
359 /// value number for the source IRInstructionCandidate.
360 /// \param [in, out] CurrentSrcTgtNumberMapping - The current mapping of source
361 /// IRSimilarityCandidate global value numbers to a set of possible numbers in
362 /// the target.
363 /// \param [in] SourceOperands - The operands in the original
364 /// IRSimilarityCandidate in the current instruction.
365 /// \param [in] TargetValueNumbers - The global value numbers of the operands in
366 /// the corresponding Instruction in the other IRSimilarityCandidate.
367 /// \returns true if there exists a possible mapping between the source
368 /// Instruction operands and the target Instruction operands, and false if not.
369 static bool checkNumberingAndReplaceCommutative(
370   const DenseMap<Value *, unsigned> &SourceValueToNumberMapping,
371   DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
372   ArrayRef<Value *> &SourceOperands,
373   DenseSet<unsigned> &TargetValueNumbers){
374 
375   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
376 
377   unsigned ArgVal;
378   bool WasInserted;
379 
380   // Iterate over the operands in the source IRSimilarityCandidate to determine
381   // whether there exists an operand in the other IRSimilarityCandidate that
382   // creates a valid mapping of Value to Value between the
383   // IRSimilarityCaniddates.
384   for (Value *V : SourceOperands) {
385     ArgVal = SourceValueToNumberMapping.find(V)->second;
386 
387     std::tie(ValueMappingIt, WasInserted) = CurrentSrcTgtNumberMapping.insert(
388         std::make_pair(ArgVal, TargetValueNumbers));
389 
390     // Instead of finding a current mapping, we inserted a set.  This means a
391     // mapping did not exist for the source Instruction operand, it has no
392     // current constraints we need to check.
393     if (WasInserted)
394       continue;
395 
396     // If a mapping already exists for the source operand to the values in the
397     // other IRSimilarityCandidate we need to iterate over the items in other
398     // IRSimilarityCandidate's Instruction to determine whether there is a valid
399     // mapping of Value to Value.
400     DenseSet<unsigned> NewSet;
401     for (unsigned &Curr : ValueMappingIt->second)
402       // If we can find the value in the mapping, we add it to the new set.
403       if (TargetValueNumbers.contains(Curr))
404         NewSet.insert(Curr);
405 
406     // If we could not find a Value, return 0.
407     if (NewSet.empty())
408       return false;
409 
410     // Otherwise replace the old mapping with the newly constructed one.
411     if (NewSet.size() != ValueMappingIt->second.size())
412       ValueMappingIt->second.swap(NewSet);
413 
414     // We have reached no conclusions about the mapping, and cannot remove
415     // any items from the other operands, so we move to check the next operand.
416     if (ValueMappingIt->second.size() != 1)
417       continue;
418 
419 
420     unsigned ValToRemove = *ValueMappingIt->second.begin();
421     // When there is only one item left in the mapping for and operand, remove
422     // the value from the other operands.  If it results in there being no
423     // mapping, return false, it means the mapping is wrong
424     for (Value *InnerV : SourceOperands) {
425       if (V == InnerV)
426         continue;
427 
428       unsigned InnerVal = SourceValueToNumberMapping.find(InnerV)->second;
429       ValueMappingIt = CurrentSrcTgtNumberMapping.find(InnerVal);
430       if (ValueMappingIt == CurrentSrcTgtNumberMapping.end())
431         continue;
432 
433       ValueMappingIt->second.erase(ValToRemove);
434       if (ValueMappingIt->second.empty())
435         return false;
436     }
437   }
438 
439   return true;
440 }
441 
442 /// Determine if operand number \p TargetArgVal is in the current mapping set
443 /// for operand number \p SourceArgVal.
444 ///
445 /// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global
446 /// value numbers from source IRSimilarityCandidate to target
447 /// IRSimilarityCandidate.
448 /// \param [in] SourceArgVal The global value number for an operand in the
449 /// in the original candidate.
450 /// \param [in] TargetArgVal The global value number for the corresponding
451 /// operand in the other candidate.
452 /// \returns True if there exists a mapping and false if not.
453 bool checkNumberingAndReplace(
454     DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
455     unsigned SourceArgVal, unsigned TargetArgVal) {
456   // We are given two unsigned integers representing the global values of
457   // the operands in different IRSimilarityCandidates and a current mapping
458   // between the two.
459   //
460   // Source Operand GVN: 1
461   // Target Operand GVN: 2
462   // CurrentMapping: {1: {1, 2}}
463   //
464   // Since we have mapping, and the target operand is contained in the set, we
465   // update it to:
466   // CurrentMapping: {1: {2}}
467   // and can return true. But, if the mapping was
468   // CurrentMapping: {1: {3}}
469   // we would return false.
470 
471   bool WasInserted;
472   DenseMap<unsigned, DenseSet<unsigned>>::iterator Val;
473 
474   std::tie(Val, WasInserted) = CurrentSrcTgtNumberMapping.insert(
475       std::make_pair(SourceArgVal, DenseSet<unsigned>({TargetArgVal})));
476 
477   // If we created a new mapping, then we are done.
478   if (WasInserted)
479     return true;
480 
481   // If there is more than one option in the mapping set, and the target value
482   // is included in the mapping set replace that set with one that only includes
483   // the target value, as it is the only valid mapping via the non commutative
484   // instruction.
485 
486   DenseSet<unsigned> &TargetSet = Val->second;
487   if (TargetSet.size() > 1 && TargetSet.contains(TargetArgVal)) {
488     TargetSet.clear();
489     TargetSet.insert(TargetArgVal);
490     return true;
491   }
492 
493   // Return true if we can find the value in the set.
494   return TargetSet.contains(TargetArgVal);
495 }
496 
497 bool IRSimilarityCandidate::compareNonCommutativeOperandMapping(
498     OperandMapping A, OperandMapping B) {
499   // Iterators to keep track of where we are in the operands for each
500   // Instruction.
501   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
502   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
503   unsigned OperandLength = A.OperVals.size();
504 
505   // For each operand, get the value numbering and ensure it is consistent.
506   for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) {
507     unsigned OperValA = A.IRSC.ValueToNumber.find(*VItA)->second;
508     unsigned OperValB = B.IRSC.ValueToNumber.find(*VItB)->second;
509 
510     // Attempt to add a set with only the target value.  If there is no mapping
511     // we can create it here.
512     //
513     // For an instruction like a subtraction:
514     // IRSimilarityCandidateA:  IRSimilarityCandidateB:
515     // %resultA = sub %a, %b    %resultB = sub %d, %e
516     //
517     // We map %a -> %d and %b -> %e.
518     //
519     // And check to see whether their mapping is consistent in
520     // checkNumberingAndReplace.
521 
522     if (!checkNumberingAndReplace(A.ValueNumberMapping, OperValA, OperValB))
523       return false;
524 
525     if (!checkNumberingAndReplace(B.ValueNumberMapping, OperValB, OperValA))
526       return false;
527   }
528   return true;
529 }
530 
531 bool IRSimilarityCandidate::compareCommutativeOperandMapping(
532     OperandMapping A, OperandMapping B) {
533   DenseSet<unsigned> ValueNumbersA;
534   DenseSet<unsigned> ValueNumbersB;
535 
536   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
537   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
538   unsigned OperandLength = A.OperVals.size();
539 
540   // Find the value number sets for the operands.
541   for (unsigned Idx = 0; Idx < OperandLength;
542        Idx++, VItA++, VItB++) {
543     ValueNumbersA.insert(A.IRSC.ValueToNumber.find(*VItA)->second);
544     ValueNumbersB.insert(B.IRSC.ValueToNumber.find(*VItB)->second);
545   }
546 
547   // Iterate over the operands in the first IRSimilarityCandidate and make sure
548   // there exists a possible mapping with the operands in the second
549   // IRSimilarityCandidate.
550   if (!checkNumberingAndReplaceCommutative(A.IRSC.ValueToNumber,
551                                            A.ValueNumberMapping, A.OperVals,
552                                            ValueNumbersB))
553     return false;
554 
555   // Iterate over the operands in the second IRSimilarityCandidate and make sure
556   // there exists a possible mapping with the operands in the first
557   // IRSimilarityCandidate.
558   if (!checkNumberingAndReplaceCommutative(B.IRSC.ValueToNumber,
559                                            B.ValueNumberMapping, B.OperVals,
560                                            ValueNumbersA))
561     return false;
562 
563   return true;
564 }
565 
566 bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
567                                              const IRSimilarityCandidate &B) {
568   DenseMap<unsigned, DenseSet<unsigned>> MappingA;
569   DenseMap<unsigned, DenseSet<unsigned>> MappingB;
570   return IRSimilarityCandidate::compareStructure(A, B, MappingA, MappingB);
571 }
572 
573 bool IRSimilarityCandidate::compareStructure(
574     const IRSimilarityCandidate &A, const IRSimilarityCandidate &B,
575     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
576     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB) {
577   if (A.getLength() != B.getLength())
578     return false;
579 
580   if (A.ValueToNumber.size() != B.ValueToNumber.size())
581     return false;
582 
583   iterator ItA = A.begin();
584   iterator ItB = B.begin();
585 
586   // These ValueNumber Mapping sets create a create a mapping between the values
587   // in one candidate to values in the other candidate.  If we create a set with
588   // one element, and that same element maps to the original element in the
589   // candidate we have a good mapping.
590   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
591 
592 
593   // Iterate over the instructions contained in each candidate
594   unsigned SectionLength = A.getStartIdx() + A.getLength();
595   for (unsigned Loc = A.getStartIdx(); Loc < SectionLength;
596        ItA++, ItB++, Loc++) {
597     // Make sure the instructions are similar to one another.
598     if (!isClose(*ItA, *ItB))
599       return false;
600 
601     Instruction *IA = ItA->Inst;
602     Instruction *IB = ItB->Inst;
603 
604     if (!ItA->Legal || !ItB->Legal)
605       return false;
606 
607     // Get the operand sets for the instructions.
608     ArrayRef<Value *> OperValsA = ItA->OperVals;
609     ArrayRef<Value *> OperValsB = ItB->OperVals;
610 
611     unsigned InstValA = A.ValueToNumber.find(IA)->second;
612     unsigned InstValB = B.ValueToNumber.find(IB)->second;
613 
614     bool WasInserted;
615     // Ensure that the mappings for the instructions exists.
616     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingA.insert(
617         std::make_pair(InstValA, DenseSet<unsigned>({InstValB})));
618     if (!WasInserted && !ValueMappingIt->second.contains(InstValB))
619       return false;
620 
621     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingB.insert(
622         std::make_pair(InstValB, DenseSet<unsigned>({InstValA})));
623     if (!WasInserted && !ValueMappingIt->second.contains(InstValA))
624       return false;
625 
626     // We have different paths for commutative instructions and non-commutative
627     // instructions since commutative instructions could allow multiple mappings
628     // to certain values.
629     if (IA->isCommutative() && !isa<FPMathOperator>(IA)) {
630       if (!compareCommutativeOperandMapping(
631               {A, OperValsA, ValueNumberMappingA},
632               {B, OperValsB, ValueNumberMappingB}))
633         return false;
634       continue;
635     }
636 
637     // Handle the non-commutative cases.
638     if (!compareNonCommutativeOperandMapping(
639             {A, OperValsA, ValueNumberMappingA},
640             {B, OperValsB, ValueNumberMappingB}))
641       return false;
642   }
643   return true;
644 }
645 
646 bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A,
647                                     const IRSimilarityCandidate &B) {
648   auto DoesOverlap = [](const IRSimilarityCandidate &X,
649                         const IRSimilarityCandidate &Y) {
650     // Check:
651     // XXXXXX        X starts before Y ends
652     //      YYYYYYY  Y starts after X starts
653     return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx;
654   };
655 
656   return DoesOverlap(A, B) || DoesOverlap(B, A);
657 }
658 
659 void IRSimilarityIdentifier::populateMapper(
660     Module &M, std::vector<IRInstructionData *> &InstrList,
661     std::vector<unsigned> &IntegerMapping) {
662 
663   std::vector<IRInstructionData *> InstrListForModule;
664   std::vector<unsigned> IntegerMappingForModule;
665   // Iterate over the functions in the module to map each Instruction in each
666   // BasicBlock to an unsigned integer.
667   for (Function &F : M) {
668 
669     if (F.empty())
670       continue;
671 
672     for (BasicBlock &BB : F) {
673 
674       if (BB.sizeWithoutDebug() < 2)
675         continue;
676 
677       // BB has potential to have similarity since it has a size greater than 2
678       // and can therefore match other regions greater than 2. Map it to a list
679       // of unsigned integers.
680       Mapper.convertToUnsignedVec(BB, InstrListForModule,
681                                   IntegerMappingForModule);
682     }
683   }
684 
685   // Insert the InstrListForModule at the end of the overall InstrList so that
686   // we can have a long InstrList for the entire set of Modules being analyzed.
687   llvm::append_range(InstrList, InstrListForModule);
688   // Do the same as above, but for IntegerMapping.
689   llvm::append_range(IntegerMapping, IntegerMappingForModule);
690 }
691 
692 void IRSimilarityIdentifier::populateMapper(
693     ArrayRef<std::unique_ptr<Module>> &Modules,
694     std::vector<IRInstructionData *> &InstrList,
695     std::vector<unsigned> &IntegerMapping) {
696 
697   // Iterate over, and map the instructions in each module.
698   for (const std::unique_ptr<Module> &M : Modules)
699     populateMapper(*M, InstrList, IntegerMapping);
700 }
701 
702 /// From a repeated subsequence, find all the different instances of the
703 /// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
704 /// the IRInstructionData in subsequence.
705 ///
706 /// \param [in] Mapper - The instruction mapper for sanity checks.
707 /// \param [in] InstrList - The vector that holds the instruction data.
708 /// \param [in] IntegerMapping - The vector that holds the mapped integers.
709 /// \param [out] CandsForRepSubstring - The vector to store the generated
710 /// IRSimilarityCandidates.
711 static void createCandidatesFromSuffixTree(
712     const IRInstructionMapper& Mapper, std::vector<IRInstructionData *> &InstrList,
713     std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
714     std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
715 
716   unsigned StringLen = RS.Length;
717 
718   // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
719   for (const unsigned &StartIdx : RS.StartIndices) {
720     unsigned EndIdx = StartIdx + StringLen - 1;
721 
722     // Check that this subsequence does not contain an illegal instruction.
723     bool ContainsIllegal = false;
724     for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
725       unsigned Key = IntegerMapping[CurrIdx];
726       if (Key > Mapper.IllegalInstrNumber) {
727         ContainsIllegal = true;
728         break;
729       }
730     }
731 
732     // If we have an illegal instruction, we should not create an
733     // IRSimilarityCandidate for this region.
734     if (ContainsIllegal)
735       continue;
736 
737     // We are getting iterators to the instructions in this region of code
738     // by advancing the start and end indices from the start of the
739     // InstrList.
740     std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
741     std::advance(StartIt, StartIdx);
742     std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
743     std::advance(EndIt, EndIdx);
744 
745     CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt);
746   }
747 }
748 
749 void IRSimilarityCandidate::createCanonicalRelationFrom(
750     IRSimilarityCandidate &SourceCand,
751     DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
752     DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping) {
753   assert(SourceCand.CanonNumToNumber.size() != 0 &&
754          "Base canonical relationship is empty!");
755   assert(SourceCand.NumberToCanonNum.size() != 0 &&
756          "Base canonical relationship is empty!");
757 
758   assert(CanonNumToNumber.size() == 0 && "Canonical Relationship is non-empty");
759   assert(NumberToCanonNum.size() == 0 && "Canonical Relationship is non-empty");
760 
761   DenseSet<unsigned> UsedGVNs;
762   // Iterate over the mappings provided from this candidate to SourceCand.  We
763   // are then able to map the GVN in this candidate to the same canonical number
764   // given to the corresponding GVN in SourceCand.
765   for (std::pair<unsigned, DenseSet<unsigned>> &GVNMapping : ToSourceMapping) {
766     unsigned SourceGVN = GVNMapping.first;
767 
768     assert(GVNMapping.second.size() != 0 && "Possible GVNs is 0!");
769 
770     unsigned ResultGVN;
771     // We need special handling if we have more than one potential value.  This
772     // means that there are at least two GVNs that could correspond to this GVN.
773     // This could lead to potential swapping later on, so we make a decision
774     // here to ensure a one-to-one mapping.
775     if (GVNMapping.second.size() > 1) {
776       bool Found = false;
777       for (unsigned Val : GVNMapping.second) {
778         // We make sure the target value number hasn't already been reserved.
779         if (UsedGVNs.find(Val) != UsedGVNs.end())
780           continue;
781 
782         // We make sure that the opposite mapping is still consistent.
783         DenseMap<unsigned, DenseSet<unsigned>>::iterator It =
784             FromSourceMapping.find(Val);
785 
786         if (It->second.find(SourceGVN) == It->second.end())
787           continue;
788 
789         // We pick the first item that satisfies these conditions.
790         Found = true;
791         ResultGVN = Val;
792         break;
793       }
794 
795       assert(Found && "Could not find matching value for source GVN");
796       (void)Found;
797 
798     } else
799       ResultGVN = *GVNMapping.second.begin();
800 
801     // Whatever GVN is found, we mark it as used.
802     UsedGVNs.insert(ResultGVN);
803 
804     unsigned CanonNum = *SourceCand.getCanonicalNum(ResultGVN);
805     CanonNumToNumber.insert(std::make_pair(CanonNum, SourceGVN));
806     NumberToCanonNum.insert(std::make_pair(SourceGVN, CanonNum));
807   }
808 }
809 
810 void IRSimilarityCandidate::createCanonicalMappingFor(
811     IRSimilarityCandidate &CurrCand) {
812   assert(CurrCand.CanonNumToNumber.size() == 0 &&
813          "Canonical Relationship is non-empty");
814   assert(CurrCand.NumberToCanonNum.size() == 0 &&
815          "Canonical Relationship is non-empty");
816 
817   unsigned CanonNum = 0;
818   // Iterate over the value numbers found, the order does not matter in this
819   // case.
820   for (std::pair<unsigned, Value *> &NumToVal : CurrCand.NumberToValue) {
821     CurrCand.NumberToCanonNum.insert(std::make_pair(NumToVal.first, CanonNum));
822     CurrCand.CanonNumToNumber.insert(std::make_pair(CanonNum, NumToVal.first));
823     CanonNum++;
824   }
825 }
826 
827 /// From the list of IRSimilarityCandidates, perform a comparison between each
828 /// IRSimilarityCandidate to determine if there are overlapping
829 /// IRInstructionData, or if they do not have the same structure.
830 ///
831 /// \param [in] CandsForRepSubstring - The vector containing the
832 /// IRSimilarityCandidates.
833 /// \param [out] StructuralGroups - the mapping of unsigned integers to vector
834 /// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
835 /// vector are structurally similar to one another.
836 static void findCandidateStructures(
837     std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
838     DenseMap<unsigned, SimilarityGroup> &StructuralGroups) {
839   std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
840       InnerCandEndIt;
841 
842   // IRSimilarityCandidates each have a structure for operand use.  It is
843   // possible that two instances of the same subsequences have different
844   // structure. Each type of structure found is assigned a number.  This
845   // DenseMap maps an IRSimilarityCandidate to which type of similarity
846   // discovered it fits within.
847   DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
848 
849   // Find the compatibility from each candidate to the others to determine
850   // which candidates overlap and which have the same structure by mapping
851   // each structure to a different group.
852   bool SameStructure;
853   bool Inserted;
854   unsigned CurrentGroupNum = 0;
855   unsigned OuterGroupNum;
856   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
857   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
858   DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
859 
860   // Iterate over the candidates to determine its structural and overlapping
861   // compatibility with other instructions
862   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
863   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
864   for (CandIt = CandsForRepSubstring.begin(),
865       CandEndIt = CandsForRepSubstring.end();
866        CandIt != CandEndIt; CandIt++) {
867 
868     // Determine if it has an assigned structural group already.
869     CandToGroupIt = CandToGroup.find(&*CandIt);
870     if (CandToGroupIt == CandToGroup.end()) {
871       // If not, we assign it one, and add it to our mapping.
872       std::tie(CandToGroupIt, Inserted) =
873           CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++));
874     }
875 
876     // Get the structural group number from the iterator.
877     OuterGroupNum = CandToGroupIt->second;
878 
879     // Check if we already have a list of IRSimilarityCandidates for the current
880     // structural group.  Create one if one does not exist.
881     CurrentGroupPair = StructuralGroups.find(OuterGroupNum);
882     if (CurrentGroupPair == StructuralGroups.end()) {
883       IRSimilarityCandidate::createCanonicalMappingFor(*CandIt);
884       std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert(
885           std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt})));
886     }
887 
888     // Iterate over the IRSimilarityCandidates following the current
889     // IRSimilarityCandidate in the list to determine whether the two
890     // IRSimilarityCandidates are compatible.  This is so we do not repeat pairs
891     // of IRSimilarityCandidates.
892     for (InnerCandIt = std::next(CandIt),
893         InnerCandEndIt = CandsForRepSubstring.end();
894          InnerCandIt != InnerCandEndIt; InnerCandIt++) {
895 
896       // We check if the inner item has a group already, if it does, we skip it.
897       CandToGroupItInner = CandToGroup.find(&*InnerCandIt);
898       if (CandToGroupItInner != CandToGroup.end())
899         continue;
900 
901       // Otherwise we determine if they have the same structure and add it to
902       // vector if they match.
903       ValueNumberMappingA.clear();
904       ValueNumberMappingB.clear();
905       SameStructure = IRSimilarityCandidate::compareStructure(
906           *CandIt, *InnerCandIt, ValueNumberMappingA, ValueNumberMappingB);
907       if (!SameStructure)
908         continue;
909 
910       InnerCandIt->createCanonicalRelationFrom(*CandIt, ValueNumberMappingA,
911                                                ValueNumberMappingB);
912       CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum));
913       CurrentGroupPair->second.push_back(*InnerCandIt);
914     }
915   }
916 }
917 
918 void IRSimilarityIdentifier::findCandidates(
919     std::vector<IRInstructionData *> &InstrList,
920     std::vector<unsigned> &IntegerMapping) {
921   SuffixTree ST(IntegerMapping);
922 
923   std::vector<IRSimilarityCandidate> CandsForRepSubstring;
924   std::vector<SimilarityGroup> NewCandidateGroups;
925 
926   DenseMap<unsigned, SimilarityGroup> StructuralGroups;
927 
928   // Iterate over the subsequences found by the Suffix Tree to create
929   // IRSimilarityCandidates for each repeated subsequence and determine which
930   // instances are structurally similar to one another.
931   for (SuffixTree::RepeatedSubstring &RS : ST) {
932     createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, RS,
933                                    CandsForRepSubstring);
934 
935     if (CandsForRepSubstring.size() < 2)
936       continue;
937 
938     findCandidateStructures(CandsForRepSubstring, StructuralGroups);
939     for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups)
940       // We only add the group if it contains more than one
941       // IRSimilarityCandidate.  If there is only one, that means there is no
942       // other repeated subsequence with the same structure.
943       if (Group.second.size() > 1)
944         SimilarityCandidates->push_back(Group.second);
945 
946     CandsForRepSubstring.clear();
947     StructuralGroups.clear();
948     NewCandidateGroups.clear();
949   }
950 }
951 
952 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
953     ArrayRef<std::unique_ptr<Module>> Modules) {
954   resetSimilarityCandidates();
955 
956   std::vector<IRInstructionData *> InstrList;
957   std::vector<unsigned> IntegerMapping;
958 
959   populateMapper(Modules, InstrList, IntegerMapping);
960   findCandidates(InstrList, IntegerMapping);
961 
962   return SimilarityCandidates.getValue();
963 }
964 
965 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
966   resetSimilarityCandidates();
967 
968   std::vector<IRInstructionData *> InstrList;
969   std::vector<unsigned> IntegerMapping;
970 
971   populateMapper(M, InstrList, IntegerMapping);
972   findCandidates(InstrList, IntegerMapping);
973 
974   return SimilarityCandidates.getValue();
975 }
976 
977 INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
978                 "ir-similarity-identifier", false, true)
979 
980 IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
981     : ModulePass(ID) {
982   initializeIRSimilarityIdentifierWrapperPassPass(
983       *PassRegistry::getPassRegistry());
984 }
985 
986 bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
987   IRSI.reset(new IRSimilarityIdentifier());
988   return false;
989 }
990 
991 bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) {
992   IRSI.reset();
993   return false;
994 }
995 
996 bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
997   IRSI->findSimilarity(M);
998   return false;
999 }
1000 
1001 AnalysisKey IRSimilarityAnalysis::Key;
1002 IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
1003                                                ModuleAnalysisManager &) {
1004 
1005   auto IRSI = IRSimilarityIdentifier();
1006   IRSI.findSimilarity(M);
1007   return IRSI;
1008 }
1009 
1010 PreservedAnalyses
1011 IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
1012   IRSimilarityIdentifier &IRSI = AM.getResult<IRSimilarityAnalysis>(M);
1013   Optional<SimilarityGroupList> &SimilarityCandidatesOpt = IRSI.getSimilarity();
1014 
1015   for (std::vector<IRSimilarityCandidate> &CandVec : *SimilarityCandidatesOpt) {
1016     OS << CandVec.size() << " candidates of length "
1017        << CandVec.begin()->getLength() << ".  Found in: \n";
1018     for (IRSimilarityCandidate &Cand : CandVec) {
1019       OS << "  Function: " << Cand.front()->Inst->getFunction()->getName().str()
1020          << ", Basic Block: ";
1021       if (Cand.front()->Inst->getParent()->getName().str() == "")
1022         OS << "(unnamed)";
1023       else
1024         OS << Cand.front()->Inst->getParent()->getName().str();
1025       OS << "\n    Start Instruction: ";
1026       Cand.frontInstruction()->print(OS);
1027       OS << "\n      End Instruction: ";
1028       Cand.backInstruction()->print(OS);
1029       OS << "\n";
1030     }
1031   }
1032 
1033   return PreservedAnalyses::all();
1034 }
1035 
1036 char IRSimilarityIdentifierWrapperPass::ID = 0;
1037