1 //===- IRSimilarityIdentifier.cpp - Find similarity in a module -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // Implementation file for the IRSimilarityIdentifier for identifying
11 // similarities in IR including the IRInstructionMapper.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Analysis/IRSimilarityIdentifier.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/IR/Intrinsics.h"
18 #include "llvm/IR/Operator.h"
19 #include "llvm/IR/User.h"
20 #include "llvm/InitializePasses.h"
21 #include "llvm/Support/SuffixTree.h"
22 
23 using namespace llvm;
24 using namespace IRSimilarity;
25 
26 IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
27                                      IRInstructionDataList &IDList)
28     : Inst(&I), Legal(Legality), IDL(&IDList) {
29   // We check for whether we have a comparison instruction.  If it is, we
30   // find the "less than" version of the predicate for consistency for
31   // comparison instructions throught the program.
32   if (CmpInst *C = dyn_cast<CmpInst>(&I)) {
33     CmpInst::Predicate Predicate = predicateForConsistency(C);
34     if (Predicate != C->getPredicate())
35       RevisedPredicate = Predicate;
36   }
37 
38   // Here we collect the operands and their types for determining whether
39   // the structure of the operand use matches between two different candidates.
40   for (Use &OI : I.operands()) {
41     if (isa<CmpInst>(I) && RevisedPredicate.hasValue()) {
42       // If we have a CmpInst where the predicate is reversed, it means the
43       // operands must be reversed as well.
44       OperVals.insert(OperVals.begin(), OI.get());
45       continue;
46     }
47 
48     OperVals.push_back(OI.get());
49   }
50 }
51 
52 CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
53   switch (CI->getPredicate()) {
54   case CmpInst::FCMP_OGT:
55   case CmpInst::FCMP_UGT:
56   case CmpInst::FCMP_OGE:
57   case CmpInst::FCMP_UGE:
58   case CmpInst::ICMP_SGT:
59   case CmpInst::ICMP_UGT:
60   case CmpInst::ICMP_SGE:
61   case CmpInst::ICMP_UGE:
62     return CI->getSwappedPredicate();
63   default:
64     return CI->getPredicate();
65   }
66 }
67 
68 CmpInst::Predicate IRInstructionData::getPredicate() const {
69   assert(isa<CmpInst>(Inst) &&
70          "Can only get a predicate from a compare instruction");
71 
72   if (RevisedPredicate.hasValue())
73     return RevisedPredicate.getValue();
74 
75   return cast<CmpInst>(Inst)->getPredicate();
76 }
77 
78 bool IRSimilarity::isClose(const IRInstructionData &A,
79                            const IRInstructionData &B) {
80 
81   if (!A.Legal || !B.Legal)
82     return false;
83 
84   // Check if we are performing the same sort of operation on the same types
85   // but not on the same values.
86   if (A.Inst->isSameOperationAs(B.Inst))
87     return true;
88 
89   // If there is a predicate, this means that either there is a swapped
90   // predicate, or that the types are different, we want to make sure that
91   // the predicates are equivalent via swapping.
92   if (isa<CmpInst>(A.Inst) && isa<CmpInst>(B.Inst)) {
93 
94     if (A.getPredicate() != B.getPredicate())
95       return false;
96 
97     // If the predicates are the same via swap, make sure that the types are
98     // still the same.
99     auto ZippedTypes = zip(A.OperVals, B.OperVals);
100 
101     return all_of(ZippedTypes, [](std::tuple<llvm::Value *, llvm::Value *> R) {
102       return std::get<0>(R)->getType() == std::get<1>(R)->getType();
103     });
104   }
105 
106   return false;
107 }
108 
109 // TODO: This is the same as the MachineOutliner, and should be consolidated
110 // into the same interface.
111 void IRInstructionMapper::convertToUnsignedVec(
112     BasicBlock &BB, std::vector<IRInstructionData *> &InstrList,
113     std::vector<unsigned> &IntegerMapping) {
114   BasicBlock::iterator It = BB.begin();
115 
116   std::vector<unsigned> IntegerMappingForBB;
117   std::vector<IRInstructionData *> InstrListForBB;
118 
119   HaveLegalRange = false;
120   CanCombineWithPrevInstr = false;
121   AddedIllegalLastTime = true;
122 
123   for (BasicBlock::iterator Et = BB.end(); It != Et; ++It) {
124     switch (InstClassifier.visit(*It)) {
125     case InstrType::Legal:
126       mapToLegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
127       break;
128     case InstrType::Illegal:
129       mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB);
130       break;
131     case InstrType::Invisible:
132       AddedIllegalLastTime = false;
133       break;
134     }
135   }
136 
137   if (HaveLegalRange) {
138     mapToIllegalUnsigned(It, IntegerMappingForBB, InstrListForBB, true);
139     for_each(InstrListForBB,
140              [this](IRInstructionData *ID) { this->IDL->push_back(*ID); });
141     llvm::append_range(InstrList, InstrListForBB);
142     llvm::append_range(IntegerMapping, IntegerMappingForBB);
143   }
144 }
145 
146 // TODO: This is the same as the MachineOutliner, and should be consolidated
147 // into the same interface.
148 unsigned IRInstructionMapper::mapToLegalUnsigned(
149     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
150     std::vector<IRInstructionData *> &InstrListForBB) {
151   // We added something legal, so we should unset the AddedLegalLastTime
152   // flag.
153   AddedIllegalLastTime = false;
154 
155   // If we have at least two adjacent legal instructions (which may have
156   // invisible instructions in between), remember that.
157   if (CanCombineWithPrevInstr)
158     HaveLegalRange = true;
159   CanCombineWithPrevInstr = true;
160 
161   // Get the integer for this instruction or give it the current
162   // LegalInstrNumber.
163   IRInstructionData *ID = allocateIRInstructionData(*It, true, *IDL);
164   InstrListForBB.push_back(ID);
165 
166   // Add to the instruction list
167   bool WasInserted;
168   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
169       ResultIt;
170   std::tie(ResultIt, WasInserted) =
171       InstructionIntegerMap.insert(std::make_pair(ID, LegalInstrNumber));
172   unsigned INumber = ResultIt->second;
173 
174   // There was an insertion.
175   if (WasInserted)
176     LegalInstrNumber++;
177 
178   IntegerMappingForBB.push_back(INumber);
179 
180   // Make sure we don't overflow or use any integers reserved by the DenseMap.
181   assert(LegalInstrNumber < IllegalInstrNumber &&
182          "Instruction mapping overflow!");
183 
184   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
185          "Tried to assign DenseMap tombstone or empty key to instruction.");
186   assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
187          "Tried to assign DenseMap tombstone or empty key to instruction.");
188 
189   return INumber;
190 }
191 
192 IRInstructionData *
193 IRInstructionMapper::allocateIRInstructionData(Instruction &I, bool Legality,
194                                                IRInstructionDataList &IDL) {
195   return new (InstDataAllocator->Allocate()) IRInstructionData(I, Legality, IDL);
196 }
197 
198 IRInstructionDataList *
199 IRInstructionMapper::allocateIRInstructionDataList() {
200   return new (IDLAllocator->Allocate()) IRInstructionDataList();
201 }
202 
203 // TODO: This is the same as the MachineOutliner, and should be consolidated
204 // into the same interface.
205 unsigned IRInstructionMapper::mapToIllegalUnsigned(
206     BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
207     std::vector<IRInstructionData *> &InstrListForBB, bool End) {
208   // Can't combine an illegal instruction. Set the flag.
209   CanCombineWithPrevInstr = false;
210 
211   // Only add one illegal number per range of legal numbers.
212   if (AddedIllegalLastTime)
213     return IllegalInstrNumber;
214 
215   IRInstructionData *ID = nullptr;
216   if (!End)
217     ID = allocateIRInstructionData(*It, false, *IDL);
218   InstrListForBB.push_back(ID);
219 
220   // Remember that we added an illegal number last time.
221   AddedIllegalLastTime = true;
222   unsigned INumber = IllegalInstrNumber;
223   IntegerMappingForBB.push_back(IllegalInstrNumber--);
224 
225   assert(LegalInstrNumber < IllegalInstrNumber &&
226          "Instruction mapping overflow!");
227 
228   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
229          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
230 
231   assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
232          "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
233 
234   return INumber;
235 }
236 
237 IRSimilarityCandidate::IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
238                                              IRInstructionData *FirstInstIt,
239                                              IRInstructionData *LastInstIt)
240     : StartIdx(StartIdx), Len(Len) {
241 
242   assert(FirstInstIt != nullptr && "Instruction is nullptr!");
243   assert(LastInstIt != nullptr && "Instruction is nullptr!");
244   assert(StartIdx + Len > StartIdx &&
245          "Overflow for IRSimilarityCandidate range?");
246   assert(Len - 1 == static_cast<unsigned>(std::distance(
247                         iterator(FirstInstIt), iterator(LastInstIt))) &&
248          "Length of the first and last IRInstructionData do not match the "
249          "given length");
250 
251   // We iterate over the given instructions, and map each unique value
252   // to a unique number in the IRSimilarityCandidate ValueToNumber and
253   // NumberToValue maps.  A constant get its own value globally, the individual
254   // uses of the constants are not considered to be unique.
255   //
256   // IR:                    Mapping Added:
257   // %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
258   // %add2 = add i32 %a, %1    %add2 -> 4
259   // %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
260   //
261   // when replace with global values, starting from 1, would be
262   //
263   // 3 = add i32 1, 2
264   // 4 = add i32 1, 3
265   // 6 = add i32 5, 2
266   unsigned LocalValNumber = 1;
267   IRInstructionDataList::iterator ID = iterator(*FirstInstIt);
268   for (unsigned Loc = StartIdx; Loc < StartIdx + Len; Loc++, ID++) {
269     // Map the operand values to an unsigned integer if it does not already
270     // have an unsigned integer assigned to it.
271     for (Value *Arg : ID->OperVals)
272       if (ValueToNumber.find(Arg) == ValueToNumber.end()) {
273         ValueToNumber.try_emplace(Arg, LocalValNumber);
274         NumberToValue.try_emplace(LocalValNumber, Arg);
275         LocalValNumber++;
276       }
277 
278     // Mapping the instructions to an unsigned integer if it is not already
279     // exist in the mapping.
280     if (ValueToNumber.find(ID->Inst) == ValueToNumber.end()) {
281       ValueToNumber.try_emplace(ID->Inst, LocalValNumber);
282       NumberToValue.try_emplace(LocalValNumber, ID->Inst);
283       LocalValNumber++;
284     }
285   }
286 
287   // Setting the first and last instruction data pointers for the candidate.  If
288   // we got through the entire for loop without hitting an assert, we know
289   // that both of these instructions are not nullptrs.
290   FirstInst = FirstInstIt;
291   LastInst = LastInstIt;
292 }
293 
294 bool IRSimilarityCandidate::isSimilar(const IRSimilarityCandidate &A,
295                                       const IRSimilarityCandidate &B) {
296   if (A.getLength() != B.getLength())
297     return false;
298 
299   auto InstrDataForBoth =
300       zip(make_range(A.begin(), A.end()), make_range(B.begin(), B.end()));
301 
302   return all_of(InstrDataForBoth,
303                 [](std::tuple<IRInstructionData &, IRInstructionData &> R) {
304                   IRInstructionData &A = std::get<0>(R);
305                   IRInstructionData &B = std::get<1>(R);
306                   if (!A.Legal || !B.Legal)
307                     return false;
308                   return isClose(A, B);
309                 });
310 }
311 
312 /// Determine if one or more of the assigned global value numbers for the
313 /// operands in \p TargetValueNumbers is in the current mapping set for operand
314 /// numbers in \p SourceOperands.  The set of possible corresponding global
315 /// value numbers are replaced with the most recent version of compatible
316 /// values.
317 ///
318 /// \param [in] SourceValueToNumberMapping - The mapping of a Value to global
319 /// value number for the source IRInstructionCandidate.
320 /// \param [in, out] CurrentSrcTgtNumberMapping - The current mapping of source
321 /// IRSimilarityCandidate global value numbers to a set of possible numbers in
322 /// the target.
323 /// \param [in] SourceOperands - The operands in the original
324 /// IRSimilarityCandidate in the current instruction.
325 /// \param [in] TargetValueNumbers - The global value numbers of the operands in
326 /// the corresponding Instruction in the other IRSimilarityCandidate.
327 /// \returns true if there exists a possible mapping between the source
328 /// Instruction operands and the target Instruction operands, and false if not.
329 static bool checkNumberingAndReplaceCommutative(
330   const DenseMap<Value *, unsigned> &SourceValueToNumberMapping,
331   DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
332   ArrayRef<Value *> &SourceOperands,
333   DenseSet<unsigned> &TargetValueNumbers){
334 
335   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
336 
337   unsigned ArgVal;
338   bool WasInserted;
339 
340   // Iterate over the operands in the source IRSimilarityCandidate to determine
341   // whether there exists an operand in the other IRSimilarityCandidate that
342   // creates a valid mapping of Value to Value between the
343   // IRSimilarityCaniddates.
344   for (Value *V : SourceOperands) {
345     ArgVal = SourceValueToNumberMapping.find(V)->second;
346 
347     std::tie(ValueMappingIt, WasInserted) = CurrentSrcTgtNumberMapping.insert(
348         std::make_pair(ArgVal, TargetValueNumbers));
349 
350     // Instead of finding a current mapping, we inserted a set.  This means a
351     // mapping did not exist for the source Instruction operand, it has no
352     // current constraints we need to check.
353     if (WasInserted)
354       continue;
355 
356     // If a mapping already exists for the source operand to the values in the
357     // other IRSimilarityCandidate we need to iterate over the items in other
358     // IRSimilarityCandidate's Instruction to determine whether there is a valid
359     // mapping of Value to Value.
360     DenseSet<unsigned> NewSet;
361     for (unsigned &Curr : ValueMappingIt->second)
362       // If we can find the value in the mapping, we add it to the new set.
363       if (TargetValueNumbers.find(Curr) != TargetValueNumbers.end())
364         NewSet.insert(Curr);
365 
366     // If we could not find a Value, return 0.
367     if (NewSet.empty())
368       return false;
369 
370     // Otherwise replace the old mapping with the newly constructed one.
371     if (NewSet.size() != ValueMappingIt->second.size())
372       ValueMappingIt->second.swap(NewSet);
373 
374     // We have reached no conclusions about the mapping, and cannot remove
375     // any items from the other operands, so we move to check the next operand.
376     if (ValueMappingIt->second.size() != 1)
377       continue;
378 
379 
380     unsigned ValToRemove = *ValueMappingIt->second.begin();
381     // When there is only one item left in the mapping for and operand, remove
382     // the value from the other operands.  If it results in there being no
383     // mapping, return false, it means the mapping is wrong
384     for (Value *InnerV : SourceOperands) {
385       if (V == InnerV)
386         continue;
387 
388       unsigned InnerVal = SourceValueToNumberMapping.find(InnerV)->second;
389       ValueMappingIt = CurrentSrcTgtNumberMapping.find(InnerVal);
390       if (ValueMappingIt == CurrentSrcTgtNumberMapping.end())
391         continue;
392 
393       ValueMappingIt->second.erase(ValToRemove);
394       if (ValueMappingIt->second.empty())
395         return false;
396     }
397   }
398 
399   return true;
400 }
401 
402 /// Determine if operand number \p TargetArgVal is in the current mapping set
403 /// for operand number \p SourceArgVal.
404 ///
405 /// \param [in, out] CurrentSrcTgtNumberMapping current mapping of global
406 /// value numbers from source IRSimilarityCandidate to target
407 /// IRSimilarityCandidate.
408 /// \param [in] SourceArgVal The global value number for an operand in the
409 /// in the original candidate.
410 /// \param [in] TargetArgVal The global value number for the corresponding
411 /// operand in the other candidate.
412 /// \returns True if there exists a mapping and false if not.
413 bool checkNumberingAndReplace(
414     DenseMap<unsigned, DenseSet<unsigned>> &CurrentSrcTgtNumberMapping,
415     unsigned SourceArgVal, unsigned TargetArgVal) {
416   // We are given two unsigned integers representing the global values of
417   // the operands in different IRSimilarityCandidates and a current mapping
418   // between the two.
419   //
420   // Source Operand GVN: 1
421   // Target Operand GVN: 2
422   // CurrentMapping: {1: {1, 2}}
423   //
424   // Since we have mapping, and the target operand is contained in the set, we
425   // update it to:
426   // CurrentMapping: {1: {2}}
427   // and can return true. But, if the mapping was
428   // CurrentMapping: {1: {3}}
429   // we would return false.
430 
431   bool WasInserted;
432   DenseMap<unsigned, DenseSet<unsigned>>::iterator Val;
433 
434   std::tie(Val, WasInserted) = CurrentSrcTgtNumberMapping.insert(
435       std::make_pair(SourceArgVal, DenseSet<unsigned>({TargetArgVal})));
436 
437   // If we created a new mapping, then we are done.
438   if (WasInserted)
439     return true;
440 
441   // If there is more than one option in the mapping set, and the target value
442   // is included in the mapping set replace that set with one that only includes
443   // the target value, as it is the only valid mapping via the non commutative
444   // instruction.
445 
446   DenseSet<unsigned> &TargetSet = Val->second;
447   if (TargetSet.size() > 1 && TargetSet.contains(TargetArgVal)) {
448     TargetSet.clear();
449     TargetSet.insert(TargetArgVal);
450     return true;
451   }
452 
453   // Return true if we can find the value in the set.
454   return TargetSet.contains(TargetArgVal);
455 }
456 
457 bool IRSimilarityCandidate::compareNonCommutativeOperandMapping(
458     OperandMapping A, OperandMapping B) {
459   // Iterators to keep track of where we are in the operands for each
460   // Instruction.
461   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
462   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
463   unsigned OperandLength = A.OperVals.size();
464 
465   // For each operand, get the value numbering and ensure it is consistent.
466   for (unsigned Idx = 0; Idx < OperandLength; Idx++, VItA++, VItB++) {
467     unsigned OperValA = A.IRSC.ValueToNumber.find(*VItA)->second;
468     unsigned OperValB = B.IRSC.ValueToNumber.find(*VItB)->second;
469 
470     // Attempt to add a set with only the target value.  If there is no mapping
471     // we can create it here.
472     //
473     // For an instruction like a subtraction:
474     // IRSimilarityCandidateA:  IRSimilarityCandidateB:
475     // %resultA = sub %a, %b    %resultB = sub %d, %e
476     //
477     // We map %a -> %d and %b -> %e.
478     //
479     // And check to see whether their mapping is consistent in
480     // checkNumberingAndReplace.
481 
482     if (!checkNumberingAndReplace(A.ValueNumberMapping, OperValA, OperValB))
483       return false;
484 
485     if (!checkNumberingAndReplace(B.ValueNumberMapping, OperValB, OperValA))
486       return false;
487   }
488   return true;
489 }
490 
491 bool IRSimilarityCandidate::compareCommutativeOperandMapping(
492     OperandMapping A, OperandMapping B) {
493   DenseSet<unsigned> ValueNumbersA;
494   DenseSet<unsigned> ValueNumbersB;
495 
496   ArrayRef<Value *>::iterator VItA = A.OperVals.begin();
497   ArrayRef<Value *>::iterator VItB = B.OperVals.begin();
498   unsigned OperandLength = A.OperVals.size();
499 
500   // Find the value number sets for the operands.
501   for (unsigned Idx = 0; Idx < OperandLength;
502        Idx++, VItA++, VItB++) {
503     ValueNumbersA.insert(A.IRSC.ValueToNumber.find(*VItA)->second);
504     ValueNumbersB.insert(B.IRSC.ValueToNumber.find(*VItB)->second);
505   }
506 
507   // Iterate over the operands in the first IRSimilarityCandidate and make sure
508   // there exists a possible mapping with the operands in the second
509   // IRSimilarityCandidate.
510   if (!checkNumberingAndReplaceCommutative(A.IRSC.ValueToNumber,
511                                            A.ValueNumberMapping, A.OperVals,
512                                            ValueNumbersB))
513     return false;
514 
515   // Iterate over the operands in the second IRSimilarityCandidate and make sure
516   // there exists a possible mapping with the operands in the first
517   // IRSimilarityCandidate.
518   if (!checkNumberingAndReplaceCommutative(B.IRSC.ValueToNumber,
519                                            B.ValueNumberMapping, B.OperVals,
520                                            ValueNumbersA))
521     return false;
522 
523   return true;
524 }
525 
526 bool IRSimilarityCandidate::compareStructure(const IRSimilarityCandidate &A,
527                                              const IRSimilarityCandidate &B) {
528   if (A.getLength() != B.getLength())
529     return false;
530 
531   if (A.ValueToNumber.size() != B.ValueToNumber.size())
532     return false;
533 
534   iterator ItA = A.begin();
535   iterator ItB = B.begin();
536 
537   // These sets create a create a mapping between the values in one candidate
538   // to values in the other candidate.  If we create a set with one element,
539   // and that same element maps to the original element in the candidate
540   // we have a good mapping.
541   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingA;
542   DenseMap<unsigned, DenseSet<unsigned>> ValueNumberMappingB;
543   DenseMap<unsigned, DenseSet<unsigned>>::iterator ValueMappingIt;
544 
545   bool WasInserted;
546 
547   // Iterate over the instructions contained in each candidate
548   unsigned SectionLength = A.getStartIdx() + A.getLength();
549   for (unsigned Loc = A.getStartIdx(); Loc < SectionLength;
550        ItA++, ItB++, Loc++) {
551     // Make sure the instructions are similar to one another.
552     if (!isClose(*ItA, *ItB))
553       return false;
554 
555     Instruction *IA = ItA->Inst;
556     Instruction *IB = ItB->Inst;
557 
558     if (!ItA->Legal || !ItB->Legal)
559       return false;
560 
561     // Get the operand sets for the instructions.
562     ArrayRef<Value *> OperValsA = ItA->OperVals;
563     ArrayRef<Value *> OperValsB = ItB->OperVals;
564 
565     unsigned InstValA = A.ValueToNumber.find(IA)->second;
566     unsigned InstValB = B.ValueToNumber.find(IB)->second;
567 
568     // Ensure that the mappings for the instructions exists.
569     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingA.insert(
570         std::make_pair(InstValA, DenseSet<unsigned>({InstValB})));
571     if (!WasInserted && !ValueMappingIt->second.contains(InstValB))
572       return false;
573 
574     std::tie(ValueMappingIt, WasInserted) = ValueNumberMappingB.insert(
575         std::make_pair(InstValB, DenseSet<unsigned>({InstValA})));
576     if (!WasInserted && !ValueMappingIt->second.contains(InstValA))
577       return false;
578 
579     // We have different paths for commutative instructions and non-commutative
580     // instructions since commutative instructions could allow multiple mappings
581     // to certain values.
582     if (IA->isCommutative() && !isa<FPMathOperator>(IA)) {
583       if (!compareCommutativeOperandMapping(
584               {A, OperValsA, ValueNumberMappingA},
585               {B, OperValsB, ValueNumberMappingB}))
586         return false;
587       continue;
588     }
589 
590     // Handle the non-commutative cases.
591     if (!compareNonCommutativeOperandMapping(
592             {A, OperValsA, ValueNumberMappingA},
593             {B, OperValsB, ValueNumberMappingB}))
594       return false;
595   }
596   return true;
597 }
598 
599 bool IRSimilarityCandidate::overlap(const IRSimilarityCandidate &A,
600                                     const IRSimilarityCandidate &B) {
601   auto DoesOverlap = [](const IRSimilarityCandidate &X,
602                         const IRSimilarityCandidate &Y) {
603     // Check:
604     // XXXXXX        X starts before Y ends
605     //      YYYYYYY  Y starts after X starts
606     return X.StartIdx <= Y.getEndIdx() && Y.StartIdx >= X.StartIdx;
607   };
608 
609   return DoesOverlap(A, B) || DoesOverlap(B, A);
610 }
611 
612 void IRSimilarityIdentifier::populateMapper(
613     Module &M, std::vector<IRInstructionData *> &InstrList,
614     std::vector<unsigned> &IntegerMapping) {
615 
616   std::vector<IRInstructionData *> InstrListForModule;
617   std::vector<unsigned> IntegerMappingForModule;
618   // Iterate over the functions in the module to map each Instruction in each
619   // BasicBlock to an unsigned integer.
620   for (Function &F : M) {
621 
622     if (F.empty())
623       continue;
624 
625     for (BasicBlock &BB : F) {
626 
627       if (BB.sizeWithoutDebug() < 2)
628         continue;
629 
630       // BB has potential to have similarity since it has a size greater than 2
631       // and can therefore match other regions greater than 2. Map it to a list
632       // of unsigned integers.
633       Mapper.convertToUnsignedVec(BB, InstrListForModule,
634                                   IntegerMappingForModule);
635     }
636   }
637 
638   // Insert the InstrListForModule at the end of the overall InstrList so that
639   // we can have a long InstrList for the entire set of Modules being analyzed.
640   llvm::append_range(InstrList, InstrListForModule);
641   // Do the same as above, but for IntegerMapping.
642   llvm::append_range(IntegerMapping, IntegerMappingForModule);
643 }
644 
645 void IRSimilarityIdentifier::populateMapper(
646     ArrayRef<std::unique_ptr<Module>> &Modules,
647     std::vector<IRInstructionData *> &InstrList,
648     std::vector<unsigned> &IntegerMapping) {
649 
650   // Iterate over, and map the instructions in each module.
651   for (const std::unique_ptr<Module> &M : Modules)
652     populateMapper(*M, InstrList, IntegerMapping);
653 }
654 
655 /// From a repeated subsequence, find all the different instances of the
656 /// subsequence from the \p InstrList, and create an IRSimilarityCandidate from
657 /// the IRInstructionData in subsequence.
658 ///
659 /// \param [in] Mapper - The instruction mapper for sanity checks.
660 /// \param [in] InstrList - The vector that holds the instruction data.
661 /// \param [in] IntegerMapping - The vector that holds the mapped integers.
662 /// \param [out] CandsForRepSubstring - The vector to store the generated
663 /// IRSimilarityCandidates.
664 static void createCandidatesFromSuffixTree(
665     IRInstructionMapper Mapper, std::vector<IRInstructionData *> &InstrList,
666     std::vector<unsigned> &IntegerMapping, SuffixTree::RepeatedSubstring &RS,
667     std::vector<IRSimilarityCandidate> &CandsForRepSubstring) {
668 
669   unsigned StringLen = RS.Length;
670 
671   // Create an IRSimilarityCandidate for instance of this subsequence \p RS.
672   for (const unsigned &StartIdx : RS.StartIndices) {
673     unsigned EndIdx = StartIdx + StringLen - 1;
674 
675     // Check that this subsequence does not contain an illegal instruction.
676     bool ContainsIllegal = false;
677     for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) {
678       unsigned Key = IntegerMapping[CurrIdx];
679       if (Key > Mapper.IllegalInstrNumber) {
680         ContainsIllegal = true;
681         break;
682       }
683     }
684 
685     // If we have an illegal instruction, we should not create an
686     // IRSimilarityCandidate for this region.
687     if (ContainsIllegal)
688       continue;
689 
690     // We are getting iterators to the instructions in this region of code
691     // by advancing the start and end indices from the start of the
692     // InstrList.
693     std::vector<IRInstructionData *>::iterator StartIt = InstrList.begin();
694     std::advance(StartIt, StartIdx);
695     std::vector<IRInstructionData *>::iterator EndIt = InstrList.begin();
696     std::advance(EndIt, EndIdx);
697 
698     CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt);
699   }
700 }
701 
702 /// From the list of IRSimilarityCandidates, perform a comparison between each
703 /// IRSimilarityCandidate to determine if there are overlapping
704 /// IRInstructionData, or if they do not have the same structure.
705 ///
706 /// \param [in] CandsForRepSubstring - The vector containing the
707 /// IRSimilarityCandidates.
708 /// \param [out] StructuralGroups - the mapping of unsigned integers to vector
709 /// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the
710 /// vector are structurally similar to one another.
711 static void findCandidateStructures(
712     std::vector<IRSimilarityCandidate> &CandsForRepSubstring,
713     DenseMap<unsigned, SimilarityGroup> &StructuralGroups) {
714   std::vector<IRSimilarityCandidate>::iterator CandIt, CandEndIt, InnerCandIt,
715       InnerCandEndIt;
716 
717   // IRSimilarityCandidates each have a structure for operand use.  It is
718   // possible that two instances of the same subsequences have different
719   // structure. Each type of structure found is assigned a number.  This
720   // DenseMap maps an IRSimilarityCandidate to which type of similarity
721   // discovered it fits within.
722   DenseMap<IRSimilarityCandidate *, unsigned> CandToGroup;
723 
724   // Find the compatibility from each candidate to the others to determine
725   // which candidates overlap and which have the same structure by mapping
726   // each structure to a different group.
727   bool SameStructure;
728   bool Inserted;
729   unsigned CurrentGroupNum = 0;
730   unsigned OuterGroupNum;
731   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupIt;
732   DenseMap<IRSimilarityCandidate *, unsigned>::iterator CandToGroupItInner;
733   DenseMap<unsigned, SimilarityGroup>::iterator CurrentGroupPair;
734 
735   // Iterate over the candidates to determine its structural and overlapping
736   // compatibility with other instructions
737   for (CandIt = CandsForRepSubstring.begin(),
738       CandEndIt = CandsForRepSubstring.end();
739        CandIt != CandEndIt; CandIt++) {
740 
741     // Determine if it has an assigned structural group already.
742     CandToGroupIt = CandToGroup.find(&*CandIt);
743     if (CandToGroupIt == CandToGroup.end()) {
744       // If not, we assign it one, and add it to our mapping.
745       std::tie(CandToGroupIt, Inserted) =
746           CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++));
747     }
748 
749     // Get the structural group number from the iterator.
750     OuterGroupNum = CandToGroupIt->second;
751 
752     // Check if we already have a list of IRSimilarityCandidates for the current
753     // structural group.  Create one if one does not exist.
754     CurrentGroupPair = StructuralGroups.find(OuterGroupNum);
755     if (CurrentGroupPair == StructuralGroups.end())
756       std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert(
757           std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt})));
758 
759     // Iterate over the IRSimilarityCandidates following the current
760     // IRSimilarityCandidate in the list to determine whether the two
761     // IRSimilarityCandidates are compatible.  This is so we do not repeat pairs
762     // of IRSimilarityCandidates.
763     for (InnerCandIt = std::next(CandIt),
764         InnerCandEndIt = CandsForRepSubstring.end();
765          InnerCandIt != InnerCandEndIt; InnerCandIt++) {
766 
767       // We check if the inner item has a group already, if it does, we skip it.
768       CandToGroupItInner = CandToGroup.find(&*InnerCandIt);
769       if (CandToGroupItInner != CandToGroup.end())
770         continue;
771 
772       // Otherwise we determine if they have the same structure and add it to
773       // vector if they match.
774       SameStructure =
775           IRSimilarityCandidate::compareStructure(*CandIt, *InnerCandIt);
776       if (!SameStructure)
777         continue;
778 
779       CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum));
780       CurrentGroupPair->second.push_back(*InnerCandIt);
781     }
782   }
783 }
784 
785 void IRSimilarityIdentifier::findCandidates(
786     std::vector<IRInstructionData *> &InstrList,
787     std::vector<unsigned> &IntegerMapping) {
788   SuffixTree ST(IntegerMapping);
789 
790   std::vector<IRSimilarityCandidate> CandsForRepSubstring;
791   std::vector<SimilarityGroup> NewCandidateGroups;
792 
793   DenseMap<unsigned, SimilarityGroup> StructuralGroups;
794 
795   // Iterate over the subsequences found by the Suffix Tree to create
796   // IRSimilarityCandidates for each repeated subsequence and determine which
797   // instances are structurally similar to one another.
798   for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) {
799     createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, *It,
800                                    CandsForRepSubstring);
801 
802     if (CandsForRepSubstring.size() < 2)
803       continue;
804 
805     findCandidateStructures(CandsForRepSubstring, StructuralGroups);
806     for (std::pair<unsigned, SimilarityGroup> &Group : StructuralGroups)
807       // We only add the group if it contains more than one
808       // IRSimilarityCandidate.  If there is only one, that means there is no
809       // other repeated subsequence with the same structure.
810       if (Group.second.size() > 1)
811         SimilarityCandidates->push_back(Group.second);
812 
813     CandsForRepSubstring.clear();
814     StructuralGroups.clear();
815     NewCandidateGroups.clear();
816   }
817 }
818 
819 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
820     ArrayRef<std::unique_ptr<Module>> Modules) {
821   resetSimilarityCandidates();
822 
823   std::vector<IRInstructionData *> InstrList;
824   std::vector<unsigned> IntegerMapping;
825 
826   populateMapper(Modules, InstrList, IntegerMapping);
827   findCandidates(InstrList, IntegerMapping);
828 
829   return SimilarityCandidates.getValue();
830 }
831 
832 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
833   resetSimilarityCandidates();
834 
835   std::vector<IRInstructionData *> InstrList;
836   std::vector<unsigned> IntegerMapping;
837 
838   populateMapper(M, InstrList, IntegerMapping);
839   findCandidates(InstrList, IntegerMapping);
840 
841   return SimilarityCandidates.getValue();
842 }
843 
844 INITIALIZE_PASS(IRSimilarityIdentifierWrapperPass, "ir-similarity-identifier",
845                 "ir-similarity-identifier", false, true)
846 
847 IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
848     : ModulePass(ID) {
849   initializeIRSimilarityIdentifierWrapperPassPass(
850       *PassRegistry::getPassRegistry());
851 }
852 
853 bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
854   IRSI.reset(new IRSimilarityIdentifier(M));
855   return false;
856 }
857 
858 bool IRSimilarityIdentifierWrapperPass::doFinalization(Module &M) {
859   IRSI.reset();
860   return false;
861 }
862 
863 bool IRSimilarityIdentifierWrapperPass::runOnModule(Module &M) {
864   // All the real work is done in the constructor for the pass.
865   IRSI.reset(new IRSimilarityIdentifier(M));
866   return false;
867 }
868 
869 AnalysisKey IRSimilarityAnalysis::Key;
870 IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
871                                                ModuleAnalysisManager &) {
872 
873   return IRSimilarityIdentifier(M);
874 }
875 
876 PreservedAnalyses
877 IRSimilarityAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &AM) {
878   IRSimilarityIdentifier &IRSI = AM.getResult<IRSimilarityAnalysis>(M);
879   Optional<SimilarityGroupList> &SimilarityCandidatesOpt = IRSI.getSimilarity();
880 
881   for (std::vector<IRSimilarityCandidate> &CandVec : *SimilarityCandidatesOpt) {
882     OS << CandVec.size() << " candidates of length "
883        << CandVec.begin()->getLength() << ".  Found in: \n";
884     for (IRSimilarityCandidate &Cand : CandVec) {
885       OS << "  Function: " << Cand.front()->Inst->getFunction()->getName().str()
886          << ",  Basic Block: ";
887       if (Cand.front()->Inst->getParent()->getName().str() == "")
888         OS << "(unnamed)\n";
889       else
890         OS << Cand.front()->Inst->getParent()->getName().str() << "\n";
891     }
892   }
893 
894   return PreservedAnalyses::all();
895 }
896 
897 char IRSimilarityIdentifierWrapperPass::ID = 0;
898