1 //===- LoopInterchange.cpp - Loop interchange pass------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This Pass handles loop interchange transform.
11 // This pass interchanges loops to provide a more cache-friendly memory access
12 // patterns.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/Analysis/AliasAnalysis.h"
18 #include "llvm/Analysis/AssumptionCache.h"
19 #include "llvm/Analysis/BlockFrequencyInfo.h"
20 #include "llvm/Analysis/CodeMetrics.h"
21 #include "llvm/Analysis/DependenceAnalysis.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/LoopIterator.h"
24 #include "llvm/Analysis/LoopPass.h"
25 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
26 #include "llvm/Analysis/ScalarEvolution.h"
27 #include "llvm/Analysis/ScalarEvolutionExpander.h"
28 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/Analysis/ValueTracking.h"
31 #include "llvm/IR/Dominators.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/IRBuilder.h"
34 #include "llvm/IR/InstIterator.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/Debug.h"
39 #include "llvm/Support/raw_ostream.h"
40 #include "llvm/Transforms/Scalar.h"
41 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
42 #include "llvm/Transforms/Utils/LoopUtils.h"
43 
44 using namespace llvm;
45 
46 #define DEBUG_TYPE "loop-interchange"
47 
48 static cl::opt<int> LoopInterchangeCostThreshold(
49     "loop-interchange-threshold", cl::init(0), cl::Hidden,
50     cl::desc("Interchange if you gain more than this number"));
51 
52 namespace {
53 
54 typedef SmallVector<Loop *, 8> LoopVector;
55 
56 // TODO: Check if we can use a sparse matrix here.
57 typedef std::vector<std::vector<char>> CharMatrix;
58 
59 // Maximum number of dependencies that can be handled in the dependency matrix.
60 static const unsigned MaxMemInstrCount = 100;
61 
62 // Maximum loop depth supported.
63 static const unsigned MaxLoopNestDepth = 10;
64 
65 struct LoopInterchange;
66 
67 #ifdef DUMP_DEP_MATRICIES
68 void printDepMatrix(CharMatrix &DepMatrix) {
69   for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) {
70     std::vector<char> Vec = *I;
71     for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II)
72       DEBUG(dbgs() << *II << " ");
73     DEBUG(dbgs() << "\n");
74   }
75 }
76 #endif
77 
78 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
79                                      Loop *L, DependenceInfo *DI) {
80   typedef SmallVector<Value *, 16> ValueVector;
81   ValueVector MemInstr;
82 
83   // For each block.
84   for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end();
85        BB != BE; ++BB) {
86     // Scan the BB and collect legal loads and stores.
87     for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E;
88          ++I) {
89       if (!isa<Instruction>(I))
90         return false;
91       if (LoadInst *Ld = dyn_cast<LoadInst>(I)) {
92         if (!Ld->isSimple())
93           return false;
94         MemInstr.push_back(&*I);
95       } else if (StoreInst *St = dyn_cast<StoreInst>(I)) {
96         if (!St->isSimple())
97           return false;
98         MemInstr.push_back(&*I);
99       }
100     }
101   }
102 
103   DEBUG(dbgs() << "Found " << MemInstr.size()
104                << " Loads and Stores to analyze\n");
105 
106   ValueVector::iterator I, IE, J, JE;
107 
108   for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
109     for (J = I, JE = MemInstr.end(); J != JE; ++J) {
110       std::vector<char> Dep;
111       Instruction *Src = cast<Instruction>(*I);
112       Instruction *Dst = cast<Instruction>(*J);
113       if (Src == Dst)
114         continue;
115       // Ignore Input dependencies.
116       if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
117         continue;
118       // Track Output, Flow, and Anti dependencies.
119       if (auto D = DI->depends(Src, Dst, true)) {
120         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
121         DEBUG(StringRef DepType =
122                   D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
123               dbgs() << "Found " << DepType
124                      << " dependency between Src and Dst\n"
125                      << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
126         unsigned Levels = D->getLevels();
127         char Direction;
128         for (unsigned II = 1; II <= Levels; ++II) {
129           const SCEV *Distance = D->getDistance(II);
130           const SCEVConstant *SCEVConst =
131               dyn_cast_or_null<SCEVConstant>(Distance);
132           if (SCEVConst) {
133             const ConstantInt *CI = SCEVConst->getValue();
134             if (CI->isNegative())
135               Direction = '<';
136             else if (CI->isZero())
137               Direction = '=';
138             else
139               Direction = '>';
140             Dep.push_back(Direction);
141           } else if (D->isScalar(II)) {
142             Direction = 'S';
143             Dep.push_back(Direction);
144           } else {
145             unsigned Dir = D->getDirection(II);
146             if (Dir == Dependence::DVEntry::LT ||
147                 Dir == Dependence::DVEntry::LE)
148               Direction = '<';
149             else if (Dir == Dependence::DVEntry::GT ||
150                      Dir == Dependence::DVEntry::GE)
151               Direction = '>';
152             else if (Dir == Dependence::DVEntry::EQ)
153               Direction = '=';
154             else
155               Direction = '*';
156             Dep.push_back(Direction);
157           }
158         }
159         while (Dep.size() != Level) {
160           Dep.push_back('I');
161         }
162 
163         DepMatrix.push_back(Dep);
164         if (DepMatrix.size() > MaxMemInstrCount) {
165           DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
166                        << " dependencies inside loop\n");
167           return false;
168         }
169       }
170     }
171   }
172 
173   // We don't have a DepMatrix to check legality return false.
174   if (DepMatrix.size() == 0)
175     return false;
176   return true;
177 }
178 
179 // A loop is moved from index 'from' to an index 'to'. Update the Dependence
180 // matrix by exchanging the two columns.
181 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
182                                     unsigned ToIndx) {
183   unsigned numRows = DepMatrix.size();
184   for (unsigned i = 0; i < numRows; ++i) {
185     char TmpVal = DepMatrix[i][ToIndx];
186     DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
187     DepMatrix[i][FromIndx] = TmpVal;
188   }
189 }
190 
191 // Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
192 // '>'
193 static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
194                                    unsigned Column) {
195   for (unsigned i = 0; i <= Column; ++i) {
196     if (DepMatrix[Row][i] == '<')
197       return false;
198     if (DepMatrix[Row][i] == '>')
199       return true;
200   }
201   // All dependencies were '=','S' or 'I'
202   return false;
203 }
204 
205 // Checks if no dependence exist in the dependency matrix in Row before Column.
206 static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
207                                  unsigned Column) {
208   for (unsigned i = 0; i < Column; ++i) {
209     if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
210         DepMatrix[Row][i] != 'I')
211       return false;
212   }
213   return true;
214 }
215 
216 static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
217                                 unsigned OuterLoopId, char InnerDep,
218                                 char OuterDep) {
219 
220   if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
221     return false;
222 
223   if (InnerDep == OuterDep)
224     return true;
225 
226   // It is legal to interchange if and only if after interchange no row has a
227   // '>' direction as the leftmost non-'='.
228 
229   if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
230     return true;
231 
232   if (InnerDep == '<')
233     return true;
234 
235   if (InnerDep == '>') {
236     // If OuterLoopId represents outermost loop then interchanging will make the
237     // 1st dependency as '>'
238     if (OuterLoopId == 0)
239       return false;
240 
241     // If all dependencies before OuterloopId are '=','S'or 'I'. Then
242     // interchanging will result in this row having an outermost non '='
243     // dependency of '>'
244     if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
245       return true;
246   }
247 
248   return false;
249 }
250 
251 // Checks if it is legal to interchange 2 loops.
252 // [Theorem] A permutation of the loops in a perfect nest is legal if and only
253 // if the direction matrix, after the same permutation is applied to its
254 // columns, has no ">" direction as the leftmost non-"=" direction in any row.
255 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
256                                       unsigned InnerLoopId,
257                                       unsigned OuterLoopId) {
258 
259   unsigned NumRows = DepMatrix.size();
260   // For each row check if it is valid to interchange.
261   for (unsigned Row = 0; Row < NumRows; ++Row) {
262     char InnerDep = DepMatrix[Row][InnerLoopId];
263     char OuterDep = DepMatrix[Row][OuterLoopId];
264     if (InnerDep == '*' || OuterDep == '*')
265       return false;
266     if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
267       return false;
268   }
269   return true;
270 }
271 
272 static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) {
273 
274   DEBUG(dbgs() << "Calling populateWorklist on Func: "
275                << L.getHeader()->getParent()->getName() << " Loop: %"
276                << L.getHeader()->getName() << '\n');
277   LoopVector LoopList;
278   Loop *CurrentLoop = &L;
279   const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
280   while (!Vec->empty()) {
281     // The current loop has multiple subloops in it hence it is not tightly
282     // nested.
283     // Discard all loops above it added into Worklist.
284     if (Vec->size() != 1) {
285       LoopList.clear();
286       return;
287     }
288     LoopList.push_back(CurrentLoop);
289     CurrentLoop = Vec->front();
290     Vec = &CurrentLoop->getSubLoops();
291   }
292   LoopList.push_back(CurrentLoop);
293   V.push_back(std::move(LoopList));
294 }
295 
296 static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
297   PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
298   if (InnerIndexVar)
299     return InnerIndexVar;
300   if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
301     return nullptr;
302   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
303     PHINode *PhiVar = cast<PHINode>(I);
304     Type *PhiTy = PhiVar->getType();
305     if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
306         !PhiTy->isPointerTy())
307       return nullptr;
308     const SCEVAddRecExpr *AddRec =
309         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
310     if (!AddRec || !AddRec->isAffine())
311       continue;
312     const SCEV *Step = AddRec->getStepRecurrence(*SE);
313     if (!isa<SCEVConstant>(Step))
314       continue;
315     // Found the induction variable.
316     // FIXME: Handle loops with more than one induction variable. Note that,
317     // currently, legality makes sure we have only one induction variable.
318     return PhiVar;
319   }
320   return nullptr;
321 }
322 
323 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
324 class LoopInterchangeLegality {
325 public:
326   LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
327                           LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
328                           OptimizationRemarkEmitter *ORE)
329       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
330         PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {}
331 
332   /// Check if the loops can be interchanged.
333   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
334                            CharMatrix &DepMatrix);
335   /// Check if the loop structure is understood. We do not handle triangular
336   /// loops for now.
337   bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
338 
339   bool currentLimitations();
340 
341   bool hasInnerLoopReduction() { return InnerLoopHasReduction; }
342 
343 private:
344   bool tightlyNested(Loop *Outer, Loop *Inner);
345   bool containsUnsafeInstructionsInHeader(BasicBlock *BB);
346   bool areAllUsesReductions(Instruction *Ins, Loop *L);
347   bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
348   bool findInductionAndReductions(Loop *L,
349                                   SmallVector<PHINode *, 8> &Inductions,
350                                   SmallVector<PHINode *, 8> &Reductions);
351   Loop *OuterLoop;
352   Loop *InnerLoop;
353 
354   ScalarEvolution *SE;
355   LoopInfo *LI;
356   DominatorTree *DT;
357   bool PreserveLCSSA;
358   /// Interface to emit optimization remarks.
359   OptimizationRemarkEmitter *ORE;
360 
361   bool InnerLoopHasReduction;
362 };
363 
364 /// LoopInterchangeProfitability checks if it is profitable to interchange the
365 /// loop.
366 class LoopInterchangeProfitability {
367 public:
368   LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
369                                OptimizationRemarkEmitter *ORE)
370       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
371 
372   /// Check if the loop interchange is profitable.
373   bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
374                     CharMatrix &DepMatrix);
375 
376 private:
377   int getInstrOrderCost();
378 
379   Loop *OuterLoop;
380   Loop *InnerLoop;
381 
382   /// Scev analysis.
383   ScalarEvolution *SE;
384   /// Interface to emit optimization remarks.
385   OptimizationRemarkEmitter *ORE;
386 };
387 
388 /// LoopInterchangeTransform interchanges the loop.
389 class LoopInterchangeTransform {
390 public:
391   LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
392                            LoopInfo *LI, DominatorTree *DT,
393                            BasicBlock *LoopNestExit,
394                            bool InnerLoopContainsReductions)
395       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
396         LoopExit(LoopNestExit),
397         InnerLoopHasReduction(InnerLoopContainsReductions) {}
398 
399   /// Interchange OuterLoop and InnerLoop.
400   bool transform();
401   void restructureLoops(Loop *InnerLoop, Loop *OuterLoop);
402   void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
403 
404 private:
405   void splitInnerLoopLatch(Instruction *);
406   void splitInnerLoopHeader();
407   bool adjustLoopLinks();
408   void adjustLoopPreheaders();
409   bool adjustLoopBranches();
410   void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred,
411                            BasicBlock *NewPred);
412 
413   Loop *OuterLoop;
414   Loop *InnerLoop;
415 
416   /// Scev analysis.
417   ScalarEvolution *SE;
418   LoopInfo *LI;
419   DominatorTree *DT;
420   BasicBlock *LoopExit;
421   bool InnerLoopHasReduction;
422 };
423 
424 // Main LoopInterchange Pass.
425 struct LoopInterchange : public FunctionPass {
426   static char ID;
427   ScalarEvolution *SE;
428   LoopInfo *LI;
429   DependenceInfo *DI;
430   DominatorTree *DT;
431   bool PreserveLCSSA;
432   /// Interface to emit optimization remarks.
433   OptimizationRemarkEmitter *ORE;
434 
435   LoopInterchange()
436       : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
437     initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
438   }
439 
440   void getAnalysisUsage(AnalysisUsage &AU) const override {
441     AU.addRequired<ScalarEvolutionWrapperPass>();
442     AU.addRequired<AAResultsWrapperPass>();
443     AU.addRequired<DominatorTreeWrapperPass>();
444     AU.addRequired<LoopInfoWrapperPass>();
445     AU.addRequired<DependenceAnalysisWrapperPass>();
446     AU.addRequiredID(LoopSimplifyID);
447     AU.addRequiredID(LCSSAID);
448     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
449   }
450 
451   bool runOnFunction(Function &F) override {
452     if (skipFunction(F))
453       return false;
454 
455     SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
456     LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
457     DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
458     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
459     DT = DTWP ? &DTWP->getDomTree() : nullptr;
460     ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
461     PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
462 
463     // Build up a worklist of loop pairs to analyze.
464     SmallVector<LoopVector, 8> Worklist;
465 
466     for (Loop *L : *LI)
467       populateWorklist(*L, Worklist);
468 
469     DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n");
470     bool Changed = true;
471     while (!Worklist.empty()) {
472       LoopVector LoopList = Worklist.pop_back_val();
473       Changed = processLoopList(LoopList, F);
474     }
475     return Changed;
476   }
477 
478   bool isComputableLoopNest(LoopVector LoopList) {
479     for (Loop *L : LoopList) {
480       const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
481       if (ExitCountOuter == SE->getCouldNotCompute()) {
482         DEBUG(dbgs() << "Couldn't compute backedge count\n");
483         return false;
484       }
485       if (L->getNumBackEdges() != 1) {
486         DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
487         return false;
488       }
489       if (!L->getExitingBlock()) {
490         DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
491         return false;
492       }
493     }
494     return true;
495   }
496 
497   unsigned selectLoopForInterchange(const LoopVector &LoopList) {
498     // TODO: Add a better heuristic to select the loop to be interchanged based
499     // on the dependence matrix. Currently we select the innermost loop.
500     return LoopList.size() - 1;
501   }
502 
503   bool processLoopList(LoopVector LoopList, Function &F) {
504 
505     bool Changed = false;
506     unsigned LoopNestDepth = LoopList.size();
507     if (LoopNestDepth < 2) {
508       DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
509       return false;
510     }
511     if (LoopNestDepth > MaxLoopNestDepth) {
512       DEBUG(dbgs() << "Cannot handle loops of depth greater than "
513                    << MaxLoopNestDepth << "\n");
514       return false;
515     }
516     if (!isComputableLoopNest(LoopList)) {
517       DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
518       return false;
519     }
520 
521     DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n");
522 
523     CharMatrix DependencyMatrix;
524     Loop *OuterMostLoop = *(LoopList.begin());
525     if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
526                                   OuterMostLoop, DI)) {
527       DEBUG(dbgs() << "Populating dependency matrix failed\n");
528       return false;
529     }
530 #ifdef DUMP_DEP_MATRICIES
531     DEBUG(dbgs() << "Dependence before interchange\n");
532     printDepMatrix(DependencyMatrix);
533 #endif
534 
535     BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch();
536     BranchInst *OuterMostLoopLatchBI =
537         dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator());
538     if (!OuterMostLoopLatchBI)
539       return false;
540 
541     // Since we currently do not handle LCSSA PHI's any failure in loop
542     // condition will now branch to LoopNestExit.
543     // TODO: This should be removed once we handle LCSSA PHI nodes.
544 
545     // Get the Outermost loop exit.
546     BasicBlock *LoopNestExit;
547     if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader())
548       LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1);
549     else
550       LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0);
551 
552     if (isa<PHINode>(LoopNestExit->begin())) {
553       DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now "
554                       "since on failure all loops branch to loop nest exit.\n");
555       return false;
556     }
557 
558     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
559     // Move the selected loop outwards to the best possible position.
560     for (unsigned i = SelecLoopId; i > 0; i--) {
561       bool Interchanged =
562           processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
563       if (!Interchanged)
564         return Changed;
565       // Loops interchanged reflect the same in LoopList
566       std::swap(LoopList[i - 1], LoopList[i]);
567 
568       // Update the DependencyMatrix
569       interChangeDependencies(DependencyMatrix, i, i - 1);
570       DT->recalculate(F);
571 #ifdef DUMP_DEP_MATRICIES
572       DEBUG(dbgs() << "Dependence after interchange\n");
573       printDepMatrix(DependencyMatrix);
574 #endif
575       Changed |= Interchanged;
576     }
577     return Changed;
578   }
579 
580   bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
581                    unsigned OuterLoopId, BasicBlock *LoopNestExit,
582                    std::vector<std::vector<char>> &DependencyMatrix) {
583 
584     DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
585                  << " and OuterLoopId = " << OuterLoopId << "\n");
586     Loop *InnerLoop = LoopList[InnerLoopId];
587     Loop *OuterLoop = LoopList[OuterLoopId];
588 
589     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
590                                 PreserveLCSSA, ORE);
591     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
592       DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
593       return false;
594     }
595     DEBUG(dbgs() << "Loops are legal to interchange\n");
596     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
597     if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
598       DEBUG(dbgs() << "Interchanging loops not profitable\n");
599       return false;
600     }
601 
602     ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged",
603                                  InnerLoop->getStartLoc(),
604                                  InnerLoop->getHeader())
605               << "Loop interchanged with enclosing loop.");
606 
607     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
608                                  LoopNestExit, LIL.hasInnerLoopReduction());
609     LIT.transform();
610     DEBUG(dbgs() << "Loops interchanged\n");
611     return true;
612   }
613 };
614 
615 } // end of namespace
616 bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
617   return none_of(Ins->users(), [=](User *U) -> bool {
618     auto *UserIns = dyn_cast<PHINode>(U);
619     RecurrenceDescriptor RD;
620     return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
621   });
622 }
623 
624 bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
625     BasicBlock *BB) {
626   for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
627     // Load corresponding to reduction PHI's are safe while concluding if
628     // tightly nested.
629     if (LoadInst *L = dyn_cast<LoadInst>(I)) {
630       if (!areAllUsesReductions(L, InnerLoop))
631         return true;
632     } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
633       return true;
634   }
635   return false;
636 }
637 
638 bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
639     BasicBlock *BB) {
640   for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
641     // Stores corresponding to reductions are safe while concluding if tightly
642     // nested.
643     if (StoreInst *L = dyn_cast<StoreInst>(I)) {
644       if (!isa<PHINode>(L->getOperand(0)))
645         return true;
646     } else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
647       return true;
648   }
649   return false;
650 }
651 
652 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
653   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
654   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
655   BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
656 
657   DEBUG(dbgs() << "Checking if loops are tightly nested\n");
658 
659   // A perfectly nested loop will not have any branch in between the outer and
660   // inner block i.e. outer header will branch to either inner preheader and
661   // outerloop latch.
662   BranchInst *OuterLoopHeaderBI =
663       dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
664   if (!OuterLoopHeaderBI)
665     return false;
666 
667   for (unsigned i = 0, e = OuterLoopHeaderBI->getNumSuccessors(); i < e; ++i) {
668     if (OuterLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader &&
669         OuterLoopHeaderBI->getSuccessor(i) != OuterLoopLatch)
670       return false;
671   }
672 
673   DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
674   // We do not have any basic block in between now make sure the outer header
675   // and outer loop latch doesn't contain any unsafe instructions.
676   if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
677       containsUnsafeInstructionsInLatch(OuterLoopLatch))
678     return false;
679 
680   DEBUG(dbgs() << "Loops are perfectly nested\n");
681   // We have a perfect loop nest.
682   return true;
683 }
684 
685 
686 bool LoopInterchangeLegality::isLoopStructureUnderstood(
687     PHINode *InnerInduction) {
688 
689   unsigned Num = InnerInduction->getNumOperands();
690   BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
691   for (unsigned i = 0; i < Num; ++i) {
692     Value *Val = InnerInduction->getOperand(i);
693     if (isa<Constant>(Val))
694       continue;
695     Instruction *I = dyn_cast<Instruction>(Val);
696     if (!I)
697       return false;
698     // TODO: Handle triangular loops.
699     // e.g. for(int i=0;i<N;i++)
700     //        for(int j=i;j<N;j++)
701     unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
702     if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
703             InnerLoopPreheader &&
704         !OuterLoop->isLoopInvariant(I)) {
705       return false;
706     }
707   }
708   return true;
709 }
710 
711 bool LoopInterchangeLegality::findInductionAndReductions(
712     Loop *L, SmallVector<PHINode *, 8> &Inductions,
713     SmallVector<PHINode *, 8> &Reductions) {
714   if (!L->getLoopLatch() || !L->getLoopPredecessor())
715     return false;
716   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
717     RecurrenceDescriptor RD;
718     InductionDescriptor ID;
719     PHINode *PHI = cast<PHINode>(I);
720     if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID))
721       Inductions.push_back(PHI);
722     else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
723       Reductions.push_back(PHI);
724     else {
725       DEBUG(
726           dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
727       return false;
728     }
729   }
730   return true;
731 }
732 
733 static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
734   for (auto I = Block->begin(); isa<PHINode>(I); ++I) {
735     PHINode *PHI = cast<PHINode>(I);
736     // Reduction lcssa phi will have only 1 incoming block that from loop latch.
737     if (PHI->getNumIncomingValues() > 1)
738       return false;
739     Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0));
740     if (!Ins)
741       return false;
742     // Incoming value for lcssa phi's in outer loop exit can only be inner loop
743     // exits lcssa phi else it would not be tightly nested.
744     if (!isa<PHINode>(Ins) && isOuterLoopExitBlock)
745       return false;
746   }
747   return true;
748 }
749 
750 static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock,
751                                          BasicBlock *LoopHeader) {
752   if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) {
753     unsigned Num = BI->getNumSuccessors();
754     assert(Num == 2);
755     for (unsigned i = 0; i < Num; ++i) {
756       if (BI->getSuccessor(i) == LoopHeader)
757         continue;
758       return BI->getSuccessor(i);
759     }
760   }
761   return nullptr;
762 }
763 
764 // This function indicates the current limitations in the transform as a result
765 // of which we do not proceed.
766 bool LoopInterchangeLegality::currentLimitations() {
767 
768   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
769   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
770   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
771   BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
772   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
773 
774   PHINode *InnerInductionVar;
775   SmallVector<PHINode *, 8> Inductions;
776   SmallVector<PHINode *, 8> Reductions;
777   if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
778     DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
779                  << "are supported currently.\n");
780     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
781                                        "UnsupportedPHIInner",
782                                        InnerLoop->getStartLoc(),
783                                        InnerLoop->getHeader())
784               << "Only inner loops with induction or reduction PHI nodes can be"
785                  " interchange currently.");
786     return true;
787   }
788 
789   // TODO: Currently we handle only loops with 1 induction variable.
790   if (Inductions.size() != 1) {
791     DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
792                  << "Failed to interchange due to current limitation\n");
793     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
794                                        "MultiInductionInner",
795                                        InnerLoop->getStartLoc(),
796                                        InnerLoop->getHeader())
797               << "Only inner loops with 1 induction variable can be "
798                  "interchanged currently.");
799     return true;
800   }
801   if (Reductions.size() > 0)
802     InnerLoopHasReduction = true;
803 
804   InnerInductionVar = Inductions.pop_back_val();
805   Reductions.clear();
806   if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
807     DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
808                  << "are supported currently.\n");
809     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
810                                        "UnsupportedPHIOuter",
811                                        OuterLoop->getStartLoc(),
812                                        OuterLoop->getHeader())
813               << "Only outer loops with induction or reduction PHI nodes can be"
814                  " interchanged currently.");
815     return true;
816   }
817 
818   // Outer loop cannot have reduction because then loops will not be tightly
819   // nested.
820   if (!Reductions.empty()) {
821     DEBUG(dbgs() << "Outer loops with reductions are not supported "
822                  << "currently.\n");
823     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
824                                        "ReductionsOuter",
825                                        OuterLoop->getStartLoc(),
826                                        OuterLoop->getHeader())
827               << "Outer loops with reductions cannot be interchangeed "
828                  "currently.");
829     return true;
830   }
831   // TODO: Currently we handle only loops with 1 induction variable.
832   if (Inductions.size() != 1) {
833     DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
834                  << "supported currently.\n");
835     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
836                                        "MultiIndutionOuter",
837                                        OuterLoop->getStartLoc(),
838                                        OuterLoop->getHeader())
839               << "Only outer loops with 1 induction variable can be "
840                  "interchanged currently.");
841     return true;
842   }
843 
844   // TODO: Triangular loops are not handled for now.
845   if (!isLoopStructureUnderstood(InnerInductionVar)) {
846     DEBUG(dbgs() << "Loop structure not understood by pass\n");
847     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
848                                        "UnsupportedStructureInner",
849                                        InnerLoop->getStartLoc(),
850                                        InnerLoop->getHeader())
851               << "Inner loop structure not understood currently.");
852     return true;
853   }
854 
855   // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
856   BasicBlock *LoopExitBlock =
857       getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
858   if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
859     DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
860     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
861                                        "NoLCSSAPHIOuter",
862                                        OuterLoop->getStartLoc(),
863                                        OuterLoop->getHeader())
864               << "Only outer loops with LCSSA PHIs can be interchange "
865                  "currently.");
866     return true;
867   }
868 
869   LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
870   if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
871     DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
872     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
873                                        "NoLCSSAPHIOuterInner",
874                                        InnerLoop->getStartLoc(),
875                                        InnerLoop->getHeader())
876               << "Only inner loops with LCSSA PHIs can be interchange "
877                  "currently.");
878     return true;
879   }
880 
881   // TODO: Current limitation: Since we split the inner loop latch at the point
882   // were induction variable is incremented (induction.next); We cannot have
883   // more than 1 user of induction.next since it would result in broken code
884   // after split.
885   // e.g.
886   // for(i=0;i<N;i++) {
887   //    for(j = 0;j<M;j++) {
888   //      A[j+1][i+2] = A[j][i]+k;
889   //  }
890   // }
891   Instruction *InnerIndexVarInc = nullptr;
892   if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
893     InnerIndexVarInc =
894         dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
895   else
896     InnerIndexVarInc =
897         dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
898 
899   if (!InnerIndexVarInc) {
900     DEBUG(dbgs() << "Did not find an instruction to increment the induction "
901                  << "variable.\n");
902     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
903                                        "NoIncrementInInner",
904                                        InnerLoop->getStartLoc(),
905                                        InnerLoop->getHeader())
906               << "The inner loop does not increment the induction variable.");
907     return true;
908   }
909 
910   // Since we split the inner loop latch on this induction variable. Make sure
911   // we do not have any instruction between the induction variable and branch
912   // instruction.
913 
914   bool FoundInduction = false;
915   for (const Instruction &I : reverse(*InnerLoopLatch)) {
916     if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I))
917       continue;
918 
919     // We found an instruction. If this is not induction variable then it is not
920     // safe to split this loop latch.
921     if (!I.isIdenticalTo(InnerIndexVarInc)) {
922       DEBUG(dbgs() << "Found unsupported instructions between induction "
923                    << "variable increment and branch.\n");
924     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
925                                        "UnsupportedInsBetweenInduction",
926                                        InnerLoop->getStartLoc(),
927                                        InnerLoop->getHeader())
928               << "Found unsupported instruction between induction variable "
929                  "increment and branch.");
930       return true;
931     }
932 
933     FoundInduction = true;
934     break;
935   }
936   // The loop latch ended and we didn't find the induction variable return as
937   // current limitation.
938   if (!FoundInduction) {
939     DEBUG(dbgs() << "Did not find the induction variable.\n");
940     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
941                                        "NoIndutionVariable",
942                                        InnerLoop->getStartLoc(),
943                                        InnerLoop->getHeader())
944               << "Did not find the induction variable.");
945     return true;
946   }
947   return false;
948 }
949 
950 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
951                                                   unsigned OuterLoopId,
952                                                   CharMatrix &DepMatrix) {
953 
954   if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
955     DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
956                  << " and OuterLoopId = " << OuterLoopId
957                  << " due to dependence\n");
958     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
959                                        "Dependence",
960                                        InnerLoop->getStartLoc(),
961                                        InnerLoop->getHeader())
962               << "Cannot interchange loops due to dependences.");
963     return false;
964   }
965 
966   // Create unique Preheaders if we already do not have one.
967   BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
968   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
969 
970   // Create  a unique outer preheader -
971   // 1) If OuterLoop preheader is not present.
972   // 2) If OuterLoop Preheader is same as OuterLoop Header
973   // 3) If OuterLoop Preheader is same as Header of the previous loop.
974   // 4) If OuterLoop Preheader is Entry node.
975   if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() ||
976       isa<PHINode>(OuterLoopPreHeader->begin()) ||
977       !OuterLoopPreHeader->getUniquePredecessor()) {
978     OuterLoopPreHeader =
979         InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA);
980   }
981 
982   if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() ||
983       InnerLoopPreHeader == OuterLoop->getHeader()) {
984     InnerLoopPreHeader =
985         InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA);
986   }
987 
988   // TODO: The loops could not be interchanged due to current limitations in the
989   // transform module.
990   if (currentLimitations()) {
991     DEBUG(dbgs() << "Not legal because of current transform limitation\n");
992     return false;
993   }
994 
995   // Check if the loops are tightly nested.
996   if (!tightlyNested(OuterLoop, InnerLoop)) {
997     DEBUG(dbgs() << "Loops not tightly nested\n");
998     ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
999                                        "NotTightlyNested",
1000                                        InnerLoop->getStartLoc(),
1001                                        InnerLoop->getHeader())
1002               << "Cannot interchange loops because they are not tightly "
1003                  "nested.");
1004     return false;
1005   }
1006 
1007   return true;
1008 }
1009 
1010 int LoopInterchangeProfitability::getInstrOrderCost() {
1011   unsigned GoodOrder, BadOrder;
1012   BadOrder = GoodOrder = 0;
1013   for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end();
1014        BI != BE; ++BI) {
1015     for (Instruction &Ins : **BI) {
1016       if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1017         unsigned NumOp = GEP->getNumOperands();
1018         bool FoundInnerInduction = false;
1019         bool FoundOuterInduction = false;
1020         for (unsigned i = 0; i < NumOp; ++i) {
1021           const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1022           const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1023           if (!AR)
1024             continue;
1025 
1026           // If we find the inner induction after an outer induction e.g.
1027           // for(int i=0;i<N;i++)
1028           //   for(int j=0;j<N;j++)
1029           //     A[i][j] = A[i-1][j-1]+k;
1030           // then it is a good order.
1031           if (AR->getLoop() == InnerLoop) {
1032             // We found an InnerLoop induction after OuterLoop induction. It is
1033             // a good order.
1034             FoundInnerInduction = true;
1035             if (FoundOuterInduction) {
1036               GoodOrder++;
1037               break;
1038             }
1039           }
1040           // If we find the outer induction after an inner induction e.g.
1041           // for(int i=0;i<N;i++)
1042           //   for(int j=0;j<N;j++)
1043           //     A[j][i] = A[j-1][i-1]+k;
1044           // then it is a bad order.
1045           if (AR->getLoop() == OuterLoop) {
1046             // We found an OuterLoop induction after InnerLoop induction. It is
1047             // a bad order.
1048             FoundOuterInduction = true;
1049             if (FoundInnerInduction) {
1050               BadOrder++;
1051               break;
1052             }
1053           }
1054         }
1055       }
1056     }
1057   }
1058   return GoodOrder - BadOrder;
1059 }
1060 
1061 static bool isProfitableForVectorization(unsigned InnerLoopId,
1062                                          unsigned OuterLoopId,
1063                                          CharMatrix &DepMatrix) {
1064   // TODO: Improve this heuristic to catch more cases.
1065   // If the inner loop is loop independent or doesn't carry any dependency it is
1066   // profitable to move this to outer position.
1067   unsigned Row = DepMatrix.size();
1068   for (unsigned i = 0; i < Row; ++i) {
1069     if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I')
1070       return false;
1071     // TODO: We need to improve this heuristic.
1072     if (DepMatrix[i][OuterLoopId] != '=')
1073       return false;
1074   }
1075   // If outer loop has dependence and inner loop is loop independent then it is
1076   // profitable to interchange to enable parallelism.
1077   return true;
1078 }
1079 
1080 bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
1081                                                 unsigned OuterLoopId,
1082                                                 CharMatrix &DepMatrix) {
1083 
1084   // TODO: Add better profitability checks.
1085   // e.g
1086   // 1) Construct dependency matrix and move the one with no loop carried dep
1087   //    inside to enable vectorization.
1088 
1089   // This is rough cost estimation algorithm. It counts the good and bad order
1090   // of induction variables in the instruction and allows reordering if number
1091   // of bad orders is more than good.
1092   int Cost = getInstrOrderCost();
1093   DEBUG(dbgs() << "Cost = " << Cost << "\n");
1094   if (Cost < -LoopInterchangeCostThreshold)
1095     return true;
1096 
1097   // It is not profitable as per current cache profitability model. But check if
1098   // we can move this loop outside to improve parallelism.
1099   if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
1100     return true;
1101 
1102   ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
1103                                      "InterchangeNotProfitable",
1104                                      InnerLoop->getStartLoc(),
1105                                      InnerLoop->getHeader())
1106             << "Interchanging loops is too costly (cost="
1107             << ore::NV("Cost", Cost) << ", threshold="
1108             << ore::NV("Threshold", LoopInterchangeCostThreshold) <<
1109             ") and it does not improve parallelism.");
1110   return false;
1111 }
1112 
1113 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1114                                                Loop *InnerLoop) {
1115   for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E;
1116        ++I) {
1117     if (*I == InnerLoop) {
1118       OuterLoop->removeChildLoop(I);
1119       return;
1120     }
1121   }
1122   llvm_unreachable("Couldn't find loop");
1123 }
1124 
1125 void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop,
1126                                                 Loop *OuterLoop) {
1127   Loop *OuterLoopParent = OuterLoop->getParentLoop();
1128   if (OuterLoopParent) {
1129     // Remove the loop from its parent loop.
1130     removeChildLoop(OuterLoopParent, OuterLoop);
1131     removeChildLoop(OuterLoop, InnerLoop);
1132     OuterLoopParent->addChildLoop(InnerLoop);
1133   } else {
1134     removeChildLoop(OuterLoop, InnerLoop);
1135     LI->changeTopLevelLoop(OuterLoop, InnerLoop);
1136   }
1137 
1138   while (!InnerLoop->empty())
1139     OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin()));
1140 
1141   InnerLoop->addChildLoop(OuterLoop);
1142 }
1143 
1144 bool LoopInterchangeTransform::transform() {
1145   bool Transformed = false;
1146   Instruction *InnerIndexVar;
1147 
1148   if (InnerLoop->getSubLoops().size() == 0) {
1149     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1150     DEBUG(dbgs() << "Calling Split Inner Loop\n");
1151     PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
1152     if (!InductionPHI) {
1153       DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1154       return false;
1155     }
1156 
1157     if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1158       InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
1159     else
1160       InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
1161 
1162     //
1163     // Split at the place were the induction variable is
1164     // incremented/decremented.
1165     // TODO: This splitting logic may not work always. Fix this.
1166     splitInnerLoopLatch(InnerIndexVar);
1167     DEBUG(dbgs() << "splitInnerLoopLatch done\n");
1168 
1169     // Splits the inner loops phi nodes out into a separate basic block.
1170     splitInnerLoopHeader();
1171     DEBUG(dbgs() << "splitInnerLoopHeader done\n");
1172   }
1173 
1174   Transformed |= adjustLoopLinks();
1175   if (!Transformed) {
1176     DEBUG(dbgs() << "adjustLoopLinks failed\n");
1177     return false;
1178   }
1179 
1180   restructureLoops(InnerLoop, OuterLoop);
1181   return true;
1182 }
1183 
1184 void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) {
1185   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1186   BasicBlock *InnerLoopLatchPred = InnerLoopLatch;
1187   InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI);
1188 }
1189 
1190 void LoopInterchangeTransform::splitInnerLoopHeader() {
1191 
1192   // Split the inner loop header out. Here make sure that the reduction PHI's
1193   // stay in the innerloop body.
1194   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1195   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1196   if (InnerLoopHasReduction) {
1197     // FIXME: Check if the induction PHI will always be the first PHI.
1198     BasicBlock *New = InnerLoopHeader->splitBasicBlock(
1199         ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split");
1200     if (LI)
1201       if (Loop *L = LI->getLoopFor(InnerLoopHeader))
1202         L->addBasicBlockToLoop(New, *LI);
1203 
1204     // Adjust Reduction PHI's in the block.
1205     SmallVector<PHINode *, 8> PHIVec;
1206     for (auto I = New->begin(); isa<PHINode>(I); ++I) {
1207       PHINode *PHI = dyn_cast<PHINode>(I);
1208       Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader);
1209       PHI->replaceAllUsesWith(V);
1210       PHIVec.push_back((PHI));
1211     }
1212     for (PHINode *P : PHIVec) {
1213       P->eraseFromParent();
1214     }
1215   } else {
1216     SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1217   }
1218 
1219   DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & "
1220                   "InnerLoopHeader\n");
1221 }
1222 
1223 /// \brief Move all instructions except the terminator from FromBB right before
1224 /// InsertBefore
1225 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1226   auto &ToList = InsertBefore->getParent()->getInstList();
1227   auto &FromList = FromBB->getInstList();
1228 
1229   ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
1230                 FromBB->getTerminator()->getIterator());
1231 }
1232 
1233 void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock,
1234                                                    BasicBlock *OldPred,
1235                                                    BasicBlock *NewPred) {
1236   for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) {
1237     PHINode *PHI = cast<PHINode>(I);
1238     unsigned Num = PHI->getNumIncomingValues();
1239     for (unsigned i = 0; i < Num; ++i) {
1240       if (PHI->getIncomingBlock(i) == OldPred)
1241         PHI->setIncomingBlock(i, NewPred);
1242     }
1243   }
1244 }
1245 
1246 bool LoopInterchangeTransform::adjustLoopBranches() {
1247 
1248   DEBUG(dbgs() << "adjustLoopBranches called\n");
1249   // Adjust the loop preheader
1250   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1251   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1252   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1253   BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1254   BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1255   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1256   BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1257   BasicBlock *InnerLoopLatchPredecessor =
1258       InnerLoopLatch->getUniquePredecessor();
1259   BasicBlock *InnerLoopLatchSuccessor;
1260   BasicBlock *OuterLoopLatchSuccessor;
1261 
1262   BranchInst *OuterLoopLatchBI =
1263       dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1264   BranchInst *InnerLoopLatchBI =
1265       dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1266   BranchInst *OuterLoopHeaderBI =
1267       dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1268   BranchInst *InnerLoopHeaderBI =
1269       dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1270 
1271   if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1272       !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1273       !InnerLoopHeaderBI)
1274     return false;
1275 
1276   BranchInst *InnerLoopLatchPredecessorBI =
1277       dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1278   BranchInst *OuterLoopPredecessorBI =
1279       dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1280 
1281   if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1282     return false;
1283   BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1284   if (!InnerLoopHeaderSuccessor)
1285     return false;
1286 
1287   // Adjust Loop Preheader and headers
1288 
1289   unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors();
1290   for (unsigned i = 0; i < NumSucc; ++i) {
1291     if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader)
1292       OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader);
1293   }
1294 
1295   NumSucc = OuterLoopHeaderBI->getNumSuccessors();
1296   for (unsigned i = 0; i < NumSucc; ++i) {
1297     if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch)
1298       OuterLoopHeaderBI->setSuccessor(i, LoopExit);
1299     else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader)
1300       OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor);
1301   }
1302 
1303   // Adjust reduction PHI's now that the incoming block has changed.
1304   updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader,
1305                       OuterLoopHeader);
1306 
1307   BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI);
1308   InnerLoopHeaderBI->eraseFromParent();
1309 
1310   // -------------Adjust loop latches-----------
1311   if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1312     InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1313   else
1314     InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1315 
1316   NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors();
1317   for (unsigned i = 0; i < NumSucc; ++i) {
1318     if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch)
1319       InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor);
1320   }
1321 
1322   // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with
1323   // the value and remove this PHI node from inner loop.
1324   SmallVector<PHINode *, 8> LcssaVec;
1325   for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) {
1326     PHINode *LcssaPhi = cast<PHINode>(I);
1327     LcssaVec.push_back(LcssaPhi);
1328   }
1329   for (PHINode *P : LcssaVec) {
1330     Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch);
1331     P->replaceAllUsesWith(Incoming);
1332     P->eraseFromParent();
1333   }
1334 
1335   if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1336     OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1337   else
1338     OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1339 
1340   if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor)
1341     InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor);
1342   else
1343     InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor);
1344 
1345   updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
1346 
1347   if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) {
1348     OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch);
1349   } else {
1350     OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch);
1351   }
1352 
1353   return true;
1354 }
1355 void LoopInterchangeTransform::adjustLoopPreheaders() {
1356 
1357   // We have interchanged the preheaders so we need to interchange the data in
1358   // the preheader as well.
1359   // This is because the content of inner preheader was previously executed
1360   // inside the outer loop.
1361   BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1362   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1363   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1364   BranchInst *InnerTermBI =
1365       cast<BranchInst>(InnerLoopPreHeader->getTerminator());
1366 
1367   // These instructions should now be executed inside the loop.
1368   // Move instruction into a new block after outer header.
1369   moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator());
1370   // These instructions were not executed previously in the loop so move them to
1371   // the older inner loop preheader.
1372   moveBBContents(OuterLoopPreHeader, InnerTermBI);
1373 }
1374 
1375 bool LoopInterchangeTransform::adjustLoopLinks() {
1376 
1377   // Adjust all branches in the inner and outer loop.
1378   bool Changed = adjustLoopBranches();
1379   if (Changed)
1380     adjustLoopPreheaders();
1381   return Changed;
1382 }
1383 
1384 char LoopInterchange::ID = 0;
1385 INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange",
1386                       "Interchanges loops for cache reuse", false, false)
1387 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
1388 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
1389 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1390 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
1391 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
1392 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
1393 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
1394 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
1395 
1396 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
1397                     "Interchanges loops for cache reuse", false, false)
1398 
1399 Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); }
1400