1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements loop unroll and jam as a routine, much like
11 // LoopUnroll.cpp implements loop unroll.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/ADT/SmallPtrSet.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/AssumptionCache.h"
18 #include "llvm/Analysis/DependenceAnalysis.h"
19 #include "llvm/Analysis/InstructionSimplify.h"
20 #include "llvm/Analysis/LoopAnalysisManager.h"
21 #include "llvm/Analysis/LoopIterator.h"
22 #include "llvm/Analysis/LoopPass.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/Analysis/ScalarEvolution.h"
25 #include "llvm/Analysis/ScalarEvolutionExpander.h"
26 #include "llvm/Analysis/Utils/Local.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/DebugInfoMetadata.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/LLVMContext.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/raw_ostream.h"
35 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
36 #include "llvm/Transforms/Utils/Cloning.h"
37 #include "llvm/Transforms/Utils/LoopSimplify.h"
38 #include "llvm/Transforms/Utils/LoopUtils.h"
39 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
40 #include "llvm/Transforms/Utils/UnrollLoop.h"
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "loop-unroll-and-jam"
44 
45 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
46 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
47 
48 static bool containsBB(std::vector<BasicBlock *> &V, BasicBlock *BB) {
49   return std::find(V.begin(), V.end(), BB) != V.end();
50 }
51 
52 // Partition blocks in an outer/inner loop pair into blocks before and after
53 // the loop
54 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
55                                      std::vector<BasicBlock *> &ForeBlocks,
56                                      std::vector<BasicBlock *> &SubLoopBlocks,
57                                      std::vector<BasicBlock *> &AftBlocks,
58                                      DominatorTree *DT) {
59   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
60   SubLoopBlocks = SubLoop->getBlocks();
61 
62   for (BasicBlock *BB : L->blocks()) {
63     if (!SubLoop->contains(BB)) {
64       if (DT->dominates(SubLoopLatch, BB))
65         AftBlocks.push_back(BB);
66       else
67         ForeBlocks.push_back(BB);
68     }
69   }
70 
71   // Check that all blocks in ForeBlocks together dominate the subloop
72   // TODO: This might ideally be done better with a dominator/postdominators.
73   BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
74   for (BasicBlock *BB : ForeBlocks) {
75     if (BB == SubLoopPreHeader)
76       continue;
77     TerminatorInst *TI = BB->getTerminator();
78     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
79       if (!containsBB(ForeBlocks, TI->getSuccessor(i)))
80         return false;
81   }
82 
83   return true;
84 }
85 
86 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
87 static void
88 moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, BasicBlock *Latch,
89                                   Instruction *InsertLoc,
90                                   std::vector<BasicBlock *> &AftBlocks) {
91   // We need to ensure we move the instructions in the correct order,
92   // starting with the earliest required instruction and moving forward.
93   std::vector<Instruction *> Worklist;
94   std::vector<Instruction *> Visited;
95   for (auto &Phi : Header->phis()) {
96     Value *V = Phi.getIncomingValueForBlock(Latch);
97     if (Instruction *I = dyn_cast<Instruction>(V))
98       Worklist.push_back(I);
99   }
100 
101   while (!Worklist.empty()) {
102     Instruction *I = Worklist.back();
103     Worklist.pop_back();
104     if (!containsBB(AftBlocks, I->getParent()))
105       continue;
106 
107     Visited.push_back(I);
108     for (auto &U : I->operands())
109       if (Instruction *II = dyn_cast<Instruction>(U))
110         Worklist.push_back(II);
111   }
112 
113   // Move all instructions in program order to before the InsertLoc
114   BasicBlock *InsertLocBB = InsertLoc->getParent();
115   for (Instruction *I : reverse(Visited)) {
116     if (I->getParent() != InsertLocBB)
117       I->moveBefore(InsertLoc);
118   }
119 }
120 
121 /*
122   This method performs Unroll and Jam. For a simple loop like:
123   for (i = ..)
124     Fore(i)
125     for (j = ..)
126       SubLoop(i, j)
127     Aft(i)
128 
129   Instead of doing normal inner or outer unrolling, we do:
130   for (i = .., i+=2)
131     Fore(i)
132     Fore(i+1)
133     for (j = ..)
134       SubLoop(i, j)
135       SubLoop(i+1, j)
136     Aft(i)
137     Aft(i+1)
138 
139   So the outer loop is essetially unrolled and then the inner loops are fused
140   ("jammed") together into a single loop. This can increase speed when there
141   are loads in SubLoop that are invariant to i, as they become shared between
142   the now jammed inner loops.
143 
144   We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
145   Fore blocks are those before the inner loop, Aft are those after. Normal
146   Unroll code is used to copy each of these sets of blocks and the results are
147   combined together into the final form above.
148 
149   isSafeToUnrollAndJam should be used prior to calling this to make sure the
150   unrolling will be valid. Checking profitablility is also advisable.
151 */
152 LoopUnrollResult
153 llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
154                        unsigned TripMultiple, bool UnrollRemainder,
155                        LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
156                        AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
157 
158   // When we enter here we should have already checked that it is safe
159   BasicBlock *Header = L->getHeader();
160   assert(L->getSubLoops().size() == 1);
161   Loop *SubLoop = *L->begin();
162 
163   // Don't enter the unroll code if there is nothing to do.
164   if (TripCount == 0 && Count < 2) {
165     LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
166     return LoopUnrollResult::Unmodified;
167   }
168 
169   assert(Count > 0);
170   assert(TripMultiple > 0);
171   assert(TripCount == 0 || TripCount % TripMultiple == 0);
172 
173   // Are we eliminating the loop control altogether?
174   bool CompletelyUnroll = (Count == TripCount);
175 
176   // We use the runtime remainder in cases where we don't know trip multiple
177   if (TripMultiple == 1 || TripMultiple % Count != 0) {
178     if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
179                                     /*UseEpilogRemainder*/ true,
180                                     UnrollRemainder, LI, SE, DT, AC, true)) {
181       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
182                            "generated when assuming runtime trip count\n");
183       return LoopUnrollResult::Unmodified;
184     }
185   }
186 
187   // Notify ScalarEvolution that the loop will be substantially changed,
188   // if not outright eliminated.
189   if (SE) {
190     SE->forgetLoop(L);
191     SE->forgetLoop(SubLoop);
192   }
193 
194   using namespace ore;
195   // Report the unrolling decision.
196   if (CompletelyUnroll) {
197     LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
198                       << Header->getName() << " with trip count " << TripCount
199                       << "!\n");
200     ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
201                                  L->getHeader())
202               << "completely unroll and jammed loop with "
203               << NV("UnrollCount", TripCount) << " iterations");
204   } else {
205     auto DiagBuilder = [&]() {
206       OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
207                               L->getHeader());
208       return Diag << "unroll and jammed loop by a factor of "
209                   << NV("UnrollCount", Count);
210     };
211 
212     LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
213                       << " by " << Count);
214     if (TripMultiple != 1) {
215       LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
216       ORE->emit([&]() {
217         return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
218                              << " trips per branch";
219       });
220     } else {
221       LLVM_DEBUG(dbgs() << " with run-time trip count");
222       ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
223     }
224     LLVM_DEBUG(dbgs() << "!\n");
225   }
226 
227   BasicBlock *Preheader = L->getLoopPreheader();
228   BasicBlock *LatchBlock = L->getLoopLatch();
229   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
230   assert(Preheader && LatchBlock && Header);
231   assert(BI && !BI->isUnconditional());
232   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
233   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
234   bool SubLoopContinueOnTrue = SubLoop->contains(
235       SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
236 
237   // Partition blocks in an outer/inner loop pair into blocks before and after
238   // the loop
239   std::vector<BasicBlock *> SubLoopBlocks;
240   std::vector<BasicBlock *> ForeBlocks;
241   std::vector<BasicBlock *> AftBlocks;
242   partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
243                            DT);
244 
245   // We keep track of the entering/first and exiting/last block of each of
246   // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
247   // blocks easier.
248   std::vector<BasicBlock *> ForeBlocksFirst;
249   std::vector<BasicBlock *> ForeBlocksLast;
250   std::vector<BasicBlock *> SubLoopBlocksFirst;
251   std::vector<BasicBlock *> SubLoopBlocksLast;
252   std::vector<BasicBlock *> AftBlocksFirst;
253   std::vector<BasicBlock *> AftBlocksLast;
254   ForeBlocksFirst.push_back(Header);
255   ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
256   SubLoopBlocksFirst.push_back(SubLoop->getHeader());
257   SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
258   AftBlocksFirst.push_back(SubLoop->getExitBlock());
259   AftBlocksLast.push_back(L->getExitingBlock());
260   // Maps Blocks[0] -> Blocks[It]
261   ValueToValueMapTy LastValueMap;
262 
263   // Move any instructions from fore phi operands from AftBlocks into Fore.
264   moveHeaderPhiOperandsToForeBlocks(
265       Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
266       AftBlocks);
267 
268   // The current on-the-fly SSA update requires blocks to be processed in
269   // reverse postorder so that LastValueMap contains the correct value at each
270   // exit.
271   LoopBlocksDFS DFS(L);
272   DFS.perform(LI);
273   // Stash the DFS iterators before adding blocks to the loop.
274   LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
275   LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
276 
277   if (Header->getParent()->isDebugInfoForProfiling())
278     for (BasicBlock *BB : L->getBlocks())
279       for (Instruction &I : *BB)
280         if (!isa<DbgInfoIntrinsic>(&I))
281           if (const DILocation *DIL = I.getDebugLoc())
282             I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
283 
284   // Copy all blocks
285   for (unsigned It = 1; It != Count; ++It) {
286     std::vector<BasicBlock *> NewBlocks;
287     // Maps Blocks[It] -> Blocks[It-1]
288     DenseMap<Value *, Value *> PrevItValueMap;
289 
290     for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
291       ValueToValueMapTy VMap;
292       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
293       Header->getParent()->getBasicBlockList().push_back(New);
294 
295       if (containsBB(ForeBlocks, *BB)) {
296         L->addBasicBlockToLoop(New, *LI);
297 
298         if (*BB == ForeBlocksFirst[0])
299           ForeBlocksFirst.push_back(New);
300         if (*BB == ForeBlocksLast[0])
301           ForeBlocksLast.push_back(New);
302       } else if (containsBB(SubLoopBlocks, *BB)) {
303         SubLoop->addBasicBlockToLoop(New, *LI);
304 
305         if (*BB == SubLoopBlocksFirst[0])
306           SubLoopBlocksFirst.push_back(New);
307         if (*BB == SubLoopBlocksLast[0])
308           SubLoopBlocksLast.push_back(New);
309       } else if (containsBB(AftBlocks, *BB)) {
310         L->addBasicBlockToLoop(New, *LI);
311 
312         if (*BB == AftBlocksFirst[0])
313           AftBlocksFirst.push_back(New);
314         if (*BB == AftBlocksLast[0])
315           AftBlocksLast.push_back(New);
316       } else {
317         llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
318       }
319 
320       // Update our running maps of newest clones
321       PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
322       LastValueMap[*BB] = New;
323       for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
324            VI != VE; ++VI) {
325         PrevItValueMap[VI->second] =
326             const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
327         LastValueMap[VI->first] = VI->second;
328       }
329 
330       NewBlocks.push_back(New);
331 
332       // Update DomTree:
333       if (*BB == ForeBlocksFirst[0])
334         DT->addNewBlock(New, ForeBlocksLast[It - 1]);
335       else if (*BB == SubLoopBlocksFirst[0])
336         DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
337       else if (*BB == AftBlocksFirst[0])
338         DT->addNewBlock(New, AftBlocksLast[It - 1]);
339       else {
340         // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
341         // structure.
342         auto BBDomNode = DT->getNode(*BB);
343         auto BBIDom = BBDomNode->getIDom();
344         BasicBlock *OriginalBBIDom = BBIDom->getBlock();
345         assert(OriginalBBIDom);
346         assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
347         DT->addNewBlock(
348             New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
349       }
350     }
351 
352     // Remap all instructions in the most recent iteration
353     for (BasicBlock *NewBlock : NewBlocks) {
354       for (Instruction &I : *NewBlock) {
355         ::remapInstruction(&I, LastValueMap);
356         if (auto *II = dyn_cast<IntrinsicInst>(&I))
357           if (II->getIntrinsicID() == Intrinsic::assume)
358             AC->registerAssumption(II);
359       }
360     }
361 
362     // Alter the ForeBlocks phi's, pointing them at the latest version of the
363     // value from the previous iteration's phis
364     for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
365       Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
366       assert(OldValue && "should have incoming edge from Aft[It]");
367       Value *NewValue = OldValue;
368       if (Value *PrevValue = PrevItValueMap[OldValue])
369         NewValue = PrevValue;
370 
371       assert(Phi.getNumOperands() == 2);
372       Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
373       Phi.setIncomingValue(0, NewValue);
374       Phi.removeIncomingValue(1);
375     }
376   }
377 
378   // Now that all the basic blocks for the unrolled iterations are in place,
379   // finish up connecting the blocks and phi nodes. At this point LastValueMap
380   // is the last unrolled iterations values.
381 
382   // Update Phis in BB from OldBB to point to NewBB
383   auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
384                             BasicBlock *NewBB) {
385     for (PHINode &Phi : BB->phis()) {
386       int I = Phi.getBasicBlockIndex(OldBB);
387       Phi.setIncomingBlock(I, NewBB);
388     }
389   };
390   // Update Phis in BB from OldBB to point to NewBB and use the latest value
391   // from LastValueMap
392   auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
393                                      BasicBlock *NewBB,
394                                      ValueToValueMapTy &LastValueMap) {
395     for (PHINode &Phi : BB->phis()) {
396       for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
397         if (Phi.getIncomingBlock(b) == OldBB) {
398           Value *OldValue = Phi.getIncomingValue(b);
399           if (Value *LastValue = LastValueMap[OldValue])
400             Phi.setIncomingValue(b, LastValue);
401           Phi.setIncomingBlock(b, NewBB);
402           break;
403         }
404       }
405     }
406   };
407   // Move all the phis from Src into Dest
408   auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
409     Instruction *insertPoint = Dest->getFirstNonPHI();
410     while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
411       Phi->moveBefore(insertPoint);
412   };
413 
414   // Update the PHI values outside the loop to point to the last block
415   updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
416                            LastValueMap);
417 
418   // Update ForeBlocks successors and phi nodes
419   BranchInst *ForeTerm =
420       cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
421   BasicBlock *Dest = SubLoopBlocksFirst[0];
422   ForeTerm->setSuccessor(0, Dest);
423 
424   if (CompletelyUnroll) {
425     while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
426       Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
427       Phi->getParent()->getInstList().erase(Phi);
428     }
429   } else {
430     // Update the PHI values to point to the last aft block
431     updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
432                              AftBlocksLast.back(), LastValueMap);
433   }
434 
435   for (unsigned It = 1; It != Count; It++) {
436     // Remap ForeBlock successors from previous iteration to this
437     BranchInst *ForeTerm =
438         cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
439     BasicBlock *Dest = ForeBlocksFirst[It];
440     ForeTerm->setSuccessor(0, Dest);
441   }
442 
443   // Subloop successors and phis
444   BranchInst *SubTerm =
445       cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
446   SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
447   SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
448   updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
449                   ForeBlocksLast.back());
450   updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
451                   SubLoopBlocksLast.back());
452 
453   for (unsigned It = 1; It != Count; It++) {
454     // Replace the conditional branch of the previous iteration subloop with an
455     // unconditional one to this one
456     BranchInst *SubTerm =
457         cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
458     BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
459     SubTerm->eraseFromParent();
460 
461     updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
462                     ForeBlocksLast.back());
463     updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
464                     SubLoopBlocksLast.back());
465     movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
466   }
467 
468   // Aft blocks successors and phis
469   BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
470   if (CompletelyUnroll) {
471     BranchInst::Create(LoopExit, Term);
472     Term->eraseFromParent();
473   } else {
474     Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
475   }
476   updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
477                   SubLoopBlocksLast.back());
478 
479   for (unsigned It = 1; It != Count; It++) {
480     // Replace the conditional branch of the previous iteration subloop with an
481     // unconditional one to this one
482     BranchInst *AftTerm =
483         cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
484     BranchInst::Create(AftBlocksFirst[It], AftTerm);
485     AftTerm->eraseFromParent();
486 
487     updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
488                     SubLoopBlocksLast.back());
489     movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
490   }
491 
492   // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
493   // new ones required.
494   if (Count != 1) {
495     SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
496     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
497                            SubLoopBlocksFirst[0]);
498     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
499                            SubLoopBlocksLast[0], AftBlocksFirst[0]);
500 
501     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
502                            ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
503     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
504                            SubLoopBlocksLast.back(), AftBlocksFirst[0]);
505     DT->applyUpdates(DTUpdates);
506   }
507 
508   // Merge adjacent basic blocks, if possible.
509   SmallPtrSet<BasicBlock *, 16> MergeBlocks;
510   MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
511   MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
512   MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
513   while (!MergeBlocks.empty()) {
514     BasicBlock *BB = *MergeBlocks.begin();
515     BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
516     if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
517       BasicBlock *Dest = Term->getSuccessor(0);
518       if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
519         // Don't remove BB and add Fold as they are the same BB
520         assert(Fold == BB);
521         (void)Fold;
522         MergeBlocks.erase(Dest);
523       } else
524         MergeBlocks.erase(BB);
525     } else
526       MergeBlocks.erase(BB);
527   }
528 
529   // At this point, the code is well formed.  We now do a quick sweep over the
530   // inserted code, doing constant propagation and dead code elimination as we
531   // go.
532   simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
533   simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
534 
535   NumCompletelyUnrolledAndJammed += CompletelyUnroll;
536   ++NumUnrolledAndJammed;
537 
538 #ifndef NDEBUG
539   // We shouldn't have done anything to break loop simplify form or LCSSA.
540   Loop *OuterL = L->getParentLoop();
541   Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
542   assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
543   if (!CompletelyUnroll)
544     assert(L->isLoopSimplifyForm());
545   assert(SubLoop->isLoopSimplifyForm());
546   assert(DT->verify());
547 #endif
548 
549   // Update LoopInfo if the loop is completely removed.
550   if (CompletelyUnroll)
551     LI->erase(L);
552 
553   return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
554                           : LoopUnrollResult::PartiallyUnrolled;
555 }
556 
557 static bool getLoadsAndStores(std::vector<BasicBlock *> &Blocks,
558                               SmallVector<Value *, 4> &MemInstr) {
559   // Scan the BBs and collect legal loads and stores.
560   // Returns false if non-simple loads/stores are found.
561   for (BasicBlock *BB : Blocks) {
562     for (Instruction &I : *BB) {
563       if (auto *Ld = dyn_cast<LoadInst>(&I)) {
564         if (!Ld->isSimple())
565           return false;
566         MemInstr.push_back(&I);
567       } else if (auto *St = dyn_cast<StoreInst>(&I)) {
568         if (!St->isSimple())
569           return false;
570         MemInstr.push_back(&I);
571       } else if (I.mayReadOrWriteMemory()) {
572         return false;
573       }
574     }
575   }
576   return true;
577 }
578 
579 static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
580                               SmallVector<Value *, 4> &Later,
581                               unsigned LoopDepth, bool InnerLoop,
582                               DependenceInfo &DI) {
583   // Use DA to check for dependencies between loads and stores that make unroll
584   // and jam invalid
585   for (Value *I : Earlier) {
586     for (Value *J : Later) {
587       Instruction *Src = cast<Instruction>(I);
588       Instruction *Dst = cast<Instruction>(J);
589       if (Src == Dst)
590         continue;
591       // Ignore Input dependencies.
592       if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
593         continue;
594 
595       // Track dependencies, and if we find them take a conservative approach
596       // by allowing only = or < (not >), altough some > would be safe
597       // (depending upon unroll width).
598       // For the inner loop, we need to disallow any (> <) dependencies
599       // FIXME: Allow > so long as distance is less than unroll width
600       if (auto D = DI.depends(Src, Dst, true)) {
601         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
602 
603         if (D->isConfused())
604           return false;
605         if (!InnerLoop) {
606           if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
607             return false;
608         } else {
609           assert(LoopDepth + 1 <= D->getLevels());
610           if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
611               D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
612             return false;
613         }
614       }
615     }
616   }
617   return true;
618 }
619 
620 static bool checkDependencies(Loop *L, std::vector<BasicBlock *> &ForeBlocks,
621                               std::vector<BasicBlock *> &SubLoopBlocks,
622                               std::vector<BasicBlock *> &AftBlocks,
623                               DependenceInfo &DI) {
624   // Get all loads/store pairs for each blocks
625   SmallVector<Value *, 4> ForeMemInstr;
626   SmallVector<Value *, 4> SubLoopMemInstr;
627   SmallVector<Value *, 4> AftMemInstr;
628   if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
629       !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
630       !getLoadsAndStores(AftBlocks, AftMemInstr))
631     return false;
632 
633   // Check for dependencies between any blocks that may change order
634   unsigned LoopDepth = L->getLoopDepth();
635   return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
636                            DI) &&
637          checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
638          checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
639                            DI) &&
640          checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
641                            DI);
642 }
643 
644 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
645                                 DependenceInfo &DI) {
646   /* We currently handle outer loops like this:
647         |
648     ForeFirst    <----\    }
649      Blocks           |    } ForeBlocks
650     ForeLast          |    }
651         |             |
652     SubLoopFirst  <\  |    }
653      Blocks        |  |    } SubLoopBlocks
654     SubLoopLast   -/  |    }
655         |             |
656     AftFirst          |    }
657      Blocks           |    } AftBlocks
658     AftLast     ------/    }
659         |
660 
661     There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
662     and AftBlocks, providing that there is one edge from Fores to SubLoops,
663     one edge from SubLoops to Afts and a single outer loop exit (from Afts).
664     In practice we currently limit Aft blocks to a single block, and limit
665     things further in the profitablility checks of the unroll and jam pass.
666 
667     Because of the way we rearrange basic blocks, we also require that
668     the Fore blocks on all unrolled iterations are safe to move before the
669     SubLoop blocks of all iterations. So we require that the phi node looping
670     operands of ForeHeader can be moved to at least the end of ForeEnd, so that
671     we can arrange cloned Fore Blocks before the subloop and match up Phi's
672     correctly.
673 
674     i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
675     It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
676 
677     There are then a number of checks along the lines of no calls, no
678     exceptions, inner loop IV is consistent, etc. Note that for loops requiring
679     runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
680     UnrollAndJamLoop if the trip count cannot be easily calculated.
681   */
682 
683   if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
684     return false;
685   Loop *SubLoop = L->getSubLoops()[0];
686   if (!SubLoop->isLoopSimplifyForm())
687     return false;
688 
689   BasicBlock *Header = L->getHeader();
690   BasicBlock *Latch = L->getLoopLatch();
691   BasicBlock *Exit = L->getExitingBlock();
692   BasicBlock *SubLoopHeader = SubLoop->getHeader();
693   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
694   BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
695 
696   if (Latch != Exit)
697     return false;
698   if (SubLoopLatch != SubLoopExit)
699     return false;
700 
701   if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
702     return false;
703 
704   // Split blocks into Fore/SubLoop/Aft based on dominators
705   std::vector<BasicBlock *> SubLoopBlocks;
706   std::vector<BasicBlock *> ForeBlocks;
707   std::vector<BasicBlock *> AftBlocks;
708   if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
709                                 AftBlocks, &DT))
710     return false;
711 
712   // Aft blocks may need to move instructions to fore blocks, which becomes more
713   // difficult if there are multiple (potentially conditionally executed)
714   // blocks. For now we just exclude loops with multiple aft blocks.
715   if (AftBlocks.size() != 1)
716     return false;
717 
718   // Check inner loop IV is consistent between all iterations
719   const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
720   if (isa<SCEVCouldNotCompute>(SubLoopBECountSC) ||
721       !SubLoopBECountSC->getType()->isIntegerTy())
722     return false;
723   ScalarEvolution::LoopDisposition LD =
724       SE.getLoopDisposition(SubLoopBECountSC, L);
725   if (LD != ScalarEvolution::LoopInvariant)
726     return false;
727 
728   // Check the loop safety info for exceptions.
729   LoopSafetyInfo LSI;
730   computeLoopSafetyInfo(&LSI, L);
731   if (LSI.MayThrow)
732     return false;
733 
734   // We've ruled out the easy stuff and now need to check that there are no
735   // interdependencies which may prevent us from moving the:
736   //  ForeBlocks before Subloop and AftBlocks.
737   //  Subloop before AftBlocks.
738   //  ForeBlock phi operands before the subloop
739 
740   // Make sure we can move all instructions we need to before the subloop
741   SmallVector<Instruction *, 8> Worklist;
742   SmallPtrSet<Instruction *, 8> Visited;
743   for (auto &Phi : Header->phis()) {
744     Value *V = Phi.getIncomingValueForBlock(Latch);
745     if (Instruction *I = dyn_cast<Instruction>(V))
746       Worklist.push_back(I);
747   }
748   while (!Worklist.empty()) {
749     Instruction *I = Worklist.back();
750     Worklist.pop_back();
751     if (Visited.insert(I).second) {
752       if (SubLoop->contains(I->getParent()))
753         return false;
754       if (containsBB(AftBlocks, I->getParent())) {
755         // If we hit a phi node in afts we know we are done (probably LCSSA)
756         if (isa<PHINode>(I))
757           return false;
758         if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
759           return false;
760         for (auto &U : I->operands())
761           if (Instruction *II = dyn_cast<Instruction>(U))
762             Worklist.push_back(II);
763       }
764     }
765   }
766 
767   // Check for memory dependencies which prohibit the unrolling we are doing.
768   // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
769   // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
770   if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
771     return false;
772 
773   return true;
774 }
775