1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements loop unroll and jam as a routine, much like
11 // LoopUnroll.cpp implements loop unroll.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/ADT/SmallPtrSet.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/AssumptionCache.h"
18 #include "llvm/Analysis/DependenceAnalysis.h"
19 #include "llvm/Analysis/InstructionSimplify.h"
20 #include "llvm/Analysis/LoopAnalysisManager.h"
21 #include "llvm/Analysis/LoopIterator.h"
22 #include "llvm/Analysis/LoopPass.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/Analysis/ScalarEvolution.h"
25 #include "llvm/Analysis/ScalarEvolutionExpander.h"
26 #include "llvm/Analysis/Utils/Local.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/DebugInfoMetadata.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/LLVMContext.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/raw_ostream.h"
35 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
36 #include "llvm/Transforms/Utils/Cloning.h"
37 #include "llvm/Transforms/Utils/LoopSimplify.h"
38 #include "llvm/Transforms/Utils/LoopUtils.h"
39 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
40 #include "llvm/Transforms/Utils/UnrollLoop.h"
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "loop-unroll-and-jam"
44 
45 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
46 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
47 
48 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
49 
50 // Partition blocks in an outer/inner loop pair into blocks before and after
51 // the loop
partitionOuterLoopBlocks(Loop * L,Loop * SubLoop,BasicBlockSet & ForeBlocks,BasicBlockSet & SubLoopBlocks,BasicBlockSet & AftBlocks,DominatorTree * DT)52 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
53                                      BasicBlockSet &ForeBlocks,
54                                      BasicBlockSet &SubLoopBlocks,
55                                      BasicBlockSet &AftBlocks,
56                                      DominatorTree *DT) {
57   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
58   SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
59 
60   for (BasicBlock *BB : L->blocks()) {
61     if (!SubLoop->contains(BB)) {
62       if (DT->dominates(SubLoopLatch, BB))
63         AftBlocks.insert(BB);
64       else
65         ForeBlocks.insert(BB);
66     }
67   }
68 
69   // Check that all blocks in ForeBlocks together dominate the subloop
70   // TODO: This might ideally be done better with a dominator/postdominators.
71   BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
72   for (BasicBlock *BB : ForeBlocks) {
73     if (BB == SubLoopPreHeader)
74       continue;
75     Instruction *TI = BB->getTerminator();
76     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
77       if (!ForeBlocks.count(TI->getSuccessor(i)))
78         return false;
79   }
80 
81   return true;
82 }
83 
84 // Looks at the phi nodes in Header for values coming from Latch. For these
85 // instructions and all their operands calls Visit on them, keeping going for
86 // all the operands in AftBlocks. Returns false if Visit returns false,
87 // otherwise returns true. This is used to process the instructions in the
88 // Aft blocks that need to be moved before the subloop. It is used in two
89 // places. One to check that the required set of instructions can be moved
90 // before the loop. Then to collect the instructions to actually move in
91 // moveHeaderPhiOperandsToForeBlocks.
92 template <typename T>
processHeaderPhiOperands(BasicBlock * Header,BasicBlock * Latch,BasicBlockSet & AftBlocks,T Visit)93 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
94                                      BasicBlockSet &AftBlocks, T Visit) {
95   SmallVector<Instruction *, 8> Worklist;
96   for (auto &Phi : Header->phis()) {
97     Value *V = Phi.getIncomingValueForBlock(Latch);
98     if (Instruction *I = dyn_cast<Instruction>(V))
99       Worklist.push_back(I);
100   }
101 
102   while (!Worklist.empty()) {
103     Instruction *I = Worklist.back();
104     Worklist.pop_back();
105     if (!Visit(I))
106       return false;
107 
108     if (AftBlocks.count(I->getParent()))
109       for (auto &U : I->operands())
110         if (Instruction *II = dyn_cast<Instruction>(U))
111           Worklist.push_back(II);
112   }
113 
114   return true;
115 }
116 
117 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
moveHeaderPhiOperandsToForeBlocks(BasicBlock * Header,BasicBlock * Latch,Instruction * InsertLoc,BasicBlockSet & AftBlocks)118 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
119                                               BasicBlock *Latch,
120                                               Instruction *InsertLoc,
121                                               BasicBlockSet &AftBlocks) {
122   // We need to ensure we move the instructions in the correct order,
123   // starting with the earliest required instruction and moving forward.
124   std::vector<Instruction *> Visited;
125   processHeaderPhiOperands(Header, Latch, AftBlocks,
126                            [&Visited, &AftBlocks](Instruction *I) {
127                              if (AftBlocks.count(I->getParent()))
128                                Visited.push_back(I);
129                              return true;
130                            });
131 
132   // Move all instructions in program order to before the InsertLoc
133   BasicBlock *InsertLocBB = InsertLoc->getParent();
134   for (Instruction *I : reverse(Visited)) {
135     if (I->getParent() != InsertLocBB)
136       I->moveBefore(InsertLoc);
137   }
138 }
139 
140 /*
141   This method performs Unroll and Jam. For a simple loop like:
142   for (i = ..)
143     Fore(i)
144     for (j = ..)
145       SubLoop(i, j)
146     Aft(i)
147 
148   Instead of doing normal inner or outer unrolling, we do:
149   for (i = .., i+=2)
150     Fore(i)
151     Fore(i+1)
152     for (j = ..)
153       SubLoop(i, j)
154       SubLoop(i+1, j)
155     Aft(i)
156     Aft(i+1)
157 
158   So the outer loop is essetially unrolled and then the inner loops are fused
159   ("jammed") together into a single loop. This can increase speed when there
160   are loads in SubLoop that are invariant to i, as they become shared between
161   the now jammed inner loops.
162 
163   We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
164   Fore blocks are those before the inner loop, Aft are those after. Normal
165   Unroll code is used to copy each of these sets of blocks and the results are
166   combined together into the final form above.
167 
168   isSafeToUnrollAndJam should be used prior to calling this to make sure the
169   unrolling will be valid. Checking profitablility is also advisable.
170 
171   If EpilogueLoop is non-null, it receives the epilogue loop (if it was
172   necessary to create one and not fully unrolled).
173 */
UnrollAndJamLoop(Loop * L,unsigned Count,unsigned TripCount,unsigned TripMultiple,bool UnrollRemainder,LoopInfo * LI,ScalarEvolution * SE,DominatorTree * DT,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,Loop ** EpilogueLoop)174 LoopUnrollResult llvm::UnrollAndJamLoop(
175     Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
176     bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
177     AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
178 
179   // When we enter here we should have already checked that it is safe
180   BasicBlock *Header = L->getHeader();
181   assert(L->getSubLoops().size() == 1);
182   Loop *SubLoop = *L->begin();
183 
184   // Don't enter the unroll code if there is nothing to do.
185   if (TripCount == 0 && Count < 2) {
186     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
187     return LoopUnrollResult::Unmodified;
188   }
189 
190   assert(Count > 0);
191   assert(TripMultiple > 0);
192   assert(TripCount == 0 || TripCount % TripMultiple == 0);
193 
194   // Are we eliminating the loop control altogether?
195   bool CompletelyUnroll = (Count == TripCount);
196 
197   // We use the runtime remainder in cases where we don't know trip multiple
198   if (TripMultiple == 1 || TripMultiple % Count != 0) {
199     if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
200                                     /*UseEpilogRemainder*/ true,
201                                     UnrollRemainder, LI, SE, DT, AC, true,
202                                     EpilogueLoop)) {
203       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
204                            "generated when assuming runtime trip count\n");
205       return LoopUnrollResult::Unmodified;
206     }
207   }
208 
209   // Notify ScalarEvolution that the loop will be substantially changed,
210   // if not outright eliminated.
211   if (SE) {
212     SE->forgetLoop(L);
213     SE->forgetLoop(SubLoop);
214   }
215 
216   using namespace ore;
217   // Report the unrolling decision.
218   if (CompletelyUnroll) {
219     LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
220                       << Header->getName() << " with trip count " << TripCount
221                       << "!\n");
222     ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
223                                  L->getHeader())
224               << "completely unroll and jammed loop with "
225               << NV("UnrollCount", TripCount) << " iterations");
226   } else {
227     auto DiagBuilder = [&]() {
228       OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
229                               L->getHeader());
230       return Diag << "unroll and jammed loop by a factor of "
231                   << NV("UnrollCount", Count);
232     };
233 
234     LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
235                       << " by " << Count);
236     if (TripMultiple != 1) {
237       LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
238       ORE->emit([&]() {
239         return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
240                              << " trips per branch";
241       });
242     } else {
243       LLVM_DEBUG(dbgs() << " with run-time trip count");
244       ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
245     }
246     LLVM_DEBUG(dbgs() << "!\n");
247   }
248 
249   BasicBlock *Preheader = L->getLoopPreheader();
250   BasicBlock *LatchBlock = L->getLoopLatch();
251   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
252   assert(Preheader && LatchBlock && Header);
253   assert(BI && !BI->isUnconditional());
254   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
255   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
256   bool SubLoopContinueOnTrue = SubLoop->contains(
257       SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
258 
259   // Partition blocks in an outer/inner loop pair into blocks before and after
260   // the loop
261   BasicBlockSet SubLoopBlocks;
262   BasicBlockSet ForeBlocks;
263   BasicBlockSet AftBlocks;
264   partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
265                            DT);
266 
267   // We keep track of the entering/first and exiting/last block of each of
268   // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
269   // blocks easier.
270   std::vector<BasicBlock *> ForeBlocksFirst;
271   std::vector<BasicBlock *> ForeBlocksLast;
272   std::vector<BasicBlock *> SubLoopBlocksFirst;
273   std::vector<BasicBlock *> SubLoopBlocksLast;
274   std::vector<BasicBlock *> AftBlocksFirst;
275   std::vector<BasicBlock *> AftBlocksLast;
276   ForeBlocksFirst.push_back(Header);
277   ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
278   SubLoopBlocksFirst.push_back(SubLoop->getHeader());
279   SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
280   AftBlocksFirst.push_back(SubLoop->getExitBlock());
281   AftBlocksLast.push_back(L->getExitingBlock());
282   // Maps Blocks[0] -> Blocks[It]
283   ValueToValueMapTy LastValueMap;
284 
285   // Move any instructions from fore phi operands from AftBlocks into Fore.
286   moveHeaderPhiOperandsToForeBlocks(
287       Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
288       AftBlocks);
289 
290   // The current on-the-fly SSA update requires blocks to be processed in
291   // reverse postorder so that LastValueMap contains the correct value at each
292   // exit.
293   LoopBlocksDFS DFS(L);
294   DFS.perform(LI);
295   // Stash the DFS iterators before adding blocks to the loop.
296   LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
297   LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
298 
299   if (Header->getParent()->isDebugInfoForProfiling())
300     for (BasicBlock *BB : L->getBlocks())
301       for (Instruction &I : *BB)
302         if (!isa<DbgInfoIntrinsic>(&I))
303           if (const DILocation *DIL = I.getDebugLoc()) {
304             auto NewDIL = DIL->cloneWithDuplicationFactor(Count);
305             if (NewDIL)
306               I.setDebugLoc(NewDIL.getValue());
307             else
308               LLVM_DEBUG(dbgs()
309                          << "Failed to create new discriminator: "
310                          << DIL->getFilename() << " Line: " << DIL->getLine());
311           }
312 
313   // Copy all blocks
314   for (unsigned It = 1; It != Count; ++It) {
315     std::vector<BasicBlock *> NewBlocks;
316     // Maps Blocks[It] -> Blocks[It-1]
317     DenseMap<Value *, Value *> PrevItValueMap;
318 
319     for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
320       ValueToValueMapTy VMap;
321       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
322       Header->getParent()->getBasicBlockList().push_back(New);
323 
324       if (ForeBlocks.count(*BB)) {
325         L->addBasicBlockToLoop(New, *LI);
326 
327         if (*BB == ForeBlocksFirst[0])
328           ForeBlocksFirst.push_back(New);
329         if (*BB == ForeBlocksLast[0])
330           ForeBlocksLast.push_back(New);
331       } else if (SubLoopBlocks.count(*BB)) {
332         SubLoop->addBasicBlockToLoop(New, *LI);
333 
334         if (*BB == SubLoopBlocksFirst[0])
335           SubLoopBlocksFirst.push_back(New);
336         if (*BB == SubLoopBlocksLast[0])
337           SubLoopBlocksLast.push_back(New);
338       } else if (AftBlocks.count(*BB)) {
339         L->addBasicBlockToLoop(New, *LI);
340 
341         if (*BB == AftBlocksFirst[0])
342           AftBlocksFirst.push_back(New);
343         if (*BB == AftBlocksLast[0])
344           AftBlocksLast.push_back(New);
345       } else {
346         llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
347       }
348 
349       // Update our running maps of newest clones
350       PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
351       LastValueMap[*BB] = New;
352       for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
353            VI != VE; ++VI) {
354         PrevItValueMap[VI->second] =
355             const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
356         LastValueMap[VI->first] = VI->second;
357       }
358 
359       NewBlocks.push_back(New);
360 
361       // Update DomTree:
362       if (*BB == ForeBlocksFirst[0])
363         DT->addNewBlock(New, ForeBlocksLast[It - 1]);
364       else if (*BB == SubLoopBlocksFirst[0])
365         DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
366       else if (*BB == AftBlocksFirst[0])
367         DT->addNewBlock(New, AftBlocksLast[It - 1]);
368       else {
369         // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
370         // structure.
371         auto BBDomNode = DT->getNode(*BB);
372         auto BBIDom = BBDomNode->getIDom();
373         BasicBlock *OriginalBBIDom = BBIDom->getBlock();
374         assert(OriginalBBIDom);
375         assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
376         DT->addNewBlock(
377             New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
378       }
379     }
380 
381     // Remap all instructions in the most recent iteration
382     for (BasicBlock *NewBlock : NewBlocks) {
383       for (Instruction &I : *NewBlock) {
384         ::remapInstruction(&I, LastValueMap);
385         if (auto *II = dyn_cast<IntrinsicInst>(&I))
386           if (II->getIntrinsicID() == Intrinsic::assume)
387             AC->registerAssumption(II);
388       }
389     }
390 
391     // Alter the ForeBlocks phi's, pointing them at the latest version of the
392     // value from the previous iteration's phis
393     for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
394       Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
395       assert(OldValue && "should have incoming edge from Aft[It]");
396       Value *NewValue = OldValue;
397       if (Value *PrevValue = PrevItValueMap[OldValue])
398         NewValue = PrevValue;
399 
400       assert(Phi.getNumOperands() == 2);
401       Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
402       Phi.setIncomingValue(0, NewValue);
403       Phi.removeIncomingValue(1);
404     }
405   }
406 
407   // Now that all the basic blocks for the unrolled iterations are in place,
408   // finish up connecting the blocks and phi nodes. At this point LastValueMap
409   // is the last unrolled iterations values.
410 
411   // Update Phis in BB from OldBB to point to NewBB
412   auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
413                             BasicBlock *NewBB) {
414     for (PHINode &Phi : BB->phis()) {
415       int I = Phi.getBasicBlockIndex(OldBB);
416       Phi.setIncomingBlock(I, NewBB);
417     }
418   };
419   // Update Phis in BB from OldBB to point to NewBB and use the latest value
420   // from LastValueMap
421   auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
422                                      BasicBlock *NewBB,
423                                      ValueToValueMapTy &LastValueMap) {
424     for (PHINode &Phi : BB->phis()) {
425       for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
426         if (Phi.getIncomingBlock(b) == OldBB) {
427           Value *OldValue = Phi.getIncomingValue(b);
428           if (Value *LastValue = LastValueMap[OldValue])
429             Phi.setIncomingValue(b, LastValue);
430           Phi.setIncomingBlock(b, NewBB);
431           break;
432         }
433       }
434     }
435   };
436   // Move all the phis from Src into Dest
437   auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
438     Instruction *insertPoint = Dest->getFirstNonPHI();
439     while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
440       Phi->moveBefore(insertPoint);
441   };
442 
443   // Update the PHI values outside the loop to point to the last block
444   updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
445                            LastValueMap);
446 
447   // Update ForeBlocks successors and phi nodes
448   BranchInst *ForeTerm =
449       cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
450   BasicBlock *Dest = SubLoopBlocksFirst[0];
451   ForeTerm->setSuccessor(0, Dest);
452 
453   if (CompletelyUnroll) {
454     while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
455       Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
456       Phi->getParent()->getInstList().erase(Phi);
457     }
458   } else {
459     // Update the PHI values to point to the last aft block
460     updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
461                              AftBlocksLast.back(), LastValueMap);
462   }
463 
464   for (unsigned It = 1; It != Count; It++) {
465     // Remap ForeBlock successors from previous iteration to this
466     BranchInst *ForeTerm =
467         cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
468     BasicBlock *Dest = ForeBlocksFirst[It];
469     ForeTerm->setSuccessor(0, Dest);
470   }
471 
472   // Subloop successors and phis
473   BranchInst *SubTerm =
474       cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
475   SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
476   SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
477   updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
478                   ForeBlocksLast.back());
479   updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
480                   SubLoopBlocksLast.back());
481 
482   for (unsigned It = 1; It != Count; It++) {
483     // Replace the conditional branch of the previous iteration subloop with an
484     // unconditional one to this one
485     BranchInst *SubTerm =
486         cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
487     BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
488     SubTerm->eraseFromParent();
489 
490     updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
491                     ForeBlocksLast.back());
492     updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
493                     SubLoopBlocksLast.back());
494     movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
495   }
496 
497   // Aft blocks successors and phis
498   BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
499   if (CompletelyUnroll) {
500     BranchInst::Create(LoopExit, Term);
501     Term->eraseFromParent();
502   } else {
503     Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
504   }
505   updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
506                   SubLoopBlocksLast.back());
507 
508   for (unsigned It = 1; It != Count; It++) {
509     // Replace the conditional branch of the previous iteration subloop with an
510     // unconditional one to this one
511     BranchInst *AftTerm =
512         cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
513     BranchInst::Create(AftBlocksFirst[It], AftTerm);
514     AftTerm->eraseFromParent();
515 
516     updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
517                     SubLoopBlocksLast.back());
518     movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
519   }
520 
521   // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
522   // new ones required.
523   if (Count != 1) {
524     SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
525     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
526                            SubLoopBlocksFirst[0]);
527     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
528                            SubLoopBlocksLast[0], AftBlocksFirst[0]);
529 
530     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
531                            ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
532     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
533                            SubLoopBlocksLast.back(), AftBlocksFirst[0]);
534     DT->applyUpdates(DTUpdates);
535   }
536 
537   // Merge adjacent basic blocks, if possible.
538   SmallPtrSet<BasicBlock *, 16> MergeBlocks;
539   MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
540   MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
541   MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
542   while (!MergeBlocks.empty()) {
543     BasicBlock *BB = *MergeBlocks.begin();
544     BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
545     if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
546       BasicBlock *Dest = Term->getSuccessor(0);
547       if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
548         // Don't remove BB and add Fold as they are the same BB
549         assert(Fold == BB);
550         (void)Fold;
551         MergeBlocks.erase(Dest);
552       } else
553         MergeBlocks.erase(BB);
554     } else
555       MergeBlocks.erase(BB);
556   }
557 
558   // At this point, the code is well formed.  We now do a quick sweep over the
559   // inserted code, doing constant propagation and dead code elimination as we
560   // go.
561   simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
562   simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
563 
564   NumCompletelyUnrolledAndJammed += CompletelyUnroll;
565   ++NumUnrolledAndJammed;
566 
567 #ifndef NDEBUG
568   // We shouldn't have done anything to break loop simplify form or LCSSA.
569   Loop *OuterL = L->getParentLoop();
570   Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
571   assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
572   if (!CompletelyUnroll)
573     assert(L->isLoopSimplifyForm());
574   assert(SubLoop->isLoopSimplifyForm());
575   assert(DT->verify());
576 #endif
577 
578   // Update LoopInfo if the loop is completely removed.
579   if (CompletelyUnroll)
580     LI->erase(L);
581 
582   return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
583                           : LoopUnrollResult::PartiallyUnrolled;
584 }
585 
getLoadsAndStores(BasicBlockSet & Blocks,SmallVector<Value *,4> & MemInstr)586 static bool getLoadsAndStores(BasicBlockSet &Blocks,
587                               SmallVector<Value *, 4> &MemInstr) {
588   // Scan the BBs and collect legal loads and stores.
589   // Returns false if non-simple loads/stores are found.
590   for (BasicBlock *BB : Blocks) {
591     for (Instruction &I : *BB) {
592       if (auto *Ld = dyn_cast<LoadInst>(&I)) {
593         if (!Ld->isSimple())
594           return false;
595         MemInstr.push_back(&I);
596       } else if (auto *St = dyn_cast<StoreInst>(&I)) {
597         if (!St->isSimple())
598           return false;
599         MemInstr.push_back(&I);
600       } else if (I.mayReadOrWriteMemory()) {
601         return false;
602       }
603     }
604   }
605   return true;
606 }
607 
checkDependencies(SmallVector<Value *,4> & Earlier,SmallVector<Value *,4> & Later,unsigned LoopDepth,bool InnerLoop,DependenceInfo & DI)608 static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
609                               SmallVector<Value *, 4> &Later,
610                               unsigned LoopDepth, bool InnerLoop,
611                               DependenceInfo &DI) {
612   // Use DA to check for dependencies between loads and stores that make unroll
613   // and jam invalid
614   for (Value *I : Earlier) {
615     for (Value *J : Later) {
616       Instruction *Src = cast<Instruction>(I);
617       Instruction *Dst = cast<Instruction>(J);
618       if (Src == Dst)
619         continue;
620       // Ignore Input dependencies.
621       if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
622         continue;
623 
624       // Track dependencies, and if we find them take a conservative approach
625       // by allowing only = or < (not >), altough some > would be safe
626       // (depending upon unroll width).
627       // For the inner loop, we need to disallow any (> <) dependencies
628       // FIXME: Allow > so long as distance is less than unroll width
629       if (auto D = DI.depends(Src, Dst, true)) {
630         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
631 
632         if (D->isConfused()) {
633           LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
634                             << "  " << *Src << "\n"
635                             << "  " << *Dst << "\n");
636           return false;
637         }
638         if (!InnerLoop) {
639           if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
640             LLVM_DEBUG(dbgs() << "  > dependency between:\n"
641                               << "  " << *Src << "\n"
642                               << "  " << *Dst << "\n");
643             return false;
644           }
645         } else {
646           assert(LoopDepth + 1 <= D->getLevels());
647           if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
648               D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
649             LLVM_DEBUG(dbgs() << "  < > dependency between:\n"
650                               << "  " << *Src << "\n"
651                               << "  " << *Dst << "\n");
652             return false;
653           }
654         }
655       }
656     }
657   }
658   return true;
659 }
660 
checkDependencies(Loop * L,BasicBlockSet & ForeBlocks,BasicBlockSet & SubLoopBlocks,BasicBlockSet & AftBlocks,DependenceInfo & DI)661 static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
662                               BasicBlockSet &SubLoopBlocks,
663                               BasicBlockSet &AftBlocks, DependenceInfo &DI) {
664   // Get all loads/store pairs for each blocks
665   SmallVector<Value *, 4> ForeMemInstr;
666   SmallVector<Value *, 4> SubLoopMemInstr;
667   SmallVector<Value *, 4> AftMemInstr;
668   if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
669       !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
670       !getLoadsAndStores(AftBlocks, AftMemInstr))
671     return false;
672 
673   // Check for dependencies between any blocks that may change order
674   unsigned LoopDepth = L->getLoopDepth();
675   return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
676                            DI) &&
677          checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
678          checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
679                            DI) &&
680          checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
681                            DI);
682 }
683 
isSafeToUnrollAndJam(Loop * L,ScalarEvolution & SE,DominatorTree & DT,DependenceInfo & DI)684 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
685                                 DependenceInfo &DI) {
686   /* We currently handle outer loops like this:
687         |
688     ForeFirst    <----\    }
689      Blocks           |    } ForeBlocks
690     ForeLast          |    }
691         |             |
692     SubLoopFirst  <\  |    }
693      Blocks        |  |    } SubLoopBlocks
694     SubLoopLast   -/  |    }
695         |             |
696     AftFirst          |    }
697      Blocks           |    } AftBlocks
698     AftLast     ------/    }
699         |
700 
701     There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
702     and AftBlocks, providing that there is one edge from Fores to SubLoops,
703     one edge from SubLoops to Afts and a single outer loop exit (from Afts).
704     In practice we currently limit Aft blocks to a single block, and limit
705     things further in the profitablility checks of the unroll and jam pass.
706 
707     Because of the way we rearrange basic blocks, we also require that
708     the Fore blocks on all unrolled iterations are safe to move before the
709     SubLoop blocks of all iterations. So we require that the phi node looping
710     operands of ForeHeader can be moved to at least the end of ForeEnd, so that
711     we can arrange cloned Fore Blocks before the subloop and match up Phi's
712     correctly.
713 
714     i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
715     It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
716 
717     There are then a number of checks along the lines of no calls, no
718     exceptions, inner loop IV is consistent, etc. Note that for loops requiring
719     runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
720     UnrollAndJamLoop if the trip count cannot be easily calculated.
721   */
722 
723   if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
724     return false;
725   Loop *SubLoop = L->getSubLoops()[0];
726   if (!SubLoop->isLoopSimplifyForm())
727     return false;
728 
729   BasicBlock *Header = L->getHeader();
730   BasicBlock *Latch = L->getLoopLatch();
731   BasicBlock *Exit = L->getExitingBlock();
732   BasicBlock *SubLoopHeader = SubLoop->getHeader();
733   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
734   BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
735 
736   if (Latch != Exit)
737     return false;
738   if (SubLoopLatch != SubLoopExit)
739     return false;
740 
741   if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
742     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
743     return false;
744   }
745 
746   // Split blocks into Fore/SubLoop/Aft based on dominators
747   BasicBlockSet SubLoopBlocks;
748   BasicBlockSet ForeBlocks;
749   BasicBlockSet AftBlocks;
750   if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
751                                 AftBlocks, &DT)) {
752     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
753     return false;
754   }
755 
756   // Aft blocks may need to move instructions to fore blocks, which becomes more
757   // difficult if there are multiple (potentially conditionally executed)
758   // blocks. For now we just exclude loops with multiple aft blocks.
759   if (AftBlocks.size() != 1) {
760     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
761                          "multiple blocks after the loop\n");
762     return false;
763   }
764 
765   // Check inner loop backedge count is consistent on all iterations of the
766   // outer loop
767   if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
768     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
769                          "not consistent on each iteration\n");
770     return false;
771   }
772 
773   // Check the loop safety info for exceptions.
774   SimpleLoopSafetyInfo LSI;
775   LSI.computeLoopSafetyInfo(L);
776   if (LSI.anyBlockMayThrow()) {
777     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
778     return false;
779   }
780 
781   // We've ruled out the easy stuff and now need to check that there are no
782   // interdependencies which may prevent us from moving the:
783   //  ForeBlocks before Subloop and AftBlocks.
784   //  Subloop before AftBlocks.
785   //  ForeBlock phi operands before the subloop
786 
787   // Make sure we can move all instructions we need to before the subloop
788   if (!processHeaderPhiOperands(
789           Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
790             if (SubLoop->contains(I->getParent()))
791               return false;
792             if (AftBlocks.count(I->getParent())) {
793               // If we hit a phi node in afts we know we are done (probably
794               // LCSSA)
795               if (isa<PHINode>(I))
796                 return false;
797               // Can't move instructions with side effects or memory
798               // reads/writes
799               if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
800                 return false;
801             }
802             // Keep going
803             return true;
804           })) {
805     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
806                          "instructions after subloop to before it\n");
807     return false;
808   }
809 
810   // Check for memory dependencies which prohibit the unrolling we are doing.
811   // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
812   // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
813   if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
814     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
815     return false;
816   }
817 
818   return true;
819 }
820