1 //===- LoopPeel.cpp -------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Loop Peeling Utilities.
10 //===----------------------------------------------------------------------===//
11 
12 #include "llvm/Transforms/Utils/LoopPeel.h"
13 #include "llvm/ADT/DenseMap.h"
14 #include "llvm/ADT/DenseSet.h"
15 #include "llvm/ADT/Optional.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/Statistic.h"
18 #include "llvm/Analysis/DomTreeUpdater.h"
19 #include "llvm/Analysis/Loads.h"
20 #include "llvm/Analysis/LoopInfo.h"
21 #include "llvm/Analysis/LoopIterator.h"
22 #include "llvm/Analysis/ScalarEvolution.h"
23 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
24 #include "llvm/Analysis/TargetTransformInfo.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/Dominators.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/InstrTypes.h"
29 #include "llvm/IR/Instruction.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/LLVMContext.h"
32 #include "llvm/IR/MDBuilder.h"
33 #include "llvm/IR/Metadata.h"
34 #include "llvm/IR/PatternMatch.h"
35 #include "llvm/Support/Casting.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/raw_ostream.h"
39 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
40 #include "llvm/Transforms/Utils/Cloning.h"
41 #include "llvm/Transforms/Utils/LoopSimplify.h"
42 #include "llvm/Transforms/Utils/LoopUtils.h"
43 #include "llvm/Transforms/Utils/UnrollLoop.h"
44 #include "llvm/Transforms/Utils/ValueMapper.h"
45 #include <algorithm>
46 #include <cassert>
47 #include <cstdint>
48 #include <limits>
49 
50 using namespace llvm;
51 using namespace llvm::PatternMatch;
52 
53 #define DEBUG_TYPE "loop-peel"
54 
55 STATISTIC(NumPeeled, "Number of loops peeled");
56 
57 static cl::opt<unsigned> UnrollPeelCount(
58     "unroll-peel-count", cl::Hidden,
59     cl::desc("Set the unroll peeling count, for testing purposes"));
60 
61 static cl::opt<bool>
62     UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
63                        cl::desc("Allows loops to be peeled when the dynamic "
64                                 "trip count is known to be low."));
65 
66 static cl::opt<bool>
67     UnrollAllowLoopNestsPeeling("unroll-allow-loop-nests-peeling",
68                                 cl::init(false), cl::Hidden,
69                                 cl::desc("Allows loop nests to be peeled."));
70 
71 static cl::opt<unsigned> UnrollPeelMaxCount(
72     "unroll-peel-max-count", cl::init(7), cl::Hidden,
73     cl::desc("Max average trip count which will cause loop peeling."));
74 
75 static cl::opt<unsigned> UnrollForcePeelCount(
76     "unroll-force-peel-count", cl::init(0), cl::Hidden,
77     cl::desc("Force a peel count regardless of profiling information."));
78 
79 static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
80 
81 // Designates that a Phi is estimated to become invariant after an "infinite"
82 // number of loop iterations (i.e. only may become an invariant if the loop is
83 // fully unrolled).
84 static const unsigned InfiniteIterationsToInvariance =
85     std::numeric_limits<unsigned>::max();
86 
87 // Check whether we are capable of peeling this loop.
88 bool llvm::canPeel(Loop *L) {
89   // Make sure the loop is in simplified form
90   if (!L->isLoopSimplifyForm())
91     return false;
92 
93   // Don't try to peel loops where the latch is not the exiting block.
94   // This can be an indication of two different things:
95   // 1) The loop is not rotated.
96   // 2) The loop contains irreducible control flow that involves the latch.
97   const BasicBlock *Latch = L->getLoopLatch();
98   if (!L->isLoopExiting(Latch))
99     return false;
100 
101   // Peeling is only supported if the latch is a branch.
102   if (!isa<BranchInst>(Latch->getTerminator()))
103     return false;
104 
105   SmallVector<BasicBlock *, 4> Exits;
106   L->getUniqueNonLatchExitBlocks(Exits);
107   // The latch must either be the only exiting block or all non-latch exit
108   // blocks have either a deopt or unreachable terminator. Both deopt and
109   // unreachable terminators are a strong indication they are not taken. Note
110   // that this is a profitability check, not a legality check. Also note that
111   // LoopPeeling currently can only update the branch weights of latch blocks
112   // and branch weights to blocks with deopt or unreachable do not need
113   // updating.
114   return all_of(Exits, [](const BasicBlock *BB) {
115     return BB->getTerminatingDeoptimizeCall() ||
116            isa<UnreachableInst>(BB->getTerminator());
117   });
118 }
119 
120 // This function calculates the number of iterations after which the given Phi
121 // becomes an invariant. The pre-calculated values are memorized in the map. The
122 // function (shortcut is I) is calculated according to the following definition:
123 // Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge].
124 //   If %y is a loop invariant, then I(%x) = 1.
125 //   If %y is a Phi from the loop header, I(%x) = I(%y) + 1.
126 //   Otherwise, I(%x) is infinite.
127 // TODO: Actually if %y is an expression that depends only on Phi %z and some
128 //       loop invariants, we can estimate I(%x) = I(%z) + 1. The example
129 //       looks like:
130 //         %x = phi(0, %a),  <-- becomes invariant starting from 3rd iteration.
131 //         %y = phi(0, 5),
132 //         %a = %y + 1.
133 static unsigned calculateIterationsToInvariance(
134     PHINode *Phi, Loop *L, BasicBlock *BackEdge,
135     SmallDenseMap<PHINode *, unsigned> &IterationsToInvariance) {
136   assert(Phi->getParent() == L->getHeader() &&
137          "Non-loop Phi should not be checked for turning into invariant.");
138   assert(BackEdge == L->getLoopLatch() && "Wrong latch?");
139   // If we already know the answer, take it from the map.
140   auto I = IterationsToInvariance.find(Phi);
141   if (I != IterationsToInvariance.end())
142     return I->second;
143 
144   // Otherwise we need to analyze the input from the back edge.
145   Value *Input = Phi->getIncomingValueForBlock(BackEdge);
146   // Place infinity to map to avoid infinite recursion for cycled Phis. Such
147   // cycles can never stop on an invariant.
148   IterationsToInvariance[Phi] = InfiniteIterationsToInvariance;
149   unsigned ToInvariance = InfiniteIterationsToInvariance;
150 
151   if (L->isLoopInvariant(Input))
152     ToInvariance = 1u;
153   else if (PHINode *IncPhi = dyn_cast<PHINode>(Input)) {
154     // Only consider Phis in header block.
155     if (IncPhi->getParent() != L->getHeader())
156       return InfiniteIterationsToInvariance;
157     // If the input becomes an invariant after X iterations, then our Phi
158     // becomes an invariant after X + 1 iterations.
159     unsigned InputToInvariance = calculateIterationsToInvariance(
160         IncPhi, L, BackEdge, IterationsToInvariance);
161     if (InputToInvariance != InfiniteIterationsToInvariance)
162       ToInvariance = InputToInvariance + 1u;
163   }
164 
165   // If we found that this Phi lies in an invariant chain, update the map.
166   if (ToInvariance != InfiniteIterationsToInvariance)
167     IterationsToInvariance[Phi] = ToInvariance;
168   return ToInvariance;
169 }
170 
171 // Try to find any invariant memory reads that will become dereferenceable in
172 // the remainder loop after peeling. The load must also be used (transitively)
173 // by an exit condition. Returns the number of iterations to peel off (at the
174 // moment either 0 or 1).
175 static unsigned peelToTurnInvariantLoadsDerefencebale(Loop &L,
176                                                       DominatorTree &DT) {
177   // Skip loops with a single exiting block, because there should be no benefit
178   // for the heuristic below.
179   if (L.getExitingBlock())
180     return 0;
181 
182   // All non-latch exit blocks must have an UnreachableInst terminator.
183   // Otherwise the heuristic below may not be profitable.
184   SmallVector<BasicBlock *, 4> Exits;
185   L.getUniqueNonLatchExitBlocks(Exits);
186   if (any_of(Exits, [](const BasicBlock *BB) {
187         return !isa<UnreachableInst>(BB->getTerminator());
188       }))
189     return 0;
190 
191   // Now look for invariant loads that dominate the latch and are not known to
192   // be dereferenceable. If there are such loads and no writes, they will become
193   // dereferenceable in the loop if the first iteration is peeled off. Also
194   // collect the set of instructions controlled by such loads. Only peel if an
195   // exit condition uses (transitively) such a load.
196   BasicBlock *Header = L.getHeader();
197   BasicBlock *Latch = L.getLoopLatch();
198   SmallPtrSet<Value *, 8> LoadUsers;
199   const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
200   for (BasicBlock *BB : L.blocks()) {
201     for (Instruction &I : *BB) {
202       if (I.mayWriteToMemory())
203         return 0;
204 
205       auto Iter = LoadUsers.find(&I);
206       if (Iter != LoadUsers.end()) {
207         for (Value *U : I.users())
208           LoadUsers.insert(U);
209       }
210       // Do not look for reads in the header; they can already be hoisted
211       // without peeling.
212       if (BB == Header)
213         continue;
214       if (auto *LI = dyn_cast<LoadInst>(&I)) {
215         Value *Ptr = LI->getPointerOperand();
216         if (DT.dominates(BB, Latch) && L.isLoopInvariant(Ptr) &&
217             !isDereferenceablePointer(Ptr, LI->getType(), DL, LI, &DT))
218           for (Value *U : I.users())
219             LoadUsers.insert(U);
220       }
221     }
222   }
223   SmallVector<BasicBlock *> ExitingBlocks;
224   L.getExitingBlocks(ExitingBlocks);
225   if (any_of(ExitingBlocks, [&LoadUsers](BasicBlock *Exiting) {
226         return LoadUsers.contains(Exiting->getTerminator());
227       }))
228     return 1;
229   return 0;
230 }
231 
232 // Return the number of iterations to peel off that make conditions in the
233 // body true/false. For example, if we peel 2 iterations off the loop below,
234 // the condition i < 2 can be evaluated at compile time.
235 //  for (i = 0; i < n; i++)
236 //    if (i < 2)
237 //      ..
238 //    else
239 //      ..
240 //   }
241 static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
242                                          ScalarEvolution &SE) {
243   assert(L.isLoopSimplifyForm() && "Loop needs to be in loop simplify form");
244   unsigned DesiredPeelCount = 0;
245 
246   for (auto *BB : L.blocks()) {
247     auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
248     if (!BI || BI->isUnconditional())
249       continue;
250 
251     // Ignore loop exit condition.
252     if (L.getLoopLatch() == BB)
253       continue;
254 
255     Value *Condition = BI->getCondition();
256     Value *LeftVal, *RightVal;
257     CmpInst::Predicate Pred;
258     if (!match(Condition, m_ICmp(Pred, m_Value(LeftVal), m_Value(RightVal))))
259       continue;
260 
261     const SCEV *LeftSCEV = SE.getSCEV(LeftVal);
262     const SCEV *RightSCEV = SE.getSCEV(RightVal);
263 
264     // Do not consider predicates that are known to be true or false
265     // independently of the loop iteration.
266     if (SE.evaluatePredicate(Pred, LeftSCEV, RightSCEV))
267       continue;
268 
269     // Check if we have a condition with one AddRec and one non AddRec
270     // expression. Normalize LeftSCEV to be the AddRec.
271     if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
272       if (isa<SCEVAddRecExpr>(RightSCEV)) {
273         std::swap(LeftSCEV, RightSCEV);
274         Pred = ICmpInst::getSwappedPredicate(Pred);
275       } else
276         continue;
277     }
278 
279     const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV);
280 
281     // Avoid huge SCEV computations in the loop below, make sure we only
282     // consider AddRecs of the loop we are trying to peel.
283     if (!LeftAR->isAffine() || LeftAR->getLoop() != &L)
284       continue;
285     if (!(ICmpInst::isEquality(Pred) && LeftAR->hasNoSelfWrap()) &&
286         !SE.getMonotonicPredicateType(LeftAR, Pred))
287       continue;
288 
289     // Check if extending the current DesiredPeelCount lets us evaluate Pred
290     // or !Pred in the loop body statically.
291     unsigned NewPeelCount = DesiredPeelCount;
292 
293     const SCEV *IterVal = LeftAR->evaluateAtIteration(
294         SE.getConstant(LeftSCEV->getType(), NewPeelCount), SE);
295 
296     // If the original condition is not known, get the negated predicate
297     // (which holds on the else branch) and check if it is known. This allows
298     // us to peel of iterations that make the original condition false.
299     if (!SE.isKnownPredicate(Pred, IterVal, RightSCEV))
300       Pred = ICmpInst::getInversePredicate(Pred);
301 
302     const SCEV *Step = LeftAR->getStepRecurrence(SE);
303     const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step);
304     auto PeelOneMoreIteration = [&IterVal, &NextIterVal, &SE, Step,
305                                  &NewPeelCount]() {
306       IterVal = NextIterVal;
307       NextIterVal = SE.getAddExpr(IterVal, Step);
308       NewPeelCount++;
309     };
310 
311     auto CanPeelOneMoreIteration = [&NewPeelCount, &MaxPeelCount]() {
312       return NewPeelCount < MaxPeelCount;
313     };
314 
315     while (CanPeelOneMoreIteration() &&
316            SE.isKnownPredicate(Pred, IterVal, RightSCEV))
317       PeelOneMoreIteration();
318 
319     // With *that* peel count, does the predicate !Pred become known in the
320     // first iteration of the loop body after peeling?
321     if (!SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal,
322                              RightSCEV))
323       continue; // If not, give up.
324 
325     // However, for equality comparisons, that isn't always sufficient to
326     // eliminate the comparsion in loop body, we may need to peel one more
327     // iteration. See if that makes !Pred become unknown again.
328     if (ICmpInst::isEquality(Pred) &&
329         !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal,
330                              RightSCEV) &&
331         !SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&
332         SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) {
333       if (!CanPeelOneMoreIteration())
334         continue; // Need to peel one more iteration, but can't. Give up.
335       PeelOneMoreIteration(); // Great!
336     }
337 
338     DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount);
339   }
340 
341   return DesiredPeelCount;
342 }
343 
344 // Return the number of iterations we want to peel off.
345 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
346                             TargetTransformInfo::PeelingPreferences &PP,
347                             unsigned &TripCount, DominatorTree &DT,
348                             ScalarEvolution &SE, unsigned Threshold) {
349   assert(LoopSize > 0 && "Zero loop size is not allowed!");
350   // Save the PP.PeelCount value set by the target in
351   // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
352   unsigned TargetPeelCount = PP.PeelCount;
353   PP.PeelCount = 0;
354   if (!canPeel(L))
355     return;
356 
357   // Only try to peel innermost loops by default.
358   // The constraint can be relaxed by the target in TTI.getUnrollingPreferences
359   // or by the flag -unroll-allow-loop-nests-peeling.
360   if (!PP.AllowLoopNestsPeeling && !L->isInnermost())
361     return;
362 
363   // If the user provided a peel count, use that.
364   bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
365   if (UserPeelCount) {
366     LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
367                       << " iterations.\n");
368     PP.PeelCount = UnrollForcePeelCount;
369     PP.PeelProfiledIterations = true;
370     return;
371   }
372 
373   // Skip peeling if it's disabled.
374   if (!PP.AllowPeeling)
375     return;
376 
377   unsigned AlreadyPeeled = 0;
378   if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
379     AlreadyPeeled = *Peeled;
380   // Stop if we already peeled off the maximum number of iterations.
381   if (AlreadyPeeled >= UnrollPeelMaxCount)
382     return;
383 
384   // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
385   // iterations of the loop. For this we compute the number for iterations after
386   // which every Phi is guaranteed to become an invariant, and try to peel the
387   // maximum number of iterations among these values, thus turning all those
388   // Phis into invariants.
389   // First, check that we can peel at least one iteration.
390   if (2 * LoopSize <= Threshold && UnrollPeelMaxCount > 0) {
391     // Store the pre-calculated values here.
392     SmallDenseMap<PHINode *, unsigned> IterationsToInvariance;
393     // Now go through all Phis to calculate their the number of iterations they
394     // need to become invariants.
395     // Start the max computation with the UP.PeelCount value set by the target
396     // in TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
397     unsigned DesiredPeelCount = TargetPeelCount;
398     BasicBlock *BackEdge = L->getLoopLatch();
399     assert(BackEdge && "Loop is not in simplified form?");
400     for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
401       PHINode *Phi = cast<PHINode>(&*BI);
402       unsigned ToInvariance = calculateIterationsToInvariance(
403           Phi, L, BackEdge, IterationsToInvariance);
404       if (ToInvariance != InfiniteIterationsToInvariance)
405         DesiredPeelCount = std::max(DesiredPeelCount, ToInvariance);
406     }
407 
408     // Pay respect to limitations implied by loop size and the max peel count.
409     unsigned MaxPeelCount = UnrollPeelMaxCount;
410     MaxPeelCount = std::min(MaxPeelCount, Threshold / LoopSize - 1);
411 
412     DesiredPeelCount = std::max(DesiredPeelCount,
413                                 countToEliminateCompares(*L, MaxPeelCount, SE));
414 
415     if (DesiredPeelCount == 0)
416       DesiredPeelCount = peelToTurnInvariantLoadsDerefencebale(*L, DT);
417 
418     if (DesiredPeelCount > 0) {
419       DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
420       // Consider max peel count limitation.
421       assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
422       if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
423         LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
424                           << " iteration(s) to turn"
425                           << " some Phis into invariants.\n");
426         PP.PeelCount = DesiredPeelCount;
427         PP.PeelProfiledIterations = false;
428         return;
429       }
430     }
431   }
432 
433   // Bail if we know the statically calculated trip count.
434   // In this case we rather prefer partial unrolling.
435   if (TripCount)
436     return;
437 
438   // Do not apply profile base peeling if it is disabled.
439   if (!PP.PeelProfiledIterations)
440     return;
441   // If we don't know the trip count, but have reason to believe the average
442   // trip count is low, peeling should be beneficial, since we will usually
443   // hit the peeled section.
444   // We only do this in the presence of profile information, since otherwise
445   // our estimates of the trip count are not reliable enough.
446   if (L->getHeader()->getParent()->hasProfileData()) {
447     Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L);
448     if (!PeelCount)
449       return;
450 
451     LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount
452                       << "\n");
453 
454     if (*PeelCount) {
455       if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) &&
456           (LoopSize * (*PeelCount + 1) <= Threshold)) {
457         LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
458                           << " iterations.\n");
459         PP.PeelCount = *PeelCount;
460         return;
461       }
462       LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
463       LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
464       LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
465       LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1)
466                         << "\n");
467       LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
468     }
469   }
470 }
471 
472 /// Update the branch weights of the latch of a peeled-off loop
473 /// iteration.
474 /// This sets the branch weights for the latch of the recently peeled off loop
475 /// iteration correctly.
476 /// Let F is a weight of the edge from latch to header.
477 /// Let E is a weight of the edge from latch to exit.
478 /// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
479 /// go to exit.
480 /// Then, Estimated TripCount = F / E.
481 /// For I-th (counting from 0) peeled off iteration we set the the weights for
482 /// the peeled latch as (TC - I, 1). It gives us reasonable distribution,
483 /// The probability to go to exit 1/(TC-I) increases. At the same time
484 /// the estimated trip count of remaining loop reduces by I.
485 /// To avoid dealing with division rounding we can just multiple both part
486 /// of weights to E and use weight as (F - I * E, E).
487 ///
488 /// \param Header The copy of the header block that belongs to next iteration.
489 /// \param LatchBR The copy of the latch branch that belongs to this iteration.
490 /// \param[in,out] FallThroughWeight The weight of the edge from latch to
491 /// header before peeling (in) and after peeled off one iteration (out).
492 static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
493                                 uint64_t ExitWeight,
494                                 uint64_t &FallThroughWeight) {
495   // FallThroughWeight is 0 means that there is no branch weights on original
496   // latch block or estimated trip count is zero.
497   if (!FallThroughWeight)
498     return;
499 
500   unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
501   MDBuilder MDB(LatchBR->getContext());
502   MDNode *WeightNode =
503       HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
504                 : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
505   LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
506   FallThroughWeight =
507       FallThroughWeight > ExitWeight ? FallThroughWeight - ExitWeight : 1;
508 }
509 
510 /// Initialize the weights.
511 ///
512 /// \param Header The header block.
513 /// \param LatchBR The latch branch.
514 /// \param[out] ExitWeight The weight of the edge from Latch to Exit.
515 /// \param[out] FallThroughWeight The weight of the edge from Latch to Header.
516 static void initBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
517                               uint64_t &ExitWeight,
518                               uint64_t &FallThroughWeight) {
519   uint64_t TrueWeight, FalseWeight;
520   if (!LatchBR->extractProfMetadata(TrueWeight, FalseWeight))
521     return;
522   unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
523   ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
524   FallThroughWeight = HeaderIdx ? FalseWeight : TrueWeight;
525 }
526 
527 /// Update the weights of original Latch block after peeling off all iterations.
528 ///
529 /// \param Header The header block.
530 /// \param LatchBR The latch branch.
531 /// \param ExitWeight The weight of the edge from Latch to Exit.
532 /// \param FallThroughWeight The weight of the edge from Latch to Header.
533 static void fixupBranchWeights(BasicBlock *Header, BranchInst *LatchBR,
534                                uint64_t ExitWeight,
535                                uint64_t FallThroughWeight) {
536   // FallThroughWeight is 0 means that there is no branch weights on original
537   // latch block or estimated trip count is zero.
538   if (!FallThroughWeight)
539     return;
540 
541   // Sets the branch weights on the loop exit.
542   MDBuilder MDB(LatchBR->getContext());
543   unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1;
544   MDNode *WeightNode =
545       HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight)
546                 : MDB.createBranchWeights(FallThroughWeight, ExitWeight);
547   LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
548 }
549 
550 /// Clones the body of the loop L, putting it between \p InsertTop and \p
551 /// InsertBot.
552 /// \param IterNumber The serial number of the iteration currently being
553 /// peeled off.
554 /// \param ExitEdges The exit edges of the original loop.
555 /// \param[out] NewBlocks A list of the blocks in the newly created clone
556 /// \param[out] VMap The value map between the loop and the new clone.
557 /// \param LoopBlocks A helper for DFS-traversal of the loop.
558 /// \param LVMap A value-map that maps instructions from the original loop to
559 /// instructions in the last peeled-off iteration.
560 /// \param LoopBlocksIDoms Immediate dominators of the original loop blocks.
561 static void cloneLoopBlocks(
562     Loop *L, unsigned IterNumber, BasicBlock *InsertTop, BasicBlock *InsertBot,
563     SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
564     SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
565     ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DomTreeUpdater &DTU,
566     const SmallDenseMap<BasicBlock *, BasicBlock *> &LoopBlocksIDoms,
567     LoopInfo *LI, ArrayRef<MDNode *> LoopLocalNoAliasDeclScopes) {
568   BasicBlock *Header = L->getHeader();
569   BasicBlock *Latch = L->getLoopLatch();
570   BasicBlock *PreHeader = L->getLoopPreheader();
571 
572   Function *F = Header->getParent();
573   LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
574   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
575   Loop *ParentLoop = L->getParentLoop();
576 
577   // For each block in the original loop, create a new copy,
578   // and update the value map with the newly created values.
579   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
580     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F);
581     NewBlocks.push_back(NewBB);
582 
583     // If an original block is an immediate child of the loop L, its copy
584     // is a child of a ParentLoop after peeling. If a block is a child of
585     // a nested loop, it is handled in the cloneLoop() call below.
586     if (ParentLoop && LI->getLoopFor(*BB) == L)
587       ParentLoop->addBasicBlockToLoop(NewBB, *LI);
588 
589     VMap[*BB] = NewBB;
590 
591     // If dominator tree is available, insert nodes to represent cloned blocks.
592     if (Header == *BB)
593       DTU.applyUpdates({{DominatorTree::Insert, InsertTop, NewBB}});
594     else {
595       BasicBlock *IDom = LoopBlocksIDoms.lookup(*BB);
596       // VMap must contain entry for IDom, as the iteration order is RPO.
597       DTU.applyUpdates(
598           {{DominatorTree::Insert, cast<BasicBlock>(VMap[IDom]), NewBB}});
599     }
600   }
601 
602   {
603     // Identify what other metadata depends on the cloned version. After
604     // cloning, replace the metadata with the corrected version for both
605     // memory instructions and noalias intrinsics.
606     std::string Ext = (Twine("Peel") + Twine(IterNumber)).str();
607     cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,
608                                Header->getContext(), Ext);
609   }
610 
611   // Recursively create the new Loop objects for nested loops, if any,
612   // to preserve LoopInfo.
613   for (Loop *ChildLoop : *L) {
614     cloneLoop(ChildLoop, ParentLoop, VMap, LI, nullptr);
615   }
616 
617   // Hook-up the control flow for the newly inserted blocks.
618   // The new header is hooked up directly to the "top", which is either
619   // the original loop preheader (for the first iteration) or the previous
620   // iteration's exiting block (for every other iteration)
621   InsertTop->getTerminator()->setSuccessor(0, cast<BasicBlock>(VMap[Header]));
622 
623   // Similarly, for the latch:
624   // The original exiting edge is still hooked up to the loop exit.
625   // The backedge now goes to the "bottom", which is either the loop's real
626   // header (for the last peeled iteration) or the copied header of the next
627   // iteration (for every other iteration)
628   BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
629   BranchInst *LatchBR = cast<BranchInst>(NewLatch->getTerminator());
630   for (unsigned idx = 0, e = LatchBR->getNumSuccessors(); idx < e; ++idx)
631     if (LatchBR->getSuccessor(idx) == Header) {
632       LatchBR->setSuccessor(idx, InsertBot);
633       break;
634     }
635   DTU.applyUpdates({{DominatorTree::Insert, NewLatch, InsertBot},
636                     {DominatorTree::Delete, InsertTop, InsertBot}});
637 
638   // The new copy of the loop body starts with a bunch of PHI nodes
639   // that pick an incoming value from either the preheader, or the previous
640   // loop iteration. Since this copy is no longer part of the loop, we
641   // resolve this statically:
642   // For the first iteration, we use the value from the preheader directly.
643   // For any other iteration, we replace the phi with the value generated by
644   // the immediately preceding clone of the loop body (which represents
645   // the previous iteration).
646   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
647     PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
648     if (IterNumber == 0) {
649       VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader);
650     } else {
651       Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch);
652       Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
653       if (LatchInst && L->contains(LatchInst))
654         VMap[&*I] = LVMap[LatchInst];
655       else
656         VMap[&*I] = LatchVal;
657     }
658     cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
659   }
660 
661   // Fix up the outgoing values - we need to add a value for the iteration
662   // we've just created. Note that this must happen *after* the incoming
663   // values are adjusted, since the value going out of the latch may also be
664   // a value coming into the header.
665   for (auto Edge : ExitEdges)
666     for (PHINode &PHI : Edge.second->phis()) {
667       Value *LatchVal = PHI.getIncomingValueForBlock(Edge.first);
668       Instruction *LatchInst = dyn_cast<Instruction>(LatchVal);
669       if (LatchInst && L->contains(LatchInst))
670         LatchVal = VMap[LatchVal];
671       PHI.addIncoming(LatchVal, cast<BasicBlock>(VMap[Edge.first]));
672     }
673 
674   // LastValueMap is updated with the values for the current loop
675   // which are used the next time this function is called.
676   for (auto KV : VMap)
677     LVMap[KV.first] = KV.second;
678 }
679 
680 TargetTransformInfo::PeelingPreferences llvm::gatherPeelingPreferences(
681     Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
682     Optional<bool> UserAllowPeeling,
683     Optional<bool> UserAllowProfileBasedPeeling, bool UnrollingSpecficValues) {
684   TargetTransformInfo::PeelingPreferences PP;
685 
686   // Set the default values.
687   PP.PeelCount = 0;
688   PP.AllowPeeling = true;
689   PP.AllowLoopNestsPeeling = false;
690   PP.PeelProfiledIterations = true;
691 
692   // Get the target specifc values.
693   TTI.getPeelingPreferences(L, SE, PP);
694 
695   // User specified values using cl::opt.
696   if (UnrollingSpecficValues) {
697     if (UnrollPeelCount.getNumOccurrences() > 0)
698       PP.PeelCount = UnrollPeelCount;
699     if (UnrollAllowPeeling.getNumOccurrences() > 0)
700       PP.AllowPeeling = UnrollAllowPeeling;
701     if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
702       PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
703   }
704 
705   // User specifed values provided by argument.
706   if (UserAllowPeeling.hasValue())
707     PP.AllowPeeling = *UserAllowPeeling;
708   if (UserAllowProfileBasedPeeling.hasValue())
709     PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
710 
711   return PP;
712 }
713 
714 /// Peel off the first \p PeelCount iterations of loop \p L.
715 ///
716 /// Note that this does not peel them off as a single straight-line block.
717 /// Rather, each iteration is peeled off separately, and needs to check the
718 /// exit condition.
719 /// For loops that dynamically execute \p PeelCount iterations or less
720 /// this provides a benefit, since the peeled off iterations, which account
721 /// for the bulk of dynamic execution, can be further simplified by scalar
722 /// optimizations.
723 bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
724                     ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
725                     bool PreserveLCSSA) {
726   assert(PeelCount > 0 && "Attempt to peel out zero iterations?");
727   assert(canPeel(L) && "Attempt to peel a loop which is not peelable?");
728 
729   LoopBlocksDFS LoopBlocks(L);
730   LoopBlocks.perform(LI);
731 
732   BasicBlock *Header = L->getHeader();
733   BasicBlock *PreHeader = L->getLoopPreheader();
734   BasicBlock *Latch = L->getLoopLatch();
735   SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitEdges;
736   L->getExitEdges(ExitEdges);
737 
738   SmallDenseSet<std::pair<BasicBlock *, BasicBlock *>, 4> ExitEdgesSet(
739       ExitEdges.begin(), ExitEdges.end());
740 
741   Function *F = Header->getParent();
742 
743   // Set up all the necessary basic blocks. It is convenient to split the
744   // preheader into 3 parts - two blocks to anchor the peeled copy of the loop
745   // body, and a new preheader for the "real" loop.
746 
747   // Peeling the first iteration transforms.
748   //
749   // PreHeader:
750   // ...
751   // Header:
752   //   LoopBody
753   //   If (cond) goto Header
754   // Exit:
755   //
756   // into
757   //
758   // InsertTop:
759   //   LoopBody
760   //   If (!cond) goto Exit
761   // InsertBot:
762   // NewPreHeader:
763   // ...
764   // Header:
765   //  LoopBody
766   //  If (cond) goto Header
767   // Exit:
768   //
769   // Each following iteration will split the current bottom anchor in two,
770   // and put the new copy of the loop body between these two blocks. That is,
771   // after peeling another iteration from the example above, we'll split
772   // InsertBot, and get:
773   //
774   // InsertTop:
775   //   LoopBody
776   //   If (!cond) goto Exit
777   // InsertBot:
778   //   LoopBody
779   //   If (!cond) goto Exit
780   // InsertBot.next:
781   // NewPreHeader:
782   // ...
783   // Header:
784   //  LoopBody
785   //  If (cond) goto Header
786   // Exit:
787 
788   BasicBlock *InsertTop = SplitEdge(PreHeader, Header, DT, LI);
789   BasicBlock *InsertBot =
790       SplitBlock(InsertTop, InsertTop->getTerminator(), DT, LI);
791   BasicBlock *NewPreHeader =
792       SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI);
793 
794   InsertTop->setName(Header->getName() + ".peel.begin");
795   InsertBot->setName(Header->getName() + ".peel.next");
796   NewPreHeader->setName(PreHeader->getName() + ".peel.newph");
797 
798   ValueToValueMapTy LVMap;
799 
800   // If we have branch weight information, we'll want to update it for the
801   // newly created branches.
802   BranchInst *LatchBR =
803       cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator());
804   uint64_t ExitWeight = 0, FallThroughWeight = 0;
805   initBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
806 
807   // Identify what noalias metadata is inside the loop: if it is inside the
808   // loop, the associated metadata must be cloned for each iteration.
809   SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
810   identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
811 
812   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
813 
814   // Fill map with the loop blocks IDoms to later update the DT when cloning the
815   // loop blocks.
816   SmallDenseMap<BasicBlock *, BasicBlock *> LoopBlocksIDoms;
817   for (auto *BB : L->blocks())
818     LoopBlocksIDoms[BB] = DT->getNode(BB)->getIDom()->getBlock();
819 
820   // For each peeled-off iteration, make a copy of the loop.
821   for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
822     SmallVector<BasicBlock *, 8> NewBlocks;
823     ValueToValueMapTy VMap;
824 
825     cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks,
826                     LoopBlocks, VMap, LVMap, DTU, LoopBlocksIDoms, LI,
827                     LoopLocalNoAliasDeclScopes);
828 
829     // Remap to use values from the current iteration instead of the
830     // previous one.
831     remapInstructionsInBlocks(NewBlocks, VMap);
832 
833     // If DT is available, insert edges from cloned exiting blocks to the exits
834     for (auto Exit : ExitEdgesSet)
835       DTU.applyUpdates({{DominatorTree::Insert,
836                          cast<BasicBlock>(LVMap[Exit.first]), Exit.second}});
837 
838     auto *LatchBRCopy = cast<BranchInst>(VMap[LatchBR]);
839     updateBranchWeights(InsertBot, LatchBRCopy, ExitWeight, FallThroughWeight);
840     // Remove Loop metadata from the latch branch instruction
841     // because it is not the Loop's latch branch anymore.
842     LatchBRCopy->setMetadata(LLVMContext::MD_loop, nullptr);
843 
844     InsertTop = InsertBot;
845     InsertBot = SplitBlock(InsertBot, InsertBot->getTerminator(), &DTU, LI);
846     InsertBot->setName(Header->getName() + ".peel.next");
847 
848     F->getBasicBlockList().splice(InsertTop->getIterator(),
849                                   F->getBasicBlockList(),
850                                   NewBlocks[0]->getIterator(), F->end());
851   }
852 
853   // Now adjust the phi nodes in the loop header to get their initial values
854   // from the last peeled-off iteration instead of the preheader.
855   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
856     PHINode *PHI = cast<PHINode>(I);
857     Value *NewVal = PHI->getIncomingValueForBlock(Latch);
858     Instruction *LatchInst = dyn_cast<Instruction>(NewVal);
859     if (LatchInst && L->contains(LatchInst))
860       NewVal = LVMap[LatchInst];
861 
862     PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
863   }
864 
865   fixupBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight);
866 
867   // Update Metadata for count of peeled off iterations.
868   unsigned AlreadyPeeled = 0;
869   if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
870     AlreadyPeeled = *Peeled;
871   addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
872 
873   if (Loop *ParentLoop = L->getParentLoop())
874     L = ParentLoop;
875 
876   // We modified the loop, update SE.
877   SE->forgetTopmostLoop(L);
878 
879   // Finally DomtTree must be correct.
880   if (DTU.hasDomTree()) {
881     DTU.flush();
882     assert(DT->verify(DominatorTree::VerificationLevel::Fast));
883   }
884 
885   // FIXME: Incrementally update loop-simplify
886   simplifyLoop(L, DT, LI, SE, AC, nullptr, PreserveLCSSA);
887 
888   NumPeeled++;
889 
890   return true;
891 }
892