1 //===-- UnrollLoopPeel.cpp - Loop peeling utilities -----------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements some loop unrolling utilities for peeling loops 11 // with dynamically inferred (from PGO) trip counts. See LoopUnroll.cpp for 12 // unrolling loops with compile-time constant trip counts. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/Analysis/LoopIterator.h" 18 #include "llvm/Analysis/LoopPass.h" 19 #include "llvm/Analysis/ScalarEvolution.h" 20 #include "llvm/Analysis/TargetTransformInfo.h" 21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/Dominators.h" 23 #include "llvm/IR/MDBuilder.h" 24 #include "llvm/IR/Metadata.h" 25 #include "llvm/IR/Module.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/raw_ostream.h" 28 #include "llvm/Transforms/Scalar.h" 29 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 30 #include "llvm/Transforms/Utils/Cloning.h" 31 #include "llvm/Transforms/Utils/LoopUtils.h" 32 #include "llvm/Transforms/Utils/UnrollLoop.h" 33 #include <algorithm> 34 35 using namespace llvm; 36 37 #define DEBUG_TYPE "loop-unroll" 38 STATISTIC(NumPeeled, "Number of loops peeled"); 39 40 static cl::opt<unsigned> UnrollPeelMaxCount( 41 "unroll-peel-max-count", cl::init(7), cl::Hidden, 42 cl::desc("Max average trip count which will cause loop peeling.")); 43 44 static cl::opt<unsigned> UnrollForcePeelCount( 45 "unroll-force-peel-count", cl::init(0), cl::Hidden, 46 cl::desc("Force a peel count regardless of profiling information.")); 47 48 // Check whether we are capable of peeling this loop. 49 static bool canPeel(Loop *L) { 50 // Make sure the loop is in simplified form 51 if (!L->isLoopSimplifyForm()) 52 return false; 53 54 // Only peel loops that contain a single exit 55 if (!L->getExitingBlock() || !L->getUniqueExitBlock()) 56 return false; 57 58 return true; 59 } 60 61 // Return the number of iterations we want to peel off. 62 void llvm::computePeelCount(Loop *L, unsigned LoopSize, 63 TargetTransformInfo::UnrollingPreferences &UP) { 64 UP.PeelCount = 0; 65 if (!canPeel(L)) 66 return; 67 68 // Only try to peel innermost loops. 69 if (!L->empty()) 70 return; 71 72 // If the user provided a peel count, use that. 73 bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0; 74 if (UserPeelCount) { 75 DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount 76 << " iterations.\n"); 77 UP.PeelCount = UnrollForcePeelCount; 78 return; 79 } 80 81 // If we don't know the trip count, but have reason to believe the average 82 // trip count is low, peeling should be beneficial, since we will usually 83 // hit the peeled section. 84 // We only do this in the presence of profile information, since otherwise 85 // our estimates of the trip count are not reliable enough. 86 if (UP.AllowPeeling && L->getHeader()->getParent()->getEntryCount()) { 87 Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L); 88 if (!PeelCount) 89 return; 90 91 DEBUG(dbgs() << "Profile-based estimated trip count is " << *PeelCount 92 << "\n"); 93 94 if (*PeelCount) { 95 if ((*PeelCount <= UnrollPeelMaxCount) && 96 (LoopSize * (*PeelCount + 1) <= UP.Threshold)) { 97 DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n"); 98 UP.PeelCount = *PeelCount; 99 return; 100 } 101 DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n"); 102 DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n"); 103 DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) << "\n"); 104 DEBUG(dbgs() << "Max peel cost: " << UP.Threshold << "\n"); 105 } 106 } 107 108 return; 109 } 110 111 /// \brief Update the branch weights of the latch of a peeled-off loop 112 /// iteration. 113 /// This sets the branch weights for the latch of the recently peeled off loop 114 /// iteration correctly. 115 /// Our goal is to make sure that: 116 /// a) The total weight of all the copies of the loop body is preserved. 117 /// b) The total weight of the loop exit is preserved. 118 /// c) The body weight is reasonably distributed between the peeled iterations. 119 /// 120 /// \param Header The copy of the header block that belongs to next iteration. 121 /// \param LatchBR The copy of the latch branch that belongs to this iteration. 122 /// \param IterNumber The serial number of the iteration that was just 123 /// peeled off. 124 /// \param AvgIters The average number of iterations we expect the loop to have. 125 /// \param[in,out] PeeledHeaderWeight The total number of dynamic loop 126 /// iterations that are unaccounted for. As an input, it represents the number 127 /// of times we expect to enter the header of the iteration currently being 128 /// peeled off. The output is the number of times we expect to enter the 129 /// header of the next iteration. 130 static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR, 131 unsigned IterNumber, unsigned AvgIters, 132 uint64_t &PeeledHeaderWeight) { 133 134 // FIXME: Pick a more realistic distribution. 135 // Currently the proportion of weight we assign to the fall-through 136 // side of the branch drops linearly with the iteration number, and we use 137 // a 0.9 fudge factor to make the drop-off less sharp... 138 if (PeeledHeaderWeight) { 139 uint64_t FallThruWeight = 140 PeeledHeaderWeight * ((float)(AvgIters - IterNumber) / AvgIters * 0.9); 141 uint64_t ExitWeight = PeeledHeaderWeight - FallThruWeight; 142 PeeledHeaderWeight -= ExitWeight; 143 144 unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1); 145 MDBuilder MDB(LatchBR->getContext()); 146 MDNode *WeightNode = 147 HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThruWeight) 148 : MDB.createBranchWeights(FallThruWeight, ExitWeight); 149 LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode); 150 } 151 } 152 153 /// \brief Clones the body of the loop L, putting it between \p InsertTop and \p 154 /// InsertBot. 155 /// \param IterNumber The serial number of the iteration currently being 156 /// peeled off. 157 /// \param Exit The exit block of the original loop. 158 /// \param[out] NewBlocks A list of the the blocks in the newly created clone 159 /// \param[out] VMap The value map between the loop and the new clone. 160 /// \param LoopBlocks A helper for DFS-traversal of the loop. 161 /// \param LVMap A value-map that maps instructions from the original loop to 162 /// instructions in the last peeled-off iteration. 163 static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop, 164 BasicBlock *InsertBot, BasicBlock *Exit, 165 SmallVectorImpl<BasicBlock *> &NewBlocks, 166 LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap, 167 ValueToValueMapTy &LVMap, LoopInfo *LI) { 168 169 BasicBlock *Header = L->getHeader(); 170 BasicBlock *Latch = L->getLoopLatch(); 171 BasicBlock *PreHeader = L->getLoopPreheader(); 172 173 Function *F = Header->getParent(); 174 LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO(); 175 LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO(); 176 Loop *ParentLoop = L->getParentLoop(); 177 178 // For each block in the original loop, create a new copy, 179 // and update the value map with the newly created values. 180 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 181 BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F); 182 NewBlocks.push_back(NewBB); 183 184 if (ParentLoop) 185 ParentLoop->addBasicBlockToLoop(NewBB, *LI); 186 187 VMap[*BB] = NewBB; 188 } 189 190 // Hook-up the control flow for the newly inserted blocks. 191 // The new header is hooked up directly to the "top", which is either 192 // the original loop preheader (for the first iteration) or the previous 193 // iteration's exiting block (for every other iteration) 194 InsertTop->getTerminator()->setSuccessor(0, cast<BasicBlock>(VMap[Header])); 195 196 // Similarly, for the latch: 197 // The original exiting edge is still hooked up to the loop exit. 198 // The backedge now goes to the "bottom", which is either the loop's real 199 // header (for the last peeled iteration) or the copied header of the next 200 // iteration (for every other iteration) 201 BranchInst *LatchBR = 202 cast<BranchInst>(cast<BasicBlock>(VMap[Latch])->getTerminator()); 203 unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1); 204 LatchBR->setSuccessor(HeaderIdx, InsertBot); 205 LatchBR->setSuccessor(1 - HeaderIdx, Exit); 206 207 // The new copy of the loop body starts with a bunch of PHI nodes 208 // that pick an incoming value from either the preheader, or the previous 209 // loop iteration. Since this copy is no longer part of the loop, we 210 // resolve this statically: 211 // For the first iteration, we use the value from the preheader directly. 212 // For any other iteration, we replace the phi with the value generated by 213 // the immediately preceding clone of the loop body (which represents 214 // the previous iteration). 215 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { 216 PHINode *NewPHI = cast<PHINode>(VMap[&*I]); 217 if (IterNumber == 0) { 218 VMap[&*I] = NewPHI->getIncomingValueForBlock(PreHeader); 219 } else { 220 Value *LatchVal = NewPHI->getIncomingValueForBlock(Latch); 221 Instruction *LatchInst = dyn_cast<Instruction>(LatchVal); 222 if (LatchInst && L->contains(LatchInst)) 223 VMap[&*I] = LVMap[LatchInst]; 224 else 225 VMap[&*I] = LatchVal; 226 } 227 cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI); 228 } 229 230 // Fix up the outgoing values - we need to add a value for the iteration 231 // we've just created. Note that this must happen *after* the incoming 232 // values are adjusted, since the value going out of the latch may also be 233 // a value coming into the header. 234 for (BasicBlock::iterator I = Exit->begin(); isa<PHINode>(I); ++I) { 235 PHINode *PHI = cast<PHINode>(I); 236 Value *LatchVal = PHI->getIncomingValueForBlock(Latch); 237 Instruction *LatchInst = dyn_cast<Instruction>(LatchVal); 238 if (LatchInst && L->contains(LatchInst)) 239 LatchVal = VMap[LatchVal]; 240 PHI->addIncoming(LatchVal, cast<BasicBlock>(VMap[Latch])); 241 } 242 243 // LastValueMap is updated with the values for the current loop 244 // which are used the next time this function is called. 245 for (const auto &KV : VMap) 246 LVMap[KV.first] = KV.second; 247 } 248 249 /// \brief Peel off the first \p PeelCount iterations of loop \p L. 250 /// 251 /// Note that this does not peel them off as a single straight-line block. 252 /// Rather, each iteration is peeled off separately, and needs to check the 253 /// exit condition. 254 /// For loops that dynamically execute \p PeelCount iterations or less 255 /// this provides a benefit, since the peeled off iterations, which account 256 /// for the bulk of dynamic execution, can be further simplified by scalar 257 /// optimizations. 258 bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, 259 ScalarEvolution *SE, DominatorTree *DT, 260 bool PreserveLCSSA) { 261 if (!canPeel(L)) 262 return false; 263 264 LoopBlocksDFS LoopBlocks(L); 265 LoopBlocks.perform(LI); 266 267 BasicBlock *Header = L->getHeader(); 268 BasicBlock *PreHeader = L->getLoopPreheader(); 269 BasicBlock *Latch = L->getLoopLatch(); 270 BasicBlock *Exit = L->getUniqueExitBlock(); 271 272 Function *F = Header->getParent(); 273 274 // Set up all the necessary basic blocks. It is convenient to split the 275 // preheader into 3 parts - two blocks to anchor the peeled copy of the loop 276 // body, and a new preheader for the "real" loop. 277 278 // Peeling the first iteration transforms. 279 // 280 // PreHeader: 281 // ... 282 // Header: 283 // LoopBody 284 // If (cond) goto Header 285 // Exit: 286 // 287 // into 288 // 289 // InsertTop: 290 // LoopBody 291 // If (!cond) goto Exit 292 // InsertBot: 293 // NewPreHeader: 294 // ... 295 // Header: 296 // LoopBody 297 // If (cond) goto Header 298 // Exit: 299 // 300 // Each following iteration will split the current bottom anchor in two, 301 // and put the new copy of the loop body between these two blocks. That is, 302 // after peeling another iteration from the example above, we'll split 303 // InsertBot, and get: 304 // 305 // InsertTop: 306 // LoopBody 307 // If (!cond) goto Exit 308 // InsertBot: 309 // LoopBody 310 // If (!cond) goto Exit 311 // InsertBot.next: 312 // NewPreHeader: 313 // ... 314 // Header: 315 // LoopBody 316 // If (cond) goto Header 317 // Exit: 318 319 BasicBlock *InsertTop = SplitEdge(PreHeader, Header, DT, LI); 320 BasicBlock *InsertBot = 321 SplitBlock(InsertTop, InsertTop->getTerminator(), DT, LI); 322 BasicBlock *NewPreHeader = 323 SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI); 324 325 InsertTop->setName(Header->getName() + ".peel.begin"); 326 InsertBot->setName(Header->getName() + ".peel.next"); 327 NewPreHeader->setName(PreHeader->getName() + ".peel.newph"); 328 329 ValueToValueMapTy LVMap; 330 331 // If we have branch weight information, we'll want to update it for the 332 // newly created branches. 333 BranchInst *LatchBR = 334 cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator()); 335 unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1); 336 337 uint64_t TrueWeight, FalseWeight; 338 uint64_t ExitWeight = 0, CurHeaderWeight = 0; 339 if (LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) { 340 ExitWeight = HeaderIdx ? TrueWeight : FalseWeight; 341 // The # of times the loop body executes is the sum of the exit block 342 // weight and the # of times the backedges are taken. 343 CurHeaderWeight = TrueWeight + FalseWeight; 344 } 345 346 // For each peeled-off iteration, make a copy of the loop. 347 for (unsigned Iter = 0; Iter < PeelCount; ++Iter) { 348 SmallVector<BasicBlock *, 8> NewBlocks; 349 ValueToValueMapTy VMap; 350 351 // Subtract the exit weight from the current header weight -- the exit 352 // weight is exactly the weight of the previous iteration's header. 353 // FIXME: due to the way the distribution is constructed, we need a 354 // guard here to make sure we don't end up with non-positive weights. 355 if (ExitWeight < CurHeaderWeight) 356 CurHeaderWeight -= ExitWeight; 357 else 358 CurHeaderWeight = 1; 359 360 cloneLoopBlocks(L, Iter, InsertTop, InsertBot, Exit, 361 NewBlocks, LoopBlocks, VMap, LVMap, LI); 362 updateBranchWeights(InsertBot, cast<BranchInst>(VMap[LatchBR]), Iter, 363 PeelCount, ExitWeight); 364 365 InsertTop = InsertBot; 366 InsertBot = SplitBlock(InsertBot, InsertBot->getTerminator(), DT, LI); 367 InsertBot->setName(Header->getName() + ".peel.next"); 368 369 F->getBasicBlockList().splice(InsertTop->getIterator(), 370 F->getBasicBlockList(), 371 NewBlocks[0]->getIterator(), F->end()); 372 373 // Remap to use values from the current iteration instead of the 374 // previous one. 375 remapInstructionsInBlocks(NewBlocks, VMap); 376 } 377 378 // Now adjust the phi nodes in the loop header to get their initial values 379 // from the last peeled-off iteration instead of the preheader. 380 for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) { 381 PHINode *PHI = cast<PHINode>(I); 382 Value *NewVal = PHI->getIncomingValueForBlock(Latch); 383 Instruction *LatchInst = dyn_cast<Instruction>(NewVal); 384 if (LatchInst && L->contains(LatchInst)) 385 NewVal = LVMap[LatchInst]; 386 387 PHI->setIncomingValue(PHI->getBasicBlockIndex(NewPreHeader), NewVal); 388 } 389 390 // Adjust the branch weights on the loop exit. 391 if (ExitWeight) { 392 // The backedge count is the difference of current header weight and 393 // current loop exit weight. If the current header weight is smaller than 394 // the current loop exit weight, we mark the loop backedge weight as 1. 395 uint64_t BackEdgeWeight = 0; 396 if (ExitWeight < CurHeaderWeight) 397 BackEdgeWeight = CurHeaderWeight - ExitWeight; 398 else 399 BackEdgeWeight = 1; 400 MDBuilder MDB(LatchBR->getContext()); 401 MDNode *WeightNode = 402 HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight) 403 : MDB.createBranchWeights(BackEdgeWeight, ExitWeight); 404 LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode); 405 } 406 407 // If the loop is nested, we changed the parent loop, update SE. 408 if (Loop *ParentLoop = L->getParentLoop()) 409 SE->forgetLoop(ParentLoop); 410 411 NumPeeled++; 412 413 return true; 414 } 415