1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements loop unroll and jam as a routine, much like 11 // LoopUnroll.cpp implements loop unroll. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/ADT/SmallPtrSet.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/Analysis/AssumptionCache.h" 18 #include "llvm/Analysis/DependenceAnalysis.h" 19 #include "llvm/Analysis/InstructionSimplify.h" 20 #include "llvm/Analysis/LoopAnalysisManager.h" 21 #include "llvm/Analysis/LoopIterator.h" 22 #include "llvm/Analysis/LoopPass.h" 23 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 24 #include "llvm/Analysis/ScalarEvolution.h" 25 #include "llvm/Analysis/ScalarEvolutionExpander.h" 26 #include "llvm/Analysis/Utils/Local.h" 27 #include "llvm/IR/BasicBlock.h" 28 #include "llvm/IR/DataLayout.h" 29 #include "llvm/IR/DebugInfoMetadata.h" 30 #include "llvm/IR/Dominators.h" 31 #include "llvm/IR/IntrinsicInst.h" 32 #include "llvm/IR/LLVMContext.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/raw_ostream.h" 35 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 36 #include "llvm/Transforms/Utils/Cloning.h" 37 #include "llvm/Transforms/Utils/LoopSimplify.h" 38 #include "llvm/Transforms/Utils/LoopUtils.h" 39 #include "llvm/Transforms/Utils/SimplifyIndVar.h" 40 #include "llvm/Transforms/Utils/UnrollLoop.h" 41 using namespace llvm; 42 43 #define DEBUG_TYPE "loop-unroll-and-jam" 44 45 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed"); 46 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed"); 47 48 static bool containsBB(std::vector<BasicBlock *> &V, BasicBlock *BB) { 49 return std::find(V.begin(), V.end(), BB) != V.end(); 50 } 51 52 // Partition blocks in an outer/inner loop pair into blocks before and after 53 // the loop 54 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, 55 std::vector<BasicBlock *> &ForeBlocks, 56 std::vector<BasicBlock *> &SubLoopBlocks, 57 std::vector<BasicBlock *> &AftBlocks, 58 DominatorTree *DT) { 59 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 60 SubLoopBlocks = SubLoop->getBlocks(); 61 62 for (BasicBlock *BB : L->blocks()) { 63 if (!SubLoop->contains(BB)) { 64 if (DT->dominates(SubLoopLatch, BB)) 65 AftBlocks.push_back(BB); 66 else 67 ForeBlocks.push_back(BB); 68 } 69 } 70 71 // Check that all blocks in ForeBlocks together dominate the subloop 72 // TODO: This might ideally be done better with a dominator/postdominators. 73 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader(); 74 for (BasicBlock *BB : ForeBlocks) { 75 if (BB == SubLoopPreHeader) 76 continue; 77 TerminatorInst *TI = BB->getTerminator(); 78 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) 79 if (!containsBB(ForeBlocks, TI->getSuccessor(i))) 80 return false; 81 } 82 83 return true; 84 } 85 86 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc. 87 static void 88 moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, BasicBlock *Latch, 89 Instruction *InsertLoc, 90 std::vector<BasicBlock *> &AftBlocks) { 91 // We need to ensure we move the instructions in the correct order, 92 // starting with the earliest required instruction and moving forward. 93 std::vector<Instruction *> Worklist; 94 std::vector<Instruction *> Visited; 95 for (auto &Phi : Header->phis()) { 96 Value *V = Phi.getIncomingValueForBlock(Latch); 97 if (Instruction *I = dyn_cast<Instruction>(V)) 98 Worklist.push_back(I); 99 } 100 101 while (!Worklist.empty()) { 102 Instruction *I = Worklist.back(); 103 Worklist.pop_back(); 104 if (!containsBB(AftBlocks, I->getParent())) 105 continue; 106 107 Visited.push_back(I); 108 for (auto &U : I->operands()) 109 if (Instruction *II = dyn_cast<Instruction>(U)) 110 Worklist.push_back(II); 111 } 112 113 // Move all instructions in program order to before the InsertLoc 114 BasicBlock *InsertLocBB = InsertLoc->getParent(); 115 for (Instruction *I : reverse(Visited)) { 116 if (I->getParent() != InsertLocBB) 117 I->moveBefore(InsertLoc); 118 } 119 } 120 121 /* 122 This method performs Unroll and Jam. For a simple loop like: 123 for (i = ..) 124 Fore(i) 125 for (j = ..) 126 SubLoop(i, j) 127 Aft(i) 128 129 Instead of doing normal inner or outer unrolling, we do: 130 for (i = .., i+=2) 131 Fore(i) 132 Fore(i+1) 133 for (j = ..) 134 SubLoop(i, j) 135 SubLoop(i+1, j) 136 Aft(i) 137 Aft(i+1) 138 139 So the outer loop is essetially unrolled and then the inner loops are fused 140 ("jammed") together into a single loop. This can increase speed when there 141 are loads in SubLoop that are invariant to i, as they become shared between 142 the now jammed inner loops. 143 144 We do this by spliting the blocks in the loop into Fore, Subloop and Aft. 145 Fore blocks are those before the inner loop, Aft are those after. Normal 146 Unroll code is used to copy each of these sets of blocks and the results are 147 combined together into the final form above. 148 149 isSafeToUnrollAndJam should be used prior to calling this to make sure the 150 unrolling will be valid. Checking profitablility is also advisable. 151 */ 152 LoopUnrollResult 153 llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, 154 unsigned TripMultiple, bool UnrollRemainder, 155 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 156 AssumptionCache *AC, OptimizationRemarkEmitter *ORE) { 157 158 // When we enter here we should have already checked that it is safe 159 BasicBlock *Header = L->getHeader(); 160 assert(L->getSubLoops().size() == 1); 161 Loop *SubLoop = *L->begin(); 162 163 // Don't enter the unroll code if there is nothing to do. 164 if (TripCount == 0 && Count < 2) { 165 LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n"); 166 return LoopUnrollResult::Unmodified; 167 } 168 169 assert(Count > 0); 170 assert(TripMultiple > 0); 171 assert(TripCount == 0 || TripCount % TripMultiple == 0); 172 173 // Are we eliminating the loop control altogether? 174 bool CompletelyUnroll = (Count == TripCount); 175 176 // We use the runtime remainder in cases where we don't know trip multiple 177 if (TripMultiple == 1 || TripMultiple % Count != 0) { 178 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, 179 /*UseEpilogRemainder*/ true, 180 UnrollRemainder, LI, SE, DT, AC, true)) { 181 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " 182 "generated when assuming runtime trip count\n"); 183 return LoopUnrollResult::Unmodified; 184 } 185 } 186 187 // Notify ScalarEvolution that the loop will be substantially changed, 188 // if not outright eliminated. 189 if (SE) { 190 SE->forgetLoop(L); 191 SE->forgetLoop(SubLoop); 192 } 193 194 using namespace ore; 195 // Report the unrolling decision. 196 if (CompletelyUnroll) { 197 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %" 198 << Header->getName() << " with trip count " << TripCount 199 << "!\n"); 200 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(), 201 L->getHeader()) 202 << "completely unroll and jammed loop with " 203 << NV("UnrollCount", TripCount) << " iterations"); 204 } else { 205 auto DiagBuilder = [&]() { 206 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(), 207 L->getHeader()); 208 return Diag << "unroll and jammed loop by a factor of " 209 << NV("UnrollCount", Count); 210 }; 211 212 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName() 213 << " by " << Count); 214 if (TripMultiple != 1) { 215 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); 216 ORE->emit([&]() { 217 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple) 218 << " trips per branch"; 219 }); 220 } else { 221 LLVM_DEBUG(dbgs() << " with run-time trip count"); 222 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; }); 223 } 224 LLVM_DEBUG(dbgs() << "!\n"); 225 } 226 227 BasicBlock *Preheader = L->getLoopPreheader(); 228 BasicBlock *LatchBlock = L->getLoopLatch(); 229 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); 230 assert(Preheader && LatchBlock && Header); 231 assert(BI && !BI->isUnconditional()); 232 bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); 233 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); 234 bool SubLoopContinueOnTrue = SubLoop->contains( 235 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0)); 236 237 // Partition blocks in an outer/inner loop pair into blocks before and after 238 // the loop 239 std::vector<BasicBlock *> SubLoopBlocks; 240 std::vector<BasicBlock *> ForeBlocks; 241 std::vector<BasicBlock *> AftBlocks; 242 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks, 243 DT); 244 245 // We keep track of the entering/first and exiting/last block of each of 246 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of 247 // blocks easier. 248 std::vector<BasicBlock *> ForeBlocksFirst; 249 std::vector<BasicBlock *> ForeBlocksLast; 250 std::vector<BasicBlock *> SubLoopBlocksFirst; 251 std::vector<BasicBlock *> SubLoopBlocksLast; 252 std::vector<BasicBlock *> AftBlocksFirst; 253 std::vector<BasicBlock *> AftBlocksLast; 254 ForeBlocksFirst.push_back(Header); 255 ForeBlocksLast.push_back(SubLoop->getLoopPreheader()); 256 SubLoopBlocksFirst.push_back(SubLoop->getHeader()); 257 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock()); 258 AftBlocksFirst.push_back(SubLoop->getExitBlock()); 259 AftBlocksLast.push_back(L->getExitingBlock()); 260 // Maps Blocks[0] -> Blocks[It] 261 ValueToValueMapTy LastValueMap; 262 263 // Move any instructions from fore phi operands from AftBlocks into Fore. 264 moveHeaderPhiOperandsToForeBlocks( 265 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(), 266 AftBlocks); 267 268 // The current on-the-fly SSA update requires blocks to be processed in 269 // reverse postorder so that LastValueMap contains the correct value at each 270 // exit. 271 LoopBlocksDFS DFS(L); 272 DFS.perform(LI); 273 // Stash the DFS iterators before adding blocks to the loop. 274 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); 275 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); 276 277 if (Header->getParent()->isDebugInfoForProfiling()) 278 for (BasicBlock *BB : L->getBlocks()) 279 for (Instruction &I : *BB) 280 if (!isa<DbgInfoIntrinsic>(&I)) 281 if (const DILocation *DIL = I.getDebugLoc()) 282 I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count)); 283 284 // Copy all blocks 285 for (unsigned It = 1; It != Count; ++It) { 286 std::vector<BasicBlock *> NewBlocks; 287 // Maps Blocks[It] -> Blocks[It-1] 288 DenseMap<Value *, Value *> PrevItValueMap; 289 290 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 291 ValueToValueMapTy VMap; 292 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); 293 Header->getParent()->getBasicBlockList().push_back(New); 294 295 if (containsBB(ForeBlocks, *BB)) { 296 L->addBasicBlockToLoop(New, *LI); 297 298 if (*BB == ForeBlocksFirst[0]) 299 ForeBlocksFirst.push_back(New); 300 if (*BB == ForeBlocksLast[0]) 301 ForeBlocksLast.push_back(New); 302 } else if (containsBB(SubLoopBlocks, *BB)) { 303 SubLoop->addBasicBlockToLoop(New, *LI); 304 305 if (*BB == SubLoopBlocksFirst[0]) 306 SubLoopBlocksFirst.push_back(New); 307 if (*BB == SubLoopBlocksLast[0]) 308 SubLoopBlocksLast.push_back(New); 309 } else if (containsBB(AftBlocks, *BB)) { 310 L->addBasicBlockToLoop(New, *LI); 311 312 if (*BB == AftBlocksFirst[0]) 313 AftBlocksFirst.push_back(New); 314 if (*BB == AftBlocksLast[0]) 315 AftBlocksLast.push_back(New); 316 } else { 317 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft"); 318 } 319 320 // Update our running maps of newest clones 321 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]); 322 LastValueMap[*BB] = New; 323 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); 324 VI != VE; ++VI) { 325 PrevItValueMap[VI->second] = 326 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]); 327 LastValueMap[VI->first] = VI->second; 328 } 329 330 NewBlocks.push_back(New); 331 332 // Update DomTree: 333 if (*BB == ForeBlocksFirst[0]) 334 DT->addNewBlock(New, ForeBlocksLast[It - 1]); 335 else if (*BB == SubLoopBlocksFirst[0]) 336 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]); 337 else if (*BB == AftBlocksFirst[0]) 338 DT->addNewBlock(New, AftBlocksLast[It - 1]); 339 else { 340 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree 341 // structure. 342 auto BBDomNode = DT->getNode(*BB); 343 auto BBIDom = BBDomNode->getIDom(); 344 BasicBlock *OriginalBBIDom = BBIDom->getBlock(); 345 assert(OriginalBBIDom); 346 assert(LastValueMap[cast<Value>(OriginalBBIDom)]); 347 DT->addNewBlock( 348 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)])); 349 } 350 } 351 352 // Remap all instructions in the most recent iteration 353 for (BasicBlock *NewBlock : NewBlocks) { 354 for (Instruction &I : *NewBlock) { 355 ::remapInstruction(&I, LastValueMap); 356 if (auto *II = dyn_cast<IntrinsicInst>(&I)) 357 if (II->getIntrinsicID() == Intrinsic::assume) 358 AC->registerAssumption(II); 359 } 360 } 361 362 // Alter the ForeBlocks phi's, pointing them at the latest version of the 363 // value from the previous iteration's phis 364 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) { 365 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]); 366 assert(OldValue && "should have incoming edge from Aft[It]"); 367 Value *NewValue = OldValue; 368 if (Value *PrevValue = PrevItValueMap[OldValue]) 369 NewValue = PrevValue; 370 371 assert(Phi.getNumOperands() == 2); 372 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]); 373 Phi.setIncomingValue(0, NewValue); 374 Phi.removeIncomingValue(1); 375 } 376 } 377 378 // Now that all the basic blocks for the unrolled iterations are in place, 379 // finish up connecting the blocks and phi nodes. At this point LastValueMap 380 // is the last unrolled iterations values. 381 382 // Update Phis in BB from OldBB to point to NewBB 383 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB, 384 BasicBlock *NewBB) { 385 for (PHINode &Phi : BB->phis()) { 386 int I = Phi.getBasicBlockIndex(OldBB); 387 Phi.setIncomingBlock(I, NewBB); 388 } 389 }; 390 // Update Phis in BB from OldBB to point to NewBB and use the latest value 391 // from LastValueMap 392 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB, 393 BasicBlock *NewBB, 394 ValueToValueMapTy &LastValueMap) { 395 for (PHINode &Phi : BB->phis()) { 396 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) { 397 if (Phi.getIncomingBlock(b) == OldBB) { 398 Value *OldValue = Phi.getIncomingValue(b); 399 if (Value *LastValue = LastValueMap[OldValue]) 400 Phi.setIncomingValue(b, LastValue); 401 Phi.setIncomingBlock(b, NewBB); 402 break; 403 } 404 } 405 } 406 }; 407 // Move all the phis from Src into Dest 408 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) { 409 Instruction *insertPoint = Dest->getFirstNonPHI(); 410 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin())) 411 Phi->moveBefore(insertPoint); 412 }; 413 414 // Update the PHI values outside the loop to point to the last block 415 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(), 416 LastValueMap); 417 418 // Update ForeBlocks successors and phi nodes 419 BranchInst *ForeTerm = 420 cast<BranchInst>(ForeBlocksLast.back()->getTerminator()); 421 BasicBlock *Dest = SubLoopBlocksFirst[0]; 422 ForeTerm->setSuccessor(0, Dest); 423 424 if (CompletelyUnroll) { 425 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) { 426 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader)); 427 Phi->getParent()->getInstList().erase(Phi); 428 } 429 } else { 430 // Update the PHI values to point to the last aft block 431 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0], 432 AftBlocksLast.back(), LastValueMap); 433 } 434 435 for (unsigned It = 1; It != Count; It++) { 436 // Remap ForeBlock successors from previous iteration to this 437 BranchInst *ForeTerm = 438 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator()); 439 BasicBlock *Dest = ForeBlocksFirst[It]; 440 ForeTerm->setSuccessor(0, Dest); 441 } 442 443 // Subloop successors and phis 444 BranchInst *SubTerm = 445 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator()); 446 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]); 447 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]); 448 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0], 449 ForeBlocksLast.back()); 450 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0], 451 SubLoopBlocksLast.back()); 452 453 for (unsigned It = 1; It != Count; It++) { 454 // Replace the conditional branch of the previous iteration subloop with an 455 // unconditional one to this one 456 BranchInst *SubTerm = 457 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator()); 458 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm); 459 SubTerm->eraseFromParent(); 460 461 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It], 462 ForeBlocksLast.back()); 463 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It], 464 SubLoopBlocksLast.back()); 465 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]); 466 } 467 468 // Aft blocks successors and phis 469 BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); 470 if (CompletelyUnroll) { 471 BranchInst::Create(LoopExit, Term); 472 Term->eraseFromParent(); 473 } else { 474 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); 475 } 476 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0], 477 SubLoopBlocksLast.back()); 478 479 for (unsigned It = 1; It != Count; It++) { 480 // Replace the conditional branch of the previous iteration subloop with an 481 // unconditional one to this one 482 BranchInst *AftTerm = 483 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator()); 484 BranchInst::Create(AftBlocksFirst[It], AftTerm); 485 AftTerm->eraseFromParent(); 486 487 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It], 488 SubLoopBlocksLast.back()); 489 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]); 490 } 491 492 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the 493 // new ones required. 494 if (Count != 1) { 495 SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 496 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0], 497 SubLoopBlocksFirst[0]); 498 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, 499 SubLoopBlocksLast[0], AftBlocksFirst[0]); 500 501 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 502 ForeBlocksLast.back(), SubLoopBlocksFirst[0]); 503 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 504 SubLoopBlocksLast.back(), AftBlocksFirst[0]); 505 DT->applyUpdates(DTUpdates); 506 } 507 508 // Merge adjacent basic blocks, if possible. 509 SmallPtrSet<BasicBlock *, 16> MergeBlocks; 510 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); 511 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); 512 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); 513 while (!MergeBlocks.empty()) { 514 BasicBlock *BB = *MergeBlocks.begin(); 515 BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()); 516 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) { 517 BasicBlock *Dest = Term->getSuccessor(0); 518 if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) { 519 // Don't remove BB and add Fold as they are the same BB 520 assert(Fold == BB); 521 (void)Fold; 522 MergeBlocks.erase(Dest); 523 } else 524 MergeBlocks.erase(BB); 525 } else 526 MergeBlocks.erase(BB); 527 } 528 529 // At this point, the code is well formed. We now do a quick sweep over the 530 // inserted code, doing constant propagation and dead code elimination as we 531 // go. 532 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC); 533 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC); 534 535 NumCompletelyUnrolledAndJammed += CompletelyUnroll; 536 ++NumUnrolledAndJammed; 537 538 #ifndef NDEBUG 539 // We shouldn't have done anything to break loop simplify form or LCSSA. 540 Loop *OuterL = L->getParentLoop(); 541 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop); 542 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); 543 if (!CompletelyUnroll) 544 assert(L->isLoopSimplifyForm()); 545 assert(SubLoop->isLoopSimplifyForm()); 546 assert(DT->verify()); 547 #endif 548 549 // Update LoopInfo if the loop is completely removed. 550 if (CompletelyUnroll) 551 LI->erase(L); 552 553 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled 554 : LoopUnrollResult::PartiallyUnrolled; 555 } 556 557 static bool getLoadsAndStores(std::vector<BasicBlock *> &Blocks, 558 SmallVector<Value *, 4> &MemInstr) { 559 // Scan the BBs and collect legal loads and stores. 560 // Returns false if non-simple loads/stores are found. 561 for (BasicBlock *BB : Blocks) { 562 for (Instruction &I : *BB) { 563 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 564 if (!Ld->isSimple()) 565 return false; 566 MemInstr.push_back(&I); 567 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 568 if (!St->isSimple()) 569 return false; 570 MemInstr.push_back(&I); 571 } else if (I.mayReadOrWriteMemory()) { 572 return false; 573 } 574 } 575 } 576 return true; 577 } 578 579 static bool checkDependencies(SmallVector<Value *, 4> &Earlier, 580 SmallVector<Value *, 4> &Later, 581 unsigned LoopDepth, bool InnerLoop, 582 DependenceInfo &DI) { 583 // Use DA to check for dependencies between loads and stores that make unroll 584 // and jam invalid 585 for (Value *I : Earlier) { 586 for (Value *J : Later) { 587 Instruction *Src = cast<Instruction>(I); 588 Instruction *Dst = cast<Instruction>(J); 589 if (Src == Dst) 590 continue; 591 // Ignore Input dependencies. 592 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 593 continue; 594 595 // Track dependencies, and if we find them take a conservative approach 596 // by allowing only = or < (not >), altough some > would be safe 597 // (depending upon unroll width). 598 // For the inner loop, we need to disallow any (> <) dependencies 599 // FIXME: Allow > so long as distance is less than unroll width 600 if (auto D = DI.depends(Src, Dst, true)) { 601 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 602 603 if (D->isConfused()) 604 return false; 605 if (!InnerLoop) { 606 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) 607 return false; 608 } else { 609 assert(LoopDepth + 1 <= D->getLevels()); 610 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT && 611 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) 612 return false; 613 } 614 } 615 } 616 } 617 return true; 618 } 619 620 static bool checkDependencies(Loop *L, std::vector<BasicBlock *> &ForeBlocks, 621 std::vector<BasicBlock *> &SubLoopBlocks, 622 std::vector<BasicBlock *> &AftBlocks, 623 DependenceInfo &DI) { 624 // Get all loads/store pairs for each blocks 625 SmallVector<Value *, 4> ForeMemInstr; 626 SmallVector<Value *, 4> SubLoopMemInstr; 627 SmallVector<Value *, 4> AftMemInstr; 628 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) || 629 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) || 630 !getLoadsAndStores(AftBlocks, AftMemInstr)) 631 return false; 632 633 // Check for dependencies between any blocks that may change order 634 unsigned LoopDepth = L->getLoopDepth(); 635 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false, 636 DI) && 637 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) && 638 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false, 639 DI) && 640 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true, 641 DI); 642 } 643 644 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, 645 DependenceInfo &DI) { 646 /* We currently handle outer loops like this: 647 | 648 ForeFirst <----\ } 649 Blocks | } ForeBlocks 650 ForeLast | } 651 | | 652 SubLoopFirst <\ | } 653 Blocks | | } SubLoopBlocks 654 SubLoopLast -/ | } 655 | | 656 AftFirst | } 657 Blocks | } AftBlocks 658 AftLast ------/ } 659 | 660 661 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks 662 and AftBlocks, providing that there is one edge from Fores to SubLoops, 663 one edge from SubLoops to Afts and a single outer loop exit (from Afts). 664 In practice we currently limit Aft blocks to a single block, and limit 665 things further in the profitablility checks of the unroll and jam pass. 666 667 Because of the way we rearrange basic blocks, we also require that 668 the Fore blocks on all unrolled iterations are safe to move before the 669 SubLoop blocks of all iterations. So we require that the phi node looping 670 operands of ForeHeader can be moved to at least the end of ForeEnd, so that 671 we can arrange cloned Fore Blocks before the subloop and match up Phi's 672 correctly. 673 674 i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2. 675 It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2. 676 677 There are then a number of checks along the lines of no calls, no 678 exceptions, inner loop IV is consistent, etc. Note that for loops requiring 679 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in 680 UnrollAndJamLoop if the trip count cannot be easily calculated. 681 */ 682 683 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1) 684 return false; 685 Loop *SubLoop = L->getSubLoops()[0]; 686 if (!SubLoop->isLoopSimplifyForm()) 687 return false; 688 689 BasicBlock *Header = L->getHeader(); 690 BasicBlock *Latch = L->getLoopLatch(); 691 BasicBlock *Exit = L->getExitingBlock(); 692 BasicBlock *SubLoopHeader = SubLoop->getHeader(); 693 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 694 BasicBlock *SubLoopExit = SubLoop->getExitingBlock(); 695 696 if (Latch != Exit) 697 return false; 698 if (SubLoopLatch != SubLoopExit) 699 return false; 700 701 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) 702 return false; 703 704 // Split blocks into Fore/SubLoop/Aft based on dominators 705 std::vector<BasicBlock *> SubLoopBlocks; 706 std::vector<BasicBlock *> ForeBlocks; 707 std::vector<BasicBlock *> AftBlocks; 708 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, 709 AftBlocks, &DT)) 710 return false; 711 712 // Aft blocks may need to move instructions to fore blocks, which becomes more 713 // difficult if there are multiple (potentially conditionally executed) 714 // blocks. For now we just exclude loops with multiple aft blocks. 715 if (AftBlocks.size() != 1) 716 return false; 717 718 // Check inner loop IV is consistent between all iterations 719 const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch); 720 if (isa<SCEVCouldNotCompute>(SubLoopBECountSC) || 721 !SubLoopBECountSC->getType()->isIntegerTy()) 722 return false; 723 ScalarEvolution::LoopDisposition LD = 724 SE.getLoopDisposition(SubLoopBECountSC, L); 725 if (LD != ScalarEvolution::LoopInvariant) 726 return false; 727 728 // Check the loop safety info for exceptions. 729 LoopSafetyInfo LSI; 730 computeLoopSafetyInfo(&LSI, L); 731 if (LSI.MayThrow) 732 return false; 733 734 // We've ruled out the easy stuff and now need to check that there are no 735 // interdependencies which may prevent us from moving the: 736 // ForeBlocks before Subloop and AftBlocks. 737 // Subloop before AftBlocks. 738 // ForeBlock phi operands before the subloop 739 740 // Make sure we can move all instructions we need to before the subloop 741 SmallVector<Instruction *, 8> Worklist; 742 SmallPtrSet<Instruction *, 8> Visited; 743 for (auto &Phi : Header->phis()) { 744 Value *V = Phi.getIncomingValueForBlock(Latch); 745 if (Instruction *I = dyn_cast<Instruction>(V)) 746 Worklist.push_back(I); 747 } 748 while (!Worklist.empty()) { 749 Instruction *I = Worklist.back(); 750 Worklist.pop_back(); 751 if (Visited.insert(I).second) { 752 if (SubLoop->contains(I->getParent())) 753 return false; 754 if (containsBB(AftBlocks, I->getParent())) { 755 // If we hit a phi node in afts we know we are done (probably LCSSA) 756 if (isa<PHINode>(I)) 757 return false; 758 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory()) 759 return false; 760 for (auto &U : I->operands()) 761 if (Instruction *II = dyn_cast<Instruction>(U)) 762 Worklist.push_back(II); 763 } 764 } 765 } 766 767 // Check for memory dependencies which prohibit the unrolling we are doing. 768 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check 769 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. 770 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) 771 return false; 772 773 return true; 774 } 775