1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements loop unroll and jam as a routine, much like 10 // LoopUnroll.cpp implements loop unroll. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/ADT/ArrayRef.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/Optional.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallPtrSet.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/ADT/StringRef.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/ADT/iterator_range.h" 24 #include "llvm/Analysis/AssumptionCache.h" 25 #include "llvm/Analysis/DependenceAnalysis.h" 26 #include "llvm/Analysis/DomTreeUpdater.h" 27 #include "llvm/Analysis/LoopInfo.h" 28 #include "llvm/Analysis/LoopIterator.h" 29 #include "llvm/Analysis/MustExecute.h" 30 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 31 #include "llvm/Analysis/ScalarEvolution.h" 32 #include "llvm/IR/BasicBlock.h" 33 #include "llvm/IR/DebugInfoMetadata.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/IR/Dominators.h" 37 #include "llvm/IR/Function.h" 38 #include "llvm/IR/Instruction.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/IntrinsicInst.h" 41 #include "llvm/IR/Use.h" 42 #include "llvm/IR/User.h" 43 #include "llvm/IR/Value.h" 44 #include "llvm/IR/ValueHandle.h" 45 #include "llvm/IR/ValueMap.h" 46 #include "llvm/Support/Casting.h" 47 #include "llvm/Support/Debug.h" 48 #include "llvm/Support/ErrorHandling.h" 49 #include "llvm/Support/GenericDomTree.h" 50 #include "llvm/Support/raw_ostream.h" 51 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 52 #include "llvm/Transforms/Utils/Cloning.h" 53 #include "llvm/Transforms/Utils/LoopUtils.h" 54 #include "llvm/Transforms/Utils/UnrollLoop.h" 55 #include "llvm/Transforms/Utils/ValueMapper.h" 56 #include <assert.h> 57 #include <memory> 58 #include <type_traits> 59 #include <vector> 60 61 using namespace llvm; 62 63 #define DEBUG_TYPE "loop-unroll-and-jam" 64 65 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed"); 66 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed"); 67 68 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet; 69 70 // Partition blocks in an outer/inner loop pair into blocks before and after 71 // the loop 72 static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks, 73 BasicBlockSet &AftBlocks, DominatorTree &DT) { 74 Loop *SubLoop = L.getSubLoops()[0]; 75 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch(); 76 77 for (BasicBlock *BB : L.blocks()) { 78 if (!SubLoop->contains(BB)) { 79 if (DT.dominates(SubLoopLatch, BB)) 80 AftBlocks.insert(BB); 81 else 82 ForeBlocks.insert(BB); 83 } 84 } 85 86 // Check that all blocks in ForeBlocks together dominate the subloop 87 // TODO: This might ideally be done better with a dominator/postdominators. 88 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader(); 89 for (BasicBlock *BB : ForeBlocks) { 90 if (BB == SubLoopPreHeader) 91 continue; 92 Instruction *TI = BB->getTerminator(); 93 for (BasicBlock *Succ : successors(TI)) 94 if (!ForeBlocks.count(Succ)) 95 return false; 96 } 97 98 return true; 99 } 100 101 /// Partition blocks in a loop nest into blocks before and after each inner 102 /// loop. 103 static bool partitionOuterLoopBlocks( 104 Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks, 105 DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, 106 DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) { 107 JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end()); 108 109 for (Loop *L : Root.getLoopsInPreorder()) { 110 if (L == &JamLoop) 111 break; 112 113 if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT)) 114 return false; 115 } 116 117 return true; 118 } 119 120 // TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more 121 // than 2 levels loop. 122 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, 123 BasicBlockSet &ForeBlocks, 124 BasicBlockSet &SubLoopBlocks, 125 BasicBlockSet &AftBlocks, 126 DominatorTree *DT) { 127 SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end()); 128 return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT); 129 } 130 131 // Looks at the phi nodes in Header for values coming from Latch. For these 132 // instructions and all their operands calls Visit on them, keeping going for 133 // all the operands in AftBlocks. Returns false if Visit returns false, 134 // otherwise returns true. This is used to process the instructions in the 135 // Aft blocks that need to be moved before the subloop. It is used in two 136 // places. One to check that the required set of instructions can be moved 137 // before the loop. Then to collect the instructions to actually move in 138 // moveHeaderPhiOperandsToForeBlocks. 139 template <typename T> 140 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch, 141 BasicBlockSet &AftBlocks, T Visit) { 142 SmallVector<Instruction *, 8> Worklist; 143 SmallPtrSet<Instruction *, 8> VisitedInstr; 144 for (auto &Phi : Header->phis()) { 145 Value *V = Phi.getIncomingValueForBlock(Latch); 146 if (Instruction *I = dyn_cast<Instruction>(V)) 147 Worklist.push_back(I); 148 } 149 150 while (!Worklist.empty()) { 151 Instruction *I = Worklist.pop_back_val(); 152 if (!Visit(I)) 153 return false; 154 VisitedInstr.insert(I); 155 156 if (AftBlocks.count(I->getParent())) 157 for (auto &U : I->operands()) 158 if (Instruction *II = dyn_cast<Instruction>(U)) 159 if (!VisitedInstr.count(II)) 160 Worklist.push_back(II); 161 } 162 163 return true; 164 } 165 166 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc. 167 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, 168 BasicBlock *Latch, 169 Instruction *InsertLoc, 170 BasicBlockSet &AftBlocks) { 171 // We need to ensure we move the instructions in the correct order, 172 // starting with the earliest required instruction and moving forward. 173 std::vector<Instruction *> Visited; 174 processHeaderPhiOperands(Header, Latch, AftBlocks, 175 [&Visited, &AftBlocks](Instruction *I) { 176 if (AftBlocks.count(I->getParent())) 177 Visited.push_back(I); 178 return true; 179 }); 180 181 // Move all instructions in program order to before the InsertLoc 182 BasicBlock *InsertLocBB = InsertLoc->getParent(); 183 for (Instruction *I : reverse(Visited)) { 184 if (I->getParent() != InsertLocBB) 185 I->moveBefore(InsertLoc); 186 } 187 } 188 189 /* 190 This method performs Unroll and Jam. For a simple loop like: 191 for (i = ..) 192 Fore(i) 193 for (j = ..) 194 SubLoop(i, j) 195 Aft(i) 196 197 Instead of doing normal inner or outer unrolling, we do: 198 for (i = .., i+=2) 199 Fore(i) 200 Fore(i+1) 201 for (j = ..) 202 SubLoop(i, j) 203 SubLoop(i+1, j) 204 Aft(i) 205 Aft(i+1) 206 207 So the outer loop is essetially unrolled and then the inner loops are fused 208 ("jammed") together into a single loop. This can increase speed when there 209 are loads in SubLoop that are invariant to i, as they become shared between 210 the now jammed inner loops. 211 212 We do this by spliting the blocks in the loop into Fore, Subloop and Aft. 213 Fore blocks are those before the inner loop, Aft are those after. Normal 214 Unroll code is used to copy each of these sets of blocks and the results are 215 combined together into the final form above. 216 217 isSafeToUnrollAndJam should be used prior to calling this to make sure the 218 unrolling will be valid. Checking profitablility is also advisable. 219 220 If EpilogueLoop is non-null, it receives the epilogue loop (if it was 221 necessary to create one and not fully unrolled). 222 */ 223 LoopUnrollResult 224 llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount, 225 unsigned TripMultiple, bool UnrollRemainder, 226 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, 227 AssumptionCache *AC, const TargetTransformInfo *TTI, 228 OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) { 229 230 // When we enter here we should have already checked that it is safe 231 BasicBlock *Header = L->getHeader(); 232 assert(Header && "No header."); 233 assert(L->getSubLoops().size() == 1); 234 Loop *SubLoop = *L->begin(); 235 236 // Don't enter the unroll code if there is nothing to do. 237 if (TripCount == 0 && Count < 2) { 238 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n"); 239 return LoopUnrollResult::Unmodified; 240 } 241 242 assert(Count > 0); 243 assert(TripMultiple > 0); 244 assert(TripCount == 0 || TripCount % TripMultiple == 0); 245 246 // Are we eliminating the loop control altogether? 247 bool CompletelyUnroll = (Count == TripCount); 248 249 // We use the runtime remainder in cases where we don't know trip multiple 250 if (TripMultiple % Count != 0) { 251 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false, 252 /*UseEpilogRemainder*/ true, 253 UnrollRemainder, /*ForgetAllSCEV*/ false, 254 LI, SE, DT, AC, TTI, true, EpilogueLoop)) { 255 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be " 256 "generated when assuming runtime trip count\n"); 257 return LoopUnrollResult::Unmodified; 258 } 259 } 260 261 // Notify ScalarEvolution that the loop will be substantially changed, 262 // if not outright eliminated. 263 if (SE) { 264 SE->forgetLoop(L); 265 SE->forgetLoop(SubLoop); 266 } 267 268 using namespace ore; 269 // Report the unrolling decision. 270 if (CompletelyUnroll) { 271 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %" 272 << Header->getName() << " with trip count " << TripCount 273 << "!\n"); 274 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(), 275 L->getHeader()) 276 << "completely unroll and jammed loop with " 277 << NV("UnrollCount", TripCount) << " iterations"); 278 } else { 279 auto DiagBuilder = [&]() { 280 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(), 281 L->getHeader()); 282 return Diag << "unroll and jammed loop by a factor of " 283 << NV("UnrollCount", Count); 284 }; 285 286 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName() 287 << " by " << Count); 288 if (TripMultiple != 1) { 289 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch"); 290 ORE->emit([&]() { 291 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple) 292 << " trips per branch"; 293 }); 294 } else { 295 LLVM_DEBUG(dbgs() << " with run-time trip count"); 296 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; }); 297 } 298 LLVM_DEBUG(dbgs() << "!\n"); 299 } 300 301 BasicBlock *Preheader = L->getLoopPreheader(); 302 BasicBlock *LatchBlock = L->getLoopLatch(); 303 assert(Preheader && "No preheader"); 304 assert(LatchBlock && "No latch block"); 305 BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); 306 assert(BI && !BI->isUnconditional()); 307 bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); 308 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); 309 bool SubLoopContinueOnTrue = SubLoop->contains( 310 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0)); 311 312 // Partition blocks in an outer/inner loop pair into blocks before and after 313 // the loop 314 BasicBlockSet SubLoopBlocks; 315 BasicBlockSet ForeBlocks; 316 BasicBlockSet AftBlocks; 317 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks, 318 DT); 319 320 // We keep track of the entering/first and exiting/last block of each of 321 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of 322 // blocks easier. 323 std::vector<BasicBlock *> ForeBlocksFirst; 324 std::vector<BasicBlock *> ForeBlocksLast; 325 std::vector<BasicBlock *> SubLoopBlocksFirst; 326 std::vector<BasicBlock *> SubLoopBlocksLast; 327 std::vector<BasicBlock *> AftBlocksFirst; 328 std::vector<BasicBlock *> AftBlocksLast; 329 ForeBlocksFirst.push_back(Header); 330 ForeBlocksLast.push_back(SubLoop->getLoopPreheader()); 331 SubLoopBlocksFirst.push_back(SubLoop->getHeader()); 332 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock()); 333 AftBlocksFirst.push_back(SubLoop->getExitBlock()); 334 AftBlocksLast.push_back(L->getExitingBlock()); 335 // Maps Blocks[0] -> Blocks[It] 336 ValueToValueMapTy LastValueMap; 337 338 // Move any instructions from fore phi operands from AftBlocks into Fore. 339 moveHeaderPhiOperandsToForeBlocks( 340 Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks); 341 342 // The current on-the-fly SSA update requires blocks to be processed in 343 // reverse postorder so that LastValueMap contains the correct value at each 344 // exit. 345 LoopBlocksDFS DFS(L); 346 DFS.perform(LI); 347 // Stash the DFS iterators before adding blocks to the loop. 348 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO(); 349 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO(); 350 351 // When a FSDiscriminator is enabled, we don't need to add the multiply 352 // factors to the discriminators. 353 if (Header->getParent()->isDebugInfoForProfiling() && !EnableFSDiscriminator) 354 for (BasicBlock *BB : L->getBlocks()) 355 for (Instruction &I : *BB) 356 if (!isa<DbgInfoIntrinsic>(&I)) 357 if (const DILocation *DIL = I.getDebugLoc()) { 358 auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count); 359 if (NewDIL) 360 I.setDebugLoc(NewDIL.getValue()); 361 else 362 LLVM_DEBUG(dbgs() 363 << "Failed to create new discriminator: " 364 << DIL->getFilename() << " Line: " << DIL->getLine()); 365 } 366 367 // Copy all blocks 368 for (unsigned It = 1; It != Count; ++It) { 369 SmallVector<BasicBlock *, 8> NewBlocks; 370 // Maps Blocks[It] -> Blocks[It-1] 371 DenseMap<Value *, Value *> PrevItValueMap; 372 SmallDenseMap<const Loop *, Loop *, 4> NewLoops; 373 NewLoops[L] = L; 374 NewLoops[SubLoop] = SubLoop; 375 376 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) { 377 ValueToValueMapTy VMap; 378 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It)); 379 Header->getParent()->getBasicBlockList().push_back(New); 380 381 // Tell LI about New. 382 addClonedBlockToLoopInfo(*BB, New, LI, NewLoops); 383 384 if (ForeBlocks.count(*BB)) { 385 if (*BB == ForeBlocksFirst[0]) 386 ForeBlocksFirst.push_back(New); 387 if (*BB == ForeBlocksLast[0]) 388 ForeBlocksLast.push_back(New); 389 } else if (SubLoopBlocks.count(*BB)) { 390 if (*BB == SubLoopBlocksFirst[0]) 391 SubLoopBlocksFirst.push_back(New); 392 if (*BB == SubLoopBlocksLast[0]) 393 SubLoopBlocksLast.push_back(New); 394 } else if (AftBlocks.count(*BB)) { 395 if (*BB == AftBlocksFirst[0]) 396 AftBlocksFirst.push_back(New); 397 if (*BB == AftBlocksLast[0]) 398 AftBlocksLast.push_back(New); 399 } else { 400 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft"); 401 } 402 403 // Update our running maps of newest clones 404 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]); 405 LastValueMap[*BB] = New; 406 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); 407 VI != VE; ++VI) { 408 PrevItValueMap[VI->second] = 409 const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]); 410 LastValueMap[VI->first] = VI->second; 411 } 412 413 NewBlocks.push_back(New); 414 415 // Update DomTree: 416 if (*BB == ForeBlocksFirst[0]) 417 DT->addNewBlock(New, ForeBlocksLast[It - 1]); 418 else if (*BB == SubLoopBlocksFirst[0]) 419 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]); 420 else if (*BB == AftBlocksFirst[0]) 421 DT->addNewBlock(New, AftBlocksLast[It - 1]); 422 else { 423 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree 424 // structure. 425 auto BBDomNode = DT->getNode(*BB); 426 auto BBIDom = BBDomNode->getIDom(); 427 BasicBlock *OriginalBBIDom = BBIDom->getBlock(); 428 assert(OriginalBBIDom); 429 assert(LastValueMap[cast<Value>(OriginalBBIDom)]); 430 DT->addNewBlock( 431 New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)])); 432 } 433 } 434 435 // Remap all instructions in the most recent iteration 436 remapInstructionsInBlocks(NewBlocks, LastValueMap); 437 for (BasicBlock *NewBlock : NewBlocks) { 438 for (Instruction &I : *NewBlock) { 439 if (auto *II = dyn_cast<AssumeInst>(&I)) 440 AC->registerAssumption(II); 441 } 442 } 443 444 // Alter the ForeBlocks phi's, pointing them at the latest version of the 445 // value from the previous iteration's phis 446 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) { 447 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]); 448 assert(OldValue && "should have incoming edge from Aft[It]"); 449 Value *NewValue = OldValue; 450 if (Value *PrevValue = PrevItValueMap[OldValue]) 451 NewValue = PrevValue; 452 453 assert(Phi.getNumOperands() == 2); 454 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]); 455 Phi.setIncomingValue(0, NewValue); 456 Phi.removeIncomingValue(1); 457 } 458 } 459 460 // Now that all the basic blocks for the unrolled iterations are in place, 461 // finish up connecting the blocks and phi nodes. At this point LastValueMap 462 // is the last unrolled iterations values. 463 464 // Update Phis in BB from OldBB to point to NewBB and use the latest value 465 // from LastValueMap 466 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB, 467 BasicBlock *NewBB, 468 ValueToValueMapTy &LastValueMap) { 469 for (PHINode &Phi : BB->phis()) { 470 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) { 471 if (Phi.getIncomingBlock(b) == OldBB) { 472 Value *OldValue = Phi.getIncomingValue(b); 473 if (Value *LastValue = LastValueMap[OldValue]) 474 Phi.setIncomingValue(b, LastValue); 475 Phi.setIncomingBlock(b, NewBB); 476 break; 477 } 478 } 479 } 480 }; 481 // Move all the phis from Src into Dest 482 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) { 483 Instruction *insertPoint = Dest->getFirstNonPHI(); 484 while (PHINode *Phi = dyn_cast<PHINode>(Src->begin())) 485 Phi->moveBefore(insertPoint); 486 }; 487 488 // Update the PHI values outside the loop to point to the last block 489 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(), 490 LastValueMap); 491 492 // Update ForeBlocks successors and phi nodes 493 BranchInst *ForeTerm = 494 cast<BranchInst>(ForeBlocksLast.back()->getTerminator()); 495 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); 496 ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]); 497 498 if (CompletelyUnroll) { 499 while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) { 500 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader)); 501 Phi->getParent()->getInstList().erase(Phi); 502 } 503 } else { 504 // Update the PHI values to point to the last aft block 505 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0], 506 AftBlocksLast.back(), LastValueMap); 507 } 508 509 for (unsigned It = 1; It != Count; It++) { 510 // Remap ForeBlock successors from previous iteration to this 511 BranchInst *ForeTerm = 512 cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator()); 513 assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor"); 514 ForeTerm->setSuccessor(0, ForeBlocksFirst[It]); 515 } 516 517 // Subloop successors and phis 518 BranchInst *SubTerm = 519 cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator()); 520 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]); 521 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]); 522 SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0], 523 ForeBlocksLast.back()); 524 SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0], 525 SubLoopBlocksLast.back()); 526 527 for (unsigned It = 1; It != Count; It++) { 528 // Replace the conditional branch of the previous iteration subloop with an 529 // unconditional one to this one 530 BranchInst *SubTerm = 531 cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator()); 532 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm); 533 SubTerm->eraseFromParent(); 534 535 SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It], 536 ForeBlocksLast.back()); 537 SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It], 538 SubLoopBlocksLast.back()); 539 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]); 540 } 541 542 // Aft blocks successors and phis 543 BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator()); 544 if (CompletelyUnroll) { 545 BranchInst::Create(LoopExit, AftTerm); 546 AftTerm->eraseFromParent(); 547 } else { 548 AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]); 549 assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit && 550 "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit"); 551 } 552 AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0], 553 SubLoopBlocksLast.back()); 554 555 for (unsigned It = 1; It != Count; It++) { 556 // Replace the conditional branch of the previous iteration subloop with an 557 // unconditional one to this one 558 BranchInst *AftTerm = 559 cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator()); 560 BranchInst::Create(AftBlocksFirst[It], AftTerm); 561 AftTerm->eraseFromParent(); 562 563 AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It], 564 SubLoopBlocksLast.back()); 565 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]); 566 } 567 568 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 569 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the 570 // new ones required. 571 if (Count != 1) { 572 SmallVector<DominatorTree::UpdateType, 4> DTUpdates; 573 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0], 574 SubLoopBlocksFirst[0]); 575 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, 576 SubLoopBlocksLast[0], AftBlocksFirst[0]); 577 578 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 579 ForeBlocksLast.back(), SubLoopBlocksFirst[0]); 580 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, 581 SubLoopBlocksLast.back(), AftBlocksFirst[0]); 582 DTU.applyUpdatesPermissive(DTUpdates); 583 } 584 585 // Merge adjacent basic blocks, if possible. 586 SmallPtrSet<BasicBlock *, 16> MergeBlocks; 587 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); 588 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); 589 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); 590 591 MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI); 592 593 // Apply updates to the DomTree. 594 DT = &DTU.getDomTree(); 595 596 // At this point, the code is well formed. We now do a quick sweep over the 597 // inserted code, doing constant propagation and dead code elimination as we 598 // go. 599 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI); 600 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC, 601 TTI); 602 603 NumCompletelyUnrolledAndJammed += CompletelyUnroll; 604 ++NumUnrolledAndJammed; 605 606 // Update LoopInfo if the loop is completely removed. 607 if (CompletelyUnroll) 608 LI->erase(L); 609 610 #ifndef NDEBUG 611 // We shouldn't have done anything to break loop simplify form or LCSSA. 612 Loop *OutestLoop = SubLoop->getParentLoop() 613 ? SubLoop->getParentLoop()->getParentLoop() 614 ? SubLoop->getParentLoop()->getParentLoop() 615 : SubLoop->getParentLoop() 616 : SubLoop; 617 assert(DT->verify()); 618 LI->verify(*DT); 619 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI)); 620 if (!CompletelyUnroll) 621 assert(L->isLoopSimplifyForm()); 622 assert(SubLoop->isLoopSimplifyForm()); 623 SE->verify(); 624 #endif 625 626 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled 627 : LoopUnrollResult::PartiallyUnrolled; 628 } 629 630 static bool getLoadsAndStores(BasicBlockSet &Blocks, 631 SmallVector<Instruction *, 4> &MemInstr) { 632 // Scan the BBs and collect legal loads and stores. 633 // Returns false if non-simple loads/stores are found. 634 for (BasicBlock *BB : Blocks) { 635 for (Instruction &I : *BB) { 636 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 637 if (!Ld->isSimple()) 638 return false; 639 MemInstr.push_back(&I); 640 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 641 if (!St->isSimple()) 642 return false; 643 MemInstr.push_back(&I); 644 } else if (I.mayReadOrWriteMemory()) { 645 return false; 646 } 647 } 648 } 649 return true; 650 } 651 652 static bool preservesForwardDependence(Instruction *Src, Instruction *Dst, 653 unsigned UnrollLevel, unsigned JamLevel, 654 bool Sequentialized, Dependence *D) { 655 // UnrollLevel might carry the dependency Src --> Dst 656 // Does a different loop after unrolling? 657 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; 658 ++CurLoopDepth) { 659 auto JammedDir = D->getDirection(CurLoopDepth); 660 if (JammedDir == Dependence::DVEntry::LT) 661 return true; 662 663 if (JammedDir & Dependence::DVEntry::GT) 664 return false; 665 } 666 667 return true; 668 } 669 670 static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst, 671 unsigned UnrollLevel, unsigned JamLevel, 672 bool Sequentialized, Dependence *D) { 673 // UnrollLevel might carry the dependency Dst --> Src 674 for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel; 675 ++CurLoopDepth) { 676 auto JammedDir = D->getDirection(CurLoopDepth); 677 if (JammedDir == Dependence::DVEntry::GT) 678 return true; 679 680 if (JammedDir & Dependence::DVEntry::LT) 681 return false; 682 } 683 684 // Backward dependencies are only preserved if not interleaved. 685 return Sequentialized; 686 } 687 688 // Check whether it is semantically safe Src and Dst considering any potential 689 // dependency between them. 690 // 691 // @param UnrollLevel The level of the loop being unrolled 692 // @param JamLevel The level of the loop being jammed; if Src and Dst are on 693 // different levels, the outermost common loop counts as jammed level 694 // 695 // @return true if is safe and false if there is a dependency violation. 696 static bool checkDependency(Instruction *Src, Instruction *Dst, 697 unsigned UnrollLevel, unsigned JamLevel, 698 bool Sequentialized, DependenceInfo &DI) { 699 assert(UnrollLevel <= JamLevel && 700 "Expecting JamLevel to be at least UnrollLevel"); 701 702 if (Src == Dst) 703 return true; 704 // Ignore Input dependencies. 705 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 706 return true; 707 708 // Check whether unroll-and-jam may violate a dependency. 709 // By construction, every dependency will be lexicographically non-negative 710 // (if it was, it would violate the current execution order), such as 711 // (0,0,>,*,*) 712 // Unroll-and-jam changes the GT execution of two executions to the same 713 // iteration of the chosen unroll level. That is, a GT dependence becomes a GE 714 // dependence (or EQ, if we fully unrolled the loop) at the loop's position: 715 // (0,0,>=,*,*) 716 // Now, the dependency is not necessarily non-negative anymore, i.e. 717 // unroll-and-jam may violate correctness. 718 std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true); 719 if (!D) 720 return true; 721 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 722 723 if (D->isConfused()) { 724 LLVM_DEBUG(dbgs() << " Confused dependency between:\n" 725 << " " << *Src << "\n" 726 << " " << *Dst << "\n"); 727 return false; 728 } 729 730 // If outer levels (levels enclosing the loop being unroll-and-jammed) have a 731 // non-equal direction, then the locations accessed in the inner levels cannot 732 // overlap in memory. We assumes the indexes never overlap into neighboring 733 // dimensions. 734 for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth) 735 if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ)) 736 return true; 737 738 auto UnrollDirection = D->getDirection(UnrollLevel); 739 740 // If the distance carried by the unrolled loop is 0, then after unrolling 741 // that distance will become non-zero resulting in non-overlapping accesses in 742 // the inner loops. 743 if (UnrollDirection == Dependence::DVEntry::EQ) 744 return true; 745 746 if (UnrollDirection & Dependence::DVEntry::LT && 747 !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel, 748 Sequentialized, D.get())) 749 return false; 750 751 if (UnrollDirection & Dependence::DVEntry::GT && 752 !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel, 753 Sequentialized, D.get())) 754 return false; 755 756 return true; 757 } 758 759 static bool 760 checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks, 761 const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap, 762 const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, 763 DependenceInfo &DI, LoopInfo &LI) { 764 SmallVector<BasicBlockSet, 8> AllBlocks; 765 for (Loop *L : Root.getLoopsInPreorder()) 766 if (ForeBlocksMap.find(L) != ForeBlocksMap.end()) 767 AllBlocks.push_back(ForeBlocksMap.lookup(L)); 768 AllBlocks.push_back(SubLoopBlocks); 769 for (Loop *L : Root.getLoopsInPreorder()) 770 if (AftBlocksMap.find(L) != AftBlocksMap.end()) 771 AllBlocks.push_back(AftBlocksMap.lookup(L)); 772 773 unsigned LoopDepth = Root.getLoopDepth(); 774 SmallVector<Instruction *, 4> EarlierLoadsAndStores; 775 SmallVector<Instruction *, 4> CurrentLoadsAndStores; 776 for (BasicBlockSet &Blocks : AllBlocks) { 777 CurrentLoadsAndStores.clear(); 778 if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores)) 779 return false; 780 781 Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent()); 782 unsigned CurLoopDepth = CurLoop->getLoopDepth(); 783 784 for (auto *Earlier : EarlierLoadsAndStores) { 785 Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent()); 786 unsigned EarlierDepth = EarlierLoop->getLoopDepth(); 787 unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth); 788 for (auto *Later : CurrentLoadsAndStores) { 789 if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false, 790 DI)) 791 return false; 792 } 793 } 794 795 size_t NumInsts = CurrentLoadsAndStores.size(); 796 for (size_t I = 0; I < NumInsts; ++I) { 797 for (size_t J = I; J < NumInsts; ++J) { 798 if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J], 799 LoopDepth, CurLoopDepth, true, DI)) 800 return false; 801 } 802 } 803 804 EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(), 805 CurrentLoadsAndStores.end()); 806 } 807 return true; 808 } 809 810 static bool isEligibleLoopForm(const Loop &Root) { 811 // Root must have a child. 812 if (Root.getSubLoops().size() != 1) 813 return false; 814 815 const Loop *L = &Root; 816 do { 817 // All loops in Root need to be in simplify and rotated form. 818 if (!L->isLoopSimplifyForm()) 819 return false; 820 821 if (!L->isRotatedForm()) 822 return false; 823 824 if (L->getHeader()->hasAddressTaken()) { 825 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n"); 826 return false; 827 } 828 829 unsigned SubLoopsSize = L->getSubLoops().size(); 830 if (SubLoopsSize == 0) 831 return true; 832 833 // Only one child is allowed. 834 if (SubLoopsSize != 1) 835 return false; 836 837 // Only loops with a single exit block can be unrolled and jammed. 838 // The function getExitBlock() is used for this check, rather than 839 // getUniqueExitBlock() to ensure loops with mulitple exit edges are 840 // disallowed. 841 if (!L->getExitBlock()) { 842 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single exit " 843 "blocks can be unrolled and jammed.\n"); 844 return false; 845 } 846 847 // Only loops with a single exiting block can be unrolled and jammed. 848 if (!L->getExitingBlock()) { 849 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; only loops with single " 850 "exiting blocks can be unrolled and jammed.\n"); 851 return false; 852 } 853 854 L = L->getSubLoops()[0]; 855 } while (L); 856 857 return true; 858 } 859 860 static Loop *getInnerMostLoop(Loop *L) { 861 while (!L->getSubLoops().empty()) 862 L = L->getSubLoops()[0]; 863 return L; 864 } 865 866 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, 867 DependenceInfo &DI, LoopInfo &LI) { 868 if (!isEligibleLoopForm(*L)) { 869 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n"); 870 return false; 871 } 872 873 /* We currently handle outer loops like this: 874 | 875 ForeFirst <------\ } 876 Blocks | } ForeBlocks of L 877 ForeLast | } 878 | | 879 ... | 880 | | 881 ForeFirst <----\ | } 882 Blocks | | } ForeBlocks of a inner loop of L 883 ForeLast | | } 884 | | | 885 JamLoopFirst <\ | | } 886 Blocks | | | } JamLoopBlocks of the innermost loop 887 JamLoopLast -/ | | } 888 | | | 889 AftFirst | | } 890 Blocks | | } AftBlocks of a inner loop of L 891 AftLast ------/ | } 892 | | 893 ... | 894 | | 895 AftFirst | } 896 Blocks | } AftBlocks of L 897 AftLast --------/ } 898 | 899 900 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks 901 and AftBlocks, providing that there is one edge from Fores to SubLoops, 902 one edge from SubLoops to Afts and a single outer loop exit (from Afts). 903 In practice we currently limit Aft blocks to a single block, and limit 904 things further in the profitablility checks of the unroll and jam pass. 905 906 Because of the way we rearrange basic blocks, we also require that 907 the Fore blocks of L on all unrolled iterations are safe to move before the 908 blocks of the direct child of L of all iterations. So we require that the 909 phi node looping operands of ForeHeader can be moved to at least the end of 910 ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and 911 match up Phi's correctly. 912 913 i.e. The old order of blocks used to be 914 (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2. 915 It needs to be safe to transform this to 916 (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2. 917 918 There are then a number of checks along the lines of no calls, no 919 exceptions, inner loop IV is consistent, etc. Note that for loops requiring 920 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in 921 UnrollAndJamLoop if the trip count cannot be easily calculated. 922 */ 923 924 // Split blocks into Fore/SubLoop/Aft based on dominators 925 Loop *JamLoop = getInnerMostLoop(L); 926 BasicBlockSet SubLoopBlocks; 927 DenseMap<Loop *, BasicBlockSet> ForeBlocksMap; 928 DenseMap<Loop *, BasicBlockSet> AftBlocksMap; 929 if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap, 930 AftBlocksMap, DT)) { 931 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n"); 932 return false; 933 } 934 935 // Aft blocks may need to move instructions to fore blocks, which becomes more 936 // difficult if there are multiple (potentially conditionally executed) 937 // blocks. For now we just exclude loops with multiple aft blocks. 938 if (AftBlocksMap[L].size() != 1) { 939 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle " 940 "multiple blocks after the loop\n"); 941 return false; 942 } 943 944 // Check inner loop backedge count is consistent on all iterations of the 945 // outer loop 946 if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) { 947 return !hasIterationCountInvariantInParent(SubLoop, SE); 948 })) { 949 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is " 950 "not consistent on each iteration\n"); 951 return false; 952 } 953 954 // Check the loop safety info for exceptions. 955 SimpleLoopSafetyInfo LSI; 956 LSI.computeLoopSafetyInfo(L); 957 if (LSI.anyBlockMayThrow()) { 958 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n"); 959 return false; 960 } 961 962 // We've ruled out the easy stuff and now need to check that there are no 963 // interdependencies which may prevent us from moving the: 964 // ForeBlocks before Subloop and AftBlocks. 965 // Subloop before AftBlocks. 966 // ForeBlock phi operands before the subloop 967 968 // Make sure we can move all instructions we need to before the subloop 969 BasicBlock *Header = L->getHeader(); 970 BasicBlock *Latch = L->getLoopLatch(); 971 BasicBlockSet AftBlocks = AftBlocksMap[L]; 972 Loop *SubLoop = L->getSubLoops()[0]; 973 if (!processHeaderPhiOperands( 974 Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) { 975 if (SubLoop->contains(I->getParent())) 976 return false; 977 if (AftBlocks.count(I->getParent())) { 978 // If we hit a phi node in afts we know we are done (probably 979 // LCSSA) 980 if (isa<PHINode>(I)) 981 return false; 982 // Can't move instructions with side effects or memory 983 // reads/writes 984 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory()) 985 return false; 986 } 987 // Keep going 988 return true; 989 })) { 990 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required " 991 "instructions after subloop to before it\n"); 992 return false; 993 } 994 995 // Check for memory dependencies which prohibit the unrolling we are doing. 996 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check 997 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub. 998 if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI, 999 LI)) { 1000 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n"); 1001 return false; 1002 } 1003 1004 return true; 1005 } 1006