1 //===- LoopInterchange.cpp - Loop interchange pass-------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This Pass handles loop interchange transform. 11 // This pass interchanges loops to provide a more cache-friendly memory access 12 // patterns. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/StringRef.h" 19 #include "llvm/Analysis/AliasAnalysis.h" 20 #include "llvm/Analysis/DependenceAnalysis.h" 21 #include "llvm/Analysis/LoopInfo.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/ScalarEvolution.h" 24 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/InstrTypes.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/IR/User.h" 35 #include "llvm/IR/Value.h" 36 #include "llvm/Pass.h" 37 #include "llvm/Support/Casting.h" 38 #include "llvm/Support/CommandLine.h" 39 #include "llvm/Support/Debug.h" 40 #include "llvm/Support/ErrorHandling.h" 41 #include "llvm/Support/raw_ostream.h" 42 #include "llvm/Transforms/Scalar.h" 43 #include "llvm/Transforms/Utils.h" 44 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 45 #include "llvm/Transforms/Utils/LoopUtils.h" 46 #include <cassert> 47 #include <utility> 48 #include <vector> 49 50 using namespace llvm; 51 52 #define DEBUG_TYPE "loop-interchange" 53 54 static cl::opt<int> LoopInterchangeCostThreshold( 55 "loop-interchange-threshold", cl::init(0), cl::Hidden, 56 cl::desc("Interchange if you gain more than this number")); 57 58 namespace { 59 60 using LoopVector = SmallVector<Loop *, 8>; 61 62 // TODO: Check if we can use a sparse matrix here. 63 using CharMatrix = std::vector<std::vector<char>>; 64 65 } // end anonymous namespace 66 67 // Maximum number of dependencies that can be handled in the dependency matrix. 68 static const unsigned MaxMemInstrCount = 100; 69 70 // Maximum loop depth supported. 71 static const unsigned MaxLoopNestDepth = 10; 72 73 #ifdef DUMP_DEP_MATRICIES 74 static void printDepMatrix(CharMatrix &DepMatrix) { 75 for (auto &Row : DepMatrix) { 76 for (auto D : Row) 77 DEBUG(dbgs() << D << " "); 78 DEBUG(dbgs() << "\n"); 79 } 80 } 81 #endif 82 83 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, 84 Loop *L, DependenceInfo *DI) { 85 using ValueVector = SmallVector<Value *, 16>; 86 87 ValueVector MemInstr; 88 89 // For each block. 90 for (BasicBlock *BB : L->blocks()) { 91 // Scan the BB and collect legal loads and stores. 92 for (Instruction &I : *BB) { 93 if (!isa<Instruction>(I)) 94 return false; 95 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 96 if (!Ld->isSimple()) 97 return false; 98 MemInstr.push_back(&I); 99 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 100 if (!St->isSimple()) 101 return false; 102 MemInstr.push_back(&I); 103 } 104 } 105 } 106 107 DEBUG(dbgs() << "Found " << MemInstr.size() 108 << " Loads and Stores to analyze\n"); 109 110 ValueVector::iterator I, IE, J, JE; 111 112 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { 113 for (J = I, JE = MemInstr.end(); J != JE; ++J) { 114 std::vector<char> Dep; 115 Instruction *Src = cast<Instruction>(*I); 116 Instruction *Dst = cast<Instruction>(*J); 117 if (Src == Dst) 118 continue; 119 // Ignore Input dependencies. 120 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 121 continue; 122 // Track Output, Flow, and Anti dependencies. 123 if (auto D = DI->depends(Src, Dst, true)) { 124 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 125 DEBUG(StringRef DepType = 126 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; 127 dbgs() << "Found " << DepType 128 << " dependency between Src and Dst\n" 129 << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); 130 unsigned Levels = D->getLevels(); 131 char Direction; 132 for (unsigned II = 1; II <= Levels; ++II) { 133 const SCEV *Distance = D->getDistance(II); 134 const SCEVConstant *SCEVConst = 135 dyn_cast_or_null<SCEVConstant>(Distance); 136 if (SCEVConst) { 137 const ConstantInt *CI = SCEVConst->getValue(); 138 if (CI->isNegative()) 139 Direction = '<'; 140 else if (CI->isZero()) 141 Direction = '='; 142 else 143 Direction = '>'; 144 Dep.push_back(Direction); 145 } else if (D->isScalar(II)) { 146 Direction = 'S'; 147 Dep.push_back(Direction); 148 } else { 149 unsigned Dir = D->getDirection(II); 150 if (Dir == Dependence::DVEntry::LT || 151 Dir == Dependence::DVEntry::LE) 152 Direction = '<'; 153 else if (Dir == Dependence::DVEntry::GT || 154 Dir == Dependence::DVEntry::GE) 155 Direction = '>'; 156 else if (Dir == Dependence::DVEntry::EQ) 157 Direction = '='; 158 else 159 Direction = '*'; 160 Dep.push_back(Direction); 161 } 162 } 163 while (Dep.size() != Level) { 164 Dep.push_back('I'); 165 } 166 167 DepMatrix.push_back(Dep); 168 if (DepMatrix.size() > MaxMemInstrCount) { 169 DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount 170 << " dependencies inside loop\n"); 171 return false; 172 } 173 } 174 } 175 } 176 177 return true; 178 } 179 180 // A loop is moved from index 'from' to an index 'to'. Update the Dependence 181 // matrix by exchanging the two columns. 182 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, 183 unsigned ToIndx) { 184 unsigned numRows = DepMatrix.size(); 185 for (unsigned i = 0; i < numRows; ++i) { 186 char TmpVal = DepMatrix[i][ToIndx]; 187 DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx]; 188 DepMatrix[i][FromIndx] = TmpVal; 189 } 190 } 191 192 // Checks if outermost non '=','S'or'I' dependence in the dependence matrix is 193 // '>' 194 static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, 195 unsigned Column) { 196 for (unsigned i = 0; i <= Column; ++i) { 197 if (DepMatrix[Row][i] == '<') 198 return false; 199 if (DepMatrix[Row][i] == '>') 200 return true; 201 } 202 // All dependencies were '=','S' or 'I' 203 return false; 204 } 205 206 // Checks if no dependence exist in the dependency matrix in Row before Column. 207 static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, 208 unsigned Column) { 209 for (unsigned i = 0; i < Column; ++i) { 210 if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' && 211 DepMatrix[Row][i] != 'I') 212 return false; 213 } 214 return true; 215 } 216 217 static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, 218 unsigned OuterLoopId, char InnerDep, 219 char OuterDep) { 220 if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) 221 return false; 222 223 if (InnerDep == OuterDep) 224 return true; 225 226 // It is legal to interchange if and only if after interchange no row has a 227 // '>' direction as the leftmost non-'='. 228 229 if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I') 230 return true; 231 232 if (InnerDep == '<') 233 return true; 234 235 if (InnerDep == '>') { 236 // If OuterLoopId represents outermost loop then interchanging will make the 237 // 1st dependency as '>' 238 if (OuterLoopId == 0) 239 return false; 240 241 // If all dependencies before OuterloopId are '=','S'or 'I'. Then 242 // interchanging will result in this row having an outermost non '=' 243 // dependency of '>' 244 if (!containsNoDependence(DepMatrix, Row, OuterLoopId)) 245 return true; 246 } 247 248 return false; 249 } 250 251 // Checks if it is legal to interchange 2 loops. 252 // [Theorem] A permutation of the loops in a perfect nest is legal if and only 253 // if the direction matrix, after the same permutation is applied to its 254 // columns, has no ">" direction as the leftmost non-"=" direction in any row. 255 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, 256 unsigned InnerLoopId, 257 unsigned OuterLoopId) { 258 unsigned NumRows = DepMatrix.size(); 259 // For each row check if it is valid to interchange. 260 for (unsigned Row = 0; Row < NumRows; ++Row) { 261 char InnerDep = DepMatrix[Row][InnerLoopId]; 262 char OuterDep = DepMatrix[Row][OuterLoopId]; 263 if (InnerDep == '*' || OuterDep == '*') 264 return false; 265 if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep)) 266 return false; 267 } 268 return true; 269 } 270 271 static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) { 272 DEBUG(dbgs() << "Calling populateWorklist on Func: " 273 << L.getHeader()->getParent()->getName() << " Loop: %" 274 << L.getHeader()->getName() << '\n'); 275 LoopVector LoopList; 276 Loop *CurrentLoop = &L; 277 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); 278 while (!Vec->empty()) { 279 // The current loop has multiple subloops in it hence it is not tightly 280 // nested. 281 // Discard all loops above it added into Worklist. 282 if (Vec->size() != 1) { 283 LoopList.clear(); 284 return; 285 } 286 LoopList.push_back(CurrentLoop); 287 CurrentLoop = Vec->front(); 288 Vec = &CurrentLoop->getSubLoops(); 289 } 290 LoopList.push_back(CurrentLoop); 291 V.push_back(std::move(LoopList)); 292 } 293 294 static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { 295 PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); 296 if (InnerIndexVar) 297 return InnerIndexVar; 298 if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) 299 return nullptr; 300 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 301 PHINode *PhiVar = cast<PHINode>(I); 302 Type *PhiTy = PhiVar->getType(); 303 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && 304 !PhiTy->isPointerTy()) 305 return nullptr; 306 const SCEVAddRecExpr *AddRec = 307 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); 308 if (!AddRec || !AddRec->isAffine()) 309 continue; 310 const SCEV *Step = AddRec->getStepRecurrence(*SE); 311 if (!isa<SCEVConstant>(Step)) 312 continue; 313 // Found the induction variable. 314 // FIXME: Handle loops with more than one induction variable. Note that, 315 // currently, legality makes sure we have only one induction variable. 316 return PhiVar; 317 } 318 return nullptr; 319 } 320 321 namespace { 322 323 /// LoopInterchangeLegality checks if it is legal to interchange the loop. 324 class LoopInterchangeLegality { 325 public: 326 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 327 LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA, 328 OptimizationRemarkEmitter *ORE) 329 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), 330 PreserveLCSSA(PreserveLCSSA), ORE(ORE) {} 331 332 /// Check if the loops can be interchanged. 333 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, 334 CharMatrix &DepMatrix); 335 336 /// Check if the loop structure is understood. We do not handle triangular 337 /// loops for now. 338 bool isLoopStructureUnderstood(PHINode *InnerInductionVar); 339 340 bool currentLimitations(); 341 342 bool hasInnerLoopReduction() { return InnerLoopHasReduction; } 343 344 private: 345 bool tightlyNested(Loop *Outer, Loop *Inner); 346 bool containsUnsafeInstructionsInHeader(BasicBlock *BB); 347 bool areAllUsesReductions(Instruction *Ins, Loop *L); 348 bool containsUnsafeInstructionsInLatch(BasicBlock *BB); 349 bool findInductionAndReductions(Loop *L, 350 SmallVector<PHINode *, 8> &Inductions, 351 SmallVector<PHINode *, 8> &Reductions); 352 353 Loop *OuterLoop; 354 Loop *InnerLoop; 355 356 ScalarEvolution *SE; 357 LoopInfo *LI; 358 DominatorTree *DT; 359 bool PreserveLCSSA; 360 361 /// Interface to emit optimization remarks. 362 OptimizationRemarkEmitter *ORE; 363 364 bool InnerLoopHasReduction = false; 365 }; 366 367 /// LoopInterchangeProfitability checks if it is profitable to interchange the 368 /// loop. 369 class LoopInterchangeProfitability { 370 public: 371 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 372 OptimizationRemarkEmitter *ORE) 373 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 374 375 /// Check if the loop interchange is profitable. 376 bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, 377 CharMatrix &DepMatrix); 378 379 private: 380 int getInstrOrderCost(); 381 382 Loop *OuterLoop; 383 Loop *InnerLoop; 384 385 /// Scev analysis. 386 ScalarEvolution *SE; 387 388 /// Interface to emit optimization remarks. 389 OptimizationRemarkEmitter *ORE; 390 }; 391 392 /// LoopInterchangeTransform interchanges the loop. 393 class LoopInterchangeTransform { 394 public: 395 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 396 LoopInfo *LI, DominatorTree *DT, 397 BasicBlock *LoopNestExit, 398 bool InnerLoopContainsReductions) 399 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), 400 LoopExit(LoopNestExit), 401 InnerLoopHasReduction(InnerLoopContainsReductions) {} 402 403 /// Interchange OuterLoop and InnerLoop. 404 bool transform(); 405 void restructureLoops(Loop *InnerLoop, Loop *OuterLoop); 406 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); 407 408 private: 409 void splitInnerLoopLatch(Instruction *); 410 void splitInnerLoopHeader(); 411 bool adjustLoopLinks(); 412 void adjustLoopPreheaders(); 413 bool adjustLoopBranches(); 414 void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, 415 BasicBlock *NewPred); 416 417 Loop *OuterLoop; 418 Loop *InnerLoop; 419 420 /// Scev analysis. 421 ScalarEvolution *SE; 422 423 LoopInfo *LI; 424 DominatorTree *DT; 425 BasicBlock *LoopExit; 426 bool InnerLoopHasReduction; 427 }; 428 429 // Main LoopInterchange Pass. 430 struct LoopInterchange : public FunctionPass { 431 static char ID; 432 ScalarEvolution *SE = nullptr; 433 LoopInfo *LI = nullptr; 434 DependenceInfo *DI = nullptr; 435 DominatorTree *DT = nullptr; 436 bool PreserveLCSSA; 437 438 /// Interface to emit optimization remarks. 439 OptimizationRemarkEmitter *ORE; 440 441 LoopInterchange() : FunctionPass(ID) { 442 initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); 443 } 444 445 void getAnalysisUsage(AnalysisUsage &AU) const override { 446 AU.addRequired<ScalarEvolutionWrapperPass>(); 447 AU.addRequired<AAResultsWrapperPass>(); 448 AU.addRequired<DominatorTreeWrapperPass>(); 449 AU.addRequired<LoopInfoWrapperPass>(); 450 AU.addRequired<DependenceAnalysisWrapperPass>(); 451 AU.addRequiredID(LoopSimplifyID); 452 AU.addRequiredID(LCSSAID); 453 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 454 455 AU.addPreserved<DominatorTreeWrapperPass>(); 456 } 457 458 bool runOnFunction(Function &F) override { 459 if (skipFunction(F)) 460 return false; 461 462 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 463 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 464 DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 465 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 466 ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 467 PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); 468 469 // Build up a worklist of loop pairs to analyze. 470 SmallVector<LoopVector, 8> Worklist; 471 472 for (Loop *L : *LI) 473 populateWorklist(*L, Worklist); 474 475 DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); 476 bool Changed = true; 477 while (!Worklist.empty()) { 478 LoopVector LoopList = Worklist.pop_back_val(); 479 Changed = processLoopList(LoopList, F); 480 } 481 return Changed; 482 } 483 484 bool isComputableLoopNest(LoopVector LoopList) { 485 for (Loop *L : LoopList) { 486 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); 487 if (ExitCountOuter == SE->getCouldNotCompute()) { 488 DEBUG(dbgs() << "Couldn't compute backedge count\n"); 489 return false; 490 } 491 if (L->getNumBackEdges() != 1) { 492 DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); 493 return false; 494 } 495 if (!L->getExitingBlock()) { 496 DEBUG(dbgs() << "Loop doesn't have unique exit block\n"); 497 return false; 498 } 499 } 500 return true; 501 } 502 503 unsigned selectLoopForInterchange(const LoopVector &LoopList) { 504 // TODO: Add a better heuristic to select the loop to be interchanged based 505 // on the dependence matrix. Currently we select the innermost loop. 506 return LoopList.size() - 1; 507 } 508 509 bool processLoopList(LoopVector LoopList, Function &F) { 510 bool Changed = false; 511 unsigned LoopNestDepth = LoopList.size(); 512 if (LoopNestDepth < 2) { 513 DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); 514 return false; 515 } 516 if (LoopNestDepth > MaxLoopNestDepth) { 517 DEBUG(dbgs() << "Cannot handle loops of depth greater than " 518 << MaxLoopNestDepth << "\n"); 519 return false; 520 } 521 if (!isComputableLoopNest(LoopList)) { 522 DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); 523 return false; 524 } 525 526 DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n"); 527 528 CharMatrix DependencyMatrix; 529 Loop *OuterMostLoop = *(LoopList.begin()); 530 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, 531 OuterMostLoop, DI)) { 532 DEBUG(dbgs() << "Populating dependency matrix failed\n"); 533 return false; 534 } 535 #ifdef DUMP_DEP_MATRICIES 536 DEBUG(dbgs() << "Dependence before interchange\n"); 537 printDepMatrix(DependencyMatrix); 538 #endif 539 540 BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch(); 541 BranchInst *OuterMostLoopLatchBI = 542 dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator()); 543 if (!OuterMostLoopLatchBI || OuterMostLoopLatchBI->getNumSuccessors() != 2) 544 return false; 545 546 // Since we currently do not handle LCSSA PHI's any failure in loop 547 // condition will now branch to LoopNestExit. 548 // TODO: This should be removed once we handle LCSSA PHI nodes. 549 550 // Get the Outermost loop exit. 551 BasicBlock *LoopNestExit; 552 if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader()) 553 LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1); 554 else 555 LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0); 556 557 if (isa<PHINode>(LoopNestExit->begin())) { 558 DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now " 559 "since on failure all loops branch to loop nest exit.\n"); 560 return false; 561 } 562 563 unsigned SelecLoopId = selectLoopForInterchange(LoopList); 564 // Move the selected loop outwards to the best possible position. 565 for (unsigned i = SelecLoopId; i > 0; i--) { 566 bool Interchanged = 567 processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); 568 if (!Interchanged) 569 return Changed; 570 // Loops interchanged reflect the same in LoopList 571 std::swap(LoopList[i - 1], LoopList[i]); 572 573 // Update the DependencyMatrix 574 interChangeDependencies(DependencyMatrix, i, i - 1); 575 #ifdef DUMP_DEP_MATRICIES 576 DEBUG(dbgs() << "Dependence after interchange\n"); 577 printDepMatrix(DependencyMatrix); 578 #endif 579 Changed |= Interchanged; 580 } 581 return Changed; 582 } 583 584 bool processLoop(LoopVector LoopList, unsigned InnerLoopId, 585 unsigned OuterLoopId, BasicBlock *LoopNestExit, 586 std::vector<std::vector<char>> &DependencyMatrix) { 587 DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId 588 << " and OuterLoopId = " << OuterLoopId << "\n"); 589 Loop *InnerLoop = LoopList[InnerLoopId]; 590 Loop *OuterLoop = LoopList[OuterLoopId]; 591 592 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, 593 PreserveLCSSA, ORE); 594 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { 595 DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n"); 596 return false; 597 } 598 DEBUG(dbgs() << "Loops are legal to interchange\n"); 599 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); 600 if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { 601 DEBUG(dbgs() << "Interchanging loops not profitable.\n"); 602 return false; 603 } 604 605 ORE->emit([&]() { 606 return OptimizationRemark(DEBUG_TYPE, "Interchanged", 607 InnerLoop->getStartLoc(), 608 InnerLoop->getHeader()) 609 << "Loop interchanged with enclosing loop."; 610 }); 611 612 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, 613 LoopNestExit, LIL.hasInnerLoopReduction()); 614 LIT.transform(); 615 DEBUG(dbgs() << "Loops interchanged.\n"); 616 return true; 617 } 618 }; 619 620 } // end anonymous namespace 621 622 bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) { 623 return llvm::none_of(Ins->users(), [=](User *U) -> bool { 624 auto *UserIns = dyn_cast<PHINode>(U); 625 RecurrenceDescriptor RD; 626 return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD); 627 }); 628 } 629 630 bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader( 631 BasicBlock *BB) { 632 for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { 633 // Load corresponding to reduction PHI's are safe while concluding if 634 // tightly nested. 635 if (LoadInst *L = dyn_cast<LoadInst>(I)) { 636 if (!areAllUsesReductions(L, InnerLoop)) 637 return true; 638 } else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 639 return true; 640 } 641 return false; 642 } 643 644 bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch( 645 BasicBlock *BB) { 646 for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { 647 // Stores corresponding to reductions are safe while concluding if tightly 648 // nested. 649 if (StoreInst *L = dyn_cast<StoreInst>(I)) { 650 if (!isa<PHINode>(L->getOperand(0))) 651 return true; 652 } else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 653 return true; 654 } 655 return false; 656 } 657 658 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { 659 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 660 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 661 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 662 663 DEBUG(dbgs() << "Checking if loops are tightly nested\n"); 664 665 // A perfectly nested loop will not have any branch in between the outer and 666 // inner block i.e. outer header will branch to either inner preheader and 667 // outerloop latch. 668 BranchInst *OuterLoopHeaderBI = 669 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 670 if (!OuterLoopHeaderBI) 671 return false; 672 673 for (BasicBlock *Succ : OuterLoopHeaderBI->successors()) 674 if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch) 675 return false; 676 677 DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n"); 678 // We do not have any basic block in between now make sure the outer header 679 // and outer loop latch doesn't contain any unsafe instructions. 680 if (containsUnsafeInstructionsInHeader(OuterLoopHeader) || 681 containsUnsafeInstructionsInLatch(OuterLoopLatch)) 682 return false; 683 684 DEBUG(dbgs() << "Loops are perfectly nested\n"); 685 // We have a perfect loop nest. 686 return true; 687 } 688 689 bool LoopInterchangeLegality::isLoopStructureUnderstood( 690 PHINode *InnerInduction) { 691 unsigned Num = InnerInduction->getNumOperands(); 692 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); 693 for (unsigned i = 0; i < Num; ++i) { 694 Value *Val = InnerInduction->getOperand(i); 695 if (isa<Constant>(Val)) 696 continue; 697 Instruction *I = dyn_cast<Instruction>(Val); 698 if (!I) 699 return false; 700 // TODO: Handle triangular loops. 701 // e.g. for(int i=0;i<N;i++) 702 // for(int j=i;j<N;j++) 703 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); 704 if (InnerInduction->getIncomingBlock(IncomBlockIndx) == 705 InnerLoopPreheader && 706 !OuterLoop->isLoopInvariant(I)) { 707 return false; 708 } 709 } 710 return true; 711 } 712 713 bool LoopInterchangeLegality::findInductionAndReductions( 714 Loop *L, SmallVector<PHINode *, 8> &Inductions, 715 SmallVector<PHINode *, 8> &Reductions) { 716 if (!L->getLoopLatch() || !L->getLoopPredecessor()) 717 return false; 718 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 719 RecurrenceDescriptor RD; 720 InductionDescriptor ID; 721 PHINode *PHI = cast<PHINode>(I); 722 if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID)) 723 Inductions.push_back(PHI); 724 else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) 725 Reductions.push_back(PHI); 726 else { 727 DEBUG( 728 dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); 729 return false; 730 } 731 } 732 return true; 733 } 734 735 static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { 736 for (auto I = Block->begin(); isa<PHINode>(I); ++I) { 737 PHINode *PHI = cast<PHINode>(I); 738 // Reduction lcssa phi will have only 1 incoming block that from loop latch. 739 if (PHI->getNumIncomingValues() > 1) 740 return false; 741 Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0)); 742 if (!Ins) 743 return false; 744 // Incoming value for lcssa phi's in outer loop exit can only be inner loop 745 // exits lcssa phi else it would not be tightly nested. 746 if (!isa<PHINode>(Ins) && isOuterLoopExitBlock) 747 return false; 748 } 749 return true; 750 } 751 752 static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock, 753 BasicBlock *LoopHeader) { 754 if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) { 755 assert(BI->getNumSuccessors() == 2 && 756 "Branch leaving loop latch must have 2 successors"); 757 for (BasicBlock *Succ : BI->successors()) { 758 if (Succ == LoopHeader) 759 continue; 760 return Succ; 761 } 762 } 763 return nullptr; 764 } 765 766 // This function indicates the current limitations in the transform as a result 767 // of which we do not proceed. 768 bool LoopInterchangeLegality::currentLimitations() { 769 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 770 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 771 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 772 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 773 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 774 775 PHINode *InnerInductionVar; 776 SmallVector<PHINode *, 8> Inductions; 777 SmallVector<PHINode *, 8> Reductions; 778 if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { 779 DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes " 780 << "are supported currently.\n"); 781 ORE->emit([&]() { 782 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", 783 InnerLoop->getStartLoc(), 784 InnerLoop->getHeader()) 785 << "Only inner loops with induction or reduction PHI nodes can be" 786 " interchange currently."; 787 }); 788 return true; 789 } 790 791 // TODO: Currently we handle only loops with 1 induction variable. 792 if (Inductions.size() != 1) { 793 DEBUG(dbgs() << "We currently only support loops with 1 induction variable." 794 << "Failed to interchange due to current limitation\n"); 795 ORE->emit([&]() { 796 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", 797 InnerLoop->getStartLoc(), 798 InnerLoop->getHeader()) 799 << "Only inner loops with 1 induction variable can be " 800 "interchanged currently."; 801 }); 802 return true; 803 } 804 if (Reductions.size() > 0) 805 InnerLoopHasReduction = true; 806 807 InnerInductionVar = Inductions.pop_back_val(); 808 Reductions.clear(); 809 if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { 810 DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes " 811 << "are supported currently.\n"); 812 ORE->emit([&]() { 813 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", 814 OuterLoop->getStartLoc(), 815 OuterLoop->getHeader()) 816 << "Only outer loops with induction or reduction PHI nodes can be" 817 " interchanged currently."; 818 }); 819 return true; 820 } 821 822 // Outer loop cannot have reduction because then loops will not be tightly 823 // nested. 824 if (!Reductions.empty()) { 825 DEBUG(dbgs() << "Outer loops with reductions are not supported " 826 << "currently.\n"); 827 ORE->emit([&]() { 828 return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter", 829 OuterLoop->getStartLoc(), 830 OuterLoop->getHeader()) 831 << "Outer loops with reductions cannot be interchangeed " 832 "currently."; 833 }); 834 return true; 835 } 836 // TODO: Currently we handle only loops with 1 induction variable. 837 if (Inductions.size() != 1) { 838 DEBUG(dbgs() << "Loops with more than 1 induction variables are not " 839 << "supported currently.\n"); 840 ORE->emit([&]() { 841 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", 842 OuterLoop->getStartLoc(), 843 OuterLoop->getHeader()) 844 << "Only outer loops with 1 induction variable can be " 845 "interchanged currently."; 846 }); 847 return true; 848 } 849 850 // TODO: Triangular loops are not handled for now. 851 if (!isLoopStructureUnderstood(InnerInductionVar)) { 852 DEBUG(dbgs() << "Loop structure not understood by pass\n"); 853 ORE->emit([&]() { 854 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner", 855 InnerLoop->getStartLoc(), 856 InnerLoop->getHeader()) 857 << "Inner loop structure not understood currently."; 858 }); 859 return true; 860 } 861 862 // TODO: We only handle LCSSA PHI's corresponding to reduction for now. 863 BasicBlock *LoopExitBlock = 864 getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader); 865 if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) { 866 DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); 867 ORE->emit([&]() { 868 return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter", 869 OuterLoop->getStartLoc(), 870 OuterLoop->getHeader()) 871 << "Only outer loops with LCSSA PHIs can be interchange " 872 "currently."; 873 }); 874 return true; 875 } 876 877 LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader); 878 if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) { 879 DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); 880 ORE->emit([&]() { 881 return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner", 882 InnerLoop->getStartLoc(), 883 InnerLoop->getHeader()) 884 << "Only inner loops with LCSSA PHIs can be interchange " 885 "currently."; 886 }); 887 return true; 888 } 889 890 // TODO: Current limitation: Since we split the inner loop latch at the point 891 // were induction variable is incremented (induction.next); We cannot have 892 // more than 1 user of induction.next since it would result in broken code 893 // after split. 894 // e.g. 895 // for(i=0;i<N;i++) { 896 // for(j = 0;j<M;j++) { 897 // A[j+1][i+2] = A[j][i]+k; 898 // } 899 // } 900 Instruction *InnerIndexVarInc = nullptr; 901 if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) 902 InnerIndexVarInc = 903 dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); 904 else 905 InnerIndexVarInc = 906 dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); 907 908 if (!InnerIndexVarInc) { 909 DEBUG(dbgs() << "Did not find an instruction to increment the induction " 910 << "variable.\n"); 911 ORE->emit([&]() { 912 return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner", 913 InnerLoop->getStartLoc(), 914 InnerLoop->getHeader()) 915 << "The inner loop does not increment the induction variable."; 916 }); 917 return true; 918 } 919 920 // Since we split the inner loop latch on this induction variable. Make sure 921 // we do not have any instruction between the induction variable and branch 922 // instruction. 923 924 bool FoundInduction = false; 925 for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) { 926 if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) || 927 isa<ZExtInst>(I)) 928 continue; 929 930 // We found an instruction. If this is not induction variable then it is not 931 // safe to split this loop latch. 932 if (!I.isIdenticalTo(InnerIndexVarInc)) { 933 DEBUG(dbgs() << "Found unsupported instructions between induction " 934 << "variable increment and branch.\n"); 935 ORE->emit([&]() { 936 return OptimizationRemarkMissed( 937 DEBUG_TYPE, "UnsupportedInsBetweenInduction", 938 InnerLoop->getStartLoc(), InnerLoop->getHeader()) 939 << "Found unsupported instruction between induction variable " 940 "increment and branch."; 941 }); 942 return true; 943 } 944 945 FoundInduction = true; 946 break; 947 } 948 // The loop latch ended and we didn't find the induction variable return as 949 // current limitation. 950 if (!FoundInduction) { 951 DEBUG(dbgs() << "Did not find the induction variable.\n"); 952 ORE->emit([&]() { 953 return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable", 954 InnerLoop->getStartLoc(), 955 InnerLoop->getHeader()) 956 << "Did not find the induction variable."; 957 }); 958 return true; 959 } 960 return false; 961 } 962 963 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, 964 unsigned OuterLoopId, 965 CharMatrix &DepMatrix) { 966 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { 967 DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId 968 << " and OuterLoopId = " << OuterLoopId 969 << " due to dependence\n"); 970 ORE->emit([&]() { 971 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence", 972 InnerLoop->getStartLoc(), 973 InnerLoop->getHeader()) 974 << "Cannot interchange loops due to dependences."; 975 }); 976 return false; 977 } 978 979 // Check if outer and inner loop contain legal instructions only. 980 for (auto *BB : OuterLoop->blocks()) 981 for (Instruction &I : *BB) 982 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 983 // readnone functions do not prevent interchanging. 984 if (CI->doesNotReadMemory()) 985 continue; 986 DEBUG(dbgs() << "Loops with call instructions cannot be interchanged " 987 << "safely."); 988 ORE->emit([&]() { 989 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst", 990 CI->getDebugLoc(), 991 CI->getParent()) 992 << "Cannot interchange loops due to call instruction."; 993 }); 994 995 return false; 996 } 997 998 // Create unique Preheaders if we already do not have one. 999 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1000 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1001 1002 // Create a unique outer preheader - 1003 // 1) If OuterLoop preheader is not present. 1004 // 2) If OuterLoop Preheader is same as OuterLoop Header 1005 // 3) If OuterLoop Preheader is same as Header of the previous loop. 1006 // 4) If OuterLoop Preheader is Entry node. 1007 if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() || 1008 isa<PHINode>(OuterLoopPreHeader->begin()) || 1009 !OuterLoopPreHeader->getUniquePredecessor()) { 1010 OuterLoopPreHeader = 1011 InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA); 1012 } 1013 1014 if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() || 1015 InnerLoopPreHeader == OuterLoop->getHeader()) { 1016 InnerLoopPreHeader = 1017 InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA); 1018 } 1019 1020 // TODO: The loops could not be interchanged due to current limitations in the 1021 // transform module. 1022 if (currentLimitations()) { 1023 DEBUG(dbgs() << "Not legal because of current transform limitation\n"); 1024 return false; 1025 } 1026 1027 // Check if the loops are tightly nested. 1028 if (!tightlyNested(OuterLoop, InnerLoop)) { 1029 DEBUG(dbgs() << "Loops not tightly nested\n"); 1030 ORE->emit([&]() { 1031 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested", 1032 InnerLoop->getStartLoc(), 1033 InnerLoop->getHeader()) 1034 << "Cannot interchange loops because they are not tightly " 1035 "nested."; 1036 }); 1037 return false; 1038 } 1039 1040 return true; 1041 } 1042 1043 int LoopInterchangeProfitability::getInstrOrderCost() { 1044 unsigned GoodOrder, BadOrder; 1045 BadOrder = GoodOrder = 0; 1046 for (BasicBlock *BB : InnerLoop->blocks()) { 1047 for (Instruction &Ins : *BB) { 1048 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { 1049 unsigned NumOp = GEP->getNumOperands(); 1050 bool FoundInnerInduction = false; 1051 bool FoundOuterInduction = false; 1052 for (unsigned i = 0; i < NumOp; ++i) { 1053 const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); 1054 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); 1055 if (!AR) 1056 continue; 1057 1058 // If we find the inner induction after an outer induction e.g. 1059 // for(int i=0;i<N;i++) 1060 // for(int j=0;j<N;j++) 1061 // A[i][j] = A[i-1][j-1]+k; 1062 // then it is a good order. 1063 if (AR->getLoop() == InnerLoop) { 1064 // We found an InnerLoop induction after OuterLoop induction. It is 1065 // a good order. 1066 FoundInnerInduction = true; 1067 if (FoundOuterInduction) { 1068 GoodOrder++; 1069 break; 1070 } 1071 } 1072 // If we find the outer induction after an inner induction e.g. 1073 // for(int i=0;i<N;i++) 1074 // for(int j=0;j<N;j++) 1075 // A[j][i] = A[j-1][i-1]+k; 1076 // then it is a bad order. 1077 if (AR->getLoop() == OuterLoop) { 1078 // We found an OuterLoop induction after InnerLoop induction. It is 1079 // a bad order. 1080 FoundOuterInduction = true; 1081 if (FoundInnerInduction) { 1082 BadOrder++; 1083 break; 1084 } 1085 } 1086 } 1087 } 1088 } 1089 } 1090 return GoodOrder - BadOrder; 1091 } 1092 1093 static bool isProfitableForVectorization(unsigned InnerLoopId, 1094 unsigned OuterLoopId, 1095 CharMatrix &DepMatrix) { 1096 // TODO: Improve this heuristic to catch more cases. 1097 // If the inner loop is loop independent or doesn't carry any dependency it is 1098 // profitable to move this to outer position. 1099 for (auto &Row : DepMatrix) { 1100 if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I') 1101 return false; 1102 // TODO: We need to improve this heuristic. 1103 if (Row[OuterLoopId] != '=') 1104 return false; 1105 } 1106 // If outer loop has dependence and inner loop is loop independent then it is 1107 // profitable to interchange to enable parallelism. 1108 return true; 1109 } 1110 1111 bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, 1112 unsigned OuterLoopId, 1113 CharMatrix &DepMatrix) { 1114 // TODO: Add better profitability checks. 1115 // e.g 1116 // 1) Construct dependency matrix and move the one with no loop carried dep 1117 // inside to enable vectorization. 1118 1119 // This is rough cost estimation algorithm. It counts the good and bad order 1120 // of induction variables in the instruction and allows reordering if number 1121 // of bad orders is more than good. 1122 int Cost = getInstrOrderCost(); 1123 DEBUG(dbgs() << "Cost = " << Cost << "\n"); 1124 if (Cost < -LoopInterchangeCostThreshold) 1125 return true; 1126 1127 // It is not profitable as per current cache profitability model. But check if 1128 // we can move this loop outside to improve parallelism. 1129 if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) 1130 return true; 1131 1132 ORE->emit([&]() { 1133 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", 1134 InnerLoop->getStartLoc(), 1135 InnerLoop->getHeader()) 1136 << "Interchanging loops is too costly (cost=" 1137 << ore::NV("Cost", Cost) << ", threshold=" 1138 << ore::NV("Threshold", LoopInterchangeCostThreshold) 1139 << ") and it does not improve parallelism."; 1140 }); 1141 return false; 1142 } 1143 1144 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, 1145 Loop *InnerLoop) { 1146 for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E; 1147 ++I) { 1148 if (*I == InnerLoop) { 1149 OuterLoop->removeChildLoop(I); 1150 return; 1151 } 1152 } 1153 llvm_unreachable("Couldn't find loop"); 1154 } 1155 1156 void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, 1157 Loop *OuterLoop) { 1158 Loop *OuterLoopParent = OuterLoop->getParentLoop(); 1159 if (OuterLoopParent) { 1160 // Remove the loop from its parent loop. 1161 removeChildLoop(OuterLoopParent, OuterLoop); 1162 removeChildLoop(OuterLoop, InnerLoop); 1163 OuterLoopParent->addChildLoop(InnerLoop); 1164 } else { 1165 removeChildLoop(OuterLoop, InnerLoop); 1166 LI->changeTopLevelLoop(OuterLoop, InnerLoop); 1167 } 1168 1169 while (!InnerLoop->empty()) 1170 OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin())); 1171 1172 InnerLoop->addChildLoop(OuterLoop); 1173 } 1174 1175 bool LoopInterchangeTransform::transform() { 1176 bool Transformed = false; 1177 Instruction *InnerIndexVar; 1178 1179 if (InnerLoop->getSubLoops().empty()) { 1180 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1181 DEBUG(dbgs() << "Calling Split Inner Loop\n"); 1182 PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); 1183 if (!InductionPHI) { 1184 DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); 1185 return false; 1186 } 1187 1188 if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) 1189 InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); 1190 else 1191 InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); 1192 1193 // Ensure that InductionPHI is the first Phi node as required by 1194 // splitInnerLoopHeader 1195 if (&InductionPHI->getParent()->front() != InductionPHI) 1196 InductionPHI->moveBefore(&InductionPHI->getParent()->front()); 1197 1198 // Split at the place were the induction variable is 1199 // incremented/decremented. 1200 // TODO: This splitting logic may not work always. Fix this. 1201 splitInnerLoopLatch(InnerIndexVar); 1202 DEBUG(dbgs() << "splitInnerLoopLatch done\n"); 1203 1204 // Splits the inner loops phi nodes out into a separate basic block. 1205 splitInnerLoopHeader(); 1206 DEBUG(dbgs() << "splitInnerLoopHeader done\n"); 1207 } 1208 1209 Transformed |= adjustLoopLinks(); 1210 if (!Transformed) { 1211 DEBUG(dbgs() << "adjustLoopLinks failed\n"); 1212 return false; 1213 } 1214 1215 restructureLoops(InnerLoop, OuterLoop); 1216 return true; 1217 } 1218 1219 void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { 1220 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1221 BasicBlock *InnerLoopLatchPred = InnerLoopLatch; 1222 InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI); 1223 } 1224 1225 void LoopInterchangeTransform::splitInnerLoopHeader() { 1226 // Split the inner loop header out. Here make sure that the reduction PHI's 1227 // stay in the innerloop body. 1228 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1229 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1230 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); 1231 if (InnerLoopHasReduction) { 1232 // Adjust Reduction PHI's in the block. The induction PHI must be the first 1233 // PHI in InnerLoopHeader for this to work. 1234 SmallVector<PHINode *, 8> PHIVec; 1235 for (auto I = std::next(InnerLoopHeader->begin()); isa<PHINode>(I); ++I) { 1236 PHINode *PHI = dyn_cast<PHINode>(I); 1237 Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader); 1238 PHI->replaceAllUsesWith(V); 1239 PHIVec.push_back((PHI)); 1240 } 1241 for (PHINode *P : PHIVec) { 1242 P->eraseFromParent(); 1243 } 1244 } 1245 1246 DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & " 1247 "InnerLoopHeader\n"); 1248 } 1249 1250 /// \brief Move all instructions except the terminator from FromBB right before 1251 /// InsertBefore 1252 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { 1253 auto &ToList = InsertBefore->getParent()->getInstList(); 1254 auto &FromList = FromBB->getInstList(); 1255 1256 ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), 1257 FromBB->getTerminator()->getIterator()); 1258 } 1259 1260 void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock, 1261 BasicBlock *OldPred, 1262 BasicBlock *NewPred) { 1263 for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) { 1264 PHINode *PHI = cast<PHINode>(I); 1265 unsigned Num = PHI->getNumIncomingValues(); 1266 for (unsigned i = 0; i < Num; ++i) { 1267 if (PHI->getIncomingBlock(i) == OldPred) 1268 PHI->setIncomingBlock(i, NewPred); 1269 } 1270 } 1271 } 1272 1273 /// \brief Update BI to jump to NewBB instead of OldBB. Records updates to 1274 /// the dominator tree in DTUpdates, if DT should be preserved. 1275 static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, 1276 BasicBlock *NewBB, 1277 std::vector<DominatorTree::UpdateType> &DTUpdates) { 1278 assert(llvm::count_if(BI->successors(), 1279 [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 && 1280 "BI must jump to OldBB at most once."); 1281 for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) { 1282 if (BI->getSuccessor(i) == OldBB) { 1283 BI->setSuccessor(i, NewBB); 1284 1285 DTUpdates.push_back( 1286 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); 1287 DTUpdates.push_back( 1288 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); 1289 break; 1290 } 1291 } 1292 } 1293 1294 bool LoopInterchangeTransform::adjustLoopBranches() { 1295 DEBUG(dbgs() << "adjustLoopBranches called\n"); 1296 std::vector<DominatorTree::UpdateType> DTUpdates; 1297 1298 // Adjust the loop preheader 1299 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1300 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1301 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1302 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 1303 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1304 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1305 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); 1306 BasicBlock *InnerLoopLatchPredecessor = 1307 InnerLoopLatch->getUniquePredecessor(); 1308 BasicBlock *InnerLoopLatchSuccessor; 1309 BasicBlock *OuterLoopLatchSuccessor; 1310 1311 BranchInst *OuterLoopLatchBI = 1312 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); 1313 BranchInst *InnerLoopLatchBI = 1314 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); 1315 BranchInst *OuterLoopHeaderBI = 1316 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 1317 BranchInst *InnerLoopHeaderBI = 1318 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); 1319 1320 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || 1321 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || 1322 !InnerLoopHeaderBI) 1323 return false; 1324 1325 BranchInst *InnerLoopLatchPredecessorBI = 1326 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); 1327 BranchInst *OuterLoopPredecessorBI = 1328 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); 1329 1330 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) 1331 return false; 1332 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); 1333 if (!InnerLoopHeaderSuccessor) 1334 return false; 1335 1336 // Adjust Loop Preheader and headers 1337 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, 1338 InnerLoopPreHeader, DTUpdates); 1339 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates); 1340 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, 1341 InnerLoopHeaderSuccessor, DTUpdates); 1342 1343 // Adjust reduction PHI's now that the incoming block has changed. 1344 updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader, 1345 OuterLoopHeader); 1346 1347 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor, 1348 OuterLoopPreHeader, DTUpdates); 1349 1350 // -------------Adjust loop latches----------- 1351 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) 1352 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); 1353 else 1354 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); 1355 1356 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, 1357 InnerLoopLatchSuccessor, DTUpdates); 1358 1359 // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with 1360 // the value and remove this PHI node from inner loop. 1361 SmallVector<PHINode *, 8> LcssaVec; 1362 for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) { 1363 PHINode *LcssaPhi = cast<PHINode>(I); 1364 LcssaVec.push_back(LcssaPhi); 1365 } 1366 for (PHINode *P : LcssaVec) { 1367 Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch); 1368 P->replaceAllUsesWith(Incoming); 1369 P->eraseFromParent(); 1370 } 1371 1372 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) 1373 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); 1374 else 1375 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); 1376 1377 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor, 1378 OuterLoopLatchSuccessor, DTUpdates); 1379 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch, 1380 DTUpdates); 1381 1382 updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); 1383 1384 DT->applyUpdates(DTUpdates); 1385 return true; 1386 } 1387 1388 void LoopInterchangeTransform::adjustLoopPreheaders() { 1389 // We have interchanged the preheaders so we need to interchange the data in 1390 // the preheader as well. 1391 // This is because the content of inner preheader was previously executed 1392 // inside the outer loop. 1393 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1394 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1395 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1396 BranchInst *InnerTermBI = 1397 cast<BranchInst>(InnerLoopPreHeader->getTerminator()); 1398 1399 // These instructions should now be executed inside the loop. 1400 // Move instruction into a new block after outer header. 1401 moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator()); 1402 // These instructions were not executed previously in the loop so move them to 1403 // the older inner loop preheader. 1404 moveBBContents(OuterLoopPreHeader, InnerTermBI); 1405 } 1406 1407 bool LoopInterchangeTransform::adjustLoopLinks() { 1408 // Adjust all branches in the inner and outer loop. 1409 bool Changed = adjustLoopBranches(); 1410 if (Changed) 1411 adjustLoopPreheaders(); 1412 return Changed; 1413 } 1414 1415 char LoopInterchange::ID = 0; 1416 1417 INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", 1418 "Interchanges loops for cache reuse", false, false) 1419 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 1420 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 1421 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1422 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1423 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 1424 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) 1425 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1426 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1427 1428 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", 1429 "Interchanges loops for cache reuse", false, false) 1430 1431 Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); } 1432