1 //===- LoopInterchange.cpp - Loop interchange pass-------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This Pass handles loop interchange transform. 11 // This pass interchanges loops to provide a more cache-friendly memory access 12 // patterns. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/StringRef.h" 19 #include "llvm/Analysis/AliasAnalysis.h" 20 #include "llvm/Analysis/DependenceAnalysis.h" 21 #include "llvm/Analysis/LoopInfo.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/ScalarEvolution.h" 24 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Function.h" 30 #include "llvm/IR/InstrTypes.h" 31 #include "llvm/IR/Instruction.h" 32 #include "llvm/IR/Instructions.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/IR/User.h" 35 #include "llvm/IR/Value.h" 36 #include "llvm/Pass.h" 37 #include "llvm/Support/Casting.h" 38 #include "llvm/Support/CommandLine.h" 39 #include "llvm/Support/Debug.h" 40 #include "llvm/Support/ErrorHandling.h" 41 #include "llvm/Support/raw_ostream.h" 42 #include "llvm/Transforms/Scalar.h" 43 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 44 #include "llvm/Transforms/Utils/LoopUtils.h" 45 #include <cassert> 46 #include <utility> 47 #include <vector> 48 49 using namespace llvm; 50 51 #define DEBUG_TYPE "loop-interchange" 52 53 static cl::opt<int> LoopInterchangeCostThreshold( 54 "loop-interchange-threshold", cl::init(0), cl::Hidden, 55 cl::desc("Interchange if you gain more than this number")); 56 57 namespace { 58 59 using LoopVector = SmallVector<Loop *, 8>; 60 61 // TODO: Check if we can use a sparse matrix here. 62 using CharMatrix = std::vector<std::vector<char>>; 63 64 } // end anonymous namespace 65 66 // Maximum number of dependencies that can be handled in the dependency matrix. 67 static const unsigned MaxMemInstrCount = 100; 68 69 // Maximum loop depth supported. 70 static const unsigned MaxLoopNestDepth = 10; 71 72 #ifdef DUMP_DEP_MATRICIES 73 static void printDepMatrix(CharMatrix &DepMatrix) { 74 for (auto &Row : DepMatrix) { 75 for (auto D : Row) 76 DEBUG(dbgs() << D << " "); 77 DEBUG(dbgs() << "\n"); 78 } 79 } 80 #endif 81 82 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, 83 Loop *L, DependenceInfo *DI) { 84 using ValueVector = SmallVector<Value *, 16>; 85 86 ValueVector MemInstr; 87 88 // For each block. 89 for (BasicBlock *BB : L->blocks()) { 90 // Scan the BB and collect legal loads and stores. 91 for (Instruction &I : *BB) { 92 if (!isa<Instruction>(I)) 93 return false; 94 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 95 if (!Ld->isSimple()) 96 return false; 97 MemInstr.push_back(&I); 98 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 99 if (!St->isSimple()) 100 return false; 101 MemInstr.push_back(&I); 102 } 103 } 104 } 105 106 DEBUG(dbgs() << "Found " << MemInstr.size() 107 << " Loads and Stores to analyze\n"); 108 109 ValueVector::iterator I, IE, J, JE; 110 111 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { 112 for (J = I, JE = MemInstr.end(); J != JE; ++J) { 113 std::vector<char> Dep; 114 Instruction *Src = cast<Instruction>(*I); 115 Instruction *Dst = cast<Instruction>(*J); 116 if (Src == Dst) 117 continue; 118 // Ignore Input dependencies. 119 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 120 continue; 121 // Track Output, Flow, and Anti dependencies. 122 if (auto D = DI->depends(Src, Dst, true)) { 123 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 124 DEBUG(StringRef DepType = 125 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; 126 dbgs() << "Found " << DepType 127 << " dependency between Src and Dst\n" 128 << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); 129 unsigned Levels = D->getLevels(); 130 char Direction; 131 for (unsigned II = 1; II <= Levels; ++II) { 132 const SCEV *Distance = D->getDistance(II); 133 const SCEVConstant *SCEVConst = 134 dyn_cast_or_null<SCEVConstant>(Distance); 135 if (SCEVConst) { 136 const ConstantInt *CI = SCEVConst->getValue(); 137 if (CI->isNegative()) 138 Direction = '<'; 139 else if (CI->isZero()) 140 Direction = '='; 141 else 142 Direction = '>'; 143 Dep.push_back(Direction); 144 } else if (D->isScalar(II)) { 145 Direction = 'S'; 146 Dep.push_back(Direction); 147 } else { 148 unsigned Dir = D->getDirection(II); 149 if (Dir == Dependence::DVEntry::LT || 150 Dir == Dependence::DVEntry::LE) 151 Direction = '<'; 152 else if (Dir == Dependence::DVEntry::GT || 153 Dir == Dependence::DVEntry::GE) 154 Direction = '>'; 155 else if (Dir == Dependence::DVEntry::EQ) 156 Direction = '='; 157 else 158 Direction = '*'; 159 Dep.push_back(Direction); 160 } 161 } 162 while (Dep.size() != Level) { 163 Dep.push_back('I'); 164 } 165 166 DepMatrix.push_back(Dep); 167 if (DepMatrix.size() > MaxMemInstrCount) { 168 DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount 169 << " dependencies inside loop\n"); 170 return false; 171 } 172 } 173 } 174 } 175 176 return true; 177 } 178 179 // A loop is moved from index 'from' to an index 'to'. Update the Dependence 180 // matrix by exchanging the two columns. 181 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, 182 unsigned ToIndx) { 183 unsigned numRows = DepMatrix.size(); 184 for (unsigned i = 0; i < numRows; ++i) { 185 char TmpVal = DepMatrix[i][ToIndx]; 186 DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx]; 187 DepMatrix[i][FromIndx] = TmpVal; 188 } 189 } 190 191 // Checks if outermost non '=','S'or'I' dependence in the dependence matrix is 192 // '>' 193 static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, 194 unsigned Column) { 195 for (unsigned i = 0; i <= Column; ++i) { 196 if (DepMatrix[Row][i] == '<') 197 return false; 198 if (DepMatrix[Row][i] == '>') 199 return true; 200 } 201 // All dependencies were '=','S' or 'I' 202 return false; 203 } 204 205 // Checks if no dependence exist in the dependency matrix in Row before Column. 206 static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, 207 unsigned Column) { 208 for (unsigned i = 0; i < Column; ++i) { 209 if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' && 210 DepMatrix[Row][i] != 'I') 211 return false; 212 } 213 return true; 214 } 215 216 static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, 217 unsigned OuterLoopId, char InnerDep, 218 char OuterDep) { 219 if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) 220 return false; 221 222 if (InnerDep == OuterDep) 223 return true; 224 225 // It is legal to interchange if and only if after interchange no row has a 226 // '>' direction as the leftmost non-'='. 227 228 if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I') 229 return true; 230 231 if (InnerDep == '<') 232 return true; 233 234 if (InnerDep == '>') { 235 // If OuterLoopId represents outermost loop then interchanging will make the 236 // 1st dependency as '>' 237 if (OuterLoopId == 0) 238 return false; 239 240 // If all dependencies before OuterloopId are '=','S'or 'I'. Then 241 // interchanging will result in this row having an outermost non '=' 242 // dependency of '>' 243 if (!containsNoDependence(DepMatrix, Row, OuterLoopId)) 244 return true; 245 } 246 247 return false; 248 } 249 250 // Checks if it is legal to interchange 2 loops. 251 // [Theorem] A permutation of the loops in a perfect nest is legal if and only 252 // if the direction matrix, after the same permutation is applied to its 253 // columns, has no ">" direction as the leftmost non-"=" direction in any row. 254 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, 255 unsigned InnerLoopId, 256 unsigned OuterLoopId) { 257 unsigned NumRows = DepMatrix.size(); 258 // For each row check if it is valid to interchange. 259 for (unsigned Row = 0; Row < NumRows; ++Row) { 260 char InnerDep = DepMatrix[Row][InnerLoopId]; 261 char OuterDep = DepMatrix[Row][OuterLoopId]; 262 if (InnerDep == '*' || OuterDep == '*') 263 return false; 264 if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep)) 265 return false; 266 } 267 return true; 268 } 269 270 static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) { 271 DEBUG(dbgs() << "Calling populateWorklist on Func: " 272 << L.getHeader()->getParent()->getName() << " Loop: %" 273 << L.getHeader()->getName() << '\n'); 274 LoopVector LoopList; 275 Loop *CurrentLoop = &L; 276 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); 277 while (!Vec->empty()) { 278 // The current loop has multiple subloops in it hence it is not tightly 279 // nested. 280 // Discard all loops above it added into Worklist. 281 if (Vec->size() != 1) { 282 LoopList.clear(); 283 return; 284 } 285 LoopList.push_back(CurrentLoop); 286 CurrentLoop = Vec->front(); 287 Vec = &CurrentLoop->getSubLoops(); 288 } 289 LoopList.push_back(CurrentLoop); 290 V.push_back(std::move(LoopList)); 291 } 292 293 static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { 294 PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); 295 if (InnerIndexVar) 296 return InnerIndexVar; 297 if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) 298 return nullptr; 299 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 300 PHINode *PhiVar = cast<PHINode>(I); 301 Type *PhiTy = PhiVar->getType(); 302 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && 303 !PhiTy->isPointerTy()) 304 return nullptr; 305 const SCEVAddRecExpr *AddRec = 306 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); 307 if (!AddRec || !AddRec->isAffine()) 308 continue; 309 const SCEV *Step = AddRec->getStepRecurrence(*SE); 310 if (!isa<SCEVConstant>(Step)) 311 continue; 312 // Found the induction variable. 313 // FIXME: Handle loops with more than one induction variable. Note that, 314 // currently, legality makes sure we have only one induction variable. 315 return PhiVar; 316 } 317 return nullptr; 318 } 319 320 namespace { 321 322 /// LoopInterchangeLegality checks if it is legal to interchange the loop. 323 class LoopInterchangeLegality { 324 public: 325 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 326 LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA, 327 OptimizationRemarkEmitter *ORE) 328 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), 329 PreserveLCSSA(PreserveLCSSA), ORE(ORE) {} 330 331 /// Check if the loops can be interchanged. 332 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, 333 CharMatrix &DepMatrix); 334 335 /// Check if the loop structure is understood. We do not handle triangular 336 /// loops for now. 337 bool isLoopStructureUnderstood(PHINode *InnerInductionVar); 338 339 bool currentLimitations(); 340 341 bool hasInnerLoopReduction() { return InnerLoopHasReduction; } 342 343 private: 344 bool tightlyNested(Loop *Outer, Loop *Inner); 345 bool containsUnsafeInstructionsInHeader(BasicBlock *BB); 346 bool areAllUsesReductions(Instruction *Ins, Loop *L); 347 bool containsUnsafeInstructionsInLatch(BasicBlock *BB); 348 bool findInductionAndReductions(Loop *L, 349 SmallVector<PHINode *, 8> &Inductions, 350 SmallVector<PHINode *, 8> &Reductions); 351 352 Loop *OuterLoop; 353 Loop *InnerLoop; 354 355 ScalarEvolution *SE; 356 LoopInfo *LI; 357 DominatorTree *DT; 358 bool PreserveLCSSA; 359 360 /// Interface to emit optimization remarks. 361 OptimizationRemarkEmitter *ORE; 362 363 bool InnerLoopHasReduction = false; 364 }; 365 366 /// LoopInterchangeProfitability checks if it is profitable to interchange the 367 /// loop. 368 class LoopInterchangeProfitability { 369 public: 370 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 371 OptimizationRemarkEmitter *ORE) 372 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 373 374 /// Check if the loop interchange is profitable. 375 bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, 376 CharMatrix &DepMatrix); 377 378 private: 379 int getInstrOrderCost(); 380 381 Loop *OuterLoop; 382 Loop *InnerLoop; 383 384 /// Scev analysis. 385 ScalarEvolution *SE; 386 387 /// Interface to emit optimization remarks. 388 OptimizationRemarkEmitter *ORE; 389 }; 390 391 /// LoopInterchangeTransform interchanges the loop. 392 class LoopInterchangeTransform { 393 public: 394 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 395 LoopInfo *LI, DominatorTree *DT, 396 BasicBlock *LoopNestExit, 397 bool InnerLoopContainsReductions) 398 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), 399 LoopExit(LoopNestExit), 400 InnerLoopHasReduction(InnerLoopContainsReductions) {} 401 402 /// Interchange OuterLoop and InnerLoop. 403 bool transform(); 404 void restructureLoops(Loop *InnerLoop, Loop *OuterLoop); 405 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); 406 407 private: 408 void splitInnerLoopLatch(Instruction *); 409 void splitInnerLoopHeader(); 410 bool adjustLoopLinks(); 411 void adjustLoopPreheaders(); 412 bool adjustLoopBranches(); 413 void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, 414 BasicBlock *NewPred); 415 416 Loop *OuterLoop; 417 Loop *InnerLoop; 418 419 /// Scev analysis. 420 ScalarEvolution *SE; 421 422 LoopInfo *LI; 423 DominatorTree *DT; 424 BasicBlock *LoopExit; 425 bool InnerLoopHasReduction; 426 }; 427 428 // Main LoopInterchange Pass. 429 struct LoopInterchange : public FunctionPass { 430 static char ID; 431 ScalarEvolution *SE = nullptr; 432 LoopInfo *LI = nullptr; 433 DependenceInfo *DI = nullptr; 434 DominatorTree *DT = nullptr; 435 bool PreserveLCSSA; 436 437 /// Interface to emit optimization remarks. 438 OptimizationRemarkEmitter *ORE; 439 440 LoopInterchange() : FunctionPass(ID) { 441 initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); 442 } 443 444 void getAnalysisUsage(AnalysisUsage &AU) const override { 445 AU.addRequired<ScalarEvolutionWrapperPass>(); 446 AU.addRequired<AAResultsWrapperPass>(); 447 AU.addRequired<DominatorTreeWrapperPass>(); 448 AU.addRequired<LoopInfoWrapperPass>(); 449 AU.addRequired<DependenceAnalysisWrapperPass>(); 450 AU.addRequiredID(LoopSimplifyID); 451 AU.addRequiredID(LCSSAID); 452 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 453 454 AU.addPreserved<DominatorTreeWrapperPass>(); 455 } 456 457 bool runOnFunction(Function &F) override { 458 if (skipFunction(F)) 459 return false; 460 461 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 462 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 463 DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 464 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 465 ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 466 PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); 467 468 // Build up a worklist of loop pairs to analyze. 469 SmallVector<LoopVector, 8> Worklist; 470 471 for (Loop *L : *LI) 472 populateWorklist(*L, Worklist); 473 474 DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); 475 bool Changed = true; 476 while (!Worklist.empty()) { 477 LoopVector LoopList = Worklist.pop_back_val(); 478 Changed = processLoopList(LoopList, F); 479 } 480 return Changed; 481 } 482 483 bool isComputableLoopNest(LoopVector LoopList) { 484 for (Loop *L : LoopList) { 485 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); 486 if (ExitCountOuter == SE->getCouldNotCompute()) { 487 DEBUG(dbgs() << "Couldn't compute backedge count\n"); 488 return false; 489 } 490 if (L->getNumBackEdges() != 1) { 491 DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); 492 return false; 493 } 494 if (!L->getExitingBlock()) { 495 DEBUG(dbgs() << "Loop doesn't have unique exit block\n"); 496 return false; 497 } 498 } 499 return true; 500 } 501 502 unsigned selectLoopForInterchange(const LoopVector &LoopList) { 503 // TODO: Add a better heuristic to select the loop to be interchanged based 504 // on the dependence matrix. Currently we select the innermost loop. 505 return LoopList.size() - 1; 506 } 507 508 bool processLoopList(LoopVector LoopList, Function &F) { 509 bool Changed = false; 510 unsigned LoopNestDepth = LoopList.size(); 511 if (LoopNestDepth < 2) { 512 DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); 513 return false; 514 } 515 if (LoopNestDepth > MaxLoopNestDepth) { 516 DEBUG(dbgs() << "Cannot handle loops of depth greater than " 517 << MaxLoopNestDepth << "\n"); 518 return false; 519 } 520 if (!isComputableLoopNest(LoopList)) { 521 DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); 522 return false; 523 } 524 525 DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n"); 526 527 CharMatrix DependencyMatrix; 528 Loop *OuterMostLoop = *(LoopList.begin()); 529 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, 530 OuterMostLoop, DI)) { 531 DEBUG(dbgs() << "Populating dependency matrix failed\n"); 532 return false; 533 } 534 #ifdef DUMP_DEP_MATRICIES 535 DEBUG(dbgs() << "Dependence before interchange\n"); 536 printDepMatrix(DependencyMatrix); 537 #endif 538 539 BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch(); 540 BranchInst *OuterMostLoopLatchBI = 541 dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator()); 542 if (!OuterMostLoopLatchBI || OuterMostLoopLatchBI->getNumSuccessors() != 2) 543 return false; 544 545 // Since we currently do not handle LCSSA PHI's any failure in loop 546 // condition will now branch to LoopNestExit. 547 // TODO: This should be removed once we handle LCSSA PHI nodes. 548 549 // Get the Outermost loop exit. 550 BasicBlock *LoopNestExit; 551 if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader()) 552 LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1); 553 else 554 LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0); 555 556 if (isa<PHINode>(LoopNestExit->begin())) { 557 DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now " 558 "since on failure all loops branch to loop nest exit.\n"); 559 return false; 560 } 561 562 unsigned SelecLoopId = selectLoopForInterchange(LoopList); 563 // Move the selected loop outwards to the best possible position. 564 for (unsigned i = SelecLoopId; i > 0; i--) { 565 bool Interchanged = 566 processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); 567 if (!Interchanged) 568 return Changed; 569 // Loops interchanged reflect the same in LoopList 570 std::swap(LoopList[i - 1], LoopList[i]); 571 572 // Update the DependencyMatrix 573 interChangeDependencies(DependencyMatrix, i, i - 1); 574 #ifdef DUMP_DEP_MATRICIES 575 DEBUG(dbgs() << "Dependence after interchange\n"); 576 printDepMatrix(DependencyMatrix); 577 #endif 578 Changed |= Interchanged; 579 } 580 return Changed; 581 } 582 583 bool processLoop(LoopVector LoopList, unsigned InnerLoopId, 584 unsigned OuterLoopId, BasicBlock *LoopNestExit, 585 std::vector<std::vector<char>> &DependencyMatrix) { 586 DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId 587 << " and OuterLoopId = " << OuterLoopId << "\n"); 588 Loop *InnerLoop = LoopList[InnerLoopId]; 589 Loop *OuterLoop = LoopList[OuterLoopId]; 590 591 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, 592 PreserveLCSSA, ORE); 593 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { 594 DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n"); 595 return false; 596 } 597 DEBUG(dbgs() << "Loops are legal to interchange\n"); 598 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); 599 if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { 600 DEBUG(dbgs() << "Interchanging loops not profitable.\n"); 601 return false; 602 } 603 604 ORE->emit([&]() { 605 return OptimizationRemark(DEBUG_TYPE, "Interchanged", 606 InnerLoop->getStartLoc(), 607 InnerLoop->getHeader()) 608 << "Loop interchanged with enclosing loop."; 609 }); 610 611 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, 612 LoopNestExit, LIL.hasInnerLoopReduction()); 613 LIT.transform(); 614 DEBUG(dbgs() << "Loops interchanged.\n"); 615 return true; 616 } 617 }; 618 619 } // end anonymous namespace 620 621 bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) { 622 return llvm::none_of(Ins->users(), [=](User *U) -> bool { 623 auto *UserIns = dyn_cast<PHINode>(U); 624 RecurrenceDescriptor RD; 625 return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD); 626 }); 627 } 628 629 bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader( 630 BasicBlock *BB) { 631 for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { 632 // Load corresponding to reduction PHI's are safe while concluding if 633 // tightly nested. 634 if (LoadInst *L = dyn_cast<LoadInst>(I)) { 635 if (!areAllUsesReductions(L, InnerLoop)) 636 return true; 637 } else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 638 return true; 639 } 640 return false; 641 } 642 643 bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch( 644 BasicBlock *BB) { 645 for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { 646 // Stores corresponding to reductions are safe while concluding if tightly 647 // nested. 648 if (StoreInst *L = dyn_cast<StoreInst>(I)) { 649 if (!isa<PHINode>(L->getOperand(0))) 650 return true; 651 } else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 652 return true; 653 } 654 return false; 655 } 656 657 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { 658 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 659 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 660 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 661 662 DEBUG(dbgs() << "Checking if loops are tightly nested\n"); 663 664 // A perfectly nested loop will not have any branch in between the outer and 665 // inner block i.e. outer header will branch to either inner preheader and 666 // outerloop latch. 667 BranchInst *OuterLoopHeaderBI = 668 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 669 if (!OuterLoopHeaderBI) 670 return false; 671 672 for (BasicBlock *Succ : OuterLoopHeaderBI->successors()) 673 if (Succ != InnerLoopPreHeader && Succ != OuterLoopLatch) 674 return false; 675 676 DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n"); 677 // We do not have any basic block in between now make sure the outer header 678 // and outer loop latch doesn't contain any unsafe instructions. 679 if (containsUnsafeInstructionsInHeader(OuterLoopHeader) || 680 containsUnsafeInstructionsInLatch(OuterLoopLatch)) 681 return false; 682 683 DEBUG(dbgs() << "Loops are perfectly nested\n"); 684 // We have a perfect loop nest. 685 return true; 686 } 687 688 bool LoopInterchangeLegality::isLoopStructureUnderstood( 689 PHINode *InnerInduction) { 690 unsigned Num = InnerInduction->getNumOperands(); 691 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); 692 for (unsigned i = 0; i < Num; ++i) { 693 Value *Val = InnerInduction->getOperand(i); 694 if (isa<Constant>(Val)) 695 continue; 696 Instruction *I = dyn_cast<Instruction>(Val); 697 if (!I) 698 return false; 699 // TODO: Handle triangular loops. 700 // e.g. for(int i=0;i<N;i++) 701 // for(int j=i;j<N;j++) 702 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); 703 if (InnerInduction->getIncomingBlock(IncomBlockIndx) == 704 InnerLoopPreheader && 705 !OuterLoop->isLoopInvariant(I)) { 706 return false; 707 } 708 } 709 return true; 710 } 711 712 bool LoopInterchangeLegality::findInductionAndReductions( 713 Loop *L, SmallVector<PHINode *, 8> &Inductions, 714 SmallVector<PHINode *, 8> &Reductions) { 715 if (!L->getLoopLatch() || !L->getLoopPredecessor()) 716 return false; 717 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 718 RecurrenceDescriptor RD; 719 InductionDescriptor ID; 720 PHINode *PHI = cast<PHINode>(I); 721 if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID)) 722 Inductions.push_back(PHI); 723 else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) 724 Reductions.push_back(PHI); 725 else { 726 DEBUG( 727 dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); 728 return false; 729 } 730 } 731 return true; 732 } 733 734 static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { 735 for (auto I = Block->begin(); isa<PHINode>(I); ++I) { 736 PHINode *PHI = cast<PHINode>(I); 737 // Reduction lcssa phi will have only 1 incoming block that from loop latch. 738 if (PHI->getNumIncomingValues() > 1) 739 return false; 740 Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0)); 741 if (!Ins) 742 return false; 743 // Incoming value for lcssa phi's in outer loop exit can only be inner loop 744 // exits lcssa phi else it would not be tightly nested. 745 if (!isa<PHINode>(Ins) && isOuterLoopExitBlock) 746 return false; 747 } 748 return true; 749 } 750 751 static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock, 752 BasicBlock *LoopHeader) { 753 if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) { 754 assert(BI->getNumSuccessors() == 2 && 755 "Branch leaving loop latch must have 2 successors"); 756 for (BasicBlock *Succ : BI->successors()) { 757 if (Succ == LoopHeader) 758 continue; 759 return Succ; 760 } 761 } 762 return nullptr; 763 } 764 765 // This function indicates the current limitations in the transform as a result 766 // of which we do not proceed. 767 bool LoopInterchangeLegality::currentLimitations() { 768 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 769 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 770 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 771 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 772 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 773 774 PHINode *InnerInductionVar; 775 SmallVector<PHINode *, 8> Inductions; 776 SmallVector<PHINode *, 8> Reductions; 777 if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { 778 DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes " 779 << "are supported currently.\n"); 780 ORE->emit([&]() { 781 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", 782 InnerLoop->getStartLoc(), 783 InnerLoop->getHeader()) 784 << "Only inner loops with induction or reduction PHI nodes can be" 785 " interchange currently."; 786 }); 787 return true; 788 } 789 790 // TODO: Currently we handle only loops with 1 induction variable. 791 if (Inductions.size() != 1) { 792 DEBUG(dbgs() << "We currently only support loops with 1 induction variable." 793 << "Failed to interchange due to current limitation\n"); 794 ORE->emit([&]() { 795 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", 796 InnerLoop->getStartLoc(), 797 InnerLoop->getHeader()) 798 << "Only inner loops with 1 induction variable can be " 799 "interchanged currently."; 800 }); 801 return true; 802 } 803 if (Reductions.size() > 0) 804 InnerLoopHasReduction = true; 805 806 InnerInductionVar = Inductions.pop_back_val(); 807 Reductions.clear(); 808 if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { 809 DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes " 810 << "are supported currently.\n"); 811 ORE->emit([&]() { 812 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", 813 OuterLoop->getStartLoc(), 814 OuterLoop->getHeader()) 815 << "Only outer loops with induction or reduction PHI nodes can be" 816 " interchanged currently."; 817 }); 818 return true; 819 } 820 821 // Outer loop cannot have reduction because then loops will not be tightly 822 // nested. 823 if (!Reductions.empty()) { 824 DEBUG(dbgs() << "Outer loops with reductions are not supported " 825 << "currently.\n"); 826 ORE->emit([&]() { 827 return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter", 828 OuterLoop->getStartLoc(), 829 OuterLoop->getHeader()) 830 << "Outer loops with reductions cannot be interchangeed " 831 "currently."; 832 }); 833 return true; 834 } 835 // TODO: Currently we handle only loops with 1 induction variable. 836 if (Inductions.size() != 1) { 837 DEBUG(dbgs() << "Loops with more than 1 induction variables are not " 838 << "supported currently.\n"); 839 ORE->emit([&]() { 840 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", 841 OuterLoop->getStartLoc(), 842 OuterLoop->getHeader()) 843 << "Only outer loops with 1 induction variable can be " 844 "interchanged currently."; 845 }); 846 return true; 847 } 848 849 // TODO: Triangular loops are not handled for now. 850 if (!isLoopStructureUnderstood(InnerInductionVar)) { 851 DEBUG(dbgs() << "Loop structure not understood by pass\n"); 852 ORE->emit([&]() { 853 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner", 854 InnerLoop->getStartLoc(), 855 InnerLoop->getHeader()) 856 << "Inner loop structure not understood currently."; 857 }); 858 return true; 859 } 860 861 // TODO: We only handle LCSSA PHI's corresponding to reduction for now. 862 BasicBlock *LoopExitBlock = 863 getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader); 864 if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) { 865 DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); 866 ORE->emit([&]() { 867 return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuter", 868 OuterLoop->getStartLoc(), 869 OuterLoop->getHeader()) 870 << "Only outer loops with LCSSA PHIs can be interchange " 871 "currently."; 872 }); 873 return true; 874 } 875 876 LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader); 877 if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) { 878 DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); 879 ORE->emit([&]() { 880 return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner", 881 InnerLoop->getStartLoc(), 882 InnerLoop->getHeader()) 883 << "Only inner loops with LCSSA PHIs can be interchange " 884 "currently."; 885 }); 886 return true; 887 } 888 889 // TODO: Current limitation: Since we split the inner loop latch at the point 890 // were induction variable is incremented (induction.next); We cannot have 891 // more than 1 user of induction.next since it would result in broken code 892 // after split. 893 // e.g. 894 // for(i=0;i<N;i++) { 895 // for(j = 0;j<M;j++) { 896 // A[j+1][i+2] = A[j][i]+k; 897 // } 898 // } 899 Instruction *InnerIndexVarInc = nullptr; 900 if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) 901 InnerIndexVarInc = 902 dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); 903 else 904 InnerIndexVarInc = 905 dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); 906 907 if (!InnerIndexVarInc) { 908 DEBUG(dbgs() << "Did not find an instruction to increment the induction " 909 << "variable.\n"); 910 ORE->emit([&]() { 911 return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner", 912 InnerLoop->getStartLoc(), 913 InnerLoop->getHeader()) 914 << "The inner loop does not increment the induction variable."; 915 }); 916 return true; 917 } 918 919 // Since we split the inner loop latch on this induction variable. Make sure 920 // we do not have any instruction between the induction variable and branch 921 // instruction. 922 923 bool FoundInduction = false; 924 for (const Instruction &I : llvm::reverse(*InnerLoopLatch)) { 925 if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) || 926 isa<ZExtInst>(I)) 927 continue; 928 929 // We found an instruction. If this is not induction variable then it is not 930 // safe to split this loop latch. 931 if (!I.isIdenticalTo(InnerIndexVarInc)) { 932 DEBUG(dbgs() << "Found unsupported instructions between induction " 933 << "variable increment and branch.\n"); 934 ORE->emit([&]() { 935 return OptimizationRemarkMissed( 936 DEBUG_TYPE, "UnsupportedInsBetweenInduction", 937 InnerLoop->getStartLoc(), InnerLoop->getHeader()) 938 << "Found unsupported instruction between induction variable " 939 "increment and branch."; 940 }); 941 return true; 942 } 943 944 FoundInduction = true; 945 break; 946 } 947 // The loop latch ended and we didn't find the induction variable return as 948 // current limitation. 949 if (!FoundInduction) { 950 DEBUG(dbgs() << "Did not find the induction variable.\n"); 951 ORE->emit([&]() { 952 return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable", 953 InnerLoop->getStartLoc(), 954 InnerLoop->getHeader()) 955 << "Did not find the induction variable."; 956 }); 957 return true; 958 } 959 return false; 960 } 961 962 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, 963 unsigned OuterLoopId, 964 CharMatrix &DepMatrix) { 965 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { 966 DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId 967 << " and OuterLoopId = " << OuterLoopId 968 << " due to dependence\n"); 969 ORE->emit([&]() { 970 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence", 971 InnerLoop->getStartLoc(), 972 InnerLoop->getHeader()) 973 << "Cannot interchange loops due to dependences."; 974 }); 975 return false; 976 } 977 978 // Check if outer and inner loop contain legal instructions only. 979 for (auto *BB : OuterLoop->blocks()) 980 for (Instruction &I : *BB) 981 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 982 // readnone functions do not prevent interchanging. 983 if (CI->doesNotReadMemory()) 984 continue; 985 DEBUG(dbgs() << "Loops with call instructions cannot be interchanged " 986 << "safely."); 987 return false; 988 } 989 990 // Create unique Preheaders if we already do not have one. 991 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 992 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 993 994 // Create a unique outer preheader - 995 // 1) If OuterLoop preheader is not present. 996 // 2) If OuterLoop Preheader is same as OuterLoop Header 997 // 3) If OuterLoop Preheader is same as Header of the previous loop. 998 // 4) If OuterLoop Preheader is Entry node. 999 if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() || 1000 isa<PHINode>(OuterLoopPreHeader->begin()) || 1001 !OuterLoopPreHeader->getUniquePredecessor()) { 1002 OuterLoopPreHeader = 1003 InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA); 1004 } 1005 1006 if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() || 1007 InnerLoopPreHeader == OuterLoop->getHeader()) { 1008 InnerLoopPreHeader = 1009 InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA); 1010 } 1011 1012 // TODO: The loops could not be interchanged due to current limitations in the 1013 // transform module. 1014 if (currentLimitations()) { 1015 DEBUG(dbgs() << "Not legal because of current transform limitation\n"); 1016 return false; 1017 } 1018 1019 // Check if the loops are tightly nested. 1020 if (!tightlyNested(OuterLoop, InnerLoop)) { 1021 DEBUG(dbgs() << "Loops not tightly nested\n"); 1022 ORE->emit([&]() { 1023 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested", 1024 InnerLoop->getStartLoc(), 1025 InnerLoop->getHeader()) 1026 << "Cannot interchange loops because they are not tightly " 1027 "nested."; 1028 }); 1029 return false; 1030 } 1031 1032 return true; 1033 } 1034 1035 int LoopInterchangeProfitability::getInstrOrderCost() { 1036 unsigned GoodOrder, BadOrder; 1037 BadOrder = GoodOrder = 0; 1038 for (BasicBlock *BB : InnerLoop->blocks()) { 1039 for (Instruction &Ins : *BB) { 1040 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { 1041 unsigned NumOp = GEP->getNumOperands(); 1042 bool FoundInnerInduction = false; 1043 bool FoundOuterInduction = false; 1044 for (unsigned i = 0; i < NumOp; ++i) { 1045 const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); 1046 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); 1047 if (!AR) 1048 continue; 1049 1050 // If we find the inner induction after an outer induction e.g. 1051 // for(int i=0;i<N;i++) 1052 // for(int j=0;j<N;j++) 1053 // A[i][j] = A[i-1][j-1]+k; 1054 // then it is a good order. 1055 if (AR->getLoop() == InnerLoop) { 1056 // We found an InnerLoop induction after OuterLoop induction. It is 1057 // a good order. 1058 FoundInnerInduction = true; 1059 if (FoundOuterInduction) { 1060 GoodOrder++; 1061 break; 1062 } 1063 } 1064 // If we find the outer induction after an inner induction e.g. 1065 // for(int i=0;i<N;i++) 1066 // for(int j=0;j<N;j++) 1067 // A[j][i] = A[j-1][i-1]+k; 1068 // then it is a bad order. 1069 if (AR->getLoop() == OuterLoop) { 1070 // We found an OuterLoop induction after InnerLoop induction. It is 1071 // a bad order. 1072 FoundOuterInduction = true; 1073 if (FoundInnerInduction) { 1074 BadOrder++; 1075 break; 1076 } 1077 } 1078 } 1079 } 1080 } 1081 } 1082 return GoodOrder - BadOrder; 1083 } 1084 1085 static bool isProfitableForVectorization(unsigned InnerLoopId, 1086 unsigned OuterLoopId, 1087 CharMatrix &DepMatrix) { 1088 // TODO: Improve this heuristic to catch more cases. 1089 // If the inner loop is loop independent or doesn't carry any dependency it is 1090 // profitable to move this to outer position. 1091 for (auto &Row : DepMatrix) { 1092 if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I') 1093 return false; 1094 // TODO: We need to improve this heuristic. 1095 if (Row[OuterLoopId] != '=') 1096 return false; 1097 } 1098 // If outer loop has dependence and inner loop is loop independent then it is 1099 // profitable to interchange to enable parallelism. 1100 return true; 1101 } 1102 1103 bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, 1104 unsigned OuterLoopId, 1105 CharMatrix &DepMatrix) { 1106 // TODO: Add better profitability checks. 1107 // e.g 1108 // 1) Construct dependency matrix and move the one with no loop carried dep 1109 // inside to enable vectorization. 1110 1111 // This is rough cost estimation algorithm. It counts the good and bad order 1112 // of induction variables in the instruction and allows reordering if number 1113 // of bad orders is more than good. 1114 int Cost = getInstrOrderCost(); 1115 DEBUG(dbgs() << "Cost = " << Cost << "\n"); 1116 if (Cost < -LoopInterchangeCostThreshold) 1117 return true; 1118 1119 // It is not profitable as per current cache profitability model. But check if 1120 // we can move this loop outside to improve parallelism. 1121 if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) 1122 return true; 1123 1124 ORE->emit([&]() { 1125 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", 1126 InnerLoop->getStartLoc(), 1127 InnerLoop->getHeader()) 1128 << "Interchanging loops is too costly (cost=" 1129 << ore::NV("Cost", Cost) << ", threshold=" 1130 << ore::NV("Threshold", LoopInterchangeCostThreshold) 1131 << ") and it does not improve parallelism."; 1132 }); 1133 return false; 1134 } 1135 1136 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, 1137 Loop *InnerLoop) { 1138 for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E; 1139 ++I) { 1140 if (*I == InnerLoop) { 1141 OuterLoop->removeChildLoop(I); 1142 return; 1143 } 1144 } 1145 llvm_unreachable("Couldn't find loop"); 1146 } 1147 1148 void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, 1149 Loop *OuterLoop) { 1150 Loop *OuterLoopParent = OuterLoop->getParentLoop(); 1151 if (OuterLoopParent) { 1152 // Remove the loop from its parent loop. 1153 removeChildLoop(OuterLoopParent, OuterLoop); 1154 removeChildLoop(OuterLoop, InnerLoop); 1155 OuterLoopParent->addChildLoop(InnerLoop); 1156 } else { 1157 removeChildLoop(OuterLoop, InnerLoop); 1158 LI->changeTopLevelLoop(OuterLoop, InnerLoop); 1159 } 1160 1161 while (!InnerLoop->empty()) 1162 OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin())); 1163 1164 InnerLoop->addChildLoop(OuterLoop); 1165 } 1166 1167 bool LoopInterchangeTransform::transform() { 1168 bool Transformed = false; 1169 Instruction *InnerIndexVar; 1170 1171 if (InnerLoop->getSubLoops().empty()) { 1172 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1173 DEBUG(dbgs() << "Calling Split Inner Loop\n"); 1174 PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); 1175 if (!InductionPHI) { 1176 DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); 1177 return false; 1178 } 1179 1180 if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) 1181 InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); 1182 else 1183 InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); 1184 1185 // Ensure that InductionPHI is the first Phi node as required by 1186 // splitInnerLoopHeader 1187 if (&InductionPHI->getParent()->front() != InductionPHI) 1188 InductionPHI->moveBefore(&InductionPHI->getParent()->front()); 1189 1190 // Split at the place were the induction variable is 1191 // incremented/decremented. 1192 // TODO: This splitting logic may not work always. Fix this. 1193 splitInnerLoopLatch(InnerIndexVar); 1194 DEBUG(dbgs() << "splitInnerLoopLatch done\n"); 1195 1196 // Splits the inner loops phi nodes out into a separate basic block. 1197 splitInnerLoopHeader(); 1198 DEBUG(dbgs() << "splitInnerLoopHeader done\n"); 1199 } 1200 1201 Transformed |= adjustLoopLinks(); 1202 if (!Transformed) { 1203 DEBUG(dbgs() << "adjustLoopLinks failed\n"); 1204 return false; 1205 } 1206 1207 restructureLoops(InnerLoop, OuterLoop); 1208 return true; 1209 } 1210 1211 void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { 1212 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1213 BasicBlock *InnerLoopLatchPred = InnerLoopLatch; 1214 InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI); 1215 } 1216 1217 void LoopInterchangeTransform::splitInnerLoopHeader() { 1218 // Split the inner loop header out. Here make sure that the reduction PHI's 1219 // stay in the innerloop body. 1220 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1221 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1222 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); 1223 if (InnerLoopHasReduction) { 1224 // Adjust Reduction PHI's in the block. The induction PHI must be the first 1225 // PHI in InnerLoopHeader for this to work. 1226 SmallVector<PHINode *, 8> PHIVec; 1227 for (auto I = std::next(InnerLoopHeader->begin()); isa<PHINode>(I); ++I) { 1228 PHINode *PHI = dyn_cast<PHINode>(I); 1229 Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader); 1230 PHI->replaceAllUsesWith(V); 1231 PHIVec.push_back((PHI)); 1232 } 1233 for (PHINode *P : PHIVec) { 1234 P->eraseFromParent(); 1235 } 1236 } 1237 1238 DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & " 1239 "InnerLoopHeader\n"); 1240 } 1241 1242 /// \brief Move all instructions except the terminator from FromBB right before 1243 /// InsertBefore 1244 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { 1245 auto &ToList = InsertBefore->getParent()->getInstList(); 1246 auto &FromList = FromBB->getInstList(); 1247 1248 ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), 1249 FromBB->getTerminator()->getIterator()); 1250 } 1251 1252 void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock, 1253 BasicBlock *OldPred, 1254 BasicBlock *NewPred) { 1255 for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) { 1256 PHINode *PHI = cast<PHINode>(I); 1257 unsigned Num = PHI->getNumIncomingValues(); 1258 for (unsigned i = 0; i < Num; ++i) { 1259 if (PHI->getIncomingBlock(i) == OldPred) 1260 PHI->setIncomingBlock(i, NewPred); 1261 } 1262 } 1263 } 1264 1265 /// \brief Update BI to jump to NewBB instead of OldBB. Records updates to 1266 /// the dominator tree in DTUpdates, if DT should be preserved. 1267 static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, 1268 BasicBlock *NewBB, 1269 std::vector<DominatorTree::UpdateType> &DTUpdates) { 1270 assert(llvm::count_if(BI->successors(), 1271 [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 && 1272 "BI must jump to OldBB at most once."); 1273 for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) { 1274 if (BI->getSuccessor(i) == OldBB) { 1275 BI->setSuccessor(i, NewBB); 1276 1277 DTUpdates.push_back( 1278 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); 1279 DTUpdates.push_back( 1280 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); 1281 break; 1282 } 1283 } 1284 } 1285 1286 bool LoopInterchangeTransform::adjustLoopBranches() { 1287 DEBUG(dbgs() << "adjustLoopBranches called\n"); 1288 std::vector<DominatorTree::UpdateType> DTUpdates; 1289 1290 // Adjust the loop preheader 1291 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1292 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1293 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1294 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 1295 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1296 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1297 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); 1298 BasicBlock *InnerLoopLatchPredecessor = 1299 InnerLoopLatch->getUniquePredecessor(); 1300 BasicBlock *InnerLoopLatchSuccessor; 1301 BasicBlock *OuterLoopLatchSuccessor; 1302 1303 BranchInst *OuterLoopLatchBI = 1304 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); 1305 BranchInst *InnerLoopLatchBI = 1306 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); 1307 BranchInst *OuterLoopHeaderBI = 1308 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 1309 BranchInst *InnerLoopHeaderBI = 1310 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); 1311 1312 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || 1313 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || 1314 !InnerLoopHeaderBI) 1315 return false; 1316 1317 BranchInst *InnerLoopLatchPredecessorBI = 1318 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); 1319 BranchInst *OuterLoopPredecessorBI = 1320 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); 1321 1322 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) 1323 return false; 1324 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); 1325 if (!InnerLoopHeaderSuccessor) 1326 return false; 1327 1328 // Adjust Loop Preheader and headers 1329 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, 1330 InnerLoopPreHeader, DTUpdates); 1331 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates); 1332 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, 1333 InnerLoopHeaderSuccessor, DTUpdates); 1334 1335 // Adjust reduction PHI's now that the incoming block has changed. 1336 updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader, 1337 OuterLoopHeader); 1338 1339 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor, 1340 OuterLoopPreHeader, DTUpdates); 1341 1342 // -------------Adjust loop latches----------- 1343 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) 1344 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); 1345 else 1346 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); 1347 1348 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, 1349 InnerLoopLatchSuccessor, DTUpdates); 1350 1351 // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with 1352 // the value and remove this PHI node from inner loop. 1353 SmallVector<PHINode *, 8> LcssaVec; 1354 for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) { 1355 PHINode *LcssaPhi = cast<PHINode>(I); 1356 LcssaVec.push_back(LcssaPhi); 1357 } 1358 for (PHINode *P : LcssaVec) { 1359 Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch); 1360 P->replaceAllUsesWith(Incoming); 1361 P->eraseFromParent(); 1362 } 1363 1364 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) 1365 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); 1366 else 1367 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); 1368 1369 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor, 1370 OuterLoopLatchSuccessor, DTUpdates); 1371 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch, 1372 DTUpdates); 1373 1374 updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); 1375 1376 DT->applyUpdates(DTUpdates); 1377 return true; 1378 } 1379 1380 void LoopInterchangeTransform::adjustLoopPreheaders() { 1381 // We have interchanged the preheaders so we need to interchange the data in 1382 // the preheader as well. 1383 // This is because the content of inner preheader was previously executed 1384 // inside the outer loop. 1385 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1386 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1387 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1388 BranchInst *InnerTermBI = 1389 cast<BranchInst>(InnerLoopPreHeader->getTerminator()); 1390 1391 // These instructions should now be executed inside the loop. 1392 // Move instruction into a new block after outer header. 1393 moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator()); 1394 // These instructions were not executed previously in the loop so move them to 1395 // the older inner loop preheader. 1396 moveBBContents(OuterLoopPreHeader, InnerTermBI); 1397 } 1398 1399 bool LoopInterchangeTransform::adjustLoopLinks() { 1400 // Adjust all branches in the inner and outer loop. 1401 bool Changed = adjustLoopBranches(); 1402 if (Changed) 1403 adjustLoopPreheaders(); 1404 return Changed; 1405 } 1406 1407 char LoopInterchange::ID = 0; 1408 1409 INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", 1410 "Interchanges loops for cache reuse", false, false) 1411 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 1412 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 1413 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1414 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1415 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 1416 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) 1417 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1418 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1419 1420 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", 1421 "Interchanges loops for cache reuse", false, false) 1422 1423 Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); } 1424