1 //===- LoopInterchange.cpp - Loop interchange pass-------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This Pass handles loop interchange transform. 11 // This pass interchanges loops to provide a more cache-friendly memory access 12 // patterns. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/Analysis/DependenceAnalysis.h" 21 #include "llvm/Analysis/LoopInfo.h" 22 #include "llvm/Analysis/LoopPass.h" 23 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 24 #include "llvm/Analysis/ScalarEvolution.h" 25 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 26 #include "llvm/IR/BasicBlock.h" 27 #include "llvm/IR/Constants.h" 28 #include "llvm/IR/DiagnosticInfo.h" 29 #include "llvm/IR/Dominators.h" 30 #include "llvm/IR/Function.h" 31 #include "llvm/IR/InstrTypes.h" 32 #include "llvm/IR/Instruction.h" 33 #include "llvm/IR/Instructions.h" 34 #include "llvm/IR/Type.h" 35 #include "llvm/IR/User.h" 36 #include "llvm/IR/Value.h" 37 #include "llvm/Pass.h" 38 #include "llvm/Support/Casting.h" 39 #include "llvm/Support/CommandLine.h" 40 #include "llvm/Support/Debug.h" 41 #include "llvm/Support/ErrorHandling.h" 42 #include "llvm/Support/raw_ostream.h" 43 #include "llvm/Transforms/Scalar.h" 44 #include "llvm/Transforms/Utils.h" 45 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 46 #include "llvm/Transforms/Utils/LoopUtils.h" 47 #include <cassert> 48 #include <utility> 49 #include <vector> 50 51 using namespace llvm; 52 53 #define DEBUG_TYPE "loop-interchange" 54 55 STATISTIC(LoopsInterchanged, "Number of loops interchanged"); 56 57 static cl::opt<int> LoopInterchangeCostThreshold( 58 "loop-interchange-threshold", cl::init(0), cl::Hidden, 59 cl::desc("Interchange if you gain more than this number")); 60 61 namespace { 62 63 using LoopVector = SmallVector<Loop *, 8>; 64 65 // TODO: Check if we can use a sparse matrix here. 66 using CharMatrix = std::vector<std::vector<char>>; 67 68 } // end anonymous namespace 69 70 // Maximum number of dependencies that can be handled in the dependency matrix. 71 static const unsigned MaxMemInstrCount = 100; 72 73 // Maximum loop depth supported. 74 static const unsigned MaxLoopNestDepth = 10; 75 76 #ifdef DUMP_DEP_MATRICIES 77 static void printDepMatrix(CharMatrix &DepMatrix) { 78 for (auto &Row : DepMatrix) { 79 for (auto D : Row) 80 LLVM_DEBUG(dbgs() << D << " "); 81 LLVM_DEBUG(dbgs() << "\n"); 82 } 83 } 84 #endif 85 86 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, 87 Loop *L, DependenceInfo *DI) { 88 using ValueVector = SmallVector<Value *, 16>; 89 90 ValueVector MemInstr; 91 92 // For each block. 93 for (BasicBlock *BB : L->blocks()) { 94 // Scan the BB and collect legal loads and stores. 95 for (Instruction &I : *BB) { 96 if (!isa<Instruction>(I)) 97 return false; 98 if (auto *Ld = dyn_cast<LoadInst>(&I)) { 99 if (!Ld->isSimple()) 100 return false; 101 MemInstr.push_back(&I); 102 } else if (auto *St = dyn_cast<StoreInst>(&I)) { 103 if (!St->isSimple()) 104 return false; 105 MemInstr.push_back(&I); 106 } 107 } 108 } 109 110 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size() 111 << " Loads and Stores to analyze\n"); 112 113 ValueVector::iterator I, IE, J, JE; 114 115 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { 116 for (J = I, JE = MemInstr.end(); J != JE; ++J) { 117 std::vector<char> Dep; 118 Instruction *Src = cast<Instruction>(*I); 119 Instruction *Dst = cast<Instruction>(*J); 120 if (Src == Dst) 121 continue; 122 // Ignore Input dependencies. 123 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 124 continue; 125 // Track Output, Flow, and Anti dependencies. 126 if (auto D = DI->depends(Src, Dst, true)) { 127 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 128 LLVM_DEBUG(StringRef DepType = 129 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; 130 dbgs() << "Found " << DepType 131 << " dependency between Src and Dst\n" 132 << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); 133 unsigned Levels = D->getLevels(); 134 char Direction; 135 for (unsigned II = 1; II <= Levels; ++II) { 136 const SCEV *Distance = D->getDistance(II); 137 const SCEVConstant *SCEVConst = 138 dyn_cast_or_null<SCEVConstant>(Distance); 139 if (SCEVConst) { 140 const ConstantInt *CI = SCEVConst->getValue(); 141 if (CI->isNegative()) 142 Direction = '<'; 143 else if (CI->isZero()) 144 Direction = '='; 145 else 146 Direction = '>'; 147 Dep.push_back(Direction); 148 } else if (D->isScalar(II)) { 149 Direction = 'S'; 150 Dep.push_back(Direction); 151 } else { 152 unsigned Dir = D->getDirection(II); 153 if (Dir == Dependence::DVEntry::LT || 154 Dir == Dependence::DVEntry::LE) 155 Direction = '<'; 156 else if (Dir == Dependence::DVEntry::GT || 157 Dir == Dependence::DVEntry::GE) 158 Direction = '>'; 159 else if (Dir == Dependence::DVEntry::EQ) 160 Direction = '='; 161 else 162 Direction = '*'; 163 Dep.push_back(Direction); 164 } 165 } 166 while (Dep.size() != Level) { 167 Dep.push_back('I'); 168 } 169 170 DepMatrix.push_back(Dep); 171 if (DepMatrix.size() > MaxMemInstrCount) { 172 LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount 173 << " dependencies inside loop\n"); 174 return false; 175 } 176 } 177 } 178 } 179 180 return true; 181 } 182 183 // A loop is moved from index 'from' to an index 'to'. Update the Dependence 184 // matrix by exchanging the two columns. 185 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, 186 unsigned ToIndx) { 187 unsigned numRows = DepMatrix.size(); 188 for (unsigned i = 0; i < numRows; ++i) { 189 char TmpVal = DepMatrix[i][ToIndx]; 190 DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx]; 191 DepMatrix[i][FromIndx] = TmpVal; 192 } 193 } 194 195 // Checks if outermost non '=','S'or'I' dependence in the dependence matrix is 196 // '>' 197 static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, 198 unsigned Column) { 199 for (unsigned i = 0; i <= Column; ++i) { 200 if (DepMatrix[Row][i] == '<') 201 return false; 202 if (DepMatrix[Row][i] == '>') 203 return true; 204 } 205 // All dependencies were '=','S' or 'I' 206 return false; 207 } 208 209 // Checks if no dependence exist in the dependency matrix in Row before Column. 210 static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, 211 unsigned Column) { 212 for (unsigned i = 0; i < Column; ++i) { 213 if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' && 214 DepMatrix[Row][i] != 'I') 215 return false; 216 } 217 return true; 218 } 219 220 static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, 221 unsigned OuterLoopId, char InnerDep, 222 char OuterDep) { 223 if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) 224 return false; 225 226 if (InnerDep == OuterDep) 227 return true; 228 229 // It is legal to interchange if and only if after interchange no row has a 230 // '>' direction as the leftmost non-'='. 231 232 if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I') 233 return true; 234 235 if (InnerDep == '<') 236 return true; 237 238 if (InnerDep == '>') { 239 // If OuterLoopId represents outermost loop then interchanging will make the 240 // 1st dependency as '>' 241 if (OuterLoopId == 0) 242 return false; 243 244 // If all dependencies before OuterloopId are '=','S'or 'I'. Then 245 // interchanging will result in this row having an outermost non '=' 246 // dependency of '>' 247 if (!containsNoDependence(DepMatrix, Row, OuterLoopId)) 248 return true; 249 } 250 251 return false; 252 } 253 254 // Checks if it is legal to interchange 2 loops. 255 // [Theorem] A permutation of the loops in a perfect nest is legal if and only 256 // if the direction matrix, after the same permutation is applied to its 257 // columns, has no ">" direction as the leftmost non-"=" direction in any row. 258 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, 259 unsigned InnerLoopId, 260 unsigned OuterLoopId) { 261 unsigned NumRows = DepMatrix.size(); 262 // For each row check if it is valid to interchange. 263 for (unsigned Row = 0; Row < NumRows; ++Row) { 264 char InnerDep = DepMatrix[Row][InnerLoopId]; 265 char OuterDep = DepMatrix[Row][OuterLoopId]; 266 if (InnerDep == '*' || OuterDep == '*') 267 return false; 268 if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep)) 269 return false; 270 } 271 return true; 272 } 273 274 static LoopVector populateWorklist(Loop &L) { 275 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: " 276 << L.getHeader()->getParent()->getName() << " Loop: %" 277 << L.getHeader()->getName() << '\n'); 278 LoopVector LoopList; 279 Loop *CurrentLoop = &L; 280 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); 281 while (!Vec->empty()) { 282 // The current loop has multiple subloops in it hence it is not tightly 283 // nested. 284 // Discard all loops above it added into Worklist. 285 if (Vec->size() != 1) 286 return {}; 287 288 LoopList.push_back(CurrentLoop); 289 CurrentLoop = Vec->front(); 290 Vec = &CurrentLoop->getSubLoops(); 291 } 292 LoopList.push_back(CurrentLoop); 293 return LoopList; 294 } 295 296 static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { 297 PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); 298 if (InnerIndexVar) 299 return InnerIndexVar; 300 if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) 301 return nullptr; 302 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 303 PHINode *PhiVar = cast<PHINode>(I); 304 Type *PhiTy = PhiVar->getType(); 305 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && 306 !PhiTy->isPointerTy()) 307 return nullptr; 308 const SCEVAddRecExpr *AddRec = 309 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); 310 if (!AddRec || !AddRec->isAffine()) 311 continue; 312 const SCEV *Step = AddRec->getStepRecurrence(*SE); 313 if (!isa<SCEVConstant>(Step)) 314 continue; 315 // Found the induction variable. 316 // FIXME: Handle loops with more than one induction variable. Note that, 317 // currently, legality makes sure we have only one induction variable. 318 return PhiVar; 319 } 320 return nullptr; 321 } 322 323 namespace { 324 325 /// LoopInterchangeLegality checks if it is legal to interchange the loop. 326 class LoopInterchangeLegality { 327 public: 328 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 329 OptimizationRemarkEmitter *ORE) 330 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 331 332 /// Check if the loops can be interchanged. 333 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, 334 CharMatrix &DepMatrix); 335 336 /// Check if the loop structure is understood. We do not handle triangular 337 /// loops for now. 338 bool isLoopStructureUnderstood(PHINode *InnerInductionVar); 339 340 bool currentLimitations(); 341 342 bool hasInnerLoopReduction() { return InnerLoopHasReduction; } 343 344 private: 345 bool tightlyNested(Loop *Outer, Loop *Inner); 346 bool containsUnsafeInstructionsInHeader(BasicBlock *BB); 347 bool areAllUsesReductions(Instruction *Ins, Loop *L); 348 bool containsUnsafeInstructionsInLatch(BasicBlock *BB); 349 bool findInductionAndReductions(Loop *L, 350 SmallVector<PHINode *, 8> &Inductions, 351 SmallVector<PHINode *, 8> &Reductions); 352 353 Loop *OuterLoop; 354 Loop *InnerLoop; 355 356 ScalarEvolution *SE; 357 358 /// Interface to emit optimization remarks. 359 OptimizationRemarkEmitter *ORE; 360 361 bool InnerLoopHasReduction = false; 362 }; 363 364 /// LoopInterchangeProfitability checks if it is profitable to interchange the 365 /// loop. 366 class LoopInterchangeProfitability { 367 public: 368 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 369 OptimizationRemarkEmitter *ORE) 370 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 371 372 /// Check if the loop interchange is profitable. 373 bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, 374 CharMatrix &DepMatrix); 375 376 private: 377 int getInstrOrderCost(); 378 379 Loop *OuterLoop; 380 Loop *InnerLoop; 381 382 /// Scev analysis. 383 ScalarEvolution *SE; 384 385 /// Interface to emit optimization remarks. 386 OptimizationRemarkEmitter *ORE; 387 }; 388 389 /// LoopInterchangeTransform interchanges the loop. 390 class LoopInterchangeTransform { 391 public: 392 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 393 LoopInfo *LI, DominatorTree *DT, 394 BasicBlock *LoopNestExit, 395 bool InnerLoopContainsReductions) 396 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), 397 LoopExit(LoopNestExit), 398 InnerLoopHasReduction(InnerLoopContainsReductions) {} 399 400 /// Interchange OuterLoop and InnerLoop. 401 bool transform(); 402 void restructureLoops(Loop *NewInner, Loop *NewOuter, 403 BasicBlock *OrigInnerPreHeader, 404 BasicBlock *OrigOuterPreHeader); 405 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); 406 407 private: 408 void splitInnerLoopLatch(Instruction *); 409 void splitInnerLoopHeader(); 410 bool adjustLoopLinks(); 411 void adjustLoopPreheaders(); 412 bool adjustLoopBranches(); 413 414 Loop *OuterLoop; 415 Loop *InnerLoop; 416 417 /// Scev analysis. 418 ScalarEvolution *SE; 419 420 LoopInfo *LI; 421 DominatorTree *DT; 422 BasicBlock *LoopExit; 423 bool InnerLoopHasReduction; 424 }; 425 426 // Main LoopInterchange Pass. 427 struct LoopInterchange : public LoopPass { 428 static char ID; 429 ScalarEvolution *SE = nullptr; 430 LoopInfo *LI = nullptr; 431 DependenceInfo *DI = nullptr; 432 DominatorTree *DT = nullptr; 433 434 /// Interface to emit optimization remarks. 435 OptimizationRemarkEmitter *ORE; 436 437 LoopInterchange() : LoopPass(ID) { 438 initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); 439 } 440 441 void getAnalysisUsage(AnalysisUsage &AU) const override { 442 AU.addRequired<DependenceAnalysisWrapperPass>(); 443 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 444 445 getLoopAnalysisUsage(AU); 446 } 447 448 bool runOnLoop(Loop *L, LPPassManager &LPM) override { 449 if (skipLoop(L) || L->getParentLoop()) 450 return false; 451 452 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 453 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 454 DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 455 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 456 ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 457 458 return processLoopList(populateWorklist(*L)); 459 } 460 461 bool isComputableLoopNest(LoopVector LoopList) { 462 for (Loop *L : LoopList) { 463 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); 464 if (ExitCountOuter == SE->getCouldNotCompute()) { 465 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n"); 466 return false; 467 } 468 if (L->getNumBackEdges() != 1) { 469 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); 470 return false; 471 } 472 if (!L->getExitingBlock()) { 473 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n"); 474 return false; 475 } 476 } 477 return true; 478 } 479 480 unsigned selectLoopForInterchange(const LoopVector &LoopList) { 481 // TODO: Add a better heuristic to select the loop to be interchanged based 482 // on the dependence matrix. Currently we select the innermost loop. 483 return LoopList.size() - 1; 484 } 485 486 bool processLoopList(LoopVector LoopList) { 487 bool Changed = false; 488 unsigned LoopNestDepth = LoopList.size(); 489 if (LoopNestDepth < 2) { 490 LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); 491 return false; 492 } 493 if (LoopNestDepth > MaxLoopNestDepth) { 494 LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than " 495 << MaxLoopNestDepth << "\n"); 496 return false; 497 } 498 if (!isComputableLoopNest(LoopList)) { 499 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); 500 return false; 501 } 502 503 LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth 504 << "\n"); 505 506 CharMatrix DependencyMatrix; 507 Loop *OuterMostLoop = *(LoopList.begin()); 508 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, 509 OuterMostLoop, DI)) { 510 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n"); 511 return false; 512 } 513 #ifdef DUMP_DEP_MATRICIES 514 LLVM_DEBUG(dbgs() << "Dependence before interchange\n"); 515 printDepMatrix(DependencyMatrix); 516 #endif 517 518 // Get the Outermost loop exit. 519 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock(); 520 if (!LoopNestExit) { 521 LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block"); 522 return false; 523 } 524 525 unsigned SelecLoopId = selectLoopForInterchange(LoopList); 526 // Move the selected loop outwards to the best possible position. 527 for (unsigned i = SelecLoopId; i > 0; i--) { 528 bool Interchanged = 529 processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); 530 if (!Interchanged) 531 return Changed; 532 // Loops interchanged reflect the same in LoopList 533 std::swap(LoopList[i - 1], LoopList[i]); 534 535 // Update the DependencyMatrix 536 interChangeDependencies(DependencyMatrix, i, i - 1); 537 #ifdef DUMP_DEP_MATRICIES 538 LLVM_DEBUG(dbgs() << "Dependence after interchange\n"); 539 printDepMatrix(DependencyMatrix); 540 #endif 541 Changed |= Interchanged; 542 } 543 return Changed; 544 } 545 546 bool processLoop(LoopVector LoopList, unsigned InnerLoopId, 547 unsigned OuterLoopId, BasicBlock *LoopNestExit, 548 std::vector<std::vector<char>> &DependencyMatrix) { 549 LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId 550 << " and OuterLoopId = " << OuterLoopId << "\n"); 551 Loop *InnerLoop = LoopList[InnerLoopId]; 552 Loop *OuterLoop = LoopList[OuterLoopId]; 553 554 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); 555 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { 556 LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n"); 557 return false; 558 } 559 LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n"); 560 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); 561 if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { 562 LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n"); 563 return false; 564 } 565 566 ORE->emit([&]() { 567 return OptimizationRemark(DEBUG_TYPE, "Interchanged", 568 InnerLoop->getStartLoc(), 569 InnerLoop->getHeader()) 570 << "Loop interchanged with enclosing loop."; 571 }); 572 573 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, 574 LoopNestExit, LIL.hasInnerLoopReduction()); 575 LIT.transform(); 576 LLVM_DEBUG(dbgs() << "Loops interchanged.\n"); 577 LoopsInterchanged++; 578 return true; 579 } 580 }; 581 582 } // end anonymous namespace 583 584 bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) { 585 return llvm::none_of(Ins->users(), [=](User *U) -> bool { 586 auto *UserIns = dyn_cast<PHINode>(U); 587 RecurrenceDescriptor RD; 588 return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD); 589 }); 590 } 591 592 bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader( 593 BasicBlock *BB) { 594 for (Instruction &I : *BB) { 595 // Load corresponding to reduction PHI's are safe while concluding if 596 // tightly nested. 597 if (LoadInst *L = dyn_cast<LoadInst>(&I)) { 598 if (!areAllUsesReductions(L, InnerLoop)) 599 return true; 600 } else if (I.mayHaveSideEffects() || I.mayReadFromMemory()) 601 return true; 602 } 603 return false; 604 } 605 606 bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch( 607 BasicBlock *BB) { 608 for (Instruction &I : *BB) { 609 // Stores corresponding to reductions are safe while concluding if tightly 610 // nested. 611 if (StoreInst *L = dyn_cast<StoreInst>(&I)) { 612 if (!isa<PHINode>(L->getOperand(0))) 613 return true; 614 } else if (I.mayHaveSideEffects() || I.mayReadFromMemory()) 615 return true; 616 } 617 return false; 618 } 619 620 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { 621 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 622 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 623 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 624 625 LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n"); 626 627 // A perfectly nested loop will not have any branch in between the outer and 628 // inner block i.e. outer header will branch to either inner preheader and 629 // outerloop latch. 630 BranchInst *OuterLoopHeaderBI = 631 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 632 if (!OuterLoopHeaderBI) 633 return false; 634 635 for (BasicBlock *Succ : successors(OuterLoopHeaderBI)) 636 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() && 637 Succ != OuterLoopLatch) 638 return false; 639 640 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n"); 641 // We do not have any basic block in between now make sure the outer header 642 // and outer loop latch doesn't contain any unsafe instructions. 643 if (containsUnsafeInstructionsInHeader(OuterLoopHeader) || 644 containsUnsafeInstructionsInLatch(OuterLoopLatch)) 645 return false; 646 647 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n"); 648 // We have a perfect loop nest. 649 return true; 650 } 651 652 bool LoopInterchangeLegality::isLoopStructureUnderstood( 653 PHINode *InnerInduction) { 654 unsigned Num = InnerInduction->getNumOperands(); 655 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); 656 for (unsigned i = 0; i < Num; ++i) { 657 Value *Val = InnerInduction->getOperand(i); 658 if (isa<Constant>(Val)) 659 continue; 660 Instruction *I = dyn_cast<Instruction>(Val); 661 if (!I) 662 return false; 663 // TODO: Handle triangular loops. 664 // e.g. for(int i=0;i<N;i++) 665 // for(int j=i;j<N;j++) 666 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); 667 if (InnerInduction->getIncomingBlock(IncomBlockIndx) == 668 InnerLoopPreheader && 669 !OuterLoop->isLoopInvariant(I)) { 670 return false; 671 } 672 } 673 return true; 674 } 675 676 bool LoopInterchangeLegality::findInductionAndReductions( 677 Loop *L, SmallVector<PHINode *, 8> &Inductions, 678 SmallVector<PHINode *, 8> &Reductions) { 679 if (!L->getLoopLatch() || !L->getLoopPredecessor()) 680 return false; 681 for (PHINode &PHI : L->getHeader()->phis()) { 682 RecurrenceDescriptor RD; 683 InductionDescriptor ID; 684 if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID)) 685 Inductions.push_back(&PHI); 686 else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD)) 687 Reductions.push_back(&PHI); 688 else { 689 LLVM_DEBUG( 690 dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); 691 return false; 692 } 693 } 694 return true; 695 } 696 697 static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { 698 for (PHINode &PHI : Block->phis()) { 699 // Reduction lcssa phi will have only 1 incoming block that from loop latch. 700 if (PHI.getNumIncomingValues() > 1) 701 return false; 702 Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0)); 703 if (!Ins) 704 return false; 705 // Incoming value for lcssa phi's in outer loop exit can only be inner loop 706 // exits lcssa phi else it would not be tightly nested. 707 if (!isa<PHINode>(Ins) && isOuterLoopExitBlock) 708 return false; 709 } 710 return true; 711 } 712 713 // This function indicates the current limitations in the transform as a result 714 // of which we do not proceed. 715 bool LoopInterchangeLegality::currentLimitations() { 716 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 717 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 718 719 // transform currently expects the loop latches to also be the exiting 720 // blocks. 721 if (InnerLoop->getExitingBlock() != InnerLoopLatch || 722 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() || 723 !isa<BranchInst>(InnerLoopLatch->getTerminator()) || 724 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) { 725 LLVM_DEBUG( 726 dbgs() << "Loops where the latch is not the exiting block are not" 727 << " supported currently.\n"); 728 ORE->emit([&]() { 729 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch", 730 OuterLoop->getStartLoc(), 731 OuterLoop->getHeader()) 732 << "Loops where the latch is not the exiting block cannot be" 733 " interchange currently."; 734 }); 735 return true; 736 } 737 738 PHINode *InnerInductionVar; 739 SmallVector<PHINode *, 8> Inductions; 740 SmallVector<PHINode *, 8> Reductions; 741 if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { 742 LLVM_DEBUG( 743 dbgs() << "Only inner loops with induction or reduction PHI nodes " 744 << "are supported currently.\n"); 745 ORE->emit([&]() { 746 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner", 747 InnerLoop->getStartLoc(), 748 InnerLoop->getHeader()) 749 << "Only inner loops with induction or reduction PHI nodes can be" 750 " interchange currently."; 751 }); 752 return true; 753 } 754 755 // TODO: Currently we handle only loops with 1 induction variable. 756 if (Inductions.size() != 1) { 757 LLVM_DEBUG( 758 dbgs() << "We currently only support loops with 1 induction variable." 759 << "Failed to interchange due to current limitation\n"); 760 ORE->emit([&]() { 761 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner", 762 InnerLoop->getStartLoc(), 763 InnerLoop->getHeader()) 764 << "Only inner loops with 1 induction variable can be " 765 "interchanged currently."; 766 }); 767 return true; 768 } 769 if (Reductions.size() > 0) 770 InnerLoopHasReduction = true; 771 772 InnerInductionVar = Inductions.pop_back_val(); 773 Reductions.clear(); 774 if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { 775 LLVM_DEBUG( 776 dbgs() << "Only outer loops with induction or reduction PHI nodes " 777 << "are supported currently.\n"); 778 ORE->emit([&]() { 779 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter", 780 OuterLoop->getStartLoc(), 781 OuterLoop->getHeader()) 782 << "Only outer loops with induction or reduction PHI nodes can be" 783 " interchanged currently."; 784 }); 785 return true; 786 } 787 788 // Outer loop cannot have reduction because then loops will not be tightly 789 // nested. 790 if (!Reductions.empty()) { 791 LLVM_DEBUG(dbgs() << "Outer loops with reductions are not supported " 792 << "currently.\n"); 793 ORE->emit([&]() { 794 return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter", 795 OuterLoop->getStartLoc(), 796 OuterLoop->getHeader()) 797 << "Outer loops with reductions cannot be interchangeed " 798 "currently."; 799 }); 800 return true; 801 } 802 // TODO: Currently we handle only loops with 1 induction variable. 803 if (Inductions.size() != 1) { 804 LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not " 805 << "supported currently.\n"); 806 ORE->emit([&]() { 807 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter", 808 OuterLoop->getStartLoc(), 809 OuterLoop->getHeader()) 810 << "Only outer loops with 1 induction variable can be " 811 "interchanged currently."; 812 }); 813 return true; 814 } 815 816 // TODO: Triangular loops are not handled for now. 817 if (!isLoopStructureUnderstood(InnerInductionVar)) { 818 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n"); 819 ORE->emit([&]() { 820 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner", 821 InnerLoop->getStartLoc(), 822 InnerLoop->getHeader()) 823 << "Inner loop structure not understood currently."; 824 }); 825 return true; 826 } 827 828 // TODO: We only handle LCSSA PHI's corresponding to reduction for now. 829 BasicBlock *InnerExit = InnerLoop->getExitBlock(); 830 if (!containsSafePHI(InnerExit, false)) { 831 LLVM_DEBUG( 832 dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); 833 ORE->emit([&]() { 834 return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner", 835 InnerLoop->getStartLoc(), 836 InnerLoop->getHeader()) 837 << "Only inner loops with LCSSA PHIs can be interchange " 838 "currently."; 839 }); 840 return true; 841 } 842 843 // TODO: Current limitation: Since we split the inner loop latch at the point 844 // were induction variable is incremented (induction.next); We cannot have 845 // more than 1 user of induction.next since it would result in broken code 846 // after split. 847 // e.g. 848 // for(i=0;i<N;i++) { 849 // for(j = 0;j<M;j++) { 850 // A[j+1][i+2] = A[j][i]+k; 851 // } 852 // } 853 Instruction *InnerIndexVarInc = nullptr; 854 if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) 855 InnerIndexVarInc = 856 dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); 857 else 858 InnerIndexVarInc = 859 dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); 860 861 if (!InnerIndexVarInc) { 862 LLVM_DEBUG( 863 dbgs() << "Did not find an instruction to increment the induction " 864 << "variable.\n"); 865 ORE->emit([&]() { 866 return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner", 867 InnerLoop->getStartLoc(), 868 InnerLoop->getHeader()) 869 << "The inner loop does not increment the induction variable."; 870 }); 871 return true; 872 } 873 874 // Since we split the inner loop latch on this induction variable. Make sure 875 // we do not have any instruction between the induction variable and branch 876 // instruction. 877 878 bool FoundInduction = false; 879 for (const Instruction &I : 880 llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) { 881 if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) || 882 isa<ZExtInst>(I)) 883 continue; 884 885 // We found an instruction. If this is not induction variable then it is not 886 // safe to split this loop latch. 887 if (!I.isIdenticalTo(InnerIndexVarInc)) { 888 LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction " 889 << "variable increment and branch.\n"); 890 ORE->emit([&]() { 891 return OptimizationRemarkMissed( 892 DEBUG_TYPE, "UnsupportedInsBetweenInduction", 893 InnerLoop->getStartLoc(), InnerLoop->getHeader()) 894 << "Found unsupported instruction between induction variable " 895 "increment and branch."; 896 }); 897 return true; 898 } 899 900 FoundInduction = true; 901 break; 902 } 903 // The loop latch ended and we didn't find the induction variable return as 904 // current limitation. 905 if (!FoundInduction) { 906 LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n"); 907 ORE->emit([&]() { 908 return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable", 909 InnerLoop->getStartLoc(), 910 InnerLoop->getHeader()) 911 << "Did not find the induction variable."; 912 }); 913 return true; 914 } 915 return false; 916 } 917 918 // We currently support LCSSA PHI nodes in the outer loop exit, if their 919 // incoming values do not come from the outer loop latch or if the 920 // outer loop latch has a single predecessor. In that case, the value will 921 // be available if both the inner and outer loop conditions are true, which 922 // will still be true after interchanging. If we have multiple predecessor, 923 // that may not be the case, e.g. because the outer loop latch may be executed 924 // if the inner loop is not executed. 925 static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { 926 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); 927 for (PHINode &PHI : LoopNestExit->phis()) { 928 // FIXME: We currently are not able to detect floating point reductions 929 // and have to use floating point PHIs as a proxy to prevent 930 // interchanging in the presence of floating point reductions. 931 if (PHI.getType()->isFloatingPointTy()) 932 return false; 933 for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { 934 Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i)); 935 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) 936 continue; 937 938 // The incoming value is defined in the outer loop latch. Currently we 939 // only support that in case the outer loop latch has a single predecessor. 940 // This guarantees that the outer loop latch is executed if and only if 941 // the inner loop is executed (because tightlyNested() guarantees that the 942 // outer loop header only branches to the inner loop or the outer loop 943 // latch). 944 // FIXME: We could weaken this logic and allow multiple predecessors, 945 // if the values are produced outside the loop latch. We would need 946 // additional logic to update the PHI nodes in the exit block as 947 // well. 948 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) 949 return false; 950 } 951 } 952 return true; 953 } 954 955 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, 956 unsigned OuterLoopId, 957 CharMatrix &DepMatrix) { 958 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { 959 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId 960 << " and OuterLoopId = " << OuterLoopId 961 << " due to dependence\n"); 962 ORE->emit([&]() { 963 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence", 964 InnerLoop->getStartLoc(), 965 InnerLoop->getHeader()) 966 << "Cannot interchange loops due to dependences."; 967 }); 968 return false; 969 } 970 // Check if outer and inner loop contain legal instructions only. 971 for (auto *BB : OuterLoop->blocks()) 972 for (Instruction &I : BB->instructionsWithoutDebug()) 973 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 974 // readnone functions do not prevent interchanging. 975 if (CI->doesNotReadMemory()) 976 continue; 977 LLVM_DEBUG( 978 dbgs() << "Loops with call instructions cannot be interchanged " 979 << "safely."); 980 ORE->emit([&]() { 981 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst", 982 CI->getDebugLoc(), 983 CI->getParent()) 984 << "Cannot interchange loops due to call instruction."; 985 }); 986 987 return false; 988 } 989 990 // TODO: The loops could not be interchanged due to current limitations in the 991 // transform module. 992 if (currentLimitations()) { 993 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n"); 994 return false; 995 } 996 997 // Check if the loops are tightly nested. 998 if (!tightlyNested(OuterLoop, InnerLoop)) { 999 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n"); 1000 ORE->emit([&]() { 1001 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested", 1002 InnerLoop->getStartLoc(), 1003 InnerLoop->getHeader()) 1004 << "Cannot interchange loops because they are not tightly " 1005 "nested."; 1006 }); 1007 return false; 1008 } 1009 1010 if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) { 1011 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); 1012 ORE->emit([&]() { 1013 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", 1014 OuterLoop->getStartLoc(), 1015 OuterLoop->getHeader()) 1016 << "Found unsupported PHI node in loop exit."; 1017 }); 1018 return false; 1019 } 1020 1021 return true; 1022 } 1023 1024 int LoopInterchangeProfitability::getInstrOrderCost() { 1025 unsigned GoodOrder, BadOrder; 1026 BadOrder = GoodOrder = 0; 1027 for (BasicBlock *BB : InnerLoop->blocks()) { 1028 for (Instruction &Ins : *BB) { 1029 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { 1030 unsigned NumOp = GEP->getNumOperands(); 1031 bool FoundInnerInduction = false; 1032 bool FoundOuterInduction = false; 1033 for (unsigned i = 0; i < NumOp; ++i) { 1034 const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); 1035 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); 1036 if (!AR) 1037 continue; 1038 1039 // If we find the inner induction after an outer induction e.g. 1040 // for(int i=0;i<N;i++) 1041 // for(int j=0;j<N;j++) 1042 // A[i][j] = A[i-1][j-1]+k; 1043 // then it is a good order. 1044 if (AR->getLoop() == InnerLoop) { 1045 // We found an InnerLoop induction after OuterLoop induction. It is 1046 // a good order. 1047 FoundInnerInduction = true; 1048 if (FoundOuterInduction) { 1049 GoodOrder++; 1050 break; 1051 } 1052 } 1053 // If we find the outer induction after an inner induction e.g. 1054 // for(int i=0;i<N;i++) 1055 // for(int j=0;j<N;j++) 1056 // A[j][i] = A[j-1][i-1]+k; 1057 // then it is a bad order. 1058 if (AR->getLoop() == OuterLoop) { 1059 // We found an OuterLoop induction after InnerLoop induction. It is 1060 // a bad order. 1061 FoundOuterInduction = true; 1062 if (FoundInnerInduction) { 1063 BadOrder++; 1064 break; 1065 } 1066 } 1067 } 1068 } 1069 } 1070 } 1071 return GoodOrder - BadOrder; 1072 } 1073 1074 static bool isProfitableForVectorization(unsigned InnerLoopId, 1075 unsigned OuterLoopId, 1076 CharMatrix &DepMatrix) { 1077 // TODO: Improve this heuristic to catch more cases. 1078 // If the inner loop is loop independent or doesn't carry any dependency it is 1079 // profitable to move this to outer position. 1080 for (auto &Row : DepMatrix) { 1081 if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I') 1082 return false; 1083 // TODO: We need to improve this heuristic. 1084 if (Row[OuterLoopId] != '=') 1085 return false; 1086 } 1087 // If outer loop has dependence and inner loop is loop independent then it is 1088 // profitable to interchange to enable parallelism. 1089 // If there are no dependences, interchanging will not improve anything. 1090 return !DepMatrix.empty(); 1091 } 1092 1093 bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, 1094 unsigned OuterLoopId, 1095 CharMatrix &DepMatrix) { 1096 // TODO: Add better profitability checks. 1097 // e.g 1098 // 1) Construct dependency matrix and move the one with no loop carried dep 1099 // inside to enable vectorization. 1100 1101 // This is rough cost estimation algorithm. It counts the good and bad order 1102 // of induction variables in the instruction and allows reordering if number 1103 // of bad orders is more than good. 1104 int Cost = getInstrOrderCost(); 1105 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n"); 1106 if (Cost < -LoopInterchangeCostThreshold) 1107 return true; 1108 1109 // It is not profitable as per current cache profitability model. But check if 1110 // we can move this loop outside to improve parallelism. 1111 if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) 1112 return true; 1113 1114 ORE->emit([&]() { 1115 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable", 1116 InnerLoop->getStartLoc(), 1117 InnerLoop->getHeader()) 1118 << "Interchanging loops is too costly (cost=" 1119 << ore::NV("Cost", Cost) << ", threshold=" 1120 << ore::NV("Threshold", LoopInterchangeCostThreshold) 1121 << ") and it does not improve parallelism."; 1122 }); 1123 return false; 1124 } 1125 1126 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, 1127 Loop *InnerLoop) { 1128 for (Loop *L : *OuterLoop) 1129 if (L == InnerLoop) { 1130 OuterLoop->removeChildLoop(L); 1131 return; 1132 } 1133 llvm_unreachable("Couldn't find loop"); 1134 } 1135 1136 /// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the 1137 /// new inner and outer loop after interchanging: NewInner is the original 1138 /// outer loop and NewOuter is the original inner loop. 1139 /// 1140 /// Before interchanging, we have the following structure 1141 /// Outer preheader 1142 // Outer header 1143 // Inner preheader 1144 // Inner header 1145 // Inner body 1146 // Inner latch 1147 // outer bbs 1148 // Outer latch 1149 // 1150 // After interchanging: 1151 // Inner preheader 1152 // Inner header 1153 // Outer preheader 1154 // Outer header 1155 // Inner body 1156 // outer bbs 1157 // Outer latch 1158 // Inner latch 1159 void LoopInterchangeTransform::restructureLoops( 1160 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader, 1161 BasicBlock *OrigOuterPreHeader) { 1162 Loop *OuterLoopParent = OuterLoop->getParentLoop(); 1163 // The original inner loop preheader moves from the new inner loop to 1164 // the parent loop, if there is one. 1165 NewInner->removeBlockFromLoop(OrigInnerPreHeader); 1166 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent); 1167 1168 // Switch the loop levels. 1169 if (OuterLoopParent) { 1170 // Remove the loop from its parent loop. 1171 removeChildLoop(OuterLoopParent, NewInner); 1172 removeChildLoop(NewInner, NewOuter); 1173 OuterLoopParent->addChildLoop(NewOuter); 1174 } else { 1175 removeChildLoop(NewInner, NewOuter); 1176 LI->changeTopLevelLoop(NewInner, NewOuter); 1177 } 1178 while (!NewOuter->empty()) 1179 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin())); 1180 NewOuter->addChildLoop(NewInner); 1181 1182 // BBs from the original inner loop. 1183 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks()); 1184 1185 // Add BBs from the original outer loop to the original inner loop (excluding 1186 // BBs already in inner loop) 1187 for (BasicBlock *BB : NewInner->blocks()) 1188 if (LI->getLoopFor(BB) == NewInner) 1189 NewOuter->addBlockEntry(BB); 1190 1191 // Now remove inner loop header and latch from the new inner loop and move 1192 // other BBs (the loop body) to the new inner loop. 1193 BasicBlock *OuterHeader = NewOuter->getHeader(); 1194 BasicBlock *OuterLatch = NewOuter->getLoopLatch(); 1195 for (BasicBlock *BB : OrigInnerBBs) { 1196 // Nothing will change for BBs in child loops. 1197 if (LI->getLoopFor(BB) != NewOuter) 1198 continue; 1199 // Remove the new outer loop header and latch from the new inner loop. 1200 if (BB == OuterHeader || BB == OuterLatch) 1201 NewInner->removeBlockFromLoop(BB); 1202 else 1203 LI->changeLoopFor(BB, NewInner); 1204 } 1205 1206 // The preheader of the original outer loop becomes part of the new 1207 // outer loop. 1208 NewOuter->addBlockEntry(OrigOuterPreHeader); 1209 LI->changeLoopFor(OrigOuterPreHeader, NewOuter); 1210 1211 // Tell SE that we move the loops around. 1212 SE->forgetLoop(NewOuter); 1213 SE->forgetLoop(NewInner); 1214 } 1215 1216 bool LoopInterchangeTransform::transform() { 1217 bool Transformed = false; 1218 Instruction *InnerIndexVar; 1219 1220 if (InnerLoop->getSubLoops().empty()) { 1221 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1222 LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n"); 1223 PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); 1224 if (!InductionPHI) { 1225 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); 1226 return false; 1227 } 1228 1229 if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) 1230 InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); 1231 else 1232 InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); 1233 1234 // Ensure that InductionPHI is the first Phi node. 1235 if (&InductionPHI->getParent()->front() != InductionPHI) 1236 InductionPHI->moveBefore(&InductionPHI->getParent()->front()); 1237 1238 // Split at the place were the induction variable is 1239 // incremented/decremented. 1240 // TODO: This splitting logic may not work always. Fix this. 1241 splitInnerLoopLatch(InnerIndexVar); 1242 LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n"); 1243 1244 // Splits the inner loops phi nodes out into a separate basic block. 1245 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1246 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); 1247 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n"); 1248 } 1249 1250 Transformed |= adjustLoopLinks(); 1251 if (!Transformed) { 1252 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n"); 1253 return false; 1254 } 1255 1256 return true; 1257 } 1258 1259 void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { 1260 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1261 BasicBlock *InnerLoopLatchPred = InnerLoopLatch; 1262 InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI); 1263 } 1264 1265 /// \brief Move all instructions except the terminator from FromBB right before 1266 /// InsertBefore 1267 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { 1268 auto &ToList = InsertBefore->getParent()->getInstList(); 1269 auto &FromList = FromBB->getInstList(); 1270 1271 ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), 1272 FromBB->getTerminator()->getIterator()); 1273 } 1274 1275 static void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, 1276 BasicBlock *NewPred) { 1277 for (PHINode &PHI : CurrBlock->phis()) { 1278 unsigned Num = PHI.getNumIncomingValues(); 1279 for (unsigned i = 0; i < Num; ++i) { 1280 if (PHI.getIncomingBlock(i) == OldPred) 1281 PHI.setIncomingBlock(i, NewPred); 1282 } 1283 } 1284 } 1285 1286 /// Update BI to jump to NewBB instead of OldBB. Records updates to 1287 /// the dominator tree in DTUpdates, if DT should be preserved. 1288 static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, 1289 BasicBlock *NewBB, 1290 std::vector<DominatorTree::UpdateType> &DTUpdates) { 1291 assert(llvm::count_if(successors(BI), 1292 [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 && 1293 "BI must jump to OldBB at most once."); 1294 for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) { 1295 if (BI->getSuccessor(i) == OldBB) { 1296 BI->setSuccessor(i, NewBB); 1297 1298 DTUpdates.push_back( 1299 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); 1300 DTUpdates.push_back( 1301 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); 1302 break; 1303 } 1304 } 1305 } 1306 1307 // Move Lcssa PHIs to the right place. 1308 static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerLatch, 1309 BasicBlock *OuterLatch) { 1310 SmallVector<PHINode *, 8> LcssaInnerExit; 1311 for (PHINode &P : InnerExit->phis()) 1312 LcssaInnerExit.push_back(&P); 1313 1314 SmallVector<PHINode *, 8> LcssaInnerLatch; 1315 for (PHINode &P : InnerLatch->phis()) 1316 LcssaInnerLatch.push_back(&P); 1317 1318 // Lcssa PHIs for values used outside the inner loop are in InnerExit. 1319 // If a PHI node has users outside of InnerExit, it has a use outside the 1320 // interchanged loop and we have to preserve it. We move these to 1321 // InnerLatch, which will become the new exit block for the innermost 1322 // loop after interchanging. For PHIs only used in InnerExit, we can just 1323 // replace them with the incoming value. 1324 for (PHINode *P : LcssaInnerExit) { 1325 bool hasUsersOutside = false; 1326 for (auto UI = P->use_begin(), E = P->use_end(); UI != E;) { 1327 Use &U = *UI; 1328 ++UI; 1329 auto *Usr = cast<Instruction>(U.getUser()); 1330 if (Usr->getParent() != InnerExit) { 1331 hasUsersOutside = true; 1332 continue; 1333 } 1334 U.set(P->getIncomingValueForBlock(InnerLatch)); 1335 } 1336 if (hasUsersOutside) 1337 P->moveBefore(InnerLatch->getFirstNonPHI()); 1338 else 1339 P->eraseFromParent(); 1340 } 1341 1342 // If the inner loop latch contains LCSSA PHIs, those come from a child loop 1343 // and we have to move them to the new inner latch. 1344 for (PHINode *P : LcssaInnerLatch) 1345 P->moveBefore(InnerExit->getFirstNonPHI()); 1346 1347 // Now adjust the incoming blocks for the LCSSA PHIs. 1348 // For PHIs moved from Inner's exit block, we need to replace Inner's latch 1349 // with the new latch. 1350 updateIncomingBlock(InnerLatch, InnerLatch, OuterLatch); 1351 } 1352 1353 bool LoopInterchangeTransform::adjustLoopBranches() { 1354 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n"); 1355 std::vector<DominatorTree::UpdateType> DTUpdates; 1356 1357 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1358 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1359 1360 assert(OuterLoopPreHeader != OuterLoop->getHeader() && 1361 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader && 1362 InnerLoopPreHeader && "Guaranteed by loop-simplify form"); 1363 // Ensure that both preheaders do not contain PHI nodes and have single 1364 // predecessors. This allows us to move them easily. We use 1365 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing 1366 // preheaders do not satisfy those conditions. 1367 if (isa<PHINode>(OuterLoopPreHeader->begin()) || 1368 !OuterLoopPreHeader->getUniquePredecessor()) 1369 OuterLoopPreHeader = InsertPreheaderForLoop(OuterLoop, DT, LI, true); 1370 if (InnerLoopPreHeader == OuterLoop->getHeader()) 1371 InnerLoopPreHeader = InsertPreheaderForLoop(InnerLoop, DT, LI, true); 1372 1373 // Adjust the loop preheader 1374 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1375 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1376 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1377 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 1378 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); 1379 BasicBlock *InnerLoopLatchPredecessor = 1380 InnerLoopLatch->getUniquePredecessor(); 1381 BasicBlock *InnerLoopLatchSuccessor; 1382 BasicBlock *OuterLoopLatchSuccessor; 1383 1384 BranchInst *OuterLoopLatchBI = 1385 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); 1386 BranchInst *InnerLoopLatchBI = 1387 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); 1388 BranchInst *OuterLoopHeaderBI = 1389 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 1390 BranchInst *InnerLoopHeaderBI = 1391 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); 1392 1393 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || 1394 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || 1395 !InnerLoopHeaderBI) 1396 return false; 1397 1398 BranchInst *InnerLoopLatchPredecessorBI = 1399 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); 1400 BranchInst *OuterLoopPredecessorBI = 1401 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); 1402 1403 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) 1404 return false; 1405 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); 1406 if (!InnerLoopHeaderSuccessor) 1407 return false; 1408 1409 // Adjust Loop Preheader and headers 1410 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, 1411 InnerLoopPreHeader, DTUpdates); 1412 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates); 1413 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, 1414 InnerLoopHeaderSuccessor, DTUpdates); 1415 1416 // Adjust reduction PHI's now that the incoming block has changed. 1417 updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader, 1418 OuterLoopHeader); 1419 1420 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor, 1421 OuterLoopPreHeader, DTUpdates); 1422 1423 // -------------Adjust loop latches----------- 1424 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) 1425 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); 1426 else 1427 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); 1428 1429 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch, 1430 InnerLoopLatchSuccessor, DTUpdates); 1431 1432 1433 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) 1434 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); 1435 else 1436 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); 1437 1438 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor, 1439 OuterLoopLatchSuccessor, DTUpdates); 1440 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch, 1441 DTUpdates); 1442 1443 DT->applyUpdates(DTUpdates); 1444 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader, 1445 OuterLoopPreHeader); 1446 1447 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopLatch, OuterLoopLatch); 1448 // For PHIs in the exit block of the outer loop, outer's latch has been 1449 // replaced by Inners'. 1450 updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); 1451 1452 // Now update the reduction PHIs in the inner and outer loop headers. 1453 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; 1454 for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1)) 1455 InnerLoopPHIs.push_back(cast<PHINode>(&PHI)); 1456 for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1)) 1457 OuterLoopPHIs.push_back(cast<PHINode>(&PHI)); 1458 1459 for (PHINode *PHI : OuterLoopPHIs) 1460 PHI->moveBefore(InnerLoopHeader->getFirstNonPHI()); 1461 1462 // Move the PHI nodes from the inner loop header to the outer loop header. 1463 // We have to deal with one kind of PHI nodes: 1464 // 1) PHI nodes that are part of inner loop-only reductions. 1465 // We only have to move the PHI node and update the incoming blocks. 1466 for (PHINode *PHI : InnerLoopPHIs) { 1467 PHI->moveBefore(OuterLoopHeader->getFirstNonPHI()); 1468 for (BasicBlock *InBB : PHI->blocks()) { 1469 if (InnerLoop->contains(InBB)) 1470 continue; 1471 1472 assert(!isa<PHINode>(PHI->getIncomingValueForBlock(InBB)) && 1473 "Unexpected incoming PHI node, reductions in outer loop are not " 1474 "supported yet"); 1475 PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB)); 1476 PHI->eraseFromParent(); 1477 break; 1478 } 1479 } 1480 1481 // Update the incoming blocks for moved PHI nodes. 1482 updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader); 1483 updateIncomingBlock(OuterLoopHeader, InnerLoopLatch, OuterLoopLatch); 1484 updateIncomingBlock(InnerLoopHeader, OuterLoopPreHeader, InnerLoopPreHeader); 1485 updateIncomingBlock(InnerLoopHeader, OuterLoopLatch, InnerLoopLatch); 1486 1487 return true; 1488 } 1489 1490 void LoopInterchangeTransform::adjustLoopPreheaders() { 1491 // We have interchanged the preheaders so we need to interchange the data in 1492 // the preheader as well. 1493 // This is because the content of inner preheader was previously executed 1494 // inside the outer loop. 1495 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1496 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1497 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1498 BranchInst *InnerTermBI = 1499 cast<BranchInst>(InnerLoopPreHeader->getTerminator()); 1500 1501 // These instructions should now be executed inside the loop. 1502 // Move instruction into a new block after outer header. 1503 moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator()); 1504 // These instructions were not executed previously in the loop so move them to 1505 // the older inner loop preheader. 1506 moveBBContents(OuterLoopPreHeader, InnerTermBI); 1507 } 1508 1509 bool LoopInterchangeTransform::adjustLoopLinks() { 1510 // Adjust all branches in the inner and outer loop. 1511 bool Changed = adjustLoopBranches(); 1512 if (Changed) 1513 adjustLoopPreheaders(); 1514 return Changed; 1515 } 1516 1517 char LoopInterchange::ID = 0; 1518 1519 INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", 1520 "Interchanges loops for cache reuse", false, false) 1521 INITIALIZE_PASS_DEPENDENCY(LoopPass) 1522 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 1523 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1524 1525 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", 1526 "Interchanges loops for cache reuse", false, false) 1527 1528 Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); } 1529