1 //===- LoopInterchange.cpp - Loop interchange pass------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This Pass handles loop interchange transform. 11 // This pass interchanges loops to provide a more cache-friendly memory access 12 // patterns. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "llvm/ADT/SmallVector.h" 17 #include "llvm/Analysis/AliasAnalysis.h" 18 #include "llvm/Analysis/AssumptionCache.h" 19 #include "llvm/Analysis/BlockFrequencyInfo.h" 20 #include "llvm/Analysis/CodeMetrics.h" 21 #include "llvm/Analysis/DependenceAnalysis.h" 22 #include "llvm/Analysis/LoopInfo.h" 23 #include "llvm/Analysis/LoopIterator.h" 24 #include "llvm/Analysis/LoopPass.h" 25 #include "llvm/Analysis/OptimizationDiagnosticInfo.h" 26 #include "llvm/Analysis/ScalarEvolution.h" 27 #include "llvm/Analysis/ScalarEvolutionExpander.h" 28 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/Analysis/ValueTracking.h" 31 #include "llvm/IR/Dominators.h" 32 #include "llvm/IR/Function.h" 33 #include "llvm/IR/IRBuilder.h" 34 #include "llvm/IR/InstIterator.h" 35 #include "llvm/IR/IntrinsicInst.h" 36 #include "llvm/IR/Module.h" 37 #include "llvm/Pass.h" 38 #include "llvm/Support/Debug.h" 39 #include "llvm/Support/raw_ostream.h" 40 #include "llvm/Transforms/Scalar.h" 41 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 42 #include "llvm/Transforms/Utils/LoopUtils.h" 43 44 using namespace llvm; 45 46 #define DEBUG_TYPE "loop-interchange" 47 48 static cl::opt<int> LoopInterchangeCostThreshold( 49 "loop-interchange-threshold", cl::init(0), cl::Hidden, 50 cl::desc("Interchange if you gain more than this number")); 51 52 namespace { 53 54 typedef SmallVector<Loop *, 8> LoopVector; 55 56 // TODO: Check if we can use a sparse matrix here. 57 typedef std::vector<std::vector<char>> CharMatrix; 58 59 // Maximum number of dependencies that can be handled in the dependency matrix. 60 static const unsigned MaxMemInstrCount = 100; 61 62 // Maximum loop depth supported. 63 static const unsigned MaxLoopNestDepth = 10; 64 65 struct LoopInterchange; 66 67 #ifdef DUMP_DEP_MATRICIES 68 void printDepMatrix(CharMatrix &DepMatrix) { 69 for (auto I = DepMatrix.begin(), E = DepMatrix.end(); I != E; ++I) { 70 std::vector<char> Vec = *I; 71 for (auto II = Vec.begin(), EE = Vec.end(); II != EE; ++II) 72 DEBUG(dbgs() << *II << " "); 73 DEBUG(dbgs() << "\n"); 74 } 75 } 76 #endif 77 78 static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, 79 Loop *L, DependenceInfo *DI) { 80 typedef SmallVector<Value *, 16> ValueVector; 81 ValueVector MemInstr; 82 83 // For each block. 84 for (Loop::block_iterator BB = L->block_begin(), BE = L->block_end(); 85 BB != BE; ++BB) { 86 // Scan the BB and collect legal loads and stores. 87 for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; 88 ++I) { 89 if (!isa<Instruction>(I)) 90 return false; 91 if (LoadInst *Ld = dyn_cast<LoadInst>(I)) { 92 if (!Ld->isSimple()) 93 return false; 94 MemInstr.push_back(&*I); 95 } else if (StoreInst *St = dyn_cast<StoreInst>(I)) { 96 if (!St->isSimple()) 97 return false; 98 MemInstr.push_back(&*I); 99 } 100 } 101 } 102 103 DEBUG(dbgs() << "Found " << MemInstr.size() 104 << " Loads and Stores to analyze\n"); 105 106 ValueVector::iterator I, IE, J, JE; 107 108 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { 109 for (J = I, JE = MemInstr.end(); J != JE; ++J) { 110 std::vector<char> Dep; 111 Instruction *Src = cast<Instruction>(*I); 112 Instruction *Dst = cast<Instruction>(*J); 113 if (Src == Dst) 114 continue; 115 // Ignore Input dependencies. 116 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst)) 117 continue; 118 // Track Output, Flow, and Anti dependencies. 119 if (auto D = DI->depends(Src, Dst, true)) { 120 assert(D->isOrdered() && "Expected an output, flow or anti dep."); 121 DEBUG(StringRef DepType = 122 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output"; 123 dbgs() << "Found " << DepType 124 << " dependency between Src and Dst\n" 125 << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); 126 unsigned Levels = D->getLevels(); 127 char Direction; 128 for (unsigned II = 1; II <= Levels; ++II) { 129 const SCEV *Distance = D->getDistance(II); 130 const SCEVConstant *SCEVConst = 131 dyn_cast_or_null<SCEVConstant>(Distance); 132 if (SCEVConst) { 133 const ConstantInt *CI = SCEVConst->getValue(); 134 if (CI->isNegative()) 135 Direction = '<'; 136 else if (CI->isZero()) 137 Direction = '='; 138 else 139 Direction = '>'; 140 Dep.push_back(Direction); 141 } else if (D->isScalar(II)) { 142 Direction = 'S'; 143 Dep.push_back(Direction); 144 } else { 145 unsigned Dir = D->getDirection(II); 146 if (Dir == Dependence::DVEntry::LT || 147 Dir == Dependence::DVEntry::LE) 148 Direction = '<'; 149 else if (Dir == Dependence::DVEntry::GT || 150 Dir == Dependence::DVEntry::GE) 151 Direction = '>'; 152 else if (Dir == Dependence::DVEntry::EQ) 153 Direction = '='; 154 else 155 Direction = '*'; 156 Dep.push_back(Direction); 157 } 158 } 159 while (Dep.size() != Level) { 160 Dep.push_back('I'); 161 } 162 163 DepMatrix.push_back(Dep); 164 if (DepMatrix.size() > MaxMemInstrCount) { 165 DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount 166 << " dependencies inside loop\n"); 167 return false; 168 } 169 } 170 } 171 } 172 173 // We don't have a DepMatrix to check legality return false. 174 if (DepMatrix.size() == 0) 175 return false; 176 return true; 177 } 178 179 // A loop is moved from index 'from' to an index 'to'. Update the Dependence 180 // matrix by exchanging the two columns. 181 static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, 182 unsigned ToIndx) { 183 unsigned numRows = DepMatrix.size(); 184 for (unsigned i = 0; i < numRows; ++i) { 185 char TmpVal = DepMatrix[i][ToIndx]; 186 DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx]; 187 DepMatrix[i][FromIndx] = TmpVal; 188 } 189 } 190 191 // Checks if outermost non '=','S'or'I' dependence in the dependence matrix is 192 // '>' 193 static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row, 194 unsigned Column) { 195 for (unsigned i = 0; i <= Column; ++i) { 196 if (DepMatrix[Row][i] == '<') 197 return false; 198 if (DepMatrix[Row][i] == '>') 199 return true; 200 } 201 // All dependencies were '=','S' or 'I' 202 return false; 203 } 204 205 // Checks if no dependence exist in the dependency matrix in Row before Column. 206 static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row, 207 unsigned Column) { 208 for (unsigned i = 0; i < Column; ++i) { 209 if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' && 210 DepMatrix[Row][i] != 'I') 211 return false; 212 } 213 return true; 214 } 215 216 static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row, 217 unsigned OuterLoopId, char InnerDep, 218 char OuterDep) { 219 220 if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId)) 221 return false; 222 223 if (InnerDep == OuterDep) 224 return true; 225 226 // It is legal to interchange if and only if after interchange no row has a 227 // '>' direction as the leftmost non-'='. 228 229 if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I') 230 return true; 231 232 if (InnerDep == '<') 233 return true; 234 235 if (InnerDep == '>') { 236 // If OuterLoopId represents outermost loop then interchanging will make the 237 // 1st dependency as '>' 238 if (OuterLoopId == 0) 239 return false; 240 241 // If all dependencies before OuterloopId are '=','S'or 'I'. Then 242 // interchanging will result in this row having an outermost non '=' 243 // dependency of '>' 244 if (!containsNoDependence(DepMatrix, Row, OuterLoopId)) 245 return true; 246 } 247 248 return false; 249 } 250 251 // Checks if it is legal to interchange 2 loops. 252 // [Theorem] A permutation of the loops in a perfect nest is legal if and only 253 // if the direction matrix, after the same permutation is applied to its 254 // columns, has no ">" direction as the leftmost non-"=" direction in any row. 255 static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, 256 unsigned InnerLoopId, 257 unsigned OuterLoopId) { 258 259 unsigned NumRows = DepMatrix.size(); 260 // For each row check if it is valid to interchange. 261 for (unsigned Row = 0; Row < NumRows; ++Row) { 262 char InnerDep = DepMatrix[Row][InnerLoopId]; 263 char OuterDep = DepMatrix[Row][OuterLoopId]; 264 if (InnerDep == '*' || OuterDep == '*') 265 return false; 266 if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep)) 267 return false; 268 } 269 return true; 270 } 271 272 static void populateWorklist(Loop &L, SmallVector<LoopVector, 8> &V) { 273 274 DEBUG(dbgs() << "Calling populateWorklist on Func: " 275 << L.getHeader()->getParent()->getName() << " Loop: %" 276 << L.getHeader()->getName() << '\n'); 277 LoopVector LoopList; 278 Loop *CurrentLoop = &L; 279 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); 280 while (!Vec->empty()) { 281 // The current loop has multiple subloops in it hence it is not tightly 282 // nested. 283 // Discard all loops above it added into Worklist. 284 if (Vec->size() != 1) { 285 LoopList.clear(); 286 return; 287 } 288 LoopList.push_back(CurrentLoop); 289 CurrentLoop = Vec->front(); 290 Vec = &CurrentLoop->getSubLoops(); 291 } 292 LoopList.push_back(CurrentLoop); 293 V.push_back(std::move(LoopList)); 294 } 295 296 static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { 297 PHINode *InnerIndexVar = L->getCanonicalInductionVariable(); 298 if (InnerIndexVar) 299 return InnerIndexVar; 300 if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr) 301 return nullptr; 302 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 303 PHINode *PhiVar = cast<PHINode>(I); 304 Type *PhiTy = PhiVar->getType(); 305 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && 306 !PhiTy->isPointerTy()) 307 return nullptr; 308 const SCEVAddRecExpr *AddRec = 309 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar)); 310 if (!AddRec || !AddRec->isAffine()) 311 continue; 312 const SCEV *Step = AddRec->getStepRecurrence(*SE); 313 if (!isa<SCEVConstant>(Step)) 314 continue; 315 // Found the induction variable. 316 // FIXME: Handle loops with more than one induction variable. Note that, 317 // currently, legality makes sure we have only one induction variable. 318 return PhiVar; 319 } 320 return nullptr; 321 } 322 323 /// LoopInterchangeLegality checks if it is legal to interchange the loop. 324 class LoopInterchangeLegality { 325 public: 326 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 327 LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA, 328 OptimizationRemarkEmitter *ORE) 329 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), 330 PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {} 331 332 /// Check if the loops can be interchanged. 333 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, 334 CharMatrix &DepMatrix); 335 /// Check if the loop structure is understood. We do not handle triangular 336 /// loops for now. 337 bool isLoopStructureUnderstood(PHINode *InnerInductionVar); 338 339 bool currentLimitations(); 340 341 bool hasInnerLoopReduction() { return InnerLoopHasReduction; } 342 343 private: 344 bool tightlyNested(Loop *Outer, Loop *Inner); 345 bool containsUnsafeInstructionsInHeader(BasicBlock *BB); 346 bool areAllUsesReductions(Instruction *Ins, Loop *L); 347 bool containsUnsafeInstructionsInLatch(BasicBlock *BB); 348 bool findInductionAndReductions(Loop *L, 349 SmallVector<PHINode *, 8> &Inductions, 350 SmallVector<PHINode *, 8> &Reductions); 351 Loop *OuterLoop; 352 Loop *InnerLoop; 353 354 ScalarEvolution *SE; 355 LoopInfo *LI; 356 DominatorTree *DT; 357 bool PreserveLCSSA; 358 /// Interface to emit optimization remarks. 359 OptimizationRemarkEmitter *ORE; 360 361 bool InnerLoopHasReduction; 362 }; 363 364 /// LoopInterchangeProfitability checks if it is profitable to interchange the 365 /// loop. 366 class LoopInterchangeProfitability { 367 public: 368 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 369 OptimizationRemarkEmitter *ORE) 370 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} 371 372 /// Check if the loop interchange is profitable. 373 bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, 374 CharMatrix &DepMatrix); 375 376 private: 377 int getInstrOrderCost(); 378 379 Loop *OuterLoop; 380 Loop *InnerLoop; 381 382 /// Scev analysis. 383 ScalarEvolution *SE; 384 /// Interface to emit optimization remarks. 385 OptimizationRemarkEmitter *ORE; 386 }; 387 388 /// LoopInterchangeTransform interchanges the loop. 389 class LoopInterchangeTransform { 390 public: 391 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, 392 LoopInfo *LI, DominatorTree *DT, 393 BasicBlock *LoopNestExit, 394 bool InnerLoopContainsReductions) 395 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), 396 LoopExit(LoopNestExit), 397 InnerLoopHasReduction(InnerLoopContainsReductions) {} 398 399 /// Interchange OuterLoop and InnerLoop. 400 bool transform(); 401 void restructureLoops(Loop *InnerLoop, Loop *OuterLoop); 402 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); 403 404 private: 405 void splitInnerLoopLatch(Instruction *); 406 void splitInnerLoopHeader(); 407 bool adjustLoopLinks(); 408 void adjustLoopPreheaders(); 409 bool adjustLoopBranches(); 410 void updateIncomingBlock(BasicBlock *CurrBlock, BasicBlock *OldPred, 411 BasicBlock *NewPred); 412 413 Loop *OuterLoop; 414 Loop *InnerLoop; 415 416 /// Scev analysis. 417 ScalarEvolution *SE; 418 LoopInfo *LI; 419 DominatorTree *DT; 420 BasicBlock *LoopExit; 421 bool InnerLoopHasReduction; 422 }; 423 424 // Main LoopInterchange Pass. 425 struct LoopInterchange : public FunctionPass { 426 static char ID; 427 ScalarEvolution *SE; 428 LoopInfo *LI; 429 DependenceInfo *DI; 430 DominatorTree *DT; 431 bool PreserveLCSSA; 432 /// Interface to emit optimization remarks. 433 OptimizationRemarkEmitter *ORE; 434 435 LoopInterchange() 436 : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) { 437 initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); 438 } 439 440 void getAnalysisUsage(AnalysisUsage &AU) const override { 441 AU.addRequired<ScalarEvolutionWrapperPass>(); 442 AU.addRequired<AAResultsWrapperPass>(); 443 AU.addRequired<DominatorTreeWrapperPass>(); 444 AU.addRequired<LoopInfoWrapperPass>(); 445 AU.addRequired<DependenceAnalysisWrapperPass>(); 446 AU.addRequiredID(LoopSimplifyID); 447 AU.addRequiredID(LCSSAID); 448 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 449 } 450 451 bool runOnFunction(Function &F) override { 452 if (skipFunction(F)) 453 return false; 454 455 SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 456 LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 457 DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 458 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 459 DT = DTWP ? &DTWP->getDomTree() : nullptr; 460 ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 461 PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); 462 463 // Build up a worklist of loop pairs to analyze. 464 SmallVector<LoopVector, 8> Worklist; 465 466 for (Loop *L : *LI) 467 populateWorklist(*L, Worklist); 468 469 DEBUG(dbgs() << "Worklist size = " << Worklist.size() << "\n"); 470 bool Changed = true; 471 while (!Worklist.empty()) { 472 LoopVector LoopList = Worklist.pop_back_val(); 473 Changed = processLoopList(LoopList, F); 474 } 475 return Changed; 476 } 477 478 bool isComputableLoopNest(LoopVector LoopList) { 479 for (Loop *L : LoopList) { 480 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); 481 if (ExitCountOuter == SE->getCouldNotCompute()) { 482 DEBUG(dbgs() << "Couldn't compute backedge count\n"); 483 return false; 484 } 485 if (L->getNumBackEdges() != 1) { 486 DEBUG(dbgs() << "NumBackEdges is not equal to 1\n"); 487 return false; 488 } 489 if (!L->getExitingBlock()) { 490 DEBUG(dbgs() << "Loop doesn't have unique exit block\n"); 491 return false; 492 } 493 } 494 return true; 495 } 496 497 unsigned selectLoopForInterchange(const LoopVector &LoopList) { 498 // TODO: Add a better heuristic to select the loop to be interchanged based 499 // on the dependence matrix. Currently we select the innermost loop. 500 return LoopList.size() - 1; 501 } 502 503 bool processLoopList(LoopVector LoopList, Function &F) { 504 505 bool Changed = false; 506 unsigned LoopNestDepth = LoopList.size(); 507 if (LoopNestDepth < 2) { 508 DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n"); 509 return false; 510 } 511 if (LoopNestDepth > MaxLoopNestDepth) { 512 DEBUG(dbgs() << "Cannot handle loops of depth greater than " 513 << MaxLoopNestDepth << "\n"); 514 return false; 515 } 516 if (!isComputableLoopNest(LoopList)) { 517 DEBUG(dbgs() << "Not valid loop candidate for interchange\n"); 518 return false; 519 } 520 521 DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth << "\n"); 522 523 CharMatrix DependencyMatrix; 524 Loop *OuterMostLoop = *(LoopList.begin()); 525 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth, 526 OuterMostLoop, DI)) { 527 DEBUG(dbgs() << "Populating dependency matrix failed\n"); 528 return false; 529 } 530 #ifdef DUMP_DEP_MATRICIES 531 DEBUG(dbgs() << "Dependence before interchange\n"); 532 printDepMatrix(DependencyMatrix); 533 #endif 534 535 BasicBlock *OuterMostLoopLatch = OuterMostLoop->getLoopLatch(); 536 BranchInst *OuterMostLoopLatchBI = 537 dyn_cast<BranchInst>(OuterMostLoopLatch->getTerminator()); 538 if (!OuterMostLoopLatchBI) 539 return false; 540 541 // Since we currently do not handle LCSSA PHI's any failure in loop 542 // condition will now branch to LoopNestExit. 543 // TODO: This should be removed once we handle LCSSA PHI nodes. 544 545 // Get the Outermost loop exit. 546 BasicBlock *LoopNestExit; 547 if (OuterMostLoopLatchBI->getSuccessor(0) == OuterMostLoop->getHeader()) 548 LoopNestExit = OuterMostLoopLatchBI->getSuccessor(1); 549 else 550 LoopNestExit = OuterMostLoopLatchBI->getSuccessor(0); 551 552 if (isa<PHINode>(LoopNestExit->begin())) { 553 DEBUG(dbgs() << "PHI Nodes in loop nest exit is not handled for now " 554 "since on failure all loops branch to loop nest exit.\n"); 555 return false; 556 } 557 558 unsigned SelecLoopId = selectLoopForInterchange(LoopList); 559 // Move the selected loop outwards to the best possible position. 560 for (unsigned i = SelecLoopId; i > 0; i--) { 561 bool Interchanged = 562 processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix); 563 if (!Interchanged) 564 return Changed; 565 // Loops interchanged reflect the same in LoopList 566 std::swap(LoopList[i - 1], LoopList[i]); 567 568 // Update the DependencyMatrix 569 interChangeDependencies(DependencyMatrix, i, i - 1); 570 DT->recalculate(F); 571 #ifdef DUMP_DEP_MATRICIES 572 DEBUG(dbgs() << "Dependence after interchange\n"); 573 printDepMatrix(DependencyMatrix); 574 #endif 575 Changed |= Interchanged; 576 } 577 return Changed; 578 } 579 580 bool processLoop(LoopVector LoopList, unsigned InnerLoopId, 581 unsigned OuterLoopId, BasicBlock *LoopNestExit, 582 std::vector<std::vector<char>> &DependencyMatrix) { 583 584 DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId 585 << " and OuterLoopId = " << OuterLoopId << "\n"); 586 Loop *InnerLoop = LoopList[InnerLoopId]; 587 Loop *OuterLoop = LoopList[OuterLoopId]; 588 589 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, 590 PreserveLCSSA, ORE); 591 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { 592 DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); 593 return false; 594 } 595 DEBUG(dbgs() << "Loops are legal to interchange\n"); 596 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); 597 if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { 598 DEBUG(dbgs() << "Interchanging loops not profitable\n"); 599 return false; 600 } 601 602 ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged", 603 InnerLoop->getStartLoc(), 604 InnerLoop->getHeader()) 605 << "Loop interchanged with enclosing loop."); 606 607 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, 608 LoopNestExit, LIL.hasInnerLoopReduction()); 609 LIT.transform(); 610 DEBUG(dbgs() << "Loops interchanged\n"); 611 return true; 612 } 613 }; 614 615 } // end of namespace 616 bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) { 617 return none_of(Ins->users(), [=](User *U) -> bool { 618 auto *UserIns = dyn_cast<PHINode>(U); 619 RecurrenceDescriptor RD; 620 return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD); 621 }); 622 } 623 624 bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader( 625 BasicBlock *BB) { 626 for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { 627 // Load corresponding to reduction PHI's are safe while concluding if 628 // tightly nested. 629 if (LoadInst *L = dyn_cast<LoadInst>(I)) { 630 if (!areAllUsesReductions(L, InnerLoop)) 631 return true; 632 } else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 633 return true; 634 } 635 return false; 636 } 637 638 bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch( 639 BasicBlock *BB) { 640 for (auto I = BB->begin(), E = BB->end(); I != E; ++I) { 641 // Stores corresponding to reductions are safe while concluding if tightly 642 // nested. 643 if (StoreInst *L = dyn_cast<StoreInst>(I)) { 644 if (!isa<PHINode>(L->getOperand(0))) 645 return true; 646 } else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) 647 return true; 648 } 649 return false; 650 } 651 652 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { 653 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 654 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 655 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 656 657 DEBUG(dbgs() << "Checking if loops are tightly nested\n"); 658 659 // A perfectly nested loop will not have any branch in between the outer and 660 // inner block i.e. outer header will branch to either inner preheader and 661 // outerloop latch. 662 BranchInst *OuterLoopHeaderBI = 663 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 664 if (!OuterLoopHeaderBI) 665 return false; 666 667 for (unsigned i = 0, e = OuterLoopHeaderBI->getNumSuccessors(); i < e; ++i) { 668 if (OuterLoopHeaderBI->getSuccessor(i) != InnerLoopPreHeader && 669 OuterLoopHeaderBI->getSuccessor(i) != OuterLoopLatch) 670 return false; 671 } 672 673 DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n"); 674 // We do not have any basic block in between now make sure the outer header 675 // and outer loop latch doesn't contain any unsafe instructions. 676 if (containsUnsafeInstructionsInHeader(OuterLoopHeader) || 677 containsUnsafeInstructionsInLatch(OuterLoopLatch)) 678 return false; 679 680 DEBUG(dbgs() << "Loops are perfectly nested\n"); 681 // We have a perfect loop nest. 682 return true; 683 } 684 685 686 bool LoopInterchangeLegality::isLoopStructureUnderstood( 687 PHINode *InnerInduction) { 688 689 unsigned Num = InnerInduction->getNumOperands(); 690 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader(); 691 for (unsigned i = 0; i < Num; ++i) { 692 Value *Val = InnerInduction->getOperand(i); 693 if (isa<Constant>(Val)) 694 continue; 695 Instruction *I = dyn_cast<Instruction>(Val); 696 if (!I) 697 return false; 698 // TODO: Handle triangular loops. 699 // e.g. for(int i=0;i<N;i++) 700 // for(int j=i;j<N;j++) 701 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); 702 if (InnerInduction->getIncomingBlock(IncomBlockIndx) == 703 InnerLoopPreheader && 704 !OuterLoop->isLoopInvariant(I)) { 705 return false; 706 } 707 } 708 return true; 709 } 710 711 bool LoopInterchangeLegality::findInductionAndReductions( 712 Loop *L, SmallVector<PHINode *, 8> &Inductions, 713 SmallVector<PHINode *, 8> &Reductions) { 714 if (!L->getLoopLatch() || !L->getLoopPredecessor()) 715 return false; 716 for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) { 717 RecurrenceDescriptor RD; 718 InductionDescriptor ID; 719 PHINode *PHI = cast<PHINode>(I); 720 if (InductionDescriptor::isInductionPHI(PHI, L, SE, ID)) 721 Inductions.push_back(PHI); 722 else if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) 723 Reductions.push_back(PHI); 724 else { 725 DEBUG( 726 dbgs() << "Failed to recognize PHI as an induction or reduction.\n"); 727 return false; 728 } 729 } 730 return true; 731 } 732 733 static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { 734 for (auto I = Block->begin(); isa<PHINode>(I); ++I) { 735 PHINode *PHI = cast<PHINode>(I); 736 // Reduction lcssa phi will have only 1 incoming block that from loop latch. 737 if (PHI->getNumIncomingValues() > 1) 738 return false; 739 Instruction *Ins = dyn_cast<Instruction>(PHI->getIncomingValue(0)); 740 if (!Ins) 741 return false; 742 // Incoming value for lcssa phi's in outer loop exit can only be inner loop 743 // exits lcssa phi else it would not be tightly nested. 744 if (!isa<PHINode>(Ins) && isOuterLoopExitBlock) 745 return false; 746 } 747 return true; 748 } 749 750 static BasicBlock *getLoopLatchExitBlock(BasicBlock *LatchBlock, 751 BasicBlock *LoopHeader) { 752 if (BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator())) { 753 unsigned Num = BI->getNumSuccessors(); 754 assert(Num == 2); 755 for (unsigned i = 0; i < Num; ++i) { 756 if (BI->getSuccessor(i) == LoopHeader) 757 continue; 758 return BI->getSuccessor(i); 759 } 760 } 761 return nullptr; 762 } 763 764 // This function indicates the current limitations in the transform as a result 765 // of which we do not proceed. 766 bool LoopInterchangeLegality::currentLimitations() { 767 768 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 769 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 770 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 771 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 772 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 773 774 PHINode *InnerInductionVar; 775 SmallVector<PHINode *, 8> Inductions; 776 SmallVector<PHINode *, 8> Reductions; 777 if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { 778 DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes " 779 << "are supported currently.\n"); 780 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 781 "UnsupportedPHIInner", 782 InnerLoop->getStartLoc(), 783 InnerLoop->getHeader()) 784 << "Only inner loops with induction or reduction PHI nodes can be" 785 " interchange currently."); 786 return true; 787 } 788 789 // TODO: Currently we handle only loops with 1 induction variable. 790 if (Inductions.size() != 1) { 791 DEBUG(dbgs() << "We currently only support loops with 1 induction variable." 792 << "Failed to interchange due to current limitation\n"); 793 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 794 "MultiInductionInner", 795 InnerLoop->getStartLoc(), 796 InnerLoop->getHeader()) 797 << "Only inner loops with 1 induction variable can be " 798 "interchanged currently."); 799 return true; 800 } 801 if (Reductions.size() > 0) 802 InnerLoopHasReduction = true; 803 804 InnerInductionVar = Inductions.pop_back_val(); 805 Reductions.clear(); 806 if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { 807 DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes " 808 << "are supported currently.\n"); 809 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 810 "UnsupportedPHIOuter", 811 OuterLoop->getStartLoc(), 812 OuterLoop->getHeader()) 813 << "Only outer loops with induction or reduction PHI nodes can be" 814 " interchanged currently."); 815 return true; 816 } 817 818 // Outer loop cannot have reduction because then loops will not be tightly 819 // nested. 820 if (!Reductions.empty()) { 821 DEBUG(dbgs() << "Outer loops with reductions are not supported " 822 << "currently.\n"); 823 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 824 "ReductionsOuter", 825 OuterLoop->getStartLoc(), 826 OuterLoop->getHeader()) 827 << "Outer loops with reductions cannot be interchangeed " 828 "currently."); 829 return true; 830 } 831 // TODO: Currently we handle only loops with 1 induction variable. 832 if (Inductions.size() != 1) { 833 DEBUG(dbgs() << "Loops with more than 1 induction variables are not " 834 << "supported currently.\n"); 835 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 836 "MultiIndutionOuter", 837 OuterLoop->getStartLoc(), 838 OuterLoop->getHeader()) 839 << "Only outer loops with 1 induction variable can be " 840 "interchanged currently."); 841 return true; 842 } 843 844 // TODO: Triangular loops are not handled for now. 845 if (!isLoopStructureUnderstood(InnerInductionVar)) { 846 DEBUG(dbgs() << "Loop structure not understood by pass\n"); 847 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 848 "UnsupportedStructureInner", 849 InnerLoop->getStartLoc(), 850 InnerLoop->getHeader()) 851 << "Inner loop structure not understood currently."); 852 return true; 853 } 854 855 // TODO: We only handle LCSSA PHI's corresponding to reduction for now. 856 BasicBlock *LoopExitBlock = 857 getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader); 858 if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) { 859 DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); 860 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 861 "NoLCSSAPHIOuter", 862 OuterLoop->getStartLoc(), 863 OuterLoop->getHeader()) 864 << "Only outer loops with LCSSA PHIs can be interchange " 865 "currently."); 866 return true; 867 } 868 869 LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader); 870 if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) { 871 DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); 872 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 873 "NoLCSSAPHIOuterInner", 874 InnerLoop->getStartLoc(), 875 InnerLoop->getHeader()) 876 << "Only inner loops with LCSSA PHIs can be interchange " 877 "currently."); 878 return true; 879 } 880 881 // TODO: Current limitation: Since we split the inner loop latch at the point 882 // were induction variable is incremented (induction.next); We cannot have 883 // more than 1 user of induction.next since it would result in broken code 884 // after split. 885 // e.g. 886 // for(i=0;i<N;i++) { 887 // for(j = 0;j<M;j++) { 888 // A[j+1][i+2] = A[j][i]+k; 889 // } 890 // } 891 Instruction *InnerIndexVarInc = nullptr; 892 if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader) 893 InnerIndexVarInc = 894 dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1)); 895 else 896 InnerIndexVarInc = 897 dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); 898 899 if (!InnerIndexVarInc) { 900 DEBUG(dbgs() << "Did not find an instruction to increment the induction " 901 << "variable.\n"); 902 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 903 "NoIncrementInInner", 904 InnerLoop->getStartLoc(), 905 InnerLoop->getHeader()) 906 << "The inner loop does not increment the induction variable."); 907 return true; 908 } 909 910 // Since we split the inner loop latch on this induction variable. Make sure 911 // we do not have any instruction between the induction variable and branch 912 // instruction. 913 914 bool FoundInduction = false; 915 for (const Instruction &I : reverse(*InnerLoopLatch)) { 916 if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I)) 917 continue; 918 919 // We found an instruction. If this is not induction variable then it is not 920 // safe to split this loop latch. 921 if (!I.isIdenticalTo(InnerIndexVarInc)) { 922 DEBUG(dbgs() << "Found unsupported instructions between induction " 923 << "variable increment and branch.\n"); 924 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 925 "UnsupportedInsBetweenInduction", 926 InnerLoop->getStartLoc(), 927 InnerLoop->getHeader()) 928 << "Found unsupported instruction between induction variable " 929 "increment and branch."); 930 return true; 931 } 932 933 FoundInduction = true; 934 break; 935 } 936 // The loop latch ended and we didn't find the induction variable return as 937 // current limitation. 938 if (!FoundInduction) { 939 DEBUG(dbgs() << "Did not find the induction variable.\n"); 940 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 941 "NoIndutionVariable", 942 InnerLoop->getStartLoc(), 943 InnerLoop->getHeader()) 944 << "Did not find the induction variable."); 945 return true; 946 } 947 return false; 948 } 949 950 bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, 951 unsigned OuterLoopId, 952 CharMatrix &DepMatrix) { 953 954 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { 955 DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId 956 << " and OuterLoopId = " << OuterLoopId 957 << " due to dependence\n"); 958 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 959 "Dependence", 960 InnerLoop->getStartLoc(), 961 InnerLoop->getHeader()) 962 << "Cannot interchange loops due to dependences."); 963 return false; 964 } 965 966 // Create unique Preheaders if we already do not have one. 967 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 968 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 969 970 // Create a unique outer preheader - 971 // 1) If OuterLoop preheader is not present. 972 // 2) If OuterLoop Preheader is same as OuterLoop Header 973 // 3) If OuterLoop Preheader is same as Header of the previous loop. 974 // 4) If OuterLoop Preheader is Entry node. 975 if (!OuterLoopPreHeader || OuterLoopPreHeader == OuterLoop->getHeader() || 976 isa<PHINode>(OuterLoopPreHeader->begin()) || 977 !OuterLoopPreHeader->getUniquePredecessor()) { 978 OuterLoopPreHeader = 979 InsertPreheaderForLoop(OuterLoop, DT, LI, PreserveLCSSA); 980 } 981 982 if (!InnerLoopPreHeader || InnerLoopPreHeader == InnerLoop->getHeader() || 983 InnerLoopPreHeader == OuterLoop->getHeader()) { 984 InnerLoopPreHeader = 985 InsertPreheaderForLoop(InnerLoop, DT, LI, PreserveLCSSA); 986 } 987 988 // TODO: The loops could not be interchanged due to current limitations in the 989 // transform module. 990 if (currentLimitations()) { 991 DEBUG(dbgs() << "Not legal because of current transform limitation\n"); 992 return false; 993 } 994 995 // Check if the loops are tightly nested. 996 if (!tightlyNested(OuterLoop, InnerLoop)) { 997 DEBUG(dbgs() << "Loops not tightly nested\n"); 998 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 999 "NotTightlyNested", 1000 InnerLoop->getStartLoc(), 1001 InnerLoop->getHeader()) 1002 << "Cannot interchange loops because they are not tightly " 1003 "nested."); 1004 return false; 1005 } 1006 1007 return true; 1008 } 1009 1010 int LoopInterchangeProfitability::getInstrOrderCost() { 1011 unsigned GoodOrder, BadOrder; 1012 BadOrder = GoodOrder = 0; 1013 for (auto BI = InnerLoop->block_begin(), BE = InnerLoop->block_end(); 1014 BI != BE; ++BI) { 1015 for (Instruction &Ins : **BI) { 1016 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) { 1017 unsigned NumOp = GEP->getNumOperands(); 1018 bool FoundInnerInduction = false; 1019 bool FoundOuterInduction = false; 1020 for (unsigned i = 0; i < NumOp; ++i) { 1021 const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i)); 1022 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal); 1023 if (!AR) 1024 continue; 1025 1026 // If we find the inner induction after an outer induction e.g. 1027 // for(int i=0;i<N;i++) 1028 // for(int j=0;j<N;j++) 1029 // A[i][j] = A[i-1][j-1]+k; 1030 // then it is a good order. 1031 if (AR->getLoop() == InnerLoop) { 1032 // We found an InnerLoop induction after OuterLoop induction. It is 1033 // a good order. 1034 FoundInnerInduction = true; 1035 if (FoundOuterInduction) { 1036 GoodOrder++; 1037 break; 1038 } 1039 } 1040 // If we find the outer induction after an inner induction e.g. 1041 // for(int i=0;i<N;i++) 1042 // for(int j=0;j<N;j++) 1043 // A[j][i] = A[j-1][i-1]+k; 1044 // then it is a bad order. 1045 if (AR->getLoop() == OuterLoop) { 1046 // We found an OuterLoop induction after InnerLoop induction. It is 1047 // a bad order. 1048 FoundOuterInduction = true; 1049 if (FoundInnerInduction) { 1050 BadOrder++; 1051 break; 1052 } 1053 } 1054 } 1055 } 1056 } 1057 } 1058 return GoodOrder - BadOrder; 1059 } 1060 1061 static bool isProfitableForVectorization(unsigned InnerLoopId, 1062 unsigned OuterLoopId, 1063 CharMatrix &DepMatrix) { 1064 // TODO: Improve this heuristic to catch more cases. 1065 // If the inner loop is loop independent or doesn't carry any dependency it is 1066 // profitable to move this to outer position. 1067 unsigned Row = DepMatrix.size(); 1068 for (unsigned i = 0; i < Row; ++i) { 1069 if (DepMatrix[i][InnerLoopId] != 'S' && DepMatrix[i][InnerLoopId] != 'I') 1070 return false; 1071 // TODO: We need to improve this heuristic. 1072 if (DepMatrix[i][OuterLoopId] != '=') 1073 return false; 1074 } 1075 // If outer loop has dependence and inner loop is loop independent then it is 1076 // profitable to interchange to enable parallelism. 1077 return true; 1078 } 1079 1080 bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, 1081 unsigned OuterLoopId, 1082 CharMatrix &DepMatrix) { 1083 1084 // TODO: Add better profitability checks. 1085 // e.g 1086 // 1) Construct dependency matrix and move the one with no loop carried dep 1087 // inside to enable vectorization. 1088 1089 // This is rough cost estimation algorithm. It counts the good and bad order 1090 // of induction variables in the instruction and allows reordering if number 1091 // of bad orders is more than good. 1092 int Cost = getInstrOrderCost(); 1093 DEBUG(dbgs() << "Cost = " << Cost << "\n"); 1094 if (Cost < -LoopInterchangeCostThreshold) 1095 return true; 1096 1097 // It is not profitable as per current cache profitability model. But check if 1098 // we can move this loop outside to improve parallelism. 1099 if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) 1100 return true; 1101 1102 ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, 1103 "InterchangeNotProfitable", 1104 InnerLoop->getStartLoc(), 1105 InnerLoop->getHeader()) 1106 << "Interchanging loops is too costly (cost=" 1107 << ore::NV("Cost", Cost) << ", threshold=" 1108 << ore::NV("Threshold", LoopInterchangeCostThreshold) << 1109 ") and it does not improve parallelism."); 1110 return false; 1111 } 1112 1113 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, 1114 Loop *InnerLoop) { 1115 for (Loop::iterator I = OuterLoop->begin(), E = OuterLoop->end(); I != E; 1116 ++I) { 1117 if (*I == InnerLoop) { 1118 OuterLoop->removeChildLoop(I); 1119 return; 1120 } 1121 } 1122 llvm_unreachable("Couldn't find loop"); 1123 } 1124 1125 void LoopInterchangeTransform::restructureLoops(Loop *InnerLoop, 1126 Loop *OuterLoop) { 1127 Loop *OuterLoopParent = OuterLoop->getParentLoop(); 1128 if (OuterLoopParent) { 1129 // Remove the loop from its parent loop. 1130 removeChildLoop(OuterLoopParent, OuterLoop); 1131 removeChildLoop(OuterLoop, InnerLoop); 1132 OuterLoopParent->addChildLoop(InnerLoop); 1133 } else { 1134 removeChildLoop(OuterLoop, InnerLoop); 1135 LI->changeTopLevelLoop(OuterLoop, InnerLoop); 1136 } 1137 1138 while (!InnerLoop->empty()) 1139 OuterLoop->addChildLoop(InnerLoop->removeChildLoop(InnerLoop->begin())); 1140 1141 InnerLoop->addChildLoop(OuterLoop); 1142 } 1143 1144 bool LoopInterchangeTransform::transform() { 1145 bool Transformed = false; 1146 Instruction *InnerIndexVar; 1147 1148 if (InnerLoop->getSubLoops().size() == 0) { 1149 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1150 DEBUG(dbgs() << "Calling Split Inner Loop\n"); 1151 PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); 1152 if (!InductionPHI) { 1153 DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); 1154 return false; 1155 } 1156 1157 if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader) 1158 InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1)); 1159 else 1160 InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0)); 1161 1162 // 1163 // Split at the place were the induction variable is 1164 // incremented/decremented. 1165 // TODO: This splitting logic may not work always. Fix this. 1166 splitInnerLoopLatch(InnerIndexVar); 1167 DEBUG(dbgs() << "splitInnerLoopLatch done\n"); 1168 1169 // Splits the inner loops phi nodes out into a separate basic block. 1170 splitInnerLoopHeader(); 1171 DEBUG(dbgs() << "splitInnerLoopHeader done\n"); 1172 } 1173 1174 Transformed |= adjustLoopLinks(); 1175 if (!Transformed) { 1176 DEBUG(dbgs() << "adjustLoopLinks failed\n"); 1177 return false; 1178 } 1179 1180 restructureLoops(InnerLoop, OuterLoop); 1181 return true; 1182 } 1183 1184 void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { 1185 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1186 BasicBlock *InnerLoopLatchPred = InnerLoopLatch; 1187 InnerLoopLatch = SplitBlock(InnerLoopLatchPred, Inc, DT, LI); 1188 } 1189 1190 void LoopInterchangeTransform::splitInnerLoopHeader() { 1191 1192 // Split the inner loop header out. Here make sure that the reduction PHI's 1193 // stay in the innerloop body. 1194 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1195 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1196 if (InnerLoopHasReduction) { 1197 // FIXME: Check if the induction PHI will always be the first PHI. 1198 BasicBlock *New = InnerLoopHeader->splitBasicBlock( 1199 ++(InnerLoopHeader->begin()), InnerLoopHeader->getName() + ".split"); 1200 if (LI) 1201 if (Loop *L = LI->getLoopFor(InnerLoopHeader)) 1202 L->addBasicBlockToLoop(New, *LI); 1203 1204 // Adjust Reduction PHI's in the block. 1205 SmallVector<PHINode *, 8> PHIVec; 1206 for (auto I = New->begin(); isa<PHINode>(I); ++I) { 1207 PHINode *PHI = dyn_cast<PHINode>(I); 1208 Value *V = PHI->getIncomingValueForBlock(InnerLoopPreHeader); 1209 PHI->replaceAllUsesWith(V); 1210 PHIVec.push_back((PHI)); 1211 } 1212 for (PHINode *P : PHIVec) { 1213 P->eraseFromParent(); 1214 } 1215 } else { 1216 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI); 1217 } 1218 1219 DEBUG(dbgs() << "Output of splitInnerLoopHeader InnerLoopHeaderSucc & " 1220 "InnerLoopHeader\n"); 1221 } 1222 1223 /// \brief Move all instructions except the terminator from FromBB right before 1224 /// InsertBefore 1225 static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { 1226 auto &ToList = InsertBefore->getParent()->getInstList(); 1227 auto &FromList = FromBB->getInstList(); 1228 1229 ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(), 1230 FromBB->getTerminator()->getIterator()); 1231 } 1232 1233 void LoopInterchangeTransform::updateIncomingBlock(BasicBlock *CurrBlock, 1234 BasicBlock *OldPred, 1235 BasicBlock *NewPred) { 1236 for (auto I = CurrBlock->begin(); isa<PHINode>(I); ++I) { 1237 PHINode *PHI = cast<PHINode>(I); 1238 unsigned Num = PHI->getNumIncomingValues(); 1239 for (unsigned i = 0; i < Num; ++i) { 1240 if (PHI->getIncomingBlock(i) == OldPred) 1241 PHI->setIncomingBlock(i, NewPred); 1242 } 1243 } 1244 } 1245 1246 bool LoopInterchangeTransform::adjustLoopBranches() { 1247 1248 DEBUG(dbgs() << "adjustLoopBranches called\n"); 1249 // Adjust the loop preheader 1250 BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); 1251 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1252 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); 1253 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); 1254 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1255 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1256 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); 1257 BasicBlock *InnerLoopLatchPredecessor = 1258 InnerLoopLatch->getUniquePredecessor(); 1259 BasicBlock *InnerLoopLatchSuccessor; 1260 BasicBlock *OuterLoopLatchSuccessor; 1261 1262 BranchInst *OuterLoopLatchBI = 1263 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator()); 1264 BranchInst *InnerLoopLatchBI = 1265 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator()); 1266 BranchInst *OuterLoopHeaderBI = 1267 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator()); 1268 BranchInst *InnerLoopHeaderBI = 1269 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator()); 1270 1271 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || 1272 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || 1273 !InnerLoopHeaderBI) 1274 return false; 1275 1276 BranchInst *InnerLoopLatchPredecessorBI = 1277 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator()); 1278 BranchInst *OuterLoopPredecessorBI = 1279 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator()); 1280 1281 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) 1282 return false; 1283 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor(); 1284 if (!InnerLoopHeaderSuccessor) 1285 return false; 1286 1287 // Adjust Loop Preheader and headers 1288 1289 unsigned NumSucc = OuterLoopPredecessorBI->getNumSuccessors(); 1290 for (unsigned i = 0; i < NumSucc; ++i) { 1291 if (OuterLoopPredecessorBI->getSuccessor(i) == OuterLoopPreHeader) 1292 OuterLoopPredecessorBI->setSuccessor(i, InnerLoopPreHeader); 1293 } 1294 1295 NumSucc = OuterLoopHeaderBI->getNumSuccessors(); 1296 for (unsigned i = 0; i < NumSucc; ++i) { 1297 if (OuterLoopHeaderBI->getSuccessor(i) == OuterLoopLatch) 1298 OuterLoopHeaderBI->setSuccessor(i, LoopExit); 1299 else if (OuterLoopHeaderBI->getSuccessor(i) == InnerLoopPreHeader) 1300 OuterLoopHeaderBI->setSuccessor(i, InnerLoopHeaderSuccessor); 1301 } 1302 1303 // Adjust reduction PHI's now that the incoming block has changed. 1304 updateIncomingBlock(InnerLoopHeaderSuccessor, InnerLoopHeader, 1305 OuterLoopHeader); 1306 1307 BranchInst::Create(OuterLoopPreHeader, InnerLoopHeaderBI); 1308 InnerLoopHeaderBI->eraseFromParent(); 1309 1310 // -------------Adjust loop latches----------- 1311 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader) 1312 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1); 1313 else 1314 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0); 1315 1316 NumSucc = InnerLoopLatchPredecessorBI->getNumSuccessors(); 1317 for (unsigned i = 0; i < NumSucc; ++i) { 1318 if (InnerLoopLatchPredecessorBI->getSuccessor(i) == InnerLoopLatch) 1319 InnerLoopLatchPredecessorBI->setSuccessor(i, InnerLoopLatchSuccessor); 1320 } 1321 1322 // Adjust PHI nodes in InnerLoopLatchSuccessor. Update all uses of PHI with 1323 // the value and remove this PHI node from inner loop. 1324 SmallVector<PHINode *, 8> LcssaVec; 1325 for (auto I = InnerLoopLatchSuccessor->begin(); isa<PHINode>(I); ++I) { 1326 PHINode *LcssaPhi = cast<PHINode>(I); 1327 LcssaVec.push_back(LcssaPhi); 1328 } 1329 for (PHINode *P : LcssaVec) { 1330 Value *Incoming = P->getIncomingValueForBlock(InnerLoopLatch); 1331 P->replaceAllUsesWith(Incoming); 1332 P->eraseFromParent(); 1333 } 1334 1335 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader) 1336 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1); 1337 else 1338 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0); 1339 1340 if (InnerLoopLatchBI->getSuccessor(1) == InnerLoopLatchSuccessor) 1341 InnerLoopLatchBI->setSuccessor(1, OuterLoopLatchSuccessor); 1342 else 1343 InnerLoopLatchBI->setSuccessor(0, OuterLoopLatchSuccessor); 1344 1345 updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch); 1346 1347 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopLatchSuccessor) { 1348 OuterLoopLatchBI->setSuccessor(0, InnerLoopLatch); 1349 } else { 1350 OuterLoopLatchBI->setSuccessor(1, InnerLoopLatch); 1351 } 1352 1353 return true; 1354 } 1355 void LoopInterchangeTransform::adjustLoopPreheaders() { 1356 1357 // We have interchanged the preheaders so we need to interchange the data in 1358 // the preheader as well. 1359 // This is because the content of inner preheader was previously executed 1360 // inside the outer loop. 1361 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader(); 1362 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); 1363 BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); 1364 BranchInst *InnerTermBI = 1365 cast<BranchInst>(InnerLoopPreHeader->getTerminator()); 1366 1367 // These instructions should now be executed inside the loop. 1368 // Move instruction into a new block after outer header. 1369 moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator()); 1370 // These instructions were not executed previously in the loop so move them to 1371 // the older inner loop preheader. 1372 moveBBContents(OuterLoopPreHeader, InnerTermBI); 1373 } 1374 1375 bool LoopInterchangeTransform::adjustLoopLinks() { 1376 1377 // Adjust all branches in the inner and outer loop. 1378 bool Changed = adjustLoopBranches(); 1379 if (Changed) 1380 adjustLoopPreheaders(); 1381 return Changed; 1382 } 1383 1384 char LoopInterchange::ID = 0; 1385 INITIALIZE_PASS_BEGIN(LoopInterchange, "loop-interchange", 1386 "Interchanges loops for cache reuse", false, false) 1387 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 1388 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 1389 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1390 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1391 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 1392 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) 1393 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1394 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1395 1396 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", 1397 "Interchanges loops for cache reuse", false, false) 1398 1399 Pass *llvm::createLoopInterchangePass() { return new LoopInterchange(); } 1400