1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the loop fusion pass. 11 /// The implementation is largely based on the following document: 12 /// 13 /// Code Transformations to Augment the Scope of Loop Fusion in a 14 /// Production Compiler 15 /// Christopher Mark Barton 16 /// MSc Thesis 17 /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf 18 /// 19 /// The general approach taken is to collect sets of control flow equivalent 20 /// loops and test whether they can be fused. The necessary conditions for 21 /// fusion are: 22 /// 1. The loops must be adjacent (there cannot be any statements between 23 /// the two loops). 24 /// 2. The loops must be conforming (they must execute the same number of 25 /// iterations). 26 /// 3. The loops must be control flow equivalent (if one loop executes, the 27 /// other is guaranteed to execute). 28 /// 4. There cannot be any negative distance dependencies between the loops. 29 /// If all of these conditions are satisfied, it is safe to fuse the loops. 30 /// 31 /// This implementation creates FusionCandidates that represent the loop and the 32 /// necessary information needed by fusion. It then operates on the fusion 33 /// candidates, first confirming that the candidate is eligible for fusion. The 34 /// candidates are then collected into control flow equivalent sets, sorted in 35 /// dominance order. Each set of control flow equivalent candidates is then 36 /// traversed, attempting to fuse pairs of candidates in the set. If all 37 /// requirements for fusion are met, the two candidates are fused, creating a 38 /// new (fused) candidate which is then added back into the set to consider for 39 /// additional fusion. 40 /// 41 /// This implementation currently does not make any modifications to remove 42 /// conditions for fusion. Code transformations to make loops conform to each of 43 /// the conditions for fusion are discussed in more detail in the document 44 /// above. These can be added to the current implementation in the future. 45 //===----------------------------------------------------------------------===// 46 47 #include "llvm/Transforms/Scalar/LoopFuse.h" 48 #include "llvm/ADT/Statistic.h" 49 #include "llvm/Analysis/AssumptionCache.h" 50 #include "llvm/Analysis/DependenceAnalysis.h" 51 #include "llvm/Analysis/DomTreeUpdater.h" 52 #include "llvm/Analysis/LoopInfo.h" 53 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 54 #include "llvm/Analysis/PostDominators.h" 55 #include "llvm/Analysis/ScalarEvolution.h" 56 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 57 #include "llvm/Analysis/TargetTransformInfo.h" 58 #include "llvm/IR/Function.h" 59 #include "llvm/IR/Verifier.h" 60 #include "llvm/InitializePasses.h" 61 #include "llvm/Pass.h" 62 #include "llvm/Support/CommandLine.h" 63 #include "llvm/Support/Debug.h" 64 #include "llvm/Support/raw_ostream.h" 65 #include "llvm/Transforms/Scalar.h" 66 #include "llvm/Transforms/Utils.h" 67 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 68 #include "llvm/Transforms/Utils/CodeMoverUtils.h" 69 #include "llvm/Transforms/Utils/UnrollLoop.h" 70 71 using namespace llvm; 72 73 #define DEBUG_TYPE "loop-fusion" 74 75 STATISTIC(FuseCounter, "Loops fused"); 76 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); 77 STATISTIC(InvalidPreheader, "Loop has invalid preheader"); 78 STATISTIC(InvalidHeader, "Loop has invalid header"); 79 STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks"); 80 STATISTIC(InvalidExitBlock, "Loop has invalid exit block"); 81 STATISTIC(InvalidLatch, "Loop has invalid latch"); 82 STATISTIC(InvalidLoop, "Loop is invalid"); 83 STATISTIC(AddressTakenBB, "Basic block has address taken"); 84 STATISTIC(MayThrowException, "Loop may throw an exception"); 85 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); 86 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); 87 STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); 88 STATISTIC(UnknownTripCount, "Loop has unknown trip count"); 89 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); 90 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); 91 STATISTIC(NonAdjacent, "Loops are not adjacent"); 92 STATISTIC( 93 NonEmptyPreheader, 94 "Loop has a non-empty preheader with instructions that cannot be moved"); 95 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); 96 STATISTIC(NonIdenticalGuards, "Candidates have different guards"); 97 STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " 98 "instructions that cannot be moved"); 99 STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " 100 "instructions that cannot be moved"); 101 STATISTIC(NotRotated, "Candidate is not rotated"); 102 103 enum FusionDependenceAnalysisChoice { 104 FUSION_DEPENDENCE_ANALYSIS_SCEV, 105 FUSION_DEPENDENCE_ANALYSIS_DA, 106 FUSION_DEPENDENCE_ANALYSIS_ALL, 107 }; 108 109 static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( 110 "loop-fusion-dependence-analysis", 111 cl::desc("Which dependence analysis should loop fusion use?"), 112 cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", 113 "Use the scalar evolution interface"), 114 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", 115 "Use the dependence analysis interface"), 116 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", 117 "Use all available analyses")), 118 cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore); 119 120 static cl::opt<unsigned> FusionPeelMaxCount( 121 "loop-fusion-peel-max-count", cl::init(0), cl::Hidden, 122 cl::desc("Max number of iterations to be peeled from a loop, such that " 123 "fusion can take place")); 124 125 #ifndef NDEBUG 126 static cl::opt<bool> 127 VerboseFusionDebugging("loop-fusion-verbose-debug", 128 cl::desc("Enable verbose debugging for Loop Fusion"), 129 cl::Hidden, cl::init(false), cl::ZeroOrMore); 130 #endif 131 132 namespace { 133 /// This class is used to represent a candidate for loop fusion. When it is 134 /// constructed, it checks the conditions for loop fusion to ensure that it 135 /// represents a valid candidate. It caches several parts of a loop that are 136 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead 137 /// of continually querying the underlying Loop to retrieve these values. It is 138 /// assumed these will not change throughout loop fusion. 139 /// 140 /// The invalidate method should be used to indicate that the FusionCandidate is 141 /// no longer a valid candidate for fusion. Similarly, the isValid() method can 142 /// be used to ensure that the FusionCandidate is still valid for fusion. 143 struct FusionCandidate { 144 /// Cache of parts of the loop used throughout loop fusion. These should not 145 /// need to change throughout the analysis and transformation. 146 /// These parts are cached to avoid repeatedly looking up in the Loop class. 147 148 /// Preheader of the loop this candidate represents 149 BasicBlock *Preheader; 150 /// Header of the loop this candidate represents 151 BasicBlock *Header; 152 /// Blocks in the loop that exit the loop 153 BasicBlock *ExitingBlock; 154 /// The successor block of this loop (where the exiting blocks go to) 155 BasicBlock *ExitBlock; 156 /// Latch of the loop 157 BasicBlock *Latch; 158 /// The loop that this fusion candidate represents 159 Loop *L; 160 /// Vector of instructions in this loop that read from memory 161 SmallVector<Instruction *, 16> MemReads; 162 /// Vector of instructions in this loop that write to memory 163 SmallVector<Instruction *, 16> MemWrites; 164 /// Are all of the members of this fusion candidate still valid 165 bool Valid; 166 /// Guard branch of the loop, if it exists 167 BranchInst *GuardBranch; 168 /// Peeling Paramaters of the Loop. 169 TTI::PeelingPreferences PP; 170 /// Can you Peel this Loop? 171 bool AbleToPeel; 172 /// Has this loop been Peeled 173 bool Peeled; 174 175 /// Dominator and PostDominator trees are needed for the 176 /// FusionCandidateCompare function, required by FusionCandidateSet to 177 /// determine where the FusionCandidate should be inserted into the set. These 178 /// are used to establish ordering of the FusionCandidates based on dominance. 179 const DominatorTree *DT; 180 const PostDominatorTree *PDT; 181 182 OptimizationRemarkEmitter &ORE; 183 184 FusionCandidate(Loop *L, const DominatorTree *DT, 185 const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE, 186 TTI::PeelingPreferences PP) 187 : Preheader(L->getLoopPreheader()), Header(L->getHeader()), 188 ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), 189 Latch(L->getLoopLatch()), L(L), Valid(true), 190 GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), 191 Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { 192 193 // Walk over all blocks in the loop and check for conditions that may 194 // prevent fusion. For each block, walk over all instructions and collect 195 // the memory reads and writes If any instructions that prevent fusion are 196 // found, invalidate this object and return. 197 for (BasicBlock *BB : L->blocks()) { 198 if (BB->hasAddressTaken()) { 199 invalidate(); 200 reportInvalidCandidate(AddressTakenBB); 201 return; 202 } 203 204 for (Instruction &I : *BB) { 205 if (I.mayThrow()) { 206 invalidate(); 207 reportInvalidCandidate(MayThrowException); 208 return; 209 } 210 if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 211 if (SI->isVolatile()) { 212 invalidate(); 213 reportInvalidCandidate(ContainsVolatileAccess); 214 return; 215 } 216 } 217 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 218 if (LI->isVolatile()) { 219 invalidate(); 220 reportInvalidCandidate(ContainsVolatileAccess); 221 return; 222 } 223 } 224 if (I.mayWriteToMemory()) 225 MemWrites.push_back(&I); 226 if (I.mayReadFromMemory()) 227 MemReads.push_back(&I); 228 } 229 } 230 } 231 232 /// Check if all members of the class are valid. 233 bool isValid() const { 234 return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && 235 !L->isInvalid() && Valid; 236 } 237 238 /// Verify that all members are in sync with the Loop object. 239 void verify() const { 240 assert(isValid() && "Candidate is not valid!!"); 241 assert(!L->isInvalid() && "Loop is invalid!"); 242 assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync"); 243 assert(Header == L->getHeader() && "Header is out of sync"); 244 assert(ExitingBlock == L->getExitingBlock() && 245 "Exiting Blocks is out of sync"); 246 assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync"); 247 assert(Latch == L->getLoopLatch() && "Latch is out of sync"); 248 } 249 250 /// Get the entry block for this fusion candidate. 251 /// 252 /// If this fusion candidate represents a guarded loop, the entry block is the 253 /// loop guard block. If it represents an unguarded loop, the entry block is 254 /// the preheader of the loop. 255 BasicBlock *getEntryBlock() const { 256 if (GuardBranch) 257 return GuardBranch->getParent(); 258 else 259 return Preheader; 260 } 261 262 /// After Peeling the loop is modified quite a bit, hence all of the Blocks 263 /// need to be updated accordingly. 264 void updateAfterPeeling() { 265 Preheader = L->getLoopPreheader(); 266 Header = L->getHeader(); 267 ExitingBlock = L->getExitingBlock(); 268 ExitBlock = L->getExitBlock(); 269 Latch = L->getLoopLatch(); 270 verify(); 271 } 272 273 /// Given a guarded loop, get the successor of the guard that is not in the 274 /// loop. 275 /// 276 /// This method returns the successor of the loop guard that is not located 277 /// within the loop (i.e., the successor of the guard that is not the 278 /// preheader). 279 /// This method is only valid for guarded loops. 280 BasicBlock *getNonLoopBlock() const { 281 assert(GuardBranch && "Only valid on guarded loops."); 282 assert(GuardBranch->isConditional() && 283 "Expecting guard to be a conditional branch."); 284 if (Peeled) 285 return GuardBranch->getSuccessor(1); 286 return (GuardBranch->getSuccessor(0) == Preheader) 287 ? GuardBranch->getSuccessor(1) 288 : GuardBranch->getSuccessor(0); 289 } 290 291 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 292 LLVM_DUMP_METHOD void dump() const { 293 dbgs() << "\tGuardBranch: "; 294 if (GuardBranch) 295 dbgs() << *GuardBranch; 296 else 297 dbgs() << "nullptr"; 298 dbgs() << "\n" 299 << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" 300 << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") 301 << "\n" 302 << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" 303 << "\tExitingBB: " 304 << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" 305 << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") 306 << "\n" 307 << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" 308 << "\tEntryBlock: " 309 << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") 310 << "\n"; 311 } 312 #endif 313 314 /// Determine if a fusion candidate (representing a loop) is eligible for 315 /// fusion. Note that this only checks whether a single loop can be fused - it 316 /// does not check whether it is *legal* to fuse two loops together. 317 bool isEligibleForFusion(ScalarEvolution &SE) const { 318 if (!isValid()) { 319 LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); 320 if (!Preheader) 321 ++InvalidPreheader; 322 if (!Header) 323 ++InvalidHeader; 324 if (!ExitingBlock) 325 ++InvalidExitingBlock; 326 if (!ExitBlock) 327 ++InvalidExitBlock; 328 if (!Latch) 329 ++InvalidLatch; 330 if (L->isInvalid()) 331 ++InvalidLoop; 332 333 return false; 334 } 335 336 // Require ScalarEvolution to be able to determine a trip count. 337 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 338 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 339 << " trip count not computable!\n"); 340 return reportInvalidCandidate(UnknownTripCount); 341 } 342 343 if (!L->isLoopSimplifyForm()) { 344 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 345 << " is not in simplified form!\n"); 346 return reportInvalidCandidate(NotSimplifiedForm); 347 } 348 349 if (!L->isRotatedForm()) { 350 LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); 351 return reportInvalidCandidate(NotRotated); 352 } 353 354 return true; 355 } 356 357 private: 358 // This is only used internally for now, to clear the MemWrites and MemReads 359 // list and setting Valid to false. I can't envision other uses of this right 360 // now, since once FusionCandidates are put into the FusionCandidateSet they 361 // are immutable. Thus, any time we need to change/update a FusionCandidate, 362 // we must create a new one and insert it into the FusionCandidateSet to 363 // ensure the FusionCandidateSet remains ordered correctly. 364 void invalidate() { 365 MemWrites.clear(); 366 MemReads.clear(); 367 Valid = false; 368 } 369 370 bool reportInvalidCandidate(llvm::Statistic &Stat) const { 371 using namespace ore; 372 assert(L && Preheader && "Fusion candidate not initialized properly!"); 373 ++Stat; 374 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), 375 L->getStartLoc(), Preheader) 376 << "[" << Preheader->getParent()->getName() << "]: " 377 << "Loop is not a candidate for fusion: " << Stat.getDesc()); 378 return false; 379 } 380 }; 381 382 struct FusionCandidateCompare { 383 /// Comparison functor to sort two Control Flow Equivalent fusion candidates 384 /// into dominance order. 385 /// If LHS dominates RHS and RHS post-dominates LHS, return true; 386 /// IF RHS dominates LHS and LHS post-dominates RHS, return false; 387 bool operator()(const FusionCandidate &LHS, 388 const FusionCandidate &RHS) const { 389 const DominatorTree *DT = LHS.DT; 390 const PostDominatorTree *PDT = LHS.PDT; 391 392 BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); 393 BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); 394 395 // Do not save PDT to local variable as it is only used in asserts and thus 396 // will trigger an unused variable warning if building without asserts. 397 assert(DT && PDT && "Expecting valid dominator tree"); 398 399 // Do this compare first so if LHS == RHS, function returns false. 400 if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { 401 // RHS dominates LHS 402 // Verify LHS post-dominates RHS 403 assert(PDT->dominates(LHSEntryBlock, RHSEntryBlock)); 404 return false; 405 } 406 407 if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { 408 // Verify RHS Postdominates LHS 409 assert(PDT->dominates(RHSEntryBlock, LHSEntryBlock)); 410 return true; 411 } 412 413 // If LHS does not dominate RHS and RHS does not dominate LHS then there is 414 // no dominance relationship between the two FusionCandidates. Thus, they 415 // should not be in the same set together. 416 llvm_unreachable( 417 "No dominance relationship between these fusion candidates!"); 418 } 419 }; 420 421 using LoopVector = SmallVector<Loop *, 4>; 422 423 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance 424 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 425 // dominates FC1 and FC1 post-dominates FC0. 426 // std::set was chosen because we want a sorted data structure with stable 427 // iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent 428 // loops by moving intervening code around. When this intervening code contains 429 // loops, those loops will be moved also. The corresponding FusionCandidates 430 // will also need to be moved accordingly. As this is done, having stable 431 // iterators will simplify the logic. Similarly, having an efficient insert that 432 // keeps the FusionCandidateSet sorted will also simplify the implementation. 433 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; 434 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; 435 436 #if !defined(NDEBUG) 437 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 438 const FusionCandidate &FC) { 439 if (FC.isValid()) 440 OS << FC.Preheader->getName(); 441 else 442 OS << "<Invalid>"; 443 444 return OS; 445 } 446 447 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 448 const FusionCandidateSet &CandSet) { 449 for (const FusionCandidate &FC : CandSet) 450 OS << FC << '\n'; 451 452 return OS; 453 } 454 455 static void 456 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { 457 dbgs() << "Fusion Candidates: \n"; 458 for (const auto &CandidateSet : FusionCandidates) { 459 dbgs() << "*** Fusion Candidate Set ***\n"; 460 dbgs() << CandidateSet; 461 dbgs() << "****************************\n"; 462 } 463 } 464 #endif 465 466 /// Collect all loops in function at the same nest level, starting at the 467 /// outermost level. 468 /// 469 /// This data structure collects all loops at the same nest level for a 470 /// given function (specified by the LoopInfo object). It starts at the 471 /// outermost level. 472 struct LoopDepthTree { 473 using LoopsOnLevelTy = SmallVector<LoopVector, 4>; 474 using iterator = LoopsOnLevelTy::iterator; 475 using const_iterator = LoopsOnLevelTy::const_iterator; 476 477 LoopDepthTree(LoopInfo &LI) : Depth(1) { 478 if (!LI.empty()) 479 LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend())); 480 } 481 482 /// Test whether a given loop has been removed from the function, and thus is 483 /// no longer valid. 484 bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); } 485 486 /// Record that a given loop has been removed from the function and is no 487 /// longer valid. 488 void removeLoop(const Loop *L) { RemovedLoops.insert(L); } 489 490 /// Descend the tree to the next (inner) nesting level 491 void descend() { 492 LoopsOnLevelTy LoopsOnNextLevel; 493 494 for (const LoopVector &LV : *this) 495 for (Loop *L : LV) 496 if (!isRemovedLoop(L) && L->begin() != L->end()) 497 LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end())); 498 499 LoopsOnLevel = LoopsOnNextLevel; 500 RemovedLoops.clear(); 501 Depth++; 502 } 503 504 bool empty() const { return size() == 0; } 505 size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } 506 unsigned getDepth() const { return Depth; } 507 508 iterator begin() { return LoopsOnLevel.begin(); } 509 iterator end() { return LoopsOnLevel.end(); } 510 const_iterator begin() const { return LoopsOnLevel.begin(); } 511 const_iterator end() const { return LoopsOnLevel.end(); } 512 513 private: 514 /// Set of loops that have been removed from the function and are no longer 515 /// valid. 516 SmallPtrSet<const Loop *, 8> RemovedLoops; 517 518 /// Depth of the current level, starting at 1 (outermost loops). 519 unsigned Depth; 520 521 /// Vector of loops at the current depth level that have the same parent loop 522 LoopsOnLevelTy LoopsOnLevel; 523 }; 524 525 #ifndef NDEBUG 526 static void printLoopVector(const LoopVector &LV) { 527 dbgs() << "****************************\n"; 528 for (auto L : LV) 529 printLoop(*L, dbgs()); 530 dbgs() << "****************************\n"; 531 } 532 #endif 533 534 struct LoopFuser { 535 private: 536 // Sets of control flow equivalent fusion candidates for a given nest level. 537 FusionCandidateCollection FusionCandidates; 538 539 LoopDepthTree LDT; 540 DomTreeUpdater DTU; 541 542 LoopInfo &LI; 543 DominatorTree &DT; 544 DependenceInfo &DI; 545 ScalarEvolution &SE; 546 PostDominatorTree &PDT; 547 OptimizationRemarkEmitter &ORE; 548 AssumptionCache &AC; 549 550 const TargetTransformInfo &TTI; 551 552 public: 553 LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, 554 ScalarEvolution &SE, PostDominatorTree &PDT, 555 OptimizationRemarkEmitter &ORE, const DataLayout &DL, 556 AssumptionCache &AC, const TargetTransformInfo &TTI) 557 : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), 558 DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} 559 560 /// This is the main entry point for loop fusion. It will traverse the 561 /// specified function and collect candidate loops to fuse, starting at the 562 /// outermost nesting level and working inwards. 563 bool fuseLoops(Function &F) { 564 #ifndef NDEBUG 565 if (VerboseFusionDebugging) { 566 LI.print(dbgs()); 567 } 568 #endif 569 570 LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() 571 << "\n"); 572 bool Changed = false; 573 574 while (!LDT.empty()) { 575 LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " 576 << LDT.getDepth() << "\n";); 577 578 for (const LoopVector &LV : LDT) { 579 assert(LV.size() > 0 && "Empty loop set was build!"); 580 581 // Skip singleton loop sets as they do not offer fusion opportunities on 582 // this level. 583 if (LV.size() == 1) 584 continue; 585 #ifndef NDEBUG 586 if (VerboseFusionDebugging) { 587 LLVM_DEBUG({ 588 dbgs() << " Visit loop set (#" << LV.size() << "):\n"; 589 printLoopVector(LV); 590 }); 591 } 592 #endif 593 594 collectFusionCandidates(LV); 595 Changed |= fuseCandidates(); 596 } 597 598 // Finished analyzing candidates at this level. 599 // Descend to the next level and clear all of the candidates currently 600 // collected. Note that it will not be possible to fuse any of the 601 // existing candidates with new candidates because the new candidates will 602 // be at a different nest level and thus not be control flow equivalent 603 // with all of the candidates collected so far. 604 LLVM_DEBUG(dbgs() << "Descend one level!\n"); 605 LDT.descend(); 606 FusionCandidates.clear(); 607 } 608 609 if (Changed) 610 LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump();); 611 612 #ifndef NDEBUG 613 assert(DT.verify()); 614 assert(PDT.verify()); 615 LI.verify(DT); 616 SE.verify(); 617 #endif 618 619 LLVM_DEBUG(dbgs() << "Loop Fusion complete\n"); 620 return Changed; 621 } 622 623 private: 624 /// Determine if two fusion candidates are control flow equivalent. 625 /// 626 /// Two fusion candidates are control flow equivalent if when one executes, 627 /// the other is guaranteed to execute. This is determined using dominators 628 /// and post-dominators: if A dominates B and B post-dominates A then A and B 629 /// are control-flow equivalent. 630 bool isControlFlowEquivalent(const FusionCandidate &FC0, 631 const FusionCandidate &FC1) const { 632 assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); 633 634 return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), 635 DT, PDT); 636 } 637 638 /// Iterate over all loops in the given loop set and identify the loops that 639 /// are eligible for fusion. Place all eligible fusion candidates into Control 640 /// Flow Equivalent sets, sorted by dominance. 641 void collectFusionCandidates(const LoopVector &LV) { 642 for (Loop *L : LV) { 643 TTI::PeelingPreferences PP = 644 gatherPeelingPreferences(L, SE, TTI, None, None); 645 FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP); 646 if (!CurrCand.isEligibleForFusion(SE)) 647 continue; 648 649 // Go through each list in FusionCandidates and determine if L is control 650 // flow equivalent with the first loop in that list. If it is, append LV. 651 // If not, go to the next list. 652 // If no suitable list is found, start another list and add it to 653 // FusionCandidates. 654 bool FoundSet = false; 655 656 for (auto &CurrCandSet : FusionCandidates) { 657 if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) { 658 CurrCandSet.insert(CurrCand); 659 FoundSet = true; 660 #ifndef NDEBUG 661 if (VerboseFusionDebugging) 662 LLVM_DEBUG(dbgs() << "Adding " << CurrCand 663 << " to existing candidate set\n"); 664 #endif 665 break; 666 } 667 } 668 if (!FoundSet) { 669 // No set was found. Create a new set and add to FusionCandidates 670 #ifndef NDEBUG 671 if (VerboseFusionDebugging) 672 LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n"); 673 #endif 674 FusionCandidateSet NewCandSet; 675 NewCandSet.insert(CurrCand); 676 FusionCandidates.push_back(NewCandSet); 677 } 678 NumFusionCandidates++; 679 } 680 } 681 682 /// Determine if it is beneficial to fuse two loops. 683 /// 684 /// For now, this method simply returns true because we want to fuse as much 685 /// as possible (primarily to test the pass). This method will evolve, over 686 /// time, to add heuristics for profitability of fusion. 687 bool isBeneficialFusion(const FusionCandidate &FC0, 688 const FusionCandidate &FC1) { 689 return true; 690 } 691 692 /// Determine if two fusion candidates have the same trip count (i.e., they 693 /// execute the same number of iterations). 694 /// 695 /// This function will return a pair of values. The first is a boolean, 696 /// stating whether or not the two candidates are known at compile time to 697 /// have the same TripCount. The second is the difference in the two 698 /// TripCounts. This information can be used later to determine whether or not 699 /// peeling can be performed on either one of the candiates. 700 std::pair<bool, Optional<unsigned>> 701 haveIdenticalTripCounts(const FusionCandidate &FC0, 702 const FusionCandidate &FC1) const { 703 704 const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); 705 if (isa<SCEVCouldNotCompute>(TripCount0)) { 706 UncomputableTripCount++; 707 LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); 708 return {false, None}; 709 } 710 711 const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); 712 if (isa<SCEVCouldNotCompute>(TripCount1)) { 713 UncomputableTripCount++; 714 LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); 715 return {false, None}; 716 } 717 718 LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " 719 << *TripCount1 << " are " 720 << (TripCount0 == TripCount1 ? "identical" : "different") 721 << "\n"); 722 723 if (TripCount0 == TripCount1) 724 return {true, 0}; 725 726 LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " 727 "determining the difference between trip counts\n"); 728 729 // Currently only considering loops with a single exit point 730 // and a non-constant trip count. 731 unsigned TC0 = SE.getSmallConstantTripCount(FC0.L); 732 unsigned TC1 = SE.getSmallConstantTripCount(FC1.L); 733 734 // If any of the tripcounts are zero that means that loop(s) do not have 735 // a single exit or a constant tripcount. 736 if (TC0 == 0 || TC1 == 0) { 737 LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " 738 "have a constant number of iterations. Peeling " 739 "is not benefical\n"); 740 return {false, None}; 741 } 742 743 Optional<unsigned> Difference = None; 744 int Diff = TC0 - TC1; 745 746 if (Diff > 0) 747 Difference = Diff; 748 else { 749 LLVM_DEBUG( 750 dbgs() 751 << "Difference is less than 0. FC1 (second loop) has more " 752 "iterations than the first one. Currently not supported.\n"); 753 } 754 755 LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference 756 << "\n"); 757 758 return {false, Difference}; 759 } 760 761 void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, 762 unsigned PeelCount) { 763 assert(FC0.AbleToPeel && "Should be able to peel loop"); 764 765 LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount 766 << " iterations of the first loop. \n"); 767 768 FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, &DT, &AC, true); 769 if (FC0.Peeled) { 770 LLVM_DEBUG(dbgs() << "Done Peeling\n"); 771 772 #ifndef NDEBUG 773 auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1); 774 775 assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 && 776 "Loops should have identical trip counts after peeling"); 777 #endif 778 779 FC0.PP.PeelCount = PeelCount; 780 781 // Peeling does not update the PDT 782 PDT.recalculate(*FC0.Preheader->getParent()); 783 784 FC0.updateAfterPeeling(); 785 786 // In this case the iterations of the loop are constant, so the first 787 // loop will execute completely (will not jump from one of 788 // the peeled blocks to the second loop). Here we are updating the 789 // branch conditions of each of the peeled blocks, such that it will 790 // branch to its successor which is not the Preheader of the second Loop. 791 // Doing this update will ensure that the entry block of the first loop 792 // dominates the entry block of the second loop. 793 BasicBlock *BB = 794 FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; 795 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 796 for (BasicBlock *Pred : predecessors(BB)) { 797 if (Pred != FC0.ExitBlock) { 798 BranchInst *Old = dyn_cast<BranchInst>(Pred->getTerminator()); 799 BasicBlock *Succ = Old->getSuccessor(0); 800 if (Succ == BB) 801 Succ = Old->getSuccessor(1); 802 BranchInst *NewBranch = BranchInst::Create(Succ); 803 ReplaceInstWithInst(Old, NewBranch); 804 TreeUpdates.emplace_back( 805 DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); 806 } 807 } 808 DTU.applyUpdates(TreeUpdates); 809 DTU.flush(); 810 LLVM_DEBUG( 811 dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount 812 << " iterations from the first loop.\n" 813 "Both Loops have the same number of iterations now.\n"); 814 } 815 } 816 817 /// Walk each set of control flow equivalent fusion candidates and attempt to 818 /// fuse them. This does a single linear traversal of all candidates in the 819 /// set. The conditions for legal fusion are checked at this point. If a pair 820 /// of fusion candidates passes all legality checks, they are fused together 821 /// and a new fusion candidate is created and added to the FusionCandidateSet. 822 /// The original fusion candidates are then removed, as they are no longer 823 /// valid. 824 bool fuseCandidates() { 825 bool Fused = false; 826 LLVM_DEBUG(printFusionCandidates(FusionCandidates)); 827 for (auto &CandidateSet : FusionCandidates) { 828 if (CandidateSet.size() < 2) 829 continue; 830 831 LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" 832 << CandidateSet << "\n"); 833 834 for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { 835 assert(!LDT.isRemovedLoop(FC0->L) && 836 "Should not have removed loops in CandidateSet!"); 837 auto FC1 = FC0; 838 for (++FC1; FC1 != CandidateSet.end(); ++FC1) { 839 assert(!LDT.isRemovedLoop(FC1->L) && 840 "Should not have removed loops in CandidateSet!"); 841 842 LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump(); 843 dbgs() << " with\n"; FC1->dump(); dbgs() << "\n"); 844 845 FC0->verify(); 846 FC1->verify(); 847 848 // Check if the candidates have identical tripcounts (first value of 849 // pair), and if not check the difference in the tripcounts between 850 // the loops (second value of pair). The difference is not equal to 851 // None iff the loops iterate a constant number of times, and have a 852 // single exit. 853 std::pair<bool, Optional<unsigned>> IdenticalTripCountRes = 854 haveIdenticalTripCounts(*FC0, *FC1); 855 bool SameTripCount = IdenticalTripCountRes.first; 856 Optional<unsigned> TCDifference = IdenticalTripCountRes.second; 857 858 // Here we are checking that FC0 (the first loop) can be peeled, and 859 // both loops have different tripcounts. 860 if (FC0->AbleToPeel && !SameTripCount && TCDifference) { 861 if (*TCDifference > FusionPeelMaxCount) { 862 LLVM_DEBUG(dbgs() 863 << "Difference in loop trip counts: " << *TCDifference 864 << " is greater than maximum peel count specificed: " 865 << FusionPeelMaxCount << "\n"); 866 } else { 867 // Dependent on peeling being performed on the first loop, and 868 // assuming all other conditions for fusion return true. 869 SameTripCount = true; 870 } 871 } 872 873 if (!SameTripCount) { 874 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " 875 "counts. Not fusing.\n"); 876 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 877 NonEqualTripCount); 878 continue; 879 } 880 881 if (!isAdjacent(*FC0, *FC1)) { 882 LLVM_DEBUG(dbgs() 883 << "Fusion candidates are not adjacent. Not fusing.\n"); 884 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); 885 continue; 886 } 887 888 // Ensure that FC0 and FC1 have identical guards. 889 // If one (or both) are not guarded, this check is not necessary. 890 if (FC0->GuardBranch && FC1->GuardBranch && 891 !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) { 892 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " 893 "guards. Not Fusing.\n"); 894 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 895 NonIdenticalGuards); 896 continue; 897 } 898 899 if (!isSafeToMoveBefore(*FC1->Preheader, 900 *FC0->Preheader->getTerminator(), DT, &PDT, 901 &DI)) { 902 LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " 903 "instructions in preheader. Not fusing.\n"); 904 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 905 NonEmptyPreheader); 906 continue; 907 } 908 909 if (FC0->GuardBranch) { 910 assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); 911 912 if (!isSafeToMoveBefore(*FC0->ExitBlock, 913 *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, 914 &PDT, &DI)) { 915 LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " 916 "instructions in exit block. Not fusing.\n"); 917 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 918 NonEmptyExitBlock); 919 continue; 920 } 921 922 if (!isSafeToMoveBefore( 923 *FC1->GuardBranch->getParent(), 924 *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT, 925 &DI)) { 926 LLVM_DEBUG(dbgs() 927 << "Fusion candidate contains unsafe " 928 "instructions in guard block. Not fusing.\n"); 929 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 930 NonEmptyGuardBlock); 931 continue; 932 } 933 } 934 935 // Check the dependencies across the loops and do not fuse if it would 936 // violate them. 937 if (!dependencesAllowFusion(*FC0, *FC1)) { 938 LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); 939 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 940 InvalidDependencies); 941 continue; 942 } 943 944 bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); 945 LLVM_DEBUG(dbgs() 946 << "\tFusion appears to be " 947 << (BeneficialToFuse ? "" : "un") << "profitable!\n"); 948 if (!BeneficialToFuse) { 949 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 950 FusionNotBeneficial); 951 continue; 952 } 953 // All analysis has completed and has determined that fusion is legal 954 // and profitable. At this point, start transforming the code and 955 // perform fusion. 956 957 LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " 958 << *FC1 << "\n"); 959 960 FusionCandidate FC0Copy = *FC0; 961 // Peel the loop after determining that fusion is legal. The Loops 962 // will still be safe to fuse after the peeling is performed. 963 bool Peel = TCDifference && *TCDifference > 0; 964 if (Peel) 965 peelFusionCandidate(FC0Copy, *FC1, *TCDifference); 966 967 // Report fusion to the Optimization Remarks. 968 // Note this needs to be done *before* performFusion because 969 // performFusion will change the original loops, making it not 970 // possible to identify them after fusion is complete. 971 reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1, 972 FuseCounter); 973 974 FusionCandidate FusedCand( 975 performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE, 976 FC0Copy.PP); 977 FusedCand.verify(); 978 assert(FusedCand.isEligibleForFusion(SE) && 979 "Fused candidate should be eligible for fusion!"); 980 981 // Notify the loop-depth-tree that these loops are not valid objects 982 LDT.removeLoop(FC1->L); 983 984 CandidateSet.erase(FC0); 985 CandidateSet.erase(FC1); 986 987 auto InsertPos = CandidateSet.insert(FusedCand); 988 989 assert(InsertPos.second && 990 "Unable to insert TargetCandidate in CandidateSet!"); 991 992 // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations 993 // of the FC1 loop will attempt to fuse the new (fused) loop with the 994 // remaining candidates in the current candidate set. 995 FC0 = FC1 = InsertPos.first; 996 997 LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet 998 << "\n"); 999 1000 Fused = true; 1001 } 1002 } 1003 } 1004 return Fused; 1005 } 1006 1007 /// Rewrite all additive recurrences in a SCEV to use a new loop. 1008 class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { 1009 public: 1010 AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, 1011 bool UseMax = true) 1012 : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), 1013 NewL(NewL) {} 1014 1015 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 1016 const Loop *ExprL = Expr->getLoop(); 1017 SmallVector<const SCEV *, 2> Operands; 1018 if (ExprL == &OldL) { 1019 Operands.append(Expr->op_begin(), Expr->op_end()); 1020 return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); 1021 } 1022 1023 if (OldL.contains(ExprL)) { 1024 bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); 1025 if (!UseMax || !Pos || !Expr->isAffine()) { 1026 Valid = false; 1027 return Expr; 1028 } 1029 return visit(Expr->getStart()); 1030 } 1031 1032 for (const SCEV *Op : Expr->operands()) 1033 Operands.push_back(visit(Op)); 1034 return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); 1035 } 1036 1037 bool wasValidSCEV() const { return Valid; } 1038 1039 private: 1040 bool Valid, UseMax; 1041 const Loop &OldL, &NewL; 1042 }; 1043 1044 /// Return false if the access functions of \p I0 and \p I1 could cause 1045 /// a negative dependence. 1046 bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, 1047 Instruction &I1, bool EqualIsInvalid) { 1048 Value *Ptr0 = getLoadStorePointerOperand(&I0); 1049 Value *Ptr1 = getLoadStorePointerOperand(&I1); 1050 if (!Ptr0 || !Ptr1) 1051 return false; 1052 1053 const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); 1054 const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); 1055 #ifndef NDEBUG 1056 if (VerboseFusionDebugging) 1057 LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " 1058 << *SCEVPtr1 << "\n"); 1059 #endif 1060 AddRecLoopReplacer Rewriter(SE, L0, L1); 1061 SCEVPtr0 = Rewriter.visit(SCEVPtr0); 1062 #ifndef NDEBUG 1063 if (VerboseFusionDebugging) 1064 LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 1065 << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); 1066 #endif 1067 if (!Rewriter.wasValidSCEV()) 1068 return false; 1069 1070 // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by 1071 // L0) and the other is not. We could check if it is monotone and test 1072 // the beginning and end value instead. 1073 1074 BasicBlock *L0Header = L0.getHeader(); 1075 auto HasNonLinearDominanceRelation = [&](const SCEV *S) { 1076 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S); 1077 if (!AddRec) 1078 return false; 1079 return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && 1080 !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); 1081 }; 1082 if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) 1083 return false; 1084 1085 ICmpInst::Predicate Pred = 1086 EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; 1087 bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); 1088 #ifndef NDEBUG 1089 if (VerboseFusionDebugging) 1090 LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 1091 << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1 1092 << "\n"); 1093 #endif 1094 return IsAlwaysGE; 1095 } 1096 1097 /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in 1098 /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses 1099 /// specified by @p DepChoice are used to determine this. 1100 bool dependencesAllowFusion(const FusionCandidate &FC0, 1101 const FusionCandidate &FC1, Instruction &I0, 1102 Instruction &I1, bool AnyDep, 1103 FusionDependenceAnalysisChoice DepChoice) { 1104 #ifndef NDEBUG 1105 if (VerboseFusionDebugging) { 1106 LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " 1107 << DepChoice << "\n"); 1108 } 1109 #endif 1110 switch (DepChoice) { 1111 case FUSION_DEPENDENCE_ANALYSIS_SCEV: 1112 return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); 1113 case FUSION_DEPENDENCE_ANALYSIS_DA: { 1114 auto DepResult = DI.depends(&I0, &I1, true); 1115 if (!DepResult) 1116 return true; 1117 #ifndef NDEBUG 1118 if (VerboseFusionDebugging) { 1119 LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); 1120 dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " 1121 << (DepResult->isOrdered() ? "true" : "false") 1122 << "]\n"); 1123 LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() 1124 << "\n"); 1125 } 1126 #endif 1127 1128 if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) 1129 LLVM_DEBUG( 1130 dbgs() << "TODO: Implement pred/succ dependence handling!\n"); 1131 1132 // TODO: Can we actually use the dependence info analysis here? 1133 return false; 1134 } 1135 1136 case FUSION_DEPENDENCE_ANALYSIS_ALL: 1137 return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 1138 FUSION_DEPENDENCE_ANALYSIS_SCEV) || 1139 dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 1140 FUSION_DEPENDENCE_ANALYSIS_DA); 1141 } 1142 1143 llvm_unreachable("Unknown fusion dependence analysis choice!"); 1144 } 1145 1146 /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. 1147 bool dependencesAllowFusion(const FusionCandidate &FC0, 1148 const FusionCandidate &FC1) { 1149 LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 1150 << "\n"); 1151 assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); 1152 assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); 1153 1154 for (Instruction *WriteL0 : FC0.MemWrites) { 1155 for (Instruction *WriteL1 : FC1.MemWrites) 1156 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 1157 /* AnyDep */ false, 1158 FusionDependenceAnalysis)) { 1159 InvalidDependencies++; 1160 return false; 1161 } 1162 for (Instruction *ReadL1 : FC1.MemReads) 1163 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, 1164 /* AnyDep */ false, 1165 FusionDependenceAnalysis)) { 1166 InvalidDependencies++; 1167 return false; 1168 } 1169 } 1170 1171 for (Instruction *WriteL1 : FC1.MemWrites) { 1172 for (Instruction *WriteL0 : FC0.MemWrites) 1173 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 1174 /* AnyDep */ false, 1175 FusionDependenceAnalysis)) { 1176 InvalidDependencies++; 1177 return false; 1178 } 1179 for (Instruction *ReadL0 : FC0.MemReads) 1180 if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, 1181 /* AnyDep */ false, 1182 FusionDependenceAnalysis)) { 1183 InvalidDependencies++; 1184 return false; 1185 } 1186 } 1187 1188 // Walk through all uses in FC1. For each use, find the reaching def. If the 1189 // def is located in FC0 then it is is not safe to fuse. 1190 for (BasicBlock *BB : FC1.L->blocks()) 1191 for (Instruction &I : *BB) 1192 for (auto &Op : I.operands()) 1193 if (Instruction *Def = dyn_cast<Instruction>(Op)) 1194 if (FC0.L->contains(Def->getParent())) { 1195 InvalidDependencies++; 1196 return false; 1197 } 1198 1199 return true; 1200 } 1201 1202 /// Determine if two fusion candidates are adjacent in the CFG. 1203 /// 1204 /// This method will determine if there are additional basic blocks in the CFG 1205 /// between the exit of \p FC0 and the entry of \p FC1. 1206 /// If the two candidates are guarded loops, then it checks whether the 1207 /// non-loop successor of the \p FC0 guard branch is the entry block of \p 1208 /// FC1. If not, then the loops are not adjacent. If the two candidates are 1209 /// not guarded loops, then it checks whether the exit block of \p FC0 is the 1210 /// preheader of \p FC1. 1211 bool isAdjacent(const FusionCandidate &FC0, 1212 const FusionCandidate &FC1) const { 1213 // If the successor of the guard branch is FC1, then the loops are adjacent 1214 if (FC0.GuardBranch) 1215 return FC0.getNonLoopBlock() == FC1.getEntryBlock(); 1216 else 1217 return FC0.ExitBlock == FC1.getEntryBlock(); 1218 } 1219 1220 /// Determine if two fusion candidates have identical guards 1221 /// 1222 /// This method will determine if two fusion candidates have the same guards. 1223 /// The guards are considered the same if: 1224 /// 1. The instructions to compute the condition used in the compare are 1225 /// identical. 1226 /// 2. The successors of the guard have the same flow into/around the loop. 1227 /// If the compare instructions are identical, then the first successor of the 1228 /// guard must go to the same place (either the preheader of the loop or the 1229 /// NonLoopBlock). In other words, the the first successor of both loops must 1230 /// both go into the loop (i.e., the preheader) or go around the loop (i.e., 1231 /// the NonLoopBlock). The same must be true for the second successor. 1232 bool haveIdenticalGuards(const FusionCandidate &FC0, 1233 const FusionCandidate &FC1) const { 1234 assert(FC0.GuardBranch && FC1.GuardBranch && 1235 "Expecting FC0 and FC1 to be guarded loops."); 1236 1237 if (auto FC0CmpInst = 1238 dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) 1239 if (auto FC1CmpInst = 1240 dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) 1241 if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) 1242 return false; 1243 1244 // The compare instructions are identical. 1245 // Now make sure the successor of the guards have the same flow into/around 1246 // the loop 1247 if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) 1248 return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); 1249 else 1250 return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); 1251 } 1252 1253 /// Modify the latch branch of FC to be unconditional since successors of the 1254 /// branch are the same. 1255 void simplifyLatchBranch(const FusionCandidate &FC) const { 1256 BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); 1257 if (FCLatchBranch) { 1258 assert(FCLatchBranch->isConditional() && 1259 FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && 1260 "Expecting the two successors of FCLatchBranch to be the same"); 1261 BranchInst *NewBranch = 1262 BranchInst::Create(FCLatchBranch->getSuccessor(0)); 1263 ReplaceInstWithInst(FCLatchBranch, NewBranch); 1264 } 1265 } 1266 1267 /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique 1268 /// successor, then merge FC0.Latch with its unique successor. 1269 void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1270 moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI); 1271 if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { 1272 MergeBlockIntoPredecessor(Succ, &DTU, &LI); 1273 DTU.flush(); 1274 } 1275 } 1276 1277 /// Fuse two fusion candidates, creating a new fused loop. 1278 /// 1279 /// This method contains the mechanics of fusing two loops, represented by \p 1280 /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 1281 /// postdominates \p FC0 (making them control flow equivalent). It also 1282 /// assumes that the other conditions for fusion have been met: adjacent, 1283 /// identical trip counts, and no negative distance dependencies exist that 1284 /// would prevent fusion. Thus, there is no checking for these conditions in 1285 /// this method. 1286 /// 1287 /// Fusion is performed by rewiring the CFG to update successor blocks of the 1288 /// components of tho loop. Specifically, the following changes are done: 1289 /// 1290 /// 1. The preheader of \p FC1 is removed as it is no longer necessary 1291 /// (because it is currently only a single statement block). 1292 /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. 1293 /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. 1294 /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. 1295 /// 1296 /// All of these modifications are done with dominator tree updates, thus 1297 /// keeping the dominator (and post dominator) information up-to-date. 1298 /// 1299 /// This can be improved in the future by actually merging blocks during 1300 /// fusion. For example, the preheader of \p FC1 can be merged with the 1301 /// preheader of \p FC0. This would allow loops with more than a single 1302 /// statement in the preheader to be fused. Similarly, the latch blocks of the 1303 /// two loops could also be fused into a single block. This will require 1304 /// analysis to prove it is safe to move the contents of the block past 1305 /// existing code, which currently has not been implemented. 1306 Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1307 assert(FC0.isValid() && FC1.isValid() && 1308 "Expecting valid fusion candidates"); 1309 1310 LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); 1311 dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); 1312 1313 // Move instructions from the preheader of FC1 to the end of the preheader 1314 // of FC0. 1315 moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI); 1316 1317 // Fusing guarded loops is handled slightly differently than non-guarded 1318 // loops and has been broken out into a separate method instead of trying to 1319 // intersperse the logic within a single method. 1320 if (FC0.GuardBranch) 1321 return fuseGuardedLoops(FC0, FC1); 1322 1323 assert(FC1.Preheader == 1324 (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); 1325 assert(FC1.Preheader->size() == 1 && 1326 FC1.Preheader->getSingleSuccessor() == FC1.Header); 1327 1328 // Remember the phi nodes originally in the header of FC0 in order to rewire 1329 // them later. However, this is only necessary if the new loop carried 1330 // values might not dominate the exiting branch. While we do not generally 1331 // test if this is the case but simply insert intermediate phi nodes, we 1332 // need to make sure these intermediate phi nodes have different 1333 // predecessors. To this end, we filter the special case where the exiting 1334 // block is the latch block of the first loop. Nothing needs to be done 1335 // anyway as all loop carried values dominate the latch and thereby also the 1336 // exiting branch. 1337 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1338 if (FC0.ExitingBlock != FC0.Latch) 1339 for (PHINode &PHI : FC0.Header->phis()) 1340 OriginalFC0PHIs.push_back(&PHI); 1341 1342 // Replace incoming blocks for header PHIs first. 1343 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1344 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1345 1346 // Then modify the control flow and update DT and PDT. 1347 SmallVector<DominatorTree::UpdateType, 16> TreeUpdates; 1348 1349 // The old exiting block of the first loop (FC0) has to jump to the header 1350 // of the second as we need to execute the code in the second header block 1351 // regardless of the trip count. That is, if the trip count is 0, so the 1352 // back edge is never taken, we still have to execute both loop headers, 1353 // especially (but not only!) if the second is a do-while style loop. 1354 // However, doing so might invalidate the phi nodes of the first loop as 1355 // the new values do only need to dominate their latch and not the exiting 1356 // predicate. To remedy this potential problem we always introduce phi 1357 // nodes in the header of the second loop later that select the loop carried 1358 // value, if the second header was reached through an old latch of the 1359 // first, or undef otherwise. This is sound as exiting the first implies the 1360 // second will exit too, __without__ taking the back-edge. [Their 1361 // trip-counts are equal after all. 1362 // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go 1363 // to FC1.Header? I think this is basically what the three sequences are 1364 // trying to accomplish; however, doing this directly in the CFG may mean 1365 // the DT/PDT becomes invalid 1366 if (!FC0.Peeled) { 1367 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, 1368 FC1.Header); 1369 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1370 DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); 1371 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1372 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1373 } else { 1374 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1375 DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); 1376 1377 // Remove the ExitBlock of the first Loop (also not needed) 1378 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1379 FC1.Header); 1380 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1381 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1382 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1383 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1384 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1385 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1386 } 1387 1388 // The pre-header of L1 is not necessary anymore. 1389 assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader)); 1390 FC1.Preheader->getTerminator()->eraseFromParent(); 1391 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1392 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1393 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1394 1395 // Moves the phi nodes from the second to the first loops header block. 1396 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1397 if (SE.isSCEVable(PHI->getType())) 1398 SE.forgetValue(PHI); 1399 if (PHI->hasNUsesOrMore(1)) 1400 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1401 else 1402 PHI->eraseFromParent(); 1403 } 1404 1405 // Introduce new phi nodes in the second loop header to ensure 1406 // exiting the first and jumping to the header of the second does not break 1407 // the SSA property of the phis originally in the first loop. See also the 1408 // comment above. 1409 Instruction *L1HeaderIP = &FC1.Header->front(); 1410 for (PHINode *LCPHI : OriginalFC0PHIs) { 1411 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1412 assert(L1LatchBBIdx >= 0 && 1413 "Expected loop carried value to be rewired at this point!"); 1414 1415 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1416 1417 PHINode *L1HeaderPHI = PHINode::Create( 1418 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1419 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1420 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1421 FC0.ExitingBlock); 1422 1423 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1424 } 1425 1426 // Replace latch terminator destinations. 1427 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1428 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1429 1430 // Modify the latch branch of FC0 to be unconditional as both successors of 1431 // the branch are the same. 1432 simplifyLatchBranch(FC0); 1433 1434 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1435 // performed the updates above. 1436 if (FC0.Latch != FC0.ExitingBlock) 1437 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1438 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1439 1440 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1441 FC0.Latch, FC0.Header)); 1442 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1443 FC1.Latch, FC0.Header)); 1444 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1445 FC1.Latch, FC1.Header)); 1446 1447 // Update DT/PDT 1448 DTU.applyUpdates(TreeUpdates); 1449 1450 LI.removeBlock(FC1.Preheader); 1451 DTU.deleteBB(FC1.Preheader); 1452 if (FC0.Peeled) { 1453 LI.removeBlock(FC0.ExitBlock); 1454 DTU.deleteBB(FC0.ExitBlock); 1455 } 1456 1457 DTU.flush(); 1458 1459 // Is there a way to keep SE up-to-date so we don't need to forget the loops 1460 // and rebuild the information in subsequent passes of fusion? 1461 // Note: Need to forget the loops before merging the loop latches, as 1462 // mergeLatch may remove the only block in FC1. 1463 SE.forgetLoop(FC1.L); 1464 SE.forgetLoop(FC0.L); 1465 1466 // Move instructions from FC0.Latch to FC1.Latch. 1467 // Note: mergeLatch requires an updated DT. 1468 mergeLatch(FC0, FC1); 1469 1470 // Merge the loops. 1471 SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), 1472 FC1.L->block_end()); 1473 for (BasicBlock *BB : Blocks) { 1474 FC0.L->addBlockEntry(BB); 1475 FC1.L->removeBlockFromLoop(BB); 1476 if (LI.getLoopFor(BB) != FC1.L) 1477 continue; 1478 LI.changeLoopFor(BB, FC0.L); 1479 } 1480 while (!FC1.L->empty()) { 1481 const auto &ChildLoopIt = FC1.L->begin(); 1482 Loop *ChildLoop = *ChildLoopIt; 1483 FC1.L->removeChildLoop(ChildLoopIt); 1484 FC0.L->addChildLoop(ChildLoop); 1485 } 1486 1487 // Delete the now empty loop L1. 1488 LI.erase(FC1.L); 1489 1490 #ifndef NDEBUG 1491 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 1492 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1493 assert(PDT.verify()); 1494 LI.verify(DT); 1495 SE.verify(); 1496 #endif 1497 1498 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 1499 1500 return FC0.L; 1501 } 1502 1503 /// Report details on loop fusion opportunities. 1504 /// 1505 /// This template function can be used to report both successful and missed 1506 /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should 1507 /// be one of: 1508 /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful 1509 /// given two valid fusion candidates. 1510 /// - OptimizationRemark to report successful fusion of two fusion 1511 /// candidates. 1512 /// The remarks will be printed using the form: 1513 /// <path/filename>:<line number>:<column number>: [<function name>]: 1514 /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> 1515 template <typename RemarkKind> 1516 void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, 1517 llvm::Statistic &Stat) { 1518 assert(FC0.Preheader && FC1.Preheader && 1519 "Expecting valid fusion candidates"); 1520 using namespace ore; 1521 ++Stat; 1522 ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), 1523 FC0.Preheader) 1524 << "[" << FC0.Preheader->getParent()->getName() 1525 << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) 1526 << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) 1527 << ": " << Stat.getDesc()); 1528 } 1529 1530 /// Fuse two guarded fusion candidates, creating a new fused loop. 1531 /// 1532 /// Fusing guarded loops is handled much the same way as fusing non-guarded 1533 /// loops. The rewiring of the CFG is slightly different though, because of 1534 /// the presence of the guards around the loops and the exit blocks after the 1535 /// loop body. As such, the new loop is rewired as follows: 1536 /// 1. Keep the guard branch from FC0 and use the non-loop block target 1537 /// from the FC1 guard branch. 1538 /// 2. Remove the exit block from FC0 (this exit block should be empty 1539 /// right now). 1540 /// 3. Remove the guard branch for FC1 1541 /// 4. Remove the preheader for FC1. 1542 /// The exit block successor for the latch of FC0 is updated to be the header 1543 /// of FC1 and the non-exit block successor of the latch of FC1 is updated to 1544 /// be the header of FC0, thus creating the fused loop. 1545 Loop *fuseGuardedLoops(const FusionCandidate &FC0, 1546 const FusionCandidate &FC1) { 1547 assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); 1548 1549 BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); 1550 BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); 1551 BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); 1552 BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); 1553 BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); 1554 1555 // Move instructions from the exit block of FC0 to the beginning of the exit 1556 // block of FC1, in the case that the FC0 loop has not been peeled. In the 1557 // case that FC0 loop is peeled, then move the instructions of the successor 1558 // of the FC0 Exit block to the beginning of the exit block of FC1. 1559 moveInstructionsToTheBeginning( 1560 (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock, 1561 DT, PDT, DI); 1562 1563 // Move instructions from the guard block of FC1 to the end of the guard 1564 // block of FC0. 1565 moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI); 1566 1567 assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); 1568 1569 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1570 1571 //////////////////////////////////////////////////////////////////////////// 1572 // Update the Loop Guard 1573 //////////////////////////////////////////////////////////////////////////// 1574 // The guard for FC0 is updated to guard both FC0 and FC1. This is done by 1575 // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. 1576 // Thus, one path from the guard goes to the preheader for FC0 (and thus 1577 // executes the new fused loop) and the other path goes to the NonLoopBlock 1578 // for FC1 (where FC1 guard would have gone if FC1 was not executed). 1579 FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock); 1580 FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); 1581 1582 BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; 1583 BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header); 1584 1585 // The guard of FC1 is not necessary anymore. 1586 FC1.GuardBranch->eraseFromParent(); 1587 new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); 1588 1589 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1590 DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); 1591 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1592 DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); 1593 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1594 DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); 1595 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1596 DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); 1597 1598 if (FC0.Peeled) { 1599 // Remove the Block after the ExitBlock of FC0 1600 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1601 DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); 1602 FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); 1603 new UnreachableInst(FC0ExitBlockSuccessor->getContext(), 1604 FC0ExitBlockSuccessor); 1605 } 1606 1607 assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && 1608 "Expecting guard block to have no predecessors"); 1609 assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && 1610 "Expecting guard block to have no successors"); 1611 1612 // Remember the phi nodes originally in the header of FC0 in order to rewire 1613 // them later. However, this is only necessary if the new loop carried 1614 // values might not dominate the exiting branch. While we do not generally 1615 // test if this is the case but simply insert intermediate phi nodes, we 1616 // need to make sure these intermediate phi nodes have different 1617 // predecessors. To this end, we filter the special case where the exiting 1618 // block is the latch block of the first loop. Nothing needs to be done 1619 // anyway as all loop carried values dominate the latch and thereby also the 1620 // exiting branch. 1621 // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch 1622 // (because the loops are rotated. Thus, nothing will ever be added to 1623 // OriginalFC0PHIs. 1624 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1625 if (FC0.ExitingBlock != FC0.Latch) 1626 for (PHINode &PHI : FC0.Header->phis()) 1627 OriginalFC0PHIs.push_back(&PHI); 1628 1629 assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); 1630 1631 // Replace incoming blocks for header PHIs first. 1632 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1633 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1634 1635 // The old exiting block of the first loop (FC0) has to jump to the header 1636 // of the second as we need to execute the code in the second header block 1637 // regardless of the trip count. That is, if the trip count is 0, so the 1638 // back edge is never taken, we still have to execute both loop headers, 1639 // especially (but not only!) if the second is a do-while style loop. 1640 // However, doing so might invalidate the phi nodes of the first loop as 1641 // the new values do only need to dominate their latch and not the exiting 1642 // predicate. To remedy this potential problem we always introduce phi 1643 // nodes in the header of the second loop later that select the loop carried 1644 // value, if the second header was reached through an old latch of the 1645 // first, or undef otherwise. This is sound as exiting the first implies the 1646 // second will exit too, __without__ taking the back-edge (their 1647 // trip-counts are equal after all). 1648 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1649 FC1.Header); 1650 1651 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1652 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1653 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1654 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1655 1656 // Remove FC0 Exit Block 1657 // The exit block for FC0 is no longer needed since control will flow 1658 // directly to the header of FC1. Since it is an empty block, it can be 1659 // removed at this point. 1660 // TODO: In the future, we can handle non-empty exit blocks my merging any 1661 // instructions from FC0 exit block into FC1 exit block prior to removing 1662 // the block. 1663 assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) && 1664 "Expecting exit block to be empty"); 1665 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1666 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1667 1668 // Remove FC1 Preheader 1669 // The pre-header of L1 is not necessary anymore. 1670 assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader)); 1671 FC1.Preheader->getTerminator()->eraseFromParent(); 1672 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1673 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1674 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1675 1676 // Moves the phi nodes from the second to the first loops header block. 1677 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1678 if (SE.isSCEVable(PHI->getType())) 1679 SE.forgetValue(PHI); 1680 if (PHI->hasNUsesOrMore(1)) 1681 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1682 else 1683 PHI->eraseFromParent(); 1684 } 1685 1686 // Introduce new phi nodes in the second loop header to ensure 1687 // exiting the first and jumping to the header of the second does not break 1688 // the SSA property of the phis originally in the first loop. See also the 1689 // comment above. 1690 Instruction *L1HeaderIP = &FC1.Header->front(); 1691 for (PHINode *LCPHI : OriginalFC0PHIs) { 1692 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1693 assert(L1LatchBBIdx >= 0 && 1694 "Expected loop carried value to be rewired at this point!"); 1695 1696 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1697 1698 PHINode *L1HeaderPHI = PHINode::Create( 1699 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1700 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1701 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1702 FC0.ExitingBlock); 1703 1704 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1705 } 1706 1707 // Update the latches 1708 1709 // Replace latch terminator destinations. 1710 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1711 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1712 1713 // Modify the latch branch of FC0 to be unconditional as both successors of 1714 // the branch are the same. 1715 simplifyLatchBranch(FC0); 1716 1717 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1718 // performed the updates above. 1719 if (FC0.Latch != FC0.ExitingBlock) 1720 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1721 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1722 1723 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1724 FC0.Latch, FC0.Header)); 1725 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1726 FC1.Latch, FC0.Header)); 1727 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1728 FC1.Latch, FC1.Header)); 1729 1730 // All done 1731 // Apply the updates to the Dominator Tree and cleanup. 1732 1733 assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && 1734 "FC1GuardBlock has successors!!"); 1735 assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && 1736 "FC1GuardBlock has predecessors!!"); 1737 1738 // Update DT/PDT 1739 DTU.applyUpdates(TreeUpdates); 1740 1741 LI.removeBlock(FC1GuardBlock); 1742 LI.removeBlock(FC1.Preheader); 1743 LI.removeBlock(FC0.ExitBlock); 1744 if (FC0.Peeled) { 1745 LI.removeBlock(FC0ExitBlockSuccessor); 1746 DTU.deleteBB(FC0ExitBlockSuccessor); 1747 } 1748 DTU.deleteBB(FC1GuardBlock); 1749 DTU.deleteBB(FC1.Preheader); 1750 DTU.deleteBB(FC0.ExitBlock); 1751 DTU.flush(); 1752 1753 // Is there a way to keep SE up-to-date so we don't need to forget the loops 1754 // and rebuild the information in subsequent passes of fusion? 1755 // Note: Need to forget the loops before merging the loop latches, as 1756 // mergeLatch may remove the only block in FC1. 1757 SE.forgetLoop(FC1.L); 1758 SE.forgetLoop(FC0.L); 1759 1760 // Move instructions from FC0.Latch to FC1.Latch. 1761 // Note: mergeLatch requires an updated DT. 1762 mergeLatch(FC0, FC1); 1763 1764 // Merge the loops. 1765 SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), 1766 FC1.L->block_end()); 1767 for (BasicBlock *BB : Blocks) { 1768 FC0.L->addBlockEntry(BB); 1769 FC1.L->removeBlockFromLoop(BB); 1770 if (LI.getLoopFor(BB) != FC1.L) 1771 continue; 1772 LI.changeLoopFor(BB, FC0.L); 1773 } 1774 while (!FC1.L->empty()) { 1775 const auto &ChildLoopIt = FC1.L->begin(); 1776 Loop *ChildLoop = *ChildLoopIt; 1777 FC1.L->removeChildLoop(ChildLoopIt); 1778 FC0.L->addChildLoop(ChildLoop); 1779 } 1780 1781 // Delete the now empty loop L1. 1782 LI.erase(FC1.L); 1783 1784 #ifndef NDEBUG 1785 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 1786 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1787 assert(PDT.verify()); 1788 LI.verify(DT); 1789 SE.verify(); 1790 #endif 1791 1792 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 1793 1794 return FC0.L; 1795 } 1796 }; 1797 1798 struct LoopFuseLegacy : public FunctionPass { 1799 1800 static char ID; 1801 1802 LoopFuseLegacy() : FunctionPass(ID) { 1803 initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry()); 1804 } 1805 1806 void getAnalysisUsage(AnalysisUsage &AU) const override { 1807 AU.addRequiredID(LoopSimplifyID); 1808 AU.addRequired<ScalarEvolutionWrapperPass>(); 1809 AU.addRequired<LoopInfoWrapperPass>(); 1810 AU.addRequired<DominatorTreeWrapperPass>(); 1811 AU.addRequired<PostDominatorTreeWrapperPass>(); 1812 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1813 AU.addRequired<DependenceAnalysisWrapperPass>(); 1814 AU.addRequired<AssumptionCacheTracker>(); 1815 AU.addRequired<TargetTransformInfoWrapperPass>(); 1816 1817 AU.addPreserved<ScalarEvolutionWrapperPass>(); 1818 AU.addPreserved<LoopInfoWrapperPass>(); 1819 AU.addPreserved<DominatorTreeWrapperPass>(); 1820 AU.addPreserved<PostDominatorTreeWrapperPass>(); 1821 } 1822 1823 bool runOnFunction(Function &F) override { 1824 if (skipFunction(F)) 1825 return false; 1826 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1827 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1828 auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 1829 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1830 auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); 1831 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1832 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1833 const TargetTransformInfo &TTI = 1834 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1835 const DataLayout &DL = F.getParent()->getDataLayout(); 1836 1837 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); 1838 return LF.fuseLoops(F); 1839 } 1840 }; 1841 } // namespace 1842 1843 PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { 1844 auto &LI = AM.getResult<LoopAnalysis>(F); 1845 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 1846 auto &DI = AM.getResult<DependenceAnalysis>(F); 1847 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 1848 auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 1849 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1850 auto &AC = AM.getResult<AssumptionAnalysis>(F); 1851 const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 1852 const DataLayout &DL = F.getParent()->getDataLayout(); 1853 1854 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); 1855 bool Changed = LF.fuseLoops(F); 1856 if (!Changed) 1857 return PreservedAnalyses::all(); 1858 1859 PreservedAnalyses PA; 1860 PA.preserve<DominatorTreeAnalysis>(); 1861 PA.preserve<PostDominatorTreeAnalysis>(); 1862 PA.preserve<ScalarEvolutionAnalysis>(); 1863 PA.preserve<LoopAnalysis>(); 1864 return PA; 1865 } 1866 1867 char LoopFuseLegacy::ID = 0; 1868 1869 INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, 1870 false) 1871 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 1872 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1873 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1874 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 1875 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1876 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1877 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1878 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1879 INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) 1880 1881 FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); } 1882