1 //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the loop fusion pass. 11 /// The implementation is largely based on the following document: 12 /// 13 /// Code Transformations to Augment the Scope of Loop Fusion in a 14 /// Production Compiler 15 /// Christopher Mark Barton 16 /// MSc Thesis 17 /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf 18 /// 19 /// The general approach taken is to collect sets of control flow equivalent 20 /// loops and test whether they can be fused. The necessary conditions for 21 /// fusion are: 22 /// 1. The loops must be adjacent (there cannot be any statements between 23 /// the two loops). 24 /// 2. The loops must be conforming (they must execute the same number of 25 /// iterations). 26 /// 3. The loops must be control flow equivalent (if one loop executes, the 27 /// other is guaranteed to execute). 28 /// 4. There cannot be any negative distance dependencies between the loops. 29 /// If all of these conditions are satisfied, it is safe to fuse the loops. 30 /// 31 /// This implementation creates FusionCandidates that represent the loop and the 32 /// necessary information needed by fusion. It then operates on the fusion 33 /// candidates, first confirming that the candidate is eligible for fusion. The 34 /// candidates are then collected into control flow equivalent sets, sorted in 35 /// dominance order. Each set of control flow equivalent candidates is then 36 /// traversed, attempting to fuse pairs of candidates in the set. If all 37 /// requirements for fusion are met, the two candidates are fused, creating a 38 /// new (fused) candidate which is then added back into the set to consider for 39 /// additional fusion. 40 /// 41 /// This implementation currently does not make any modifications to remove 42 /// conditions for fusion. Code transformations to make loops conform to each of 43 /// the conditions for fusion are discussed in more detail in the document 44 /// above. These can be added to the current implementation in the future. 45 //===----------------------------------------------------------------------===// 46 47 #include "llvm/Transforms/Scalar/LoopFuse.h" 48 #include "llvm/ADT/Statistic.h" 49 #include "llvm/Analysis/AssumptionCache.h" 50 #include "llvm/Analysis/DependenceAnalysis.h" 51 #include "llvm/Analysis/DomTreeUpdater.h" 52 #include "llvm/Analysis/LoopInfo.h" 53 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 54 #include "llvm/Analysis/PostDominators.h" 55 #include "llvm/Analysis/ScalarEvolution.h" 56 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 57 #include "llvm/Analysis/TargetTransformInfo.h" 58 #include "llvm/IR/Function.h" 59 #include "llvm/IR/Verifier.h" 60 #include "llvm/InitializePasses.h" 61 #include "llvm/Pass.h" 62 #include "llvm/Support/CommandLine.h" 63 #include "llvm/Support/Debug.h" 64 #include "llvm/Support/raw_ostream.h" 65 #include "llvm/Transforms/Scalar.h" 66 #include "llvm/Transforms/Utils.h" 67 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 68 #include "llvm/Transforms/Utils/CodeMoverUtils.h" 69 #include "llvm/Transforms/Utils/LoopPeel.h" 70 71 using namespace llvm; 72 73 #define DEBUG_TYPE "loop-fusion" 74 75 STATISTIC(FuseCounter, "Loops fused"); 76 STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); 77 STATISTIC(InvalidPreheader, "Loop has invalid preheader"); 78 STATISTIC(InvalidHeader, "Loop has invalid header"); 79 STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks"); 80 STATISTIC(InvalidExitBlock, "Loop has invalid exit block"); 81 STATISTIC(InvalidLatch, "Loop has invalid latch"); 82 STATISTIC(InvalidLoop, "Loop is invalid"); 83 STATISTIC(AddressTakenBB, "Basic block has address taken"); 84 STATISTIC(MayThrowException, "Loop may throw an exception"); 85 STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); 86 STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); 87 STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); 88 STATISTIC(UnknownTripCount, "Loop has unknown trip count"); 89 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); 90 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); 91 STATISTIC(NonAdjacent, "Loops are not adjacent"); 92 STATISTIC( 93 NonEmptyPreheader, 94 "Loop has a non-empty preheader with instructions that cannot be moved"); 95 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); 96 STATISTIC(NonIdenticalGuards, "Candidates have different guards"); 97 STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with " 98 "instructions that cannot be moved"); 99 STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with " 100 "instructions that cannot be moved"); 101 STATISTIC(NotRotated, "Candidate is not rotated"); 102 STATISTIC(OnlySecondCandidateIsGuarded, 103 "The second candidate is guarded while the first one is not"); 104 105 enum FusionDependenceAnalysisChoice { 106 FUSION_DEPENDENCE_ANALYSIS_SCEV, 107 FUSION_DEPENDENCE_ANALYSIS_DA, 108 FUSION_DEPENDENCE_ANALYSIS_ALL, 109 }; 110 111 static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis( 112 "loop-fusion-dependence-analysis", 113 cl::desc("Which dependence analysis should loop fusion use?"), 114 cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev", 115 "Use the scalar evolution interface"), 116 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da", 117 "Use the dependence analysis interface"), 118 clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all", 119 "Use all available analyses")), 120 cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore); 121 122 static cl::opt<unsigned> FusionPeelMaxCount( 123 "loop-fusion-peel-max-count", cl::init(0), cl::Hidden, 124 cl::desc("Max number of iterations to be peeled from a loop, such that " 125 "fusion can take place")); 126 127 #ifndef NDEBUG 128 static cl::opt<bool> 129 VerboseFusionDebugging("loop-fusion-verbose-debug", 130 cl::desc("Enable verbose debugging for Loop Fusion"), 131 cl::Hidden, cl::init(false), cl::ZeroOrMore); 132 #endif 133 134 namespace { 135 /// This class is used to represent a candidate for loop fusion. When it is 136 /// constructed, it checks the conditions for loop fusion to ensure that it 137 /// represents a valid candidate. It caches several parts of a loop that are 138 /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead 139 /// of continually querying the underlying Loop to retrieve these values. It is 140 /// assumed these will not change throughout loop fusion. 141 /// 142 /// The invalidate method should be used to indicate that the FusionCandidate is 143 /// no longer a valid candidate for fusion. Similarly, the isValid() method can 144 /// be used to ensure that the FusionCandidate is still valid for fusion. 145 struct FusionCandidate { 146 /// Cache of parts of the loop used throughout loop fusion. These should not 147 /// need to change throughout the analysis and transformation. 148 /// These parts are cached to avoid repeatedly looking up in the Loop class. 149 150 /// Preheader of the loop this candidate represents 151 BasicBlock *Preheader; 152 /// Header of the loop this candidate represents 153 BasicBlock *Header; 154 /// Blocks in the loop that exit the loop 155 BasicBlock *ExitingBlock; 156 /// The successor block of this loop (where the exiting blocks go to) 157 BasicBlock *ExitBlock; 158 /// Latch of the loop 159 BasicBlock *Latch; 160 /// The loop that this fusion candidate represents 161 Loop *L; 162 /// Vector of instructions in this loop that read from memory 163 SmallVector<Instruction *, 16> MemReads; 164 /// Vector of instructions in this loop that write to memory 165 SmallVector<Instruction *, 16> MemWrites; 166 /// Are all of the members of this fusion candidate still valid 167 bool Valid; 168 /// Guard branch of the loop, if it exists 169 BranchInst *GuardBranch; 170 /// Peeling Paramaters of the Loop. 171 TTI::PeelingPreferences PP; 172 /// Can you Peel this Loop? 173 bool AbleToPeel; 174 /// Has this loop been Peeled 175 bool Peeled; 176 177 /// Dominator and PostDominator trees are needed for the 178 /// FusionCandidateCompare function, required by FusionCandidateSet to 179 /// determine where the FusionCandidate should be inserted into the set. These 180 /// are used to establish ordering of the FusionCandidates based on dominance. 181 const DominatorTree *DT; 182 const PostDominatorTree *PDT; 183 184 OptimizationRemarkEmitter &ORE; 185 186 FusionCandidate(Loop *L, const DominatorTree *DT, 187 const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE, 188 TTI::PeelingPreferences PP) 189 : Preheader(L->getLoopPreheader()), Header(L->getHeader()), 190 ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), 191 Latch(L->getLoopLatch()), L(L), Valid(true), 192 GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)), 193 Peeled(false), DT(DT), PDT(PDT), ORE(ORE) { 194 195 // Walk over all blocks in the loop and check for conditions that may 196 // prevent fusion. For each block, walk over all instructions and collect 197 // the memory reads and writes If any instructions that prevent fusion are 198 // found, invalidate this object and return. 199 for (BasicBlock *BB : L->blocks()) { 200 if (BB->hasAddressTaken()) { 201 invalidate(); 202 reportInvalidCandidate(AddressTakenBB); 203 return; 204 } 205 206 for (Instruction &I : *BB) { 207 if (I.mayThrow()) { 208 invalidate(); 209 reportInvalidCandidate(MayThrowException); 210 return; 211 } 212 if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { 213 if (SI->isVolatile()) { 214 invalidate(); 215 reportInvalidCandidate(ContainsVolatileAccess); 216 return; 217 } 218 } 219 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { 220 if (LI->isVolatile()) { 221 invalidate(); 222 reportInvalidCandidate(ContainsVolatileAccess); 223 return; 224 } 225 } 226 if (I.mayWriteToMemory()) 227 MemWrites.push_back(&I); 228 if (I.mayReadFromMemory()) 229 MemReads.push_back(&I); 230 } 231 } 232 } 233 234 /// Check if all members of the class are valid. 235 bool isValid() const { 236 return Preheader && Header && ExitingBlock && ExitBlock && Latch && L && 237 !L->isInvalid() && Valid; 238 } 239 240 /// Verify that all members are in sync with the Loop object. 241 void verify() const { 242 assert(isValid() && "Candidate is not valid!!"); 243 assert(!L->isInvalid() && "Loop is invalid!"); 244 assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync"); 245 assert(Header == L->getHeader() && "Header is out of sync"); 246 assert(ExitingBlock == L->getExitingBlock() && 247 "Exiting Blocks is out of sync"); 248 assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync"); 249 assert(Latch == L->getLoopLatch() && "Latch is out of sync"); 250 } 251 252 /// Get the entry block for this fusion candidate. 253 /// 254 /// If this fusion candidate represents a guarded loop, the entry block is the 255 /// loop guard block. If it represents an unguarded loop, the entry block is 256 /// the preheader of the loop. 257 BasicBlock *getEntryBlock() const { 258 if (GuardBranch) 259 return GuardBranch->getParent(); 260 else 261 return Preheader; 262 } 263 264 /// After Peeling the loop is modified quite a bit, hence all of the Blocks 265 /// need to be updated accordingly. 266 void updateAfterPeeling() { 267 Preheader = L->getLoopPreheader(); 268 Header = L->getHeader(); 269 ExitingBlock = L->getExitingBlock(); 270 ExitBlock = L->getExitBlock(); 271 Latch = L->getLoopLatch(); 272 verify(); 273 } 274 275 /// Given a guarded loop, get the successor of the guard that is not in the 276 /// loop. 277 /// 278 /// This method returns the successor of the loop guard that is not located 279 /// within the loop (i.e., the successor of the guard that is not the 280 /// preheader). 281 /// This method is only valid for guarded loops. 282 BasicBlock *getNonLoopBlock() const { 283 assert(GuardBranch && "Only valid on guarded loops."); 284 assert(GuardBranch->isConditional() && 285 "Expecting guard to be a conditional branch."); 286 if (Peeled) 287 return GuardBranch->getSuccessor(1); 288 return (GuardBranch->getSuccessor(0) == Preheader) 289 ? GuardBranch->getSuccessor(1) 290 : GuardBranch->getSuccessor(0); 291 } 292 293 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 294 LLVM_DUMP_METHOD void dump() const { 295 dbgs() << "\tGuardBranch: "; 296 if (GuardBranch) 297 dbgs() << *GuardBranch; 298 else 299 dbgs() << "nullptr"; 300 dbgs() << "\n" 301 << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" 302 << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") 303 << "\n" 304 << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" 305 << "\tExitingBB: " 306 << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" 307 << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") 308 << "\n" 309 << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" 310 << "\tEntryBlock: " 311 << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") 312 << "\n"; 313 } 314 #endif 315 316 /// Determine if a fusion candidate (representing a loop) is eligible for 317 /// fusion. Note that this only checks whether a single loop can be fused - it 318 /// does not check whether it is *legal* to fuse two loops together. 319 bool isEligibleForFusion(ScalarEvolution &SE) const { 320 if (!isValid()) { 321 LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); 322 if (!Preheader) 323 ++InvalidPreheader; 324 if (!Header) 325 ++InvalidHeader; 326 if (!ExitingBlock) 327 ++InvalidExitingBlock; 328 if (!ExitBlock) 329 ++InvalidExitBlock; 330 if (!Latch) 331 ++InvalidLatch; 332 if (L->isInvalid()) 333 ++InvalidLoop; 334 335 return false; 336 } 337 338 // Require ScalarEvolution to be able to determine a trip count. 339 if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { 340 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 341 << " trip count not computable!\n"); 342 return reportInvalidCandidate(UnknownTripCount); 343 } 344 345 if (!L->isLoopSimplifyForm()) { 346 LLVM_DEBUG(dbgs() << "Loop " << L->getName() 347 << " is not in simplified form!\n"); 348 return reportInvalidCandidate(NotSimplifiedForm); 349 } 350 351 if (!L->isRotatedForm()) { 352 LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); 353 return reportInvalidCandidate(NotRotated); 354 } 355 356 return true; 357 } 358 359 private: 360 // This is only used internally for now, to clear the MemWrites and MemReads 361 // list and setting Valid to false. I can't envision other uses of this right 362 // now, since once FusionCandidates are put into the FusionCandidateSet they 363 // are immutable. Thus, any time we need to change/update a FusionCandidate, 364 // we must create a new one and insert it into the FusionCandidateSet to 365 // ensure the FusionCandidateSet remains ordered correctly. 366 void invalidate() { 367 MemWrites.clear(); 368 MemReads.clear(); 369 Valid = false; 370 } 371 372 bool reportInvalidCandidate(llvm::Statistic &Stat) const { 373 using namespace ore; 374 assert(L && Preheader && "Fusion candidate not initialized properly!"); 375 ++Stat; 376 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), 377 L->getStartLoc(), Preheader) 378 << "[" << Preheader->getParent()->getName() << "]: " 379 << "Loop is not a candidate for fusion: " << Stat.getDesc()); 380 return false; 381 } 382 }; 383 384 struct FusionCandidateCompare { 385 /// Comparison functor to sort two Control Flow Equivalent fusion candidates 386 /// into dominance order. 387 /// If LHS dominates RHS and RHS post-dominates LHS, return true; 388 /// IF RHS dominates LHS and LHS post-dominates RHS, return false; 389 bool operator()(const FusionCandidate &LHS, 390 const FusionCandidate &RHS) const { 391 const DominatorTree *DT = LHS.DT; 392 393 BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); 394 BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); 395 396 // Do not save PDT to local variable as it is only used in asserts and thus 397 // will trigger an unused variable warning if building without asserts. 398 assert(DT && LHS.PDT && "Expecting valid dominator tree"); 399 400 // Do this compare first so if LHS == RHS, function returns false. 401 if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { 402 // RHS dominates LHS 403 // Verify LHS post-dominates RHS 404 assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); 405 return false; 406 } 407 408 if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { 409 // Verify RHS Postdominates LHS 410 assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); 411 return true; 412 } 413 414 // If LHS does not dominate RHS and RHS does not dominate LHS then there is 415 // no dominance relationship between the two FusionCandidates. Thus, they 416 // should not be in the same set together. 417 llvm_unreachable( 418 "No dominance relationship between these fusion candidates!"); 419 } 420 }; 421 422 using LoopVector = SmallVector<Loop *, 4>; 423 424 // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance 425 // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0 426 // dominates FC1 and FC1 post-dominates FC0. 427 // std::set was chosen because we want a sorted data structure with stable 428 // iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent 429 // loops by moving intervening code around. When this intervening code contains 430 // loops, those loops will be moved also. The corresponding FusionCandidates 431 // will also need to be moved accordingly. As this is done, having stable 432 // iterators will simplify the logic. Similarly, having an efficient insert that 433 // keeps the FusionCandidateSet sorted will also simplify the implementation. 434 using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; 435 using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; 436 437 #if !defined(NDEBUG) 438 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 439 const FusionCandidate &FC) { 440 if (FC.isValid()) 441 OS << FC.Preheader->getName(); 442 else 443 OS << "<Invalid>"; 444 445 return OS; 446 } 447 448 static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, 449 const FusionCandidateSet &CandSet) { 450 for (const FusionCandidate &FC : CandSet) 451 OS << FC << '\n'; 452 453 return OS; 454 } 455 456 static void 457 printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { 458 dbgs() << "Fusion Candidates: \n"; 459 for (const auto &CandidateSet : FusionCandidates) { 460 dbgs() << "*** Fusion Candidate Set ***\n"; 461 dbgs() << CandidateSet; 462 dbgs() << "****************************\n"; 463 } 464 } 465 #endif 466 467 /// Collect all loops in function at the same nest level, starting at the 468 /// outermost level. 469 /// 470 /// This data structure collects all loops at the same nest level for a 471 /// given function (specified by the LoopInfo object). It starts at the 472 /// outermost level. 473 struct LoopDepthTree { 474 using LoopsOnLevelTy = SmallVector<LoopVector, 4>; 475 using iterator = LoopsOnLevelTy::iterator; 476 using const_iterator = LoopsOnLevelTy::const_iterator; 477 478 LoopDepthTree(LoopInfo &LI) : Depth(1) { 479 if (!LI.empty()) 480 LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend())); 481 } 482 483 /// Test whether a given loop has been removed from the function, and thus is 484 /// no longer valid. 485 bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); } 486 487 /// Record that a given loop has been removed from the function and is no 488 /// longer valid. 489 void removeLoop(const Loop *L) { RemovedLoops.insert(L); } 490 491 /// Descend the tree to the next (inner) nesting level 492 void descend() { 493 LoopsOnLevelTy LoopsOnNextLevel; 494 495 for (const LoopVector &LV : *this) 496 for (Loop *L : LV) 497 if (!isRemovedLoop(L) && L->begin() != L->end()) 498 LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end())); 499 500 LoopsOnLevel = LoopsOnNextLevel; 501 RemovedLoops.clear(); 502 Depth++; 503 } 504 505 bool empty() const { return size() == 0; } 506 size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); } 507 unsigned getDepth() const { return Depth; } 508 509 iterator begin() { return LoopsOnLevel.begin(); } 510 iterator end() { return LoopsOnLevel.end(); } 511 const_iterator begin() const { return LoopsOnLevel.begin(); } 512 const_iterator end() const { return LoopsOnLevel.end(); } 513 514 private: 515 /// Set of loops that have been removed from the function and are no longer 516 /// valid. 517 SmallPtrSet<const Loop *, 8> RemovedLoops; 518 519 /// Depth of the current level, starting at 1 (outermost loops). 520 unsigned Depth; 521 522 /// Vector of loops at the current depth level that have the same parent loop 523 LoopsOnLevelTy LoopsOnLevel; 524 }; 525 526 #ifndef NDEBUG 527 static void printLoopVector(const LoopVector &LV) { 528 dbgs() << "****************************\n"; 529 for (auto L : LV) 530 printLoop(*L, dbgs()); 531 dbgs() << "****************************\n"; 532 } 533 #endif 534 535 struct LoopFuser { 536 private: 537 // Sets of control flow equivalent fusion candidates for a given nest level. 538 FusionCandidateCollection FusionCandidates; 539 540 LoopDepthTree LDT; 541 DomTreeUpdater DTU; 542 543 LoopInfo &LI; 544 DominatorTree &DT; 545 DependenceInfo &DI; 546 ScalarEvolution &SE; 547 PostDominatorTree &PDT; 548 OptimizationRemarkEmitter &ORE; 549 AssumptionCache &AC; 550 551 const TargetTransformInfo &TTI; 552 553 public: 554 LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI, 555 ScalarEvolution &SE, PostDominatorTree &PDT, 556 OptimizationRemarkEmitter &ORE, const DataLayout &DL, 557 AssumptionCache &AC, const TargetTransformInfo &TTI) 558 : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI), 559 DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {} 560 561 /// This is the main entry point for loop fusion. It will traverse the 562 /// specified function and collect candidate loops to fuse, starting at the 563 /// outermost nesting level and working inwards. 564 bool fuseLoops(Function &F) { 565 #ifndef NDEBUG 566 if (VerboseFusionDebugging) { 567 LI.print(dbgs()); 568 } 569 #endif 570 571 LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName() 572 << "\n"); 573 bool Changed = false; 574 575 while (!LDT.empty()) { 576 LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth " 577 << LDT.getDepth() << "\n";); 578 579 for (const LoopVector &LV : LDT) { 580 assert(LV.size() > 0 && "Empty loop set was build!"); 581 582 // Skip singleton loop sets as they do not offer fusion opportunities on 583 // this level. 584 if (LV.size() == 1) 585 continue; 586 #ifndef NDEBUG 587 if (VerboseFusionDebugging) { 588 LLVM_DEBUG({ 589 dbgs() << " Visit loop set (#" << LV.size() << "):\n"; 590 printLoopVector(LV); 591 }); 592 } 593 #endif 594 595 collectFusionCandidates(LV); 596 Changed |= fuseCandidates(); 597 } 598 599 // Finished analyzing candidates at this level. 600 // Descend to the next level and clear all of the candidates currently 601 // collected. Note that it will not be possible to fuse any of the 602 // existing candidates with new candidates because the new candidates will 603 // be at a different nest level and thus not be control flow equivalent 604 // with all of the candidates collected so far. 605 LLVM_DEBUG(dbgs() << "Descend one level!\n"); 606 LDT.descend(); 607 FusionCandidates.clear(); 608 } 609 610 if (Changed) 611 LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump();); 612 613 #ifndef NDEBUG 614 assert(DT.verify()); 615 assert(PDT.verify()); 616 LI.verify(DT); 617 SE.verify(); 618 #endif 619 620 LLVM_DEBUG(dbgs() << "Loop Fusion complete\n"); 621 return Changed; 622 } 623 624 private: 625 /// Determine if two fusion candidates are control flow equivalent. 626 /// 627 /// Two fusion candidates are control flow equivalent if when one executes, 628 /// the other is guaranteed to execute. This is determined using dominators 629 /// and post-dominators: if A dominates B and B post-dominates A then A and B 630 /// are control-flow equivalent. 631 bool isControlFlowEquivalent(const FusionCandidate &FC0, 632 const FusionCandidate &FC1) const { 633 assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); 634 635 return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), 636 DT, PDT); 637 } 638 639 /// Iterate over all loops in the given loop set and identify the loops that 640 /// are eligible for fusion. Place all eligible fusion candidates into Control 641 /// Flow Equivalent sets, sorted by dominance. 642 void collectFusionCandidates(const LoopVector &LV) { 643 for (Loop *L : LV) { 644 TTI::PeelingPreferences PP = 645 gatherPeelingPreferences(L, SE, TTI, None, None); 646 FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP); 647 if (!CurrCand.isEligibleForFusion(SE)) 648 continue; 649 650 // Go through each list in FusionCandidates and determine if L is control 651 // flow equivalent with the first loop in that list. If it is, append LV. 652 // If not, go to the next list. 653 // If no suitable list is found, start another list and add it to 654 // FusionCandidates. 655 bool FoundSet = false; 656 657 for (auto &CurrCandSet : FusionCandidates) { 658 if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) { 659 CurrCandSet.insert(CurrCand); 660 FoundSet = true; 661 #ifndef NDEBUG 662 if (VerboseFusionDebugging) 663 LLVM_DEBUG(dbgs() << "Adding " << CurrCand 664 << " to existing candidate set\n"); 665 #endif 666 break; 667 } 668 } 669 if (!FoundSet) { 670 // No set was found. Create a new set and add to FusionCandidates 671 #ifndef NDEBUG 672 if (VerboseFusionDebugging) 673 LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n"); 674 #endif 675 FusionCandidateSet NewCandSet; 676 NewCandSet.insert(CurrCand); 677 FusionCandidates.push_back(NewCandSet); 678 } 679 NumFusionCandidates++; 680 } 681 } 682 683 /// Determine if it is beneficial to fuse two loops. 684 /// 685 /// For now, this method simply returns true because we want to fuse as much 686 /// as possible (primarily to test the pass). This method will evolve, over 687 /// time, to add heuristics for profitability of fusion. 688 bool isBeneficialFusion(const FusionCandidate &FC0, 689 const FusionCandidate &FC1) { 690 return true; 691 } 692 693 /// Determine if two fusion candidates have the same trip count (i.e., they 694 /// execute the same number of iterations). 695 /// 696 /// This function will return a pair of values. The first is a boolean, 697 /// stating whether or not the two candidates are known at compile time to 698 /// have the same TripCount. The second is the difference in the two 699 /// TripCounts. This information can be used later to determine whether or not 700 /// peeling can be performed on either one of the candiates. 701 std::pair<bool, Optional<unsigned>> 702 haveIdenticalTripCounts(const FusionCandidate &FC0, 703 const FusionCandidate &FC1) const { 704 705 const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L); 706 if (isa<SCEVCouldNotCompute>(TripCount0)) { 707 UncomputableTripCount++; 708 LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!"); 709 return {false, None}; 710 } 711 712 const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L); 713 if (isa<SCEVCouldNotCompute>(TripCount1)) { 714 UncomputableTripCount++; 715 LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!"); 716 return {false, None}; 717 } 718 719 LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & " 720 << *TripCount1 << " are " 721 << (TripCount0 == TripCount1 ? "identical" : "different") 722 << "\n"); 723 724 if (TripCount0 == TripCount1) 725 return {true, 0}; 726 727 LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, " 728 "determining the difference between trip counts\n"); 729 730 // Currently only considering loops with a single exit point 731 // and a non-constant trip count. 732 const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L); 733 const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L); 734 735 // If any of the tripcounts are zero that means that loop(s) do not have 736 // a single exit or a constant tripcount. 737 if (TC0 == 0 || TC1 == 0) { 738 LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not " 739 "have a constant number of iterations. Peeling " 740 "is not benefical\n"); 741 return {false, None}; 742 } 743 744 Optional<unsigned> Difference = None; 745 int Diff = TC0 - TC1; 746 747 if (Diff > 0) 748 Difference = Diff; 749 else { 750 LLVM_DEBUG( 751 dbgs() << "Difference is less than 0. FC1 (second loop) has more " 752 "iterations than the first one. Currently not supported\n"); 753 } 754 755 LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference 756 << "\n"); 757 758 return {false, Difference}; 759 } 760 761 void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1, 762 unsigned PeelCount) { 763 assert(FC0.AbleToPeel && "Should be able to peel loop"); 764 765 LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount 766 << " iterations of the first loop. \n"); 767 768 FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, &DT, &AC, true); 769 if (FC0.Peeled) { 770 LLVM_DEBUG(dbgs() << "Done Peeling\n"); 771 772 #ifndef NDEBUG 773 auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1); 774 775 assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 && 776 "Loops should have identical trip counts after peeling"); 777 #endif 778 779 FC0.PP.PeelCount += PeelCount; 780 781 // Peeling does not update the PDT 782 PDT.recalculate(*FC0.Preheader->getParent()); 783 784 FC0.updateAfterPeeling(); 785 786 // In this case the iterations of the loop are constant, so the first 787 // loop will execute completely (will not jump from one of 788 // the peeled blocks to the second loop). Here we are updating the 789 // branch conditions of each of the peeled blocks, such that it will 790 // branch to its successor which is not the preheader of the second loop 791 // in the case of unguarded loops, or the succesors of the exit block of 792 // the first loop otherwise. Doing this update will ensure that the entry 793 // block of the first loop dominates the entry block of the second loop. 794 BasicBlock *BB = 795 FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader; 796 if (BB) { 797 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 798 SmallVector<Instruction *, 8> WorkList; 799 for (BasicBlock *Pred : predecessors(BB)) { 800 if (Pred != FC0.ExitBlock) { 801 WorkList.emplace_back(Pred->getTerminator()); 802 TreeUpdates.emplace_back( 803 DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB)); 804 } 805 } 806 // Cannot modify the predecessors inside the above loop as it will cause 807 // the iterators to be nullptrs, causing memory errors. 808 for (Instruction *CurrentBranch: WorkList) { 809 BasicBlock *Succ = CurrentBranch->getSuccessor(0); 810 if (Succ == BB) 811 Succ = CurrentBranch->getSuccessor(1); 812 ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ)); 813 } 814 815 DTU.applyUpdates(TreeUpdates); 816 DTU.flush(); 817 } 818 LLVM_DEBUG( 819 dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount 820 << " iterations from the first loop.\n" 821 "Both Loops have the same number of iterations now.\n"); 822 } 823 } 824 825 /// Walk each set of control flow equivalent fusion candidates and attempt to 826 /// fuse them. This does a single linear traversal of all candidates in the 827 /// set. The conditions for legal fusion are checked at this point. If a pair 828 /// of fusion candidates passes all legality checks, they are fused together 829 /// and a new fusion candidate is created and added to the FusionCandidateSet. 830 /// The original fusion candidates are then removed, as they are no longer 831 /// valid. 832 bool fuseCandidates() { 833 bool Fused = false; 834 LLVM_DEBUG(printFusionCandidates(FusionCandidates)); 835 for (auto &CandidateSet : FusionCandidates) { 836 if (CandidateSet.size() < 2) 837 continue; 838 839 LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n" 840 << CandidateSet << "\n"); 841 842 for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) { 843 assert(!LDT.isRemovedLoop(FC0->L) && 844 "Should not have removed loops in CandidateSet!"); 845 auto FC1 = FC0; 846 for (++FC1; FC1 != CandidateSet.end(); ++FC1) { 847 assert(!LDT.isRemovedLoop(FC1->L) && 848 "Should not have removed loops in CandidateSet!"); 849 850 LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump(); 851 dbgs() << " with\n"; FC1->dump(); dbgs() << "\n"); 852 853 FC0->verify(); 854 FC1->verify(); 855 856 // Check if the candidates have identical tripcounts (first value of 857 // pair), and if not check the difference in the tripcounts between 858 // the loops (second value of pair). The difference is not equal to 859 // None iff the loops iterate a constant number of times, and have a 860 // single exit. 861 std::pair<bool, Optional<unsigned>> IdenticalTripCountRes = 862 haveIdenticalTripCounts(*FC0, *FC1); 863 bool SameTripCount = IdenticalTripCountRes.first; 864 Optional<unsigned> TCDifference = IdenticalTripCountRes.second; 865 866 // Here we are checking that FC0 (the first loop) can be peeled, and 867 // both loops have different tripcounts. 868 if (FC0->AbleToPeel && !SameTripCount && TCDifference) { 869 if (*TCDifference > FusionPeelMaxCount) { 870 LLVM_DEBUG(dbgs() 871 << "Difference in loop trip counts: " << *TCDifference 872 << " is greater than maximum peel count specificed: " 873 << FusionPeelMaxCount << "\n"); 874 } else { 875 // Dependent on peeling being performed on the first loop, and 876 // assuming all other conditions for fusion return true. 877 SameTripCount = true; 878 } 879 } 880 881 if (!SameTripCount) { 882 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " 883 "counts. Not fusing.\n"); 884 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 885 NonEqualTripCount); 886 continue; 887 } 888 889 if (!isAdjacent(*FC0, *FC1)) { 890 LLVM_DEBUG(dbgs() 891 << "Fusion candidates are not adjacent. Not fusing.\n"); 892 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); 893 continue; 894 } 895 896 if (!FC0->GuardBranch && FC1->GuardBranch) { 897 LLVM_DEBUG(dbgs() << "The second candidate is guarded while the " 898 "first one is not. Not fusing.\n"); 899 reportLoopFusion<OptimizationRemarkMissed>( 900 *FC0, *FC1, OnlySecondCandidateIsGuarded); 901 continue; 902 } 903 904 // Ensure that FC0 and FC1 have identical guards. 905 // If one (or both) are not guarded, this check is not necessary. 906 if (FC0->GuardBranch && FC1->GuardBranch && 907 !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) { 908 LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " 909 "guards. Not Fusing.\n"); 910 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 911 NonIdenticalGuards); 912 continue; 913 } 914 915 if (!isSafeToMoveBefore(*FC1->Preheader, 916 *FC0->Preheader->getTerminator(), DT, &PDT, 917 &DI)) { 918 LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " 919 "instructions in preheader. Not fusing.\n"); 920 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 921 NonEmptyPreheader); 922 continue; 923 } 924 925 if (FC0->GuardBranch) { 926 assert(FC1->GuardBranch && "Expecting valid FC1 guard branch"); 927 928 if (!isSafeToMoveBefore(*FC0->ExitBlock, 929 *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT, 930 &PDT, &DI)) { 931 LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe " 932 "instructions in exit block. Not fusing.\n"); 933 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 934 NonEmptyExitBlock); 935 continue; 936 } 937 938 if (!isSafeToMoveBefore( 939 *FC1->GuardBranch->getParent(), 940 *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT, 941 &DI)) { 942 LLVM_DEBUG(dbgs() 943 << "Fusion candidate contains unsafe " 944 "instructions in guard block. Not fusing.\n"); 945 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 946 NonEmptyGuardBlock); 947 continue; 948 } 949 } 950 951 // Check the dependencies across the loops and do not fuse if it would 952 // violate them. 953 if (!dependencesAllowFusion(*FC0, *FC1)) { 954 LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); 955 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 956 InvalidDependencies); 957 continue; 958 } 959 960 bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1); 961 LLVM_DEBUG(dbgs() 962 << "\tFusion appears to be " 963 << (BeneficialToFuse ? "" : "un") << "profitable!\n"); 964 if (!BeneficialToFuse) { 965 reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, 966 FusionNotBeneficial); 967 continue; 968 } 969 // All analysis has completed and has determined that fusion is legal 970 // and profitable. At this point, start transforming the code and 971 // perform fusion. 972 973 LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and " 974 << *FC1 << "\n"); 975 976 FusionCandidate FC0Copy = *FC0; 977 // Peel the loop after determining that fusion is legal. The Loops 978 // will still be safe to fuse after the peeling is performed. 979 bool Peel = TCDifference && *TCDifference > 0; 980 if (Peel) 981 peelFusionCandidate(FC0Copy, *FC1, *TCDifference); 982 983 // Report fusion to the Optimization Remarks. 984 // Note this needs to be done *before* performFusion because 985 // performFusion will change the original loops, making it not 986 // possible to identify them after fusion is complete. 987 reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1, 988 FuseCounter); 989 990 FusionCandidate FusedCand( 991 performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE, 992 FC0Copy.PP); 993 FusedCand.verify(); 994 assert(FusedCand.isEligibleForFusion(SE) && 995 "Fused candidate should be eligible for fusion!"); 996 997 // Notify the loop-depth-tree that these loops are not valid objects 998 LDT.removeLoop(FC1->L); 999 1000 CandidateSet.erase(FC0); 1001 CandidateSet.erase(FC1); 1002 1003 auto InsertPos = CandidateSet.insert(FusedCand); 1004 1005 assert(InsertPos.second && 1006 "Unable to insert TargetCandidate in CandidateSet!"); 1007 1008 // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations 1009 // of the FC1 loop will attempt to fuse the new (fused) loop with the 1010 // remaining candidates in the current candidate set. 1011 FC0 = FC1 = InsertPos.first; 1012 1013 LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet 1014 << "\n"); 1015 1016 Fused = true; 1017 } 1018 } 1019 } 1020 return Fused; 1021 } 1022 1023 /// Rewrite all additive recurrences in a SCEV to use a new loop. 1024 class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> { 1025 public: 1026 AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL, 1027 bool UseMax = true) 1028 : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL), 1029 NewL(NewL) {} 1030 1031 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { 1032 const Loop *ExprL = Expr->getLoop(); 1033 SmallVector<const SCEV *, 2> Operands; 1034 if (ExprL == &OldL) { 1035 Operands.append(Expr->op_begin(), Expr->op_end()); 1036 return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags()); 1037 } 1038 1039 if (OldL.contains(ExprL)) { 1040 bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE)); 1041 if (!UseMax || !Pos || !Expr->isAffine()) { 1042 Valid = false; 1043 return Expr; 1044 } 1045 return visit(Expr->getStart()); 1046 } 1047 1048 for (const SCEV *Op : Expr->operands()) 1049 Operands.push_back(visit(Op)); 1050 return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags()); 1051 } 1052 1053 bool wasValidSCEV() const { return Valid; } 1054 1055 private: 1056 bool Valid, UseMax; 1057 const Loop &OldL, &NewL; 1058 }; 1059 1060 /// Return false if the access functions of \p I0 and \p I1 could cause 1061 /// a negative dependence. 1062 bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0, 1063 Instruction &I1, bool EqualIsInvalid) { 1064 Value *Ptr0 = getLoadStorePointerOperand(&I0); 1065 Value *Ptr1 = getLoadStorePointerOperand(&I1); 1066 if (!Ptr0 || !Ptr1) 1067 return false; 1068 1069 const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0); 1070 const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1); 1071 #ifndef NDEBUG 1072 if (VerboseFusionDebugging) 1073 LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs " 1074 << *SCEVPtr1 << "\n"); 1075 #endif 1076 AddRecLoopReplacer Rewriter(SE, L0, L1); 1077 SCEVPtr0 = Rewriter.visit(SCEVPtr0); 1078 #ifndef NDEBUG 1079 if (VerboseFusionDebugging) 1080 LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0 1081 << " [Valid: " << Rewriter.wasValidSCEV() << "]\n"); 1082 #endif 1083 if (!Rewriter.wasValidSCEV()) 1084 return false; 1085 1086 // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by 1087 // L0) and the other is not. We could check if it is monotone and test 1088 // the beginning and end value instead. 1089 1090 BasicBlock *L0Header = L0.getHeader(); 1091 auto HasNonLinearDominanceRelation = [&](const SCEV *S) { 1092 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S); 1093 if (!AddRec) 1094 return false; 1095 return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) && 1096 !DT.dominates(AddRec->getLoop()->getHeader(), L0Header); 1097 }; 1098 if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation)) 1099 return false; 1100 1101 ICmpInst::Predicate Pred = 1102 EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE; 1103 bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1); 1104 #ifndef NDEBUG 1105 if (VerboseFusionDebugging) 1106 LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0 1107 << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1 1108 << "\n"); 1109 #endif 1110 return IsAlwaysGE; 1111 } 1112 1113 /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in 1114 /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses 1115 /// specified by @p DepChoice are used to determine this. 1116 bool dependencesAllowFusion(const FusionCandidate &FC0, 1117 const FusionCandidate &FC1, Instruction &I0, 1118 Instruction &I1, bool AnyDep, 1119 FusionDependenceAnalysisChoice DepChoice) { 1120 #ifndef NDEBUG 1121 if (VerboseFusionDebugging) { 1122 LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : " 1123 << DepChoice << "\n"); 1124 } 1125 #endif 1126 switch (DepChoice) { 1127 case FUSION_DEPENDENCE_ANALYSIS_SCEV: 1128 return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep); 1129 case FUSION_DEPENDENCE_ANALYSIS_DA: { 1130 auto DepResult = DI.depends(&I0, &I1, true); 1131 if (!DepResult) 1132 return true; 1133 #ifndef NDEBUG 1134 if (VerboseFusionDebugging) { 1135 LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs()); 1136 dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: " 1137 << (DepResult->isOrdered() ? "true" : "false") 1138 << "]\n"); 1139 LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels() 1140 << "\n"); 1141 } 1142 #endif 1143 1144 if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) 1145 LLVM_DEBUG( 1146 dbgs() << "TODO: Implement pred/succ dependence handling!\n"); 1147 1148 // TODO: Can we actually use the dependence info analysis here? 1149 return false; 1150 } 1151 1152 case FUSION_DEPENDENCE_ANALYSIS_ALL: 1153 return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 1154 FUSION_DEPENDENCE_ANALYSIS_SCEV) || 1155 dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep, 1156 FUSION_DEPENDENCE_ANALYSIS_DA); 1157 } 1158 1159 llvm_unreachable("Unknown fusion dependence analysis choice!"); 1160 } 1161 1162 /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused. 1163 bool dependencesAllowFusion(const FusionCandidate &FC0, 1164 const FusionCandidate &FC1) { 1165 LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 1166 << "\n"); 1167 assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); 1168 assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); 1169 1170 for (Instruction *WriteL0 : FC0.MemWrites) { 1171 for (Instruction *WriteL1 : FC1.MemWrites) 1172 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 1173 /* AnyDep */ false, 1174 FusionDependenceAnalysis)) { 1175 InvalidDependencies++; 1176 return false; 1177 } 1178 for (Instruction *ReadL1 : FC1.MemReads) 1179 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1, 1180 /* AnyDep */ false, 1181 FusionDependenceAnalysis)) { 1182 InvalidDependencies++; 1183 return false; 1184 } 1185 } 1186 1187 for (Instruction *WriteL1 : FC1.MemWrites) { 1188 for (Instruction *WriteL0 : FC0.MemWrites) 1189 if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1, 1190 /* AnyDep */ false, 1191 FusionDependenceAnalysis)) { 1192 InvalidDependencies++; 1193 return false; 1194 } 1195 for (Instruction *ReadL0 : FC0.MemReads) 1196 if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1, 1197 /* AnyDep */ false, 1198 FusionDependenceAnalysis)) { 1199 InvalidDependencies++; 1200 return false; 1201 } 1202 } 1203 1204 // Walk through all uses in FC1. For each use, find the reaching def. If the 1205 // def is located in FC0 then it is is not safe to fuse. 1206 for (BasicBlock *BB : FC1.L->blocks()) 1207 for (Instruction &I : *BB) 1208 for (auto &Op : I.operands()) 1209 if (Instruction *Def = dyn_cast<Instruction>(Op)) 1210 if (FC0.L->contains(Def->getParent())) { 1211 InvalidDependencies++; 1212 return false; 1213 } 1214 1215 return true; 1216 } 1217 1218 /// Determine if two fusion candidates are adjacent in the CFG. 1219 /// 1220 /// This method will determine if there are additional basic blocks in the CFG 1221 /// between the exit of \p FC0 and the entry of \p FC1. 1222 /// If the two candidates are guarded loops, then it checks whether the 1223 /// non-loop successor of the \p FC0 guard branch is the entry block of \p 1224 /// FC1. If not, then the loops are not adjacent. If the two candidates are 1225 /// not guarded loops, then it checks whether the exit block of \p FC0 is the 1226 /// preheader of \p FC1. 1227 bool isAdjacent(const FusionCandidate &FC0, 1228 const FusionCandidate &FC1) const { 1229 // If the successor of the guard branch is FC1, then the loops are adjacent 1230 if (FC0.GuardBranch) 1231 return FC0.getNonLoopBlock() == FC1.getEntryBlock(); 1232 else 1233 return FC0.ExitBlock == FC1.getEntryBlock(); 1234 } 1235 1236 /// Determine if two fusion candidates have identical guards 1237 /// 1238 /// This method will determine if two fusion candidates have the same guards. 1239 /// The guards are considered the same if: 1240 /// 1. The instructions to compute the condition used in the compare are 1241 /// identical. 1242 /// 2. The successors of the guard have the same flow into/around the loop. 1243 /// If the compare instructions are identical, then the first successor of the 1244 /// guard must go to the same place (either the preheader of the loop or the 1245 /// NonLoopBlock). In other words, the the first successor of both loops must 1246 /// both go into the loop (i.e., the preheader) or go around the loop (i.e., 1247 /// the NonLoopBlock). The same must be true for the second successor. 1248 bool haveIdenticalGuards(const FusionCandidate &FC0, 1249 const FusionCandidate &FC1) const { 1250 assert(FC0.GuardBranch && FC1.GuardBranch && 1251 "Expecting FC0 and FC1 to be guarded loops."); 1252 1253 if (auto FC0CmpInst = 1254 dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) 1255 if (auto FC1CmpInst = 1256 dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) 1257 if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) 1258 return false; 1259 1260 // The compare instructions are identical. 1261 // Now make sure the successor of the guards have the same flow into/around 1262 // the loop 1263 if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) 1264 return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); 1265 else 1266 return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); 1267 } 1268 1269 /// Modify the latch branch of FC to be unconditional since successors of the 1270 /// branch are the same. 1271 void simplifyLatchBranch(const FusionCandidate &FC) const { 1272 BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); 1273 if (FCLatchBranch) { 1274 assert(FCLatchBranch->isConditional() && 1275 FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && 1276 "Expecting the two successors of FCLatchBranch to be the same"); 1277 BranchInst *NewBranch = 1278 BranchInst::Create(FCLatchBranch->getSuccessor(0)); 1279 ReplaceInstWithInst(FCLatchBranch, NewBranch); 1280 } 1281 } 1282 1283 /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique 1284 /// successor, then merge FC0.Latch with its unique successor. 1285 void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1286 moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI); 1287 if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { 1288 MergeBlockIntoPredecessor(Succ, &DTU, &LI); 1289 DTU.flush(); 1290 } 1291 } 1292 1293 /// Fuse two fusion candidates, creating a new fused loop. 1294 /// 1295 /// This method contains the mechanics of fusing two loops, represented by \p 1296 /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1 1297 /// postdominates \p FC0 (making them control flow equivalent). It also 1298 /// assumes that the other conditions for fusion have been met: adjacent, 1299 /// identical trip counts, and no negative distance dependencies exist that 1300 /// would prevent fusion. Thus, there is no checking for these conditions in 1301 /// this method. 1302 /// 1303 /// Fusion is performed by rewiring the CFG to update successor blocks of the 1304 /// components of tho loop. Specifically, the following changes are done: 1305 /// 1306 /// 1. The preheader of \p FC1 is removed as it is no longer necessary 1307 /// (because it is currently only a single statement block). 1308 /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1. 1309 /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0. 1310 /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0. 1311 /// 1312 /// All of these modifications are done with dominator tree updates, thus 1313 /// keeping the dominator (and post dominator) information up-to-date. 1314 /// 1315 /// This can be improved in the future by actually merging blocks during 1316 /// fusion. For example, the preheader of \p FC1 can be merged with the 1317 /// preheader of \p FC0. This would allow loops with more than a single 1318 /// statement in the preheader to be fused. Similarly, the latch blocks of the 1319 /// two loops could also be fused into a single block. This will require 1320 /// analysis to prove it is safe to move the contents of the block past 1321 /// existing code, which currently has not been implemented. 1322 Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) { 1323 assert(FC0.isValid() && FC1.isValid() && 1324 "Expecting valid fusion candidates"); 1325 1326 LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); 1327 dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); 1328 1329 // Move instructions from the preheader of FC1 to the end of the preheader 1330 // of FC0. 1331 moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI); 1332 1333 // Fusing guarded loops is handled slightly differently than non-guarded 1334 // loops and has been broken out into a separate method instead of trying to 1335 // intersperse the logic within a single method. 1336 if (FC0.GuardBranch) 1337 return fuseGuardedLoops(FC0, FC1); 1338 1339 assert(FC1.Preheader == 1340 (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock)); 1341 assert(FC1.Preheader->size() == 1 && 1342 FC1.Preheader->getSingleSuccessor() == FC1.Header); 1343 1344 // Remember the phi nodes originally in the header of FC0 in order to rewire 1345 // them later. However, this is only necessary if the new loop carried 1346 // values might not dominate the exiting branch. While we do not generally 1347 // test if this is the case but simply insert intermediate phi nodes, we 1348 // need to make sure these intermediate phi nodes have different 1349 // predecessors. To this end, we filter the special case where the exiting 1350 // block is the latch block of the first loop. Nothing needs to be done 1351 // anyway as all loop carried values dominate the latch and thereby also the 1352 // exiting branch. 1353 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1354 if (FC0.ExitingBlock != FC0.Latch) 1355 for (PHINode &PHI : FC0.Header->phis()) 1356 OriginalFC0PHIs.push_back(&PHI); 1357 1358 // Replace incoming blocks for header PHIs first. 1359 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1360 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1361 1362 // Then modify the control flow and update DT and PDT. 1363 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1364 1365 // The old exiting block of the first loop (FC0) has to jump to the header 1366 // of the second as we need to execute the code in the second header block 1367 // regardless of the trip count. That is, if the trip count is 0, so the 1368 // back edge is never taken, we still have to execute both loop headers, 1369 // especially (but not only!) if the second is a do-while style loop. 1370 // However, doing so might invalidate the phi nodes of the first loop as 1371 // the new values do only need to dominate their latch and not the exiting 1372 // predicate. To remedy this potential problem we always introduce phi 1373 // nodes in the header of the second loop later that select the loop carried 1374 // value, if the second header was reached through an old latch of the 1375 // first, or undef otherwise. This is sound as exiting the first implies the 1376 // second will exit too, __without__ taking the back-edge. [Their 1377 // trip-counts are equal after all. 1378 // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go 1379 // to FC1.Header? I think this is basically what the three sequences are 1380 // trying to accomplish; however, doing this directly in the CFG may mean 1381 // the DT/PDT becomes invalid 1382 if (!FC0.Peeled) { 1383 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader, 1384 FC1.Header); 1385 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1386 DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader)); 1387 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1388 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1389 } else { 1390 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1391 DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader)); 1392 1393 // Remove the ExitBlock of the first Loop (also not needed) 1394 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1395 FC1.Header); 1396 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1397 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1398 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1399 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1400 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1401 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1402 } 1403 1404 // The pre-header of L1 is not necessary anymore. 1405 assert(pred_empty(FC1.Preheader)); 1406 FC1.Preheader->getTerminator()->eraseFromParent(); 1407 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1408 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1409 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1410 1411 // Moves the phi nodes from the second to the first loops header block. 1412 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1413 if (SE.isSCEVable(PHI->getType())) 1414 SE.forgetValue(PHI); 1415 if (PHI->hasNUsesOrMore(1)) 1416 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1417 else 1418 PHI->eraseFromParent(); 1419 } 1420 1421 // Introduce new phi nodes in the second loop header to ensure 1422 // exiting the first and jumping to the header of the second does not break 1423 // the SSA property of the phis originally in the first loop. See also the 1424 // comment above. 1425 Instruction *L1HeaderIP = &FC1.Header->front(); 1426 for (PHINode *LCPHI : OriginalFC0PHIs) { 1427 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1428 assert(L1LatchBBIdx >= 0 && 1429 "Expected loop carried value to be rewired at this point!"); 1430 1431 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1432 1433 PHINode *L1HeaderPHI = PHINode::Create( 1434 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1435 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1436 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1437 FC0.ExitingBlock); 1438 1439 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1440 } 1441 1442 // Replace latch terminator destinations. 1443 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1444 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1445 1446 // Modify the latch branch of FC0 to be unconditional as both successors of 1447 // the branch are the same. 1448 simplifyLatchBranch(FC0); 1449 1450 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1451 // performed the updates above. 1452 if (FC0.Latch != FC0.ExitingBlock) 1453 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1454 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1455 1456 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1457 FC0.Latch, FC0.Header)); 1458 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1459 FC1.Latch, FC0.Header)); 1460 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1461 FC1.Latch, FC1.Header)); 1462 1463 // Update DT/PDT 1464 DTU.applyUpdates(TreeUpdates); 1465 1466 LI.removeBlock(FC1.Preheader); 1467 DTU.deleteBB(FC1.Preheader); 1468 if (FC0.Peeled) { 1469 LI.removeBlock(FC0.ExitBlock); 1470 DTU.deleteBB(FC0.ExitBlock); 1471 } 1472 1473 DTU.flush(); 1474 1475 // Is there a way to keep SE up-to-date so we don't need to forget the loops 1476 // and rebuild the information in subsequent passes of fusion? 1477 // Note: Need to forget the loops before merging the loop latches, as 1478 // mergeLatch may remove the only block in FC1. 1479 SE.forgetLoop(FC1.L); 1480 SE.forgetLoop(FC0.L); 1481 1482 // Move instructions from FC0.Latch to FC1.Latch. 1483 // Note: mergeLatch requires an updated DT. 1484 mergeLatch(FC0, FC1); 1485 1486 // Merge the loops. 1487 SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); 1488 for (BasicBlock *BB : Blocks) { 1489 FC0.L->addBlockEntry(BB); 1490 FC1.L->removeBlockFromLoop(BB); 1491 if (LI.getLoopFor(BB) != FC1.L) 1492 continue; 1493 LI.changeLoopFor(BB, FC0.L); 1494 } 1495 while (!FC1.L->isInnermost()) { 1496 const auto &ChildLoopIt = FC1.L->begin(); 1497 Loop *ChildLoop = *ChildLoopIt; 1498 FC1.L->removeChildLoop(ChildLoopIt); 1499 FC0.L->addChildLoop(ChildLoop); 1500 } 1501 1502 // Delete the now empty loop L1. 1503 LI.erase(FC1.L); 1504 1505 #ifndef NDEBUG 1506 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 1507 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1508 assert(PDT.verify()); 1509 LI.verify(DT); 1510 SE.verify(); 1511 #endif 1512 1513 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 1514 1515 return FC0.L; 1516 } 1517 1518 /// Report details on loop fusion opportunities. 1519 /// 1520 /// This template function can be used to report both successful and missed 1521 /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should 1522 /// be one of: 1523 /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful 1524 /// given two valid fusion candidates. 1525 /// - OptimizationRemark to report successful fusion of two fusion 1526 /// candidates. 1527 /// The remarks will be printed using the form: 1528 /// <path/filename>:<line number>:<column number>: [<function name>]: 1529 /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> 1530 template <typename RemarkKind> 1531 void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, 1532 llvm::Statistic &Stat) { 1533 assert(FC0.Preheader && FC1.Preheader && 1534 "Expecting valid fusion candidates"); 1535 using namespace ore; 1536 ++Stat; 1537 ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), 1538 FC0.Preheader) 1539 << "[" << FC0.Preheader->getParent()->getName() 1540 << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) 1541 << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) 1542 << ": " << Stat.getDesc()); 1543 } 1544 1545 /// Fuse two guarded fusion candidates, creating a new fused loop. 1546 /// 1547 /// Fusing guarded loops is handled much the same way as fusing non-guarded 1548 /// loops. The rewiring of the CFG is slightly different though, because of 1549 /// the presence of the guards around the loops and the exit blocks after the 1550 /// loop body. As such, the new loop is rewired as follows: 1551 /// 1. Keep the guard branch from FC0 and use the non-loop block target 1552 /// from the FC1 guard branch. 1553 /// 2. Remove the exit block from FC0 (this exit block should be empty 1554 /// right now). 1555 /// 3. Remove the guard branch for FC1 1556 /// 4. Remove the preheader for FC1. 1557 /// The exit block successor for the latch of FC0 is updated to be the header 1558 /// of FC1 and the non-exit block successor of the latch of FC1 is updated to 1559 /// be the header of FC0, thus creating the fused loop. 1560 Loop *fuseGuardedLoops(const FusionCandidate &FC0, 1561 const FusionCandidate &FC1) { 1562 assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); 1563 1564 BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); 1565 BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); 1566 BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); 1567 BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); 1568 BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor(); 1569 1570 // Move instructions from the exit block of FC0 to the beginning of the exit 1571 // block of FC1, in the case that the FC0 loop has not been peeled. In the 1572 // case that FC0 loop is peeled, then move the instructions of the successor 1573 // of the FC0 Exit block to the beginning of the exit block of FC1. 1574 moveInstructionsToTheBeginning( 1575 (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock, 1576 DT, PDT, DI); 1577 1578 // Move instructions from the guard block of FC1 to the end of the guard 1579 // block of FC0. 1580 moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI); 1581 1582 assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); 1583 1584 SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; 1585 1586 //////////////////////////////////////////////////////////////////////////// 1587 // Update the Loop Guard 1588 //////////////////////////////////////////////////////////////////////////// 1589 // The guard for FC0 is updated to guard both FC0 and FC1. This is done by 1590 // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. 1591 // Thus, one path from the guard goes to the preheader for FC0 (and thus 1592 // executes the new fused loop) and the other path goes to the NonLoopBlock 1593 // for FC1 (where FC1 guard would have gone if FC1 was not executed). 1594 FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock); 1595 FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); 1596 1597 BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock; 1598 BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header); 1599 1600 // The guard of FC1 is not necessary anymore. 1601 FC1.GuardBranch->eraseFromParent(); 1602 new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); 1603 1604 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1605 DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); 1606 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1607 DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); 1608 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1609 DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); 1610 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1611 DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); 1612 1613 if (FC0.Peeled) { 1614 // Remove the Block after the ExitBlock of FC0 1615 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1616 DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock)); 1617 FC0ExitBlockSuccessor->getTerminator()->eraseFromParent(); 1618 new UnreachableInst(FC0ExitBlockSuccessor->getContext(), 1619 FC0ExitBlockSuccessor); 1620 } 1621 1622 assert(pred_empty(FC1GuardBlock) && 1623 "Expecting guard block to have no predecessors"); 1624 assert(succ_empty(FC1GuardBlock) && 1625 "Expecting guard block to have no successors"); 1626 1627 // Remember the phi nodes originally in the header of FC0 in order to rewire 1628 // them later. However, this is only necessary if the new loop carried 1629 // values might not dominate the exiting branch. While we do not generally 1630 // test if this is the case but simply insert intermediate phi nodes, we 1631 // need to make sure these intermediate phi nodes have different 1632 // predecessors. To this end, we filter the special case where the exiting 1633 // block is the latch block of the first loop. Nothing needs to be done 1634 // anyway as all loop carried values dominate the latch and thereby also the 1635 // exiting branch. 1636 // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch 1637 // (because the loops are rotated. Thus, nothing will ever be added to 1638 // OriginalFC0PHIs. 1639 SmallVector<PHINode *, 8> OriginalFC0PHIs; 1640 if (FC0.ExitingBlock != FC0.Latch) 1641 for (PHINode &PHI : FC0.Header->phis()) 1642 OriginalFC0PHIs.push_back(&PHI); 1643 1644 assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); 1645 1646 // Replace incoming blocks for header PHIs first. 1647 FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); 1648 FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); 1649 1650 // The old exiting block of the first loop (FC0) has to jump to the header 1651 // of the second as we need to execute the code in the second header block 1652 // regardless of the trip count. That is, if the trip count is 0, so the 1653 // back edge is never taken, we still have to execute both loop headers, 1654 // especially (but not only!) if the second is a do-while style loop. 1655 // However, doing so might invalidate the phi nodes of the first loop as 1656 // the new values do only need to dominate their latch and not the exiting 1657 // predicate. To remedy this potential problem we always introduce phi 1658 // nodes in the header of the second loop later that select the loop carried 1659 // value, if the second header was reached through an old latch of the 1660 // first, or undef otherwise. This is sound as exiting the first implies the 1661 // second will exit too, __without__ taking the back-edge (their 1662 // trip-counts are equal after all). 1663 FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, 1664 FC1.Header); 1665 1666 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1667 DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); 1668 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1669 DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); 1670 1671 // Remove FC0 Exit Block 1672 // The exit block for FC0 is no longer needed since control will flow 1673 // directly to the header of FC1. Since it is an empty block, it can be 1674 // removed at this point. 1675 // TODO: In the future, we can handle non-empty exit blocks my merging any 1676 // instructions from FC0 exit block into FC1 exit block prior to removing 1677 // the block. 1678 assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty"); 1679 FC0.ExitBlock->getTerminator()->eraseFromParent(); 1680 new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); 1681 1682 // Remove FC1 Preheader 1683 // The pre-header of L1 is not necessary anymore. 1684 assert(pred_empty(FC1.Preheader)); 1685 FC1.Preheader->getTerminator()->eraseFromParent(); 1686 new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); 1687 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1688 DominatorTree::Delete, FC1.Preheader, FC1.Header)); 1689 1690 // Moves the phi nodes from the second to the first loops header block. 1691 while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { 1692 if (SE.isSCEVable(PHI->getType())) 1693 SE.forgetValue(PHI); 1694 if (PHI->hasNUsesOrMore(1)) 1695 PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); 1696 else 1697 PHI->eraseFromParent(); 1698 } 1699 1700 // Introduce new phi nodes in the second loop header to ensure 1701 // exiting the first and jumping to the header of the second does not break 1702 // the SSA property of the phis originally in the first loop. See also the 1703 // comment above. 1704 Instruction *L1HeaderIP = &FC1.Header->front(); 1705 for (PHINode *LCPHI : OriginalFC0PHIs) { 1706 int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); 1707 assert(L1LatchBBIdx >= 0 && 1708 "Expected loop carried value to be rewired at this point!"); 1709 1710 Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); 1711 1712 PHINode *L1HeaderPHI = PHINode::Create( 1713 LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); 1714 L1HeaderPHI->addIncoming(LCV, FC0.Latch); 1715 L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), 1716 FC0.ExitingBlock); 1717 1718 LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); 1719 } 1720 1721 // Update the latches 1722 1723 // Replace latch terminator destinations. 1724 FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); 1725 FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); 1726 1727 // Modify the latch branch of FC0 to be unconditional as both successors of 1728 // the branch are the same. 1729 simplifyLatchBranch(FC0); 1730 1731 // If FC0.Latch and FC0.ExitingBlock are the same then we have already 1732 // performed the updates above. 1733 if (FC0.Latch != FC0.ExitingBlock) 1734 TreeUpdates.emplace_back(DominatorTree::UpdateType( 1735 DominatorTree::Insert, FC0.Latch, FC1.Header)); 1736 1737 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1738 FC0.Latch, FC0.Header)); 1739 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, 1740 FC1.Latch, FC0.Header)); 1741 TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, 1742 FC1.Latch, FC1.Header)); 1743 1744 // All done 1745 // Apply the updates to the Dominator Tree and cleanup. 1746 1747 assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!"); 1748 assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!"); 1749 1750 // Update DT/PDT 1751 DTU.applyUpdates(TreeUpdates); 1752 1753 LI.removeBlock(FC1GuardBlock); 1754 LI.removeBlock(FC1.Preheader); 1755 LI.removeBlock(FC0.ExitBlock); 1756 if (FC0.Peeled) { 1757 LI.removeBlock(FC0ExitBlockSuccessor); 1758 DTU.deleteBB(FC0ExitBlockSuccessor); 1759 } 1760 DTU.deleteBB(FC1GuardBlock); 1761 DTU.deleteBB(FC1.Preheader); 1762 DTU.deleteBB(FC0.ExitBlock); 1763 DTU.flush(); 1764 1765 // Is there a way to keep SE up-to-date so we don't need to forget the loops 1766 // and rebuild the information in subsequent passes of fusion? 1767 // Note: Need to forget the loops before merging the loop latches, as 1768 // mergeLatch may remove the only block in FC1. 1769 SE.forgetLoop(FC1.L); 1770 SE.forgetLoop(FC0.L); 1771 1772 // Move instructions from FC0.Latch to FC1.Latch. 1773 // Note: mergeLatch requires an updated DT. 1774 mergeLatch(FC0, FC1); 1775 1776 // Merge the loops. 1777 SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks()); 1778 for (BasicBlock *BB : Blocks) { 1779 FC0.L->addBlockEntry(BB); 1780 FC1.L->removeBlockFromLoop(BB); 1781 if (LI.getLoopFor(BB) != FC1.L) 1782 continue; 1783 LI.changeLoopFor(BB, FC0.L); 1784 } 1785 while (!FC1.L->isInnermost()) { 1786 const auto &ChildLoopIt = FC1.L->begin(); 1787 Loop *ChildLoop = *ChildLoopIt; 1788 FC1.L->removeChildLoop(ChildLoopIt); 1789 FC0.L->addChildLoop(ChildLoop); 1790 } 1791 1792 // Delete the now empty loop L1. 1793 LI.erase(FC1.L); 1794 1795 #ifndef NDEBUG 1796 assert(!verifyFunction(*FC0.Header->getParent(), &errs())); 1797 assert(DT.verify(DominatorTree::VerificationLevel::Fast)); 1798 assert(PDT.verify()); 1799 LI.verify(DT); 1800 SE.verify(); 1801 #endif 1802 1803 LLVM_DEBUG(dbgs() << "Fusion done:\n"); 1804 1805 return FC0.L; 1806 } 1807 }; 1808 1809 struct LoopFuseLegacy : public FunctionPass { 1810 1811 static char ID; 1812 1813 LoopFuseLegacy() : FunctionPass(ID) { 1814 initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry()); 1815 } 1816 1817 void getAnalysisUsage(AnalysisUsage &AU) const override { 1818 AU.addRequiredID(LoopSimplifyID); 1819 AU.addRequired<ScalarEvolutionWrapperPass>(); 1820 AU.addRequired<LoopInfoWrapperPass>(); 1821 AU.addRequired<DominatorTreeWrapperPass>(); 1822 AU.addRequired<PostDominatorTreeWrapperPass>(); 1823 AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); 1824 AU.addRequired<DependenceAnalysisWrapperPass>(); 1825 AU.addRequired<AssumptionCacheTracker>(); 1826 AU.addRequired<TargetTransformInfoWrapperPass>(); 1827 1828 AU.addPreserved<ScalarEvolutionWrapperPass>(); 1829 AU.addPreserved<LoopInfoWrapperPass>(); 1830 AU.addPreserved<DominatorTreeWrapperPass>(); 1831 AU.addPreserved<PostDominatorTreeWrapperPass>(); 1832 } 1833 1834 bool runOnFunction(Function &F) override { 1835 if (skipFunction(F)) 1836 return false; 1837 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 1838 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 1839 auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); 1840 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 1841 auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); 1842 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); 1843 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 1844 const TargetTransformInfo &TTI = 1845 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); 1846 const DataLayout &DL = F.getParent()->getDataLayout(); 1847 1848 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); 1849 return LF.fuseLoops(F); 1850 } 1851 }; 1852 } // namespace 1853 1854 PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { 1855 auto &LI = AM.getResult<LoopAnalysis>(F); 1856 auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 1857 auto &DI = AM.getResult<DependenceAnalysis>(F); 1858 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 1859 auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); 1860 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1861 auto &AC = AM.getResult<AssumptionAnalysis>(F); 1862 const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); 1863 const DataLayout &DL = F.getParent()->getDataLayout(); 1864 1865 LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI); 1866 bool Changed = LF.fuseLoops(F); 1867 if (!Changed) 1868 return PreservedAnalyses::all(); 1869 1870 PreservedAnalyses PA; 1871 PA.preserve<DominatorTreeAnalysis>(); 1872 PA.preserve<PostDominatorTreeAnalysis>(); 1873 PA.preserve<ScalarEvolutionAnalysis>(); 1874 PA.preserve<LoopAnalysis>(); 1875 return PA; 1876 } 1877 1878 char LoopFuseLegacy::ID = 0; 1879 1880 INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, 1881 false) 1882 INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) 1883 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) 1884 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 1885 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) 1886 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) 1887 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) 1888 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1889 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1890 INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false) 1891 1892 FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); } 1893