13cdf8794SKit Barton //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
23cdf8794SKit Barton //
33cdf8794SKit Barton // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
43cdf8794SKit Barton // See https://llvm.org/LICENSE.txt for license information.
53cdf8794SKit Barton // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
63cdf8794SKit Barton //
73cdf8794SKit Barton //===----------------------------------------------------------------------===//
83cdf8794SKit Barton ///
93cdf8794SKit Barton /// \file
103cdf8794SKit Barton /// This file implements the loop fusion pass.
113cdf8794SKit Barton /// The implementation is largely based on the following document:
123cdf8794SKit Barton ///
133cdf8794SKit Barton ///       Code Transformations to Augment the Scope of Loop Fusion in a
143cdf8794SKit Barton ///         Production Compiler
153cdf8794SKit Barton ///       Christopher Mark Barton
163cdf8794SKit Barton ///       MSc Thesis
173cdf8794SKit Barton ///       https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
183cdf8794SKit Barton ///
193cdf8794SKit Barton /// The general approach taken is to collect sets of control flow equivalent
203cdf8794SKit Barton /// loops and test whether they can be fused. The necessary conditions for
213cdf8794SKit Barton /// fusion are:
223cdf8794SKit Barton ///    1. The loops must be adjacent (there cannot be any statements between
233cdf8794SKit Barton ///       the two loops).
243cdf8794SKit Barton ///    2. The loops must be conforming (they must execute the same number of
253cdf8794SKit Barton ///       iterations).
263cdf8794SKit Barton ///    3. The loops must be control flow equivalent (if one loop executes, the
273cdf8794SKit Barton ///       other is guaranteed to execute).
283cdf8794SKit Barton ///    4. There cannot be any negative distance dependencies between the loops.
293cdf8794SKit Barton /// If all of these conditions are satisfied, it is safe to fuse the loops.
303cdf8794SKit Barton ///
313cdf8794SKit Barton /// This implementation creates FusionCandidates that represent the loop and the
323cdf8794SKit Barton /// necessary information needed by fusion. It then operates on the fusion
333cdf8794SKit Barton /// candidates, first confirming that the candidate is eligible for fusion. The
343cdf8794SKit Barton /// candidates are then collected into control flow equivalent sets, sorted in
353cdf8794SKit Barton /// dominance order. Each set of control flow equivalent candidates is then
363cdf8794SKit Barton /// traversed, attempting to fuse pairs of candidates in the set. If all
373cdf8794SKit Barton /// requirements for fusion are met, the two candidates are fused, creating a
383cdf8794SKit Barton /// new (fused) candidate which is then added back into the set to consider for
393cdf8794SKit Barton /// additional fusion.
403cdf8794SKit Barton ///
413cdf8794SKit Barton /// This implementation currently does not make any modifications to remove
423cdf8794SKit Barton /// conditions for fusion. Code transformations to make loops conform to each of
433cdf8794SKit Barton /// the conditions for fusion are discussed in more detail in the document
443cdf8794SKit Barton /// above. These can be added to the current implementation in the future.
453cdf8794SKit Barton //===----------------------------------------------------------------------===//
463cdf8794SKit Barton 
473cdf8794SKit Barton #include "llvm/Transforms/Scalar/LoopFuse.h"
483cdf8794SKit Barton #include "llvm/ADT/Statistic.h"
4938a82179SSidharth Baveja #include "llvm/Analysis/AssumptionCache.h"
503cdf8794SKit Barton #include "llvm/Analysis/DependenceAnalysis.h"
513cdf8794SKit Barton #include "llvm/Analysis/DomTreeUpdater.h"
523cdf8794SKit Barton #include "llvm/Analysis/LoopInfo.h"
533cdf8794SKit Barton #include "llvm/Analysis/OptimizationRemarkEmitter.h"
543cdf8794SKit Barton #include "llvm/Analysis/PostDominators.h"
553cdf8794SKit Barton #include "llvm/Analysis/ScalarEvolution.h"
563cdf8794SKit Barton #include "llvm/Analysis/ScalarEvolutionExpressions.h"
5738a82179SSidharth Baveja #include "llvm/Analysis/TargetTransformInfo.h"
583cdf8794SKit Barton #include "llvm/IR/Function.h"
593cdf8794SKit Barton #include "llvm/IR/Verifier.h"
6005da2fe5SReid Kleckner #include "llvm/InitializePasses.h"
613cdf8794SKit Barton #include "llvm/Pass.h"
624c1a1d3cSReid Kleckner #include "llvm/Support/CommandLine.h"
633cdf8794SKit Barton #include "llvm/Support/Debug.h"
643cdf8794SKit Barton #include "llvm/Support/raw_ostream.h"
653cdf8794SKit Barton #include "llvm/Transforms/Scalar.h"
663cdf8794SKit Barton #include "llvm/Transforms/Utils.h"
673cdf8794SKit Barton #include "llvm/Transforms/Utils/BasicBlockUtils.h"
68aaf7f05aSWhitney Tsang #include "llvm/Transforms/Utils/CodeMoverUtils.h"
69b7cfa6caSSidharth Baveja #include "llvm/Transforms/Utils/LoopPeel.h"
703cdf8794SKit Barton 
713cdf8794SKit Barton using namespace llvm;
723cdf8794SKit Barton 
733cdf8794SKit Barton #define DEBUG_TYPE "loop-fusion"
743cdf8794SKit Barton 
75de0b6339SKit Barton STATISTIC(FuseCounter, "Loops fused");
763cdf8794SKit Barton STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
773cdf8794SKit Barton STATISTIC(InvalidPreheader, "Loop has invalid preheader");
783cdf8794SKit Barton STATISTIC(InvalidHeader, "Loop has invalid header");
793cdf8794SKit Barton STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
803cdf8794SKit Barton STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
813cdf8794SKit Barton STATISTIC(InvalidLatch, "Loop has invalid latch");
823cdf8794SKit Barton STATISTIC(InvalidLoop, "Loop is invalid");
833cdf8794SKit Barton STATISTIC(AddressTakenBB, "Basic block has address taken");
843cdf8794SKit Barton STATISTIC(MayThrowException, "Loop may throw an exception");
853cdf8794SKit Barton STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
863cdf8794SKit Barton STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
873cdf8794SKit Barton STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
88de0b6339SKit Barton STATISTIC(UnknownTripCount, "Loop has unknown trip count");
893cdf8794SKit Barton STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
90de0b6339SKit Barton STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
91de0b6339SKit Barton STATISTIC(NonAdjacent, "Loops are not adjacent");
92da58e68fSWhitney Tsang STATISTIC(
93da58e68fSWhitney Tsang     NonEmptyPreheader,
94da58e68fSWhitney Tsang     "Loop has a non-empty preheader with instructions that cannot be moved");
95de0b6339SKit Barton STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
9650bc6104SKit Barton STATISTIC(NonIdenticalGuards, "Candidates have different guards");
97e44f4a8aSWhitney Tsang STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
98e44f4a8aSWhitney Tsang                              "instructions that cannot be moved");
99e44f4a8aSWhitney Tsang STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
100e44f4a8aSWhitney Tsang                               "instructions that cannot be moved");
101ff07fc66SKit Barton STATISTIC(NotRotated, "Candidate is not rotated");
1026a82ace5STa-Wei Tu STATISTIC(OnlySecondCandidateIsGuarded,
1036a82ace5STa-Wei Tu           "The second candidate is guarded while the first one is not");
1043cdf8794SKit Barton 
1053cdf8794SKit Barton enum FusionDependenceAnalysisChoice {
1063cdf8794SKit Barton   FUSION_DEPENDENCE_ANALYSIS_SCEV,
1073cdf8794SKit Barton   FUSION_DEPENDENCE_ANALYSIS_DA,
1083cdf8794SKit Barton   FUSION_DEPENDENCE_ANALYSIS_ALL,
1093cdf8794SKit Barton };
1103cdf8794SKit Barton 
1113cdf8794SKit Barton static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
1123cdf8794SKit Barton     "loop-fusion-dependence-analysis",
1133cdf8794SKit Barton     cl::desc("Which dependence analysis should loop fusion use?"),
1143cdf8794SKit Barton     cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev",
1153cdf8794SKit Barton                           "Use the scalar evolution interface"),
1163cdf8794SKit Barton                clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da",
1173cdf8794SKit Barton                           "Use the dependence analysis interface"),
1183cdf8794SKit Barton                clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all",
1193cdf8794SKit Barton                           "Use all available analyses")),
120*95a13425SFangrui Song     cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL));
1213cdf8794SKit Barton 
12238a82179SSidharth Baveja static cl::opt<unsigned> FusionPeelMaxCount(
12338a82179SSidharth Baveja     "loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
12438a82179SSidharth Baveja     cl::desc("Max number of iterations to be peeled from a loop, such that "
12538a82179SSidharth Baveja              "fusion can take place"));
12638a82179SSidharth Baveja 
1273cdf8794SKit Barton #ifndef NDEBUG
1283cdf8794SKit Barton static cl::opt<bool>
1293cdf8794SKit Barton     VerboseFusionDebugging("loop-fusion-verbose-debug",
1303cdf8794SKit Barton                            cl::desc("Enable verbose debugging for Loop Fusion"),
13136c7d79dSFangrui Song                            cl::Hidden, cl::init(false));
1323cdf8794SKit Barton #endif
1333cdf8794SKit Barton 
134dc5f805dSBenjamin Kramer namespace {
1353cdf8794SKit Barton /// This class is used to represent a candidate for loop fusion. When it is
1363cdf8794SKit Barton /// constructed, it checks the conditions for loop fusion to ensure that it
1373cdf8794SKit Barton /// represents a valid candidate. It caches several parts of a loop that are
1383cdf8794SKit Barton /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
1393cdf8794SKit Barton /// of continually querying the underlying Loop to retrieve these values. It is
1403cdf8794SKit Barton /// assumed these will not change throughout loop fusion.
1413cdf8794SKit Barton ///
1423cdf8794SKit Barton /// The invalidate method should be used to indicate that the FusionCandidate is
1433cdf8794SKit Barton /// no longer a valid candidate for fusion. Similarly, the isValid() method can
1443cdf8794SKit Barton /// be used to ensure that the FusionCandidate is still valid for fusion.
1453cdf8794SKit Barton struct FusionCandidate {
1463cdf8794SKit Barton   /// Cache of parts of the loop used throughout loop fusion. These should not
1473cdf8794SKit Barton   /// need to change throughout the analysis and transformation.
1483cdf8794SKit Barton   /// These parts are cached to avoid repeatedly looking up in the Loop class.
1493cdf8794SKit Barton 
1503cdf8794SKit Barton   /// Preheader of the loop this candidate represents
1513cdf8794SKit Barton   BasicBlock *Preheader;
1523cdf8794SKit Barton   /// Header of the loop this candidate represents
1533cdf8794SKit Barton   BasicBlock *Header;
1543cdf8794SKit Barton   /// Blocks in the loop that exit the loop
1553cdf8794SKit Barton   BasicBlock *ExitingBlock;
1563cdf8794SKit Barton   /// The successor block of this loop (where the exiting blocks go to)
1573cdf8794SKit Barton   BasicBlock *ExitBlock;
1583cdf8794SKit Barton   /// Latch of the loop
1593cdf8794SKit Barton   BasicBlock *Latch;
1603cdf8794SKit Barton   /// The loop that this fusion candidate represents
1613cdf8794SKit Barton   Loop *L;
1623cdf8794SKit Barton   /// Vector of instructions in this loop that read from memory
1633cdf8794SKit Barton   SmallVector<Instruction *, 16> MemReads;
1643cdf8794SKit Barton   /// Vector of instructions in this loop that write to memory
1653cdf8794SKit Barton   SmallVector<Instruction *, 16> MemWrites;
1663cdf8794SKit Barton   /// Are all of the members of this fusion candidate still valid
1673cdf8794SKit Barton   bool Valid;
16850bc6104SKit Barton   /// Guard branch of the loop, if it exists
16950bc6104SKit Barton   BranchInst *GuardBranch;
17038a82179SSidharth Baveja   /// Peeling Paramaters of the Loop.
17138a82179SSidharth Baveja   TTI::PeelingPreferences PP;
17238a82179SSidharth Baveja   /// Can you Peel this Loop?
17338a82179SSidharth Baveja   bool AbleToPeel;
17438a82179SSidharth Baveja   /// Has this loop been Peeled
17538a82179SSidharth Baveja   bool Peeled;
1763cdf8794SKit Barton 
1773cdf8794SKit Barton   /// Dominator and PostDominator trees are needed for the
1783cdf8794SKit Barton   /// FusionCandidateCompare function, required by FusionCandidateSet to
1793cdf8794SKit Barton   /// determine where the FusionCandidate should be inserted into the set. These
1803cdf8794SKit Barton   /// are used to establish ordering of the FusionCandidates based on dominance.
181a73e4ce6SAnna Thomas   DominatorTree &DT;
1823cdf8794SKit Barton   const PostDominatorTree *PDT;
1833cdf8794SKit Barton 
184de0b6339SKit Barton   OptimizationRemarkEmitter &ORE;
185de0b6339SKit Barton 
FusionCandidate__anon08ec0c300111::FusionCandidate186a73e4ce6SAnna Thomas   FusionCandidate(Loop *L, DominatorTree &DT,
18738a82179SSidharth Baveja                   const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE,
18838a82179SSidharth Baveja                   TTI::PeelingPreferences PP)
1893cdf8794SKit Barton       : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
1903cdf8794SKit Barton         ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
1913db1cf7aSKit Barton         Latch(L->getLoopLatch()), L(L), Valid(true),
19238a82179SSidharth Baveja         GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
19338a82179SSidharth Baveja         Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
1943cdf8794SKit Barton 
1953cdf8794SKit Barton     // Walk over all blocks in the loop and check for conditions that may
1963cdf8794SKit Barton     // prevent fusion. For each block, walk over all instructions and collect
1973cdf8794SKit Barton     // the memory reads and writes If any instructions that prevent fusion are
1983cdf8794SKit Barton     // found, invalidate this object and return.
1993cdf8794SKit Barton     for (BasicBlock *BB : L->blocks()) {
2003cdf8794SKit Barton       if (BB->hasAddressTaken()) {
2013cdf8794SKit Barton         invalidate();
202de0b6339SKit Barton         reportInvalidCandidate(AddressTakenBB);
2033cdf8794SKit Barton         return;
2043cdf8794SKit Barton       }
2053cdf8794SKit Barton 
2063cdf8794SKit Barton       for (Instruction &I : *BB) {
2073cdf8794SKit Barton         if (I.mayThrow()) {
2083cdf8794SKit Barton           invalidate();
209de0b6339SKit Barton           reportInvalidCandidate(MayThrowException);
2103cdf8794SKit Barton           return;
2113cdf8794SKit Barton         }
2123cdf8794SKit Barton         if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
2133cdf8794SKit Barton           if (SI->isVolatile()) {
2143cdf8794SKit Barton             invalidate();
215de0b6339SKit Barton             reportInvalidCandidate(ContainsVolatileAccess);
2163cdf8794SKit Barton             return;
2173cdf8794SKit Barton           }
2183cdf8794SKit Barton         }
2193cdf8794SKit Barton         if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
2203cdf8794SKit Barton           if (LI->isVolatile()) {
2213cdf8794SKit Barton             invalidate();
222de0b6339SKit Barton             reportInvalidCandidate(ContainsVolatileAccess);
2233cdf8794SKit Barton             return;
2243cdf8794SKit Barton           }
2253cdf8794SKit Barton         }
2263cdf8794SKit Barton         if (I.mayWriteToMemory())
2273cdf8794SKit Barton           MemWrites.push_back(&I);
2283cdf8794SKit Barton         if (I.mayReadFromMemory())
2293cdf8794SKit Barton           MemReads.push_back(&I);
2303cdf8794SKit Barton       }
2313cdf8794SKit Barton     }
2323cdf8794SKit Barton   }
2333cdf8794SKit Barton 
2343cdf8794SKit Barton   /// Check if all members of the class are valid.
isValid__anon08ec0c300111::FusionCandidate2353cdf8794SKit Barton   bool isValid() const {
2363cdf8794SKit Barton     return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
2373cdf8794SKit Barton            !L->isInvalid() && Valid;
2383cdf8794SKit Barton   }
2393cdf8794SKit Barton 
2403cdf8794SKit Barton   /// Verify that all members are in sync with the Loop object.
verify__anon08ec0c300111::FusionCandidate2413cdf8794SKit Barton   void verify() const {
2423cdf8794SKit Barton     assert(isValid() && "Candidate is not valid!!");
2433cdf8794SKit Barton     assert(!L->isInvalid() && "Loop is invalid!");
2443cdf8794SKit Barton     assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
2453cdf8794SKit Barton     assert(Header == L->getHeader() && "Header is out of sync");
2463cdf8794SKit Barton     assert(ExitingBlock == L->getExitingBlock() &&
2473cdf8794SKit Barton            "Exiting Blocks is out of sync");
2483cdf8794SKit Barton     assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
2493cdf8794SKit Barton     assert(Latch == L->getLoopLatch() && "Latch is out of sync");
2503cdf8794SKit Barton   }
2513cdf8794SKit Barton 
25250bc6104SKit Barton   /// Get the entry block for this fusion candidate.
25350bc6104SKit Barton   ///
25450bc6104SKit Barton   /// If this fusion candidate represents a guarded loop, the entry block is the
25550bc6104SKit Barton   /// loop guard block. If it represents an unguarded loop, the entry block is
25650bc6104SKit Barton   /// the preheader of the loop.
getEntryBlock__anon08ec0c300111::FusionCandidate25750bc6104SKit Barton   BasicBlock *getEntryBlock() const {
25850bc6104SKit Barton     if (GuardBranch)
25950bc6104SKit Barton       return GuardBranch->getParent();
26050bc6104SKit Barton     else
26150bc6104SKit Barton       return Preheader;
26250bc6104SKit Barton   }
26350bc6104SKit Barton 
26438a82179SSidharth Baveja   /// After Peeling the loop is modified quite a bit, hence all of the Blocks
26538a82179SSidharth Baveja   /// need to be updated accordingly.
updateAfterPeeling__anon08ec0c300111::FusionCandidate26638a82179SSidharth Baveja   void updateAfterPeeling() {
26738a82179SSidharth Baveja     Preheader = L->getLoopPreheader();
26838a82179SSidharth Baveja     Header = L->getHeader();
26938a82179SSidharth Baveja     ExitingBlock = L->getExitingBlock();
27038a82179SSidharth Baveja     ExitBlock = L->getExitBlock();
27138a82179SSidharth Baveja     Latch = L->getLoopLatch();
27238a82179SSidharth Baveja     verify();
27338a82179SSidharth Baveja   }
27438a82179SSidharth Baveja 
27550bc6104SKit Barton   /// Given a guarded loop, get the successor of the guard that is not in the
27650bc6104SKit Barton   /// loop.
27750bc6104SKit Barton   ///
27850bc6104SKit Barton   /// This method returns the successor of the loop guard that is not located
27950bc6104SKit Barton   /// within the loop (i.e., the successor of the guard that is not the
28050bc6104SKit Barton   /// preheader).
28150bc6104SKit Barton   /// This method is only valid for guarded loops.
getNonLoopBlock__anon08ec0c300111::FusionCandidate28250bc6104SKit Barton   BasicBlock *getNonLoopBlock() const {
28350bc6104SKit Barton     assert(GuardBranch && "Only valid on guarded loops.");
28450bc6104SKit Barton     assert(GuardBranch->isConditional() &&
28550bc6104SKit Barton            "Expecting guard to be a conditional branch.");
28638a82179SSidharth Baveja     if (Peeled)
28738a82179SSidharth Baveja       return GuardBranch->getSuccessor(1);
28850bc6104SKit Barton     return (GuardBranch->getSuccessor(0) == Preheader)
28950bc6104SKit Barton                ? GuardBranch->getSuccessor(1)
29050bc6104SKit Barton                : GuardBranch->getSuccessor(0);
29150bc6104SKit Barton   }
29250bc6104SKit Barton 
2933cdf8794SKit Barton #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump__anon08ec0c300111::FusionCandidate2943cdf8794SKit Barton   LLVM_DUMP_METHOD void dump() const {
295d1f41b2cSWhitney Tsang     dbgs() << "\tGuardBranch: ";
296d1f41b2cSWhitney Tsang     if (GuardBranch)
297d1f41b2cSWhitney Tsang       dbgs() << *GuardBranch;
298d1f41b2cSWhitney Tsang     else
299d1f41b2cSWhitney Tsang       dbgs() << "nullptr";
300d1f41b2cSWhitney Tsang     dbgs() << "\n"
30150bc6104SKit Barton            << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
30250bc6104SKit Barton            << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
3033cdf8794SKit Barton            << "\n"
3043cdf8794SKit Barton            << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
3053cdf8794SKit Barton            << "\tExitingBB: "
3063cdf8794SKit Barton            << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
3073cdf8794SKit Barton            << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
3083cdf8794SKit Barton            << "\n"
30950bc6104SKit Barton            << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
31050bc6104SKit Barton            << "\tEntryBlock: "
31150bc6104SKit Barton            << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
31250bc6104SKit Barton            << "\n";
3133cdf8794SKit Barton   }
3143cdf8794SKit Barton #endif
3153cdf8794SKit Barton 
316de0b6339SKit Barton   /// Determine if a fusion candidate (representing a loop) is eligible for
317de0b6339SKit Barton   /// fusion. Note that this only checks whether a single loop can be fused - it
318de0b6339SKit Barton   /// does not check whether it is *legal* to fuse two loops together.
isEligibleForFusion__anon08ec0c300111::FusionCandidate319de0b6339SKit Barton   bool isEligibleForFusion(ScalarEvolution &SE) const {
320de0b6339SKit Barton     if (!isValid()) {
321de0b6339SKit Barton       LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
322de0b6339SKit Barton       if (!Preheader)
323de0b6339SKit Barton         ++InvalidPreheader;
324de0b6339SKit Barton       if (!Header)
325de0b6339SKit Barton         ++InvalidHeader;
326de0b6339SKit Barton       if (!ExitingBlock)
327de0b6339SKit Barton         ++InvalidExitingBlock;
328de0b6339SKit Barton       if (!ExitBlock)
329de0b6339SKit Barton         ++InvalidExitBlock;
330de0b6339SKit Barton       if (!Latch)
331de0b6339SKit Barton         ++InvalidLatch;
332de0b6339SKit Barton       if (L->isInvalid())
333de0b6339SKit Barton         ++InvalidLoop;
334de0b6339SKit Barton 
335de0b6339SKit Barton       return false;
336de0b6339SKit Barton     }
337de0b6339SKit Barton 
338de0b6339SKit Barton     // Require ScalarEvolution to be able to determine a trip count.
339de0b6339SKit Barton     if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
340de0b6339SKit Barton       LLVM_DEBUG(dbgs() << "Loop " << L->getName()
341de0b6339SKit Barton                         << " trip count not computable!\n");
342de0b6339SKit Barton       return reportInvalidCandidate(UnknownTripCount);
343de0b6339SKit Barton     }
344de0b6339SKit Barton 
345de0b6339SKit Barton     if (!L->isLoopSimplifyForm()) {
346de0b6339SKit Barton       LLVM_DEBUG(dbgs() << "Loop " << L->getName()
347de0b6339SKit Barton                         << " is not in simplified form!\n");
348de0b6339SKit Barton       return reportInvalidCandidate(NotSimplifiedForm);
349de0b6339SKit Barton     }
350de0b6339SKit Barton 
3513db1cf7aSKit Barton     if (!L->isRotatedForm()) {
352ff07fc66SKit Barton       LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");
353ff07fc66SKit Barton       return reportInvalidCandidate(NotRotated);
354ff07fc66SKit Barton     }
355ff07fc66SKit Barton 
356de0b6339SKit Barton     return true;
357de0b6339SKit Barton   }
358de0b6339SKit Barton 
3593cdf8794SKit Barton private:
3603cdf8794SKit Barton   // This is only used internally for now, to clear the MemWrites and MemReads
3613cdf8794SKit Barton   // list and setting Valid to false. I can't envision other uses of this right
3623cdf8794SKit Barton   // now, since once FusionCandidates are put into the FusionCandidateSet they
3633cdf8794SKit Barton   // are immutable. Thus, any time we need to change/update a FusionCandidate,
3643cdf8794SKit Barton   // we must create a new one and insert it into the FusionCandidateSet to
3653cdf8794SKit Barton   // ensure the FusionCandidateSet remains ordered correctly.
invalidate__anon08ec0c300111::FusionCandidate3663cdf8794SKit Barton   void invalidate() {
3673cdf8794SKit Barton     MemWrites.clear();
3683cdf8794SKit Barton     MemReads.clear();
3693cdf8794SKit Barton     Valid = false;
3703cdf8794SKit Barton   }
371de0b6339SKit Barton 
reportInvalidCandidate__anon08ec0c300111::FusionCandidate372de0b6339SKit Barton   bool reportInvalidCandidate(llvm::Statistic &Stat) const {
373de0b6339SKit Barton     using namespace ore;
374de0b6339SKit Barton     assert(L && Preheader && "Fusion candidate not initialized properly!");
37518839be9SFangrui Song #if LLVM_ENABLE_STATS
376de0b6339SKit Barton     ++Stat;
377de0b6339SKit Barton     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
378de0b6339SKit Barton                                         L->getStartLoc(), Preheader)
379de0b6339SKit Barton              << "[" << Preheader->getParent()->getName() << "]: "
380de0b6339SKit Barton              << "Loop is not a candidate for fusion: " << Stat.getDesc());
38118839be9SFangrui Song #endif
382de0b6339SKit Barton     return false;
383de0b6339SKit Barton   }
3843cdf8794SKit Barton };
3853cdf8794SKit Barton 
3863cdf8794SKit Barton struct FusionCandidateCompare {
3873cdf8794SKit Barton   /// Comparison functor to sort two Control Flow Equivalent fusion candidates
3883cdf8794SKit Barton   /// into dominance order.
3893cdf8794SKit Barton   /// If LHS dominates RHS and RHS post-dominates LHS, return true;
3903cdf8794SKit Barton   /// IF RHS dominates LHS and LHS post-dominates RHS, return false;
operator ()__anon08ec0c300111::FusionCandidateCompare3913cdf8794SKit Barton   bool operator()(const FusionCandidate &LHS,
3923cdf8794SKit Barton                   const FusionCandidate &RHS) const {
393a73e4ce6SAnna Thomas     const DominatorTree *DT = &(LHS.DT);
3943cdf8794SKit Barton 
39550bc6104SKit Barton     BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
39650bc6104SKit Barton     BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
39750bc6104SKit Barton 
3983cdf8794SKit Barton     // Do not save PDT to local variable as it is only used in asserts and thus
3993cdf8794SKit Barton     // will trigger an unused variable warning if building without asserts.
4001ee1da1eSJordan Rupprecht     assert(DT && LHS.PDT && "Expecting valid dominator tree");
4013cdf8794SKit Barton 
4027b619202SRichard Trieu     // Do this compare first so if LHS == RHS, function returns false.
40350bc6104SKit Barton     if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
4043cdf8794SKit Barton       // RHS dominates LHS
4053cdf8794SKit Barton       // Verify LHS post-dominates RHS
4061ee1da1eSJordan Rupprecht       assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
4073cdf8794SKit Barton       return false;
4083cdf8794SKit Barton     }
4097b619202SRichard Trieu 
41050bc6104SKit Barton     if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
4117b619202SRichard Trieu       // Verify RHS Postdominates LHS
4121ee1da1eSJordan Rupprecht       assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
4137b619202SRichard Trieu       return true;
4147b619202SRichard Trieu     }
4157b619202SRichard Trieu 
4163cdf8794SKit Barton     // If LHS does not dominate RHS and RHS does not dominate LHS then there is
4173cdf8794SKit Barton     // no dominance relationship between the two FusionCandidates. Thus, they
4183cdf8794SKit Barton     // should not be in the same set together.
4193cdf8794SKit Barton     llvm_unreachable(
4203cdf8794SKit Barton         "No dominance relationship between these fusion candidates!");
4213cdf8794SKit Barton   }
4223cdf8794SKit Barton };
4233cdf8794SKit Barton 
4243cdf8794SKit Barton using LoopVector = SmallVector<Loop *, 4>;
4253cdf8794SKit Barton 
4263cdf8794SKit Barton // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
4273cdf8794SKit Barton // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
4283cdf8794SKit Barton // dominates FC1 and FC1 post-dominates FC0.
4293cdf8794SKit Barton // std::set was chosen because we want a sorted data structure with stable
4303cdf8794SKit Barton // iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent
4313cdf8794SKit Barton // loops by moving intervening code around. When this intervening code contains
4323cdf8794SKit Barton // loops, those loops will be moved also. The corresponding FusionCandidates
4333cdf8794SKit Barton // will also need to be moved accordingly. As this is done, having stable
4343cdf8794SKit Barton // iterators will simplify the logic. Similarly, having an efficient insert that
4353cdf8794SKit Barton // keeps the FusionCandidateSet sorted will also simplify the implementation.
4363cdf8794SKit Barton using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
4373cdf8794SKit Barton using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
4383cdf8794SKit Barton 
439eb70ac02SFangrui Song #if !defined(NDEBUG)
operator <<(llvm::raw_ostream & OS,const FusionCandidate & FC)440eb70ac02SFangrui Song static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
44169640273SFangrui Song                                      const FusionCandidate &FC) {
44269640273SFangrui Song   if (FC.isValid())
44369640273SFangrui Song     OS << FC.Preheader->getName();
44469640273SFangrui Song   else
44569640273SFangrui Song     OS << "<Invalid>";
44669640273SFangrui Song 
44769640273SFangrui Song   return OS;
44869640273SFangrui Song }
44969640273SFangrui Song 
operator <<(llvm::raw_ostream & OS,const FusionCandidateSet & CandSet)45069640273SFangrui Song static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
4513cdf8794SKit Barton                                      const FusionCandidateSet &CandSet) {
452eb70ac02SFangrui Song   for (const FusionCandidate &FC : CandSet)
453eb70ac02SFangrui Song     OS << FC << '\n';
4543cdf8794SKit Barton 
4553cdf8794SKit Barton   return OS;
4563cdf8794SKit Barton }
4573cdf8794SKit Barton 
4583cdf8794SKit Barton static void
printFusionCandidates(const FusionCandidateCollection & FusionCandidates)4593cdf8794SKit Barton printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
4608e64f0a6SKit Barton   dbgs() << "Fusion Candidates: \n";
4613cdf8794SKit Barton   for (const auto &CandidateSet : FusionCandidates) {
4623cdf8794SKit Barton     dbgs() << "*** Fusion Candidate Set ***\n";
4633cdf8794SKit Barton     dbgs() << CandidateSet;
4643cdf8794SKit Barton     dbgs() << "****************************\n";
4653cdf8794SKit Barton   }
4663cdf8794SKit Barton }
4673cdf8794SKit Barton #endif
4683cdf8794SKit Barton 
4693cdf8794SKit Barton /// Collect all loops in function at the same nest level, starting at the
4703cdf8794SKit Barton /// outermost level.
4713cdf8794SKit Barton ///
4723cdf8794SKit Barton /// This data structure collects all loops at the same nest level for a
4733cdf8794SKit Barton /// given function (specified by the LoopInfo object). It starts at the
4743cdf8794SKit Barton /// outermost level.
4753cdf8794SKit Barton struct LoopDepthTree {
4763cdf8794SKit Barton   using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
4773cdf8794SKit Barton   using iterator = LoopsOnLevelTy::iterator;
4783cdf8794SKit Barton   using const_iterator = LoopsOnLevelTy::const_iterator;
4793cdf8794SKit Barton 
LoopDepthTree__anon08ec0c300111::LoopDepthTree4803cdf8794SKit Barton   LoopDepthTree(LoopInfo &LI) : Depth(1) {
4813cdf8794SKit Barton     if (!LI.empty())
4823cdf8794SKit Barton       LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));
4833cdf8794SKit Barton   }
4843cdf8794SKit Barton 
4853cdf8794SKit Barton   /// Test whether a given loop has been removed from the function, and thus is
4863cdf8794SKit Barton   /// no longer valid.
isRemovedLoop__anon08ec0c300111::LoopDepthTree4873cdf8794SKit Barton   bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }
4883cdf8794SKit Barton 
4893cdf8794SKit Barton   /// Record that a given loop has been removed from the function and is no
4903cdf8794SKit Barton   /// longer valid.
removeLoop__anon08ec0c300111::LoopDepthTree4913cdf8794SKit Barton   void removeLoop(const Loop *L) { RemovedLoops.insert(L); }
4923cdf8794SKit Barton 
4933cdf8794SKit Barton   /// Descend the tree to the next (inner) nesting level
descend__anon08ec0c300111::LoopDepthTree4943cdf8794SKit Barton   void descend() {
4953cdf8794SKit Barton     LoopsOnLevelTy LoopsOnNextLevel;
4963cdf8794SKit Barton 
4973cdf8794SKit Barton     for (const LoopVector &LV : *this)
4983cdf8794SKit Barton       for (Loop *L : LV)
4993cdf8794SKit Barton         if (!isRemovedLoop(L) && L->begin() != L->end())
5003cdf8794SKit Barton           LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));
5013cdf8794SKit Barton 
5023cdf8794SKit Barton     LoopsOnLevel = LoopsOnNextLevel;
5033cdf8794SKit Barton     RemovedLoops.clear();
5043cdf8794SKit Barton     Depth++;
5053cdf8794SKit Barton   }
5063cdf8794SKit Barton 
empty__anon08ec0c300111::LoopDepthTree5073cdf8794SKit Barton   bool empty() const { return size() == 0; }
size__anon08ec0c300111::LoopDepthTree5083cdf8794SKit Barton   size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
getDepth__anon08ec0c300111::LoopDepthTree5093cdf8794SKit Barton   unsigned getDepth() const { return Depth; }
5103cdf8794SKit Barton 
begin__anon08ec0c300111::LoopDepthTree5113cdf8794SKit Barton   iterator begin() { return LoopsOnLevel.begin(); }
end__anon08ec0c300111::LoopDepthTree5123cdf8794SKit Barton   iterator end() { return LoopsOnLevel.end(); }
begin__anon08ec0c300111::LoopDepthTree5133cdf8794SKit Barton   const_iterator begin() const { return LoopsOnLevel.begin(); }
end__anon08ec0c300111::LoopDepthTree5143cdf8794SKit Barton   const_iterator end() const { return LoopsOnLevel.end(); }
5153cdf8794SKit Barton 
5163cdf8794SKit Barton private:
5173cdf8794SKit Barton   /// Set of loops that have been removed from the function and are no longer
5183cdf8794SKit Barton   /// valid.
5193cdf8794SKit Barton   SmallPtrSet<const Loop *, 8> RemovedLoops;
5203cdf8794SKit Barton 
5213cdf8794SKit Barton   /// Depth of the current level, starting at 1 (outermost loops).
5223cdf8794SKit Barton   unsigned Depth;
5233cdf8794SKit Barton 
5243cdf8794SKit Barton   /// Vector of loops at the current depth level that have the same parent loop
5253cdf8794SKit Barton   LoopsOnLevelTy LoopsOnLevel;
5263cdf8794SKit Barton };
5273cdf8794SKit Barton 
5283cdf8794SKit Barton #ifndef NDEBUG
printLoopVector(const LoopVector & LV)5293cdf8794SKit Barton static void printLoopVector(const LoopVector &LV) {
5303cdf8794SKit Barton   dbgs() << "****************************\n";
5313cdf8794SKit Barton   for (auto L : LV)
5323cdf8794SKit Barton     printLoop(*L, dbgs());
5333cdf8794SKit Barton   dbgs() << "****************************\n";
5343cdf8794SKit Barton }
5353cdf8794SKit Barton #endif
5363cdf8794SKit Barton 
5373cdf8794SKit Barton struct LoopFuser {
5383cdf8794SKit Barton private:
5393cdf8794SKit Barton   // Sets of control flow equivalent fusion candidates for a given nest level.
5403cdf8794SKit Barton   FusionCandidateCollection FusionCandidates;
5413cdf8794SKit Barton 
5423cdf8794SKit Barton   LoopDepthTree LDT;
5433cdf8794SKit Barton   DomTreeUpdater DTU;
5443cdf8794SKit Barton 
5453cdf8794SKit Barton   LoopInfo &LI;
5463cdf8794SKit Barton   DominatorTree &DT;
5473cdf8794SKit Barton   DependenceInfo &DI;
5483cdf8794SKit Barton   ScalarEvolution &SE;
5493cdf8794SKit Barton   PostDominatorTree &PDT;
5503cdf8794SKit Barton   OptimizationRemarkEmitter &ORE;
55138a82179SSidharth Baveja   AssumptionCache &AC;
55238a82179SSidharth Baveja 
55338a82179SSidharth Baveja   const TargetTransformInfo &TTI;
5543cdf8794SKit Barton 
5553cdf8794SKit Barton public:
LoopFuser__anon08ec0c300111::LoopFuser5563cdf8794SKit Barton   LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
5573cdf8794SKit Barton             ScalarEvolution &SE, PostDominatorTree &PDT,
55838a82179SSidharth Baveja             OptimizationRemarkEmitter &ORE, const DataLayout &DL,
55938a82179SSidharth Baveja             AssumptionCache &AC, const TargetTransformInfo &TTI)
5603cdf8794SKit Barton       : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
56138a82179SSidharth Baveja         DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
5623cdf8794SKit Barton 
5633cdf8794SKit Barton   /// This is the main entry point for loop fusion. It will traverse the
5643cdf8794SKit Barton   /// specified function and collect candidate loops to fuse, starting at the
5653cdf8794SKit Barton   /// outermost nesting level and working inwards.
fuseLoops__anon08ec0c300111::LoopFuser5663cdf8794SKit Barton   bool fuseLoops(Function &F) {
5673cdf8794SKit Barton #ifndef NDEBUG
5683cdf8794SKit Barton     if (VerboseFusionDebugging) {
5693cdf8794SKit Barton       LI.print(dbgs());
5703cdf8794SKit Barton     }
5713cdf8794SKit Barton #endif
5723cdf8794SKit Barton 
5733cdf8794SKit Barton     LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
5743cdf8794SKit Barton                       << "\n");
5753cdf8794SKit Barton     bool Changed = false;
5763cdf8794SKit Barton 
5773cdf8794SKit Barton     while (!LDT.empty()) {
5783cdf8794SKit Barton       LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
5793cdf8794SKit Barton                         << LDT.getDepth() << "\n";);
5803cdf8794SKit Barton 
5813cdf8794SKit Barton       for (const LoopVector &LV : LDT) {
5823cdf8794SKit Barton         assert(LV.size() > 0 && "Empty loop set was build!");
5833cdf8794SKit Barton 
5843cdf8794SKit Barton         // Skip singleton loop sets as they do not offer fusion opportunities on
5853cdf8794SKit Barton         // this level.
5863cdf8794SKit Barton         if (LV.size() == 1)
5873cdf8794SKit Barton           continue;
5883cdf8794SKit Barton #ifndef NDEBUG
5893cdf8794SKit Barton         if (VerboseFusionDebugging) {
5903cdf8794SKit Barton           LLVM_DEBUG({
5913cdf8794SKit Barton             dbgs() << "  Visit loop set (#" << LV.size() << "):\n";
5923cdf8794SKit Barton             printLoopVector(LV);
5933cdf8794SKit Barton           });
5943cdf8794SKit Barton         }
5953cdf8794SKit Barton #endif
5963cdf8794SKit Barton 
5973cdf8794SKit Barton         collectFusionCandidates(LV);
5983cdf8794SKit Barton         Changed |= fuseCandidates();
5993cdf8794SKit Barton       }
6003cdf8794SKit Barton 
6013cdf8794SKit Barton       // Finished analyzing candidates at this level.
6023cdf8794SKit Barton       // Descend to the next level and clear all of the candidates currently
6033cdf8794SKit Barton       // collected. Note that it will not be possible to fuse any of the
6043cdf8794SKit Barton       // existing candidates with new candidates because the new candidates will
6053cdf8794SKit Barton       // be at a different nest level and thus not be control flow equivalent
6063cdf8794SKit Barton       // with all of the candidates collected so far.
6073cdf8794SKit Barton       LLVM_DEBUG(dbgs() << "Descend one level!\n");
6083cdf8794SKit Barton       LDT.descend();
6093cdf8794SKit Barton       FusionCandidates.clear();
6103cdf8794SKit Barton     }
6113cdf8794SKit Barton 
6123cdf8794SKit Barton     if (Changed)
6133cdf8794SKit Barton       LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
6143cdf8794SKit Barton 
6153cdf8794SKit Barton #ifndef NDEBUG
6163cdf8794SKit Barton     assert(DT.verify());
6173cdf8794SKit Barton     assert(PDT.verify());
6183cdf8794SKit Barton     LI.verify(DT);
6193cdf8794SKit Barton     SE.verify();
6203cdf8794SKit Barton #endif
6213cdf8794SKit Barton 
6223cdf8794SKit Barton     LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
6233cdf8794SKit Barton     return Changed;
6243cdf8794SKit Barton   }
6253cdf8794SKit Barton 
6263cdf8794SKit Barton private:
6273cdf8794SKit Barton   /// Determine if two fusion candidates are control flow equivalent.
6283cdf8794SKit Barton   ///
6293cdf8794SKit Barton   /// Two fusion candidates are control flow equivalent if when one executes,
6303cdf8794SKit Barton   /// the other is guaranteed to execute. This is determined using dominators
6313cdf8794SKit Barton   /// and post-dominators: if A dominates B and B post-dominates A then A and B
6323cdf8794SKit Barton   /// are control-flow equivalent.
isControlFlowEquivalent__anon08ec0c300111::LoopFuser6333cdf8794SKit Barton   bool isControlFlowEquivalent(const FusionCandidate &FC0,
6343cdf8794SKit Barton                                const FusionCandidate &FC1) const {
6353cdf8794SKit Barton     assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
6363cdf8794SKit Barton 
637aaf7f05aSWhitney Tsang     return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(),
638aaf7f05aSWhitney Tsang                                      DT, PDT);
6393cdf8794SKit Barton   }
6403cdf8794SKit Barton 
6413cdf8794SKit Barton   /// Iterate over all loops in the given loop set and identify the loops that
6423cdf8794SKit Barton   /// are eligible for fusion. Place all eligible fusion candidates into Control
6433cdf8794SKit Barton   /// Flow Equivalent sets, sorted by dominance.
collectFusionCandidates__anon08ec0c300111::LoopFuser6443cdf8794SKit Barton   void collectFusionCandidates(const LoopVector &LV) {
6453cdf8794SKit Barton     for (Loop *L : LV) {
64638a82179SSidharth Baveja       TTI::PeelingPreferences PP =
64738a82179SSidharth Baveja           gatherPeelingPreferences(L, SE, TTI, None, None);
648a73e4ce6SAnna Thomas       FusionCandidate CurrCand(L, DT, &PDT, ORE, PP);
649de0b6339SKit Barton       if (!CurrCand.isEligibleForFusion(SE))
6503cdf8794SKit Barton         continue;
6513cdf8794SKit Barton 
6523cdf8794SKit Barton       // Go through each list in FusionCandidates and determine if L is control
6533cdf8794SKit Barton       // flow equivalent with the first loop in that list. If it is, append LV.
6543cdf8794SKit Barton       // If not, go to the next list.
6553cdf8794SKit Barton       // If no suitable list is found, start another list and add it to
6563cdf8794SKit Barton       // FusionCandidates.
6573cdf8794SKit Barton       bool FoundSet = false;
6583cdf8794SKit Barton 
6593cdf8794SKit Barton       for (auto &CurrCandSet : FusionCandidates) {
6603cdf8794SKit Barton         if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {
6613cdf8794SKit Barton           CurrCandSet.insert(CurrCand);
6623cdf8794SKit Barton           FoundSet = true;
6633cdf8794SKit Barton #ifndef NDEBUG
6643cdf8794SKit Barton           if (VerboseFusionDebugging)
6653cdf8794SKit Barton             LLVM_DEBUG(dbgs() << "Adding " << CurrCand
6663cdf8794SKit Barton                               << " to existing candidate set\n");
6673cdf8794SKit Barton #endif
6683cdf8794SKit Barton           break;
6693cdf8794SKit Barton         }
6703cdf8794SKit Barton       }
6713cdf8794SKit Barton       if (!FoundSet) {
6723cdf8794SKit Barton         // No set was found. Create a new set and add to FusionCandidates
6733cdf8794SKit Barton #ifndef NDEBUG
6743cdf8794SKit Barton         if (VerboseFusionDebugging)
6753cdf8794SKit Barton           LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
6763cdf8794SKit Barton #endif
6773cdf8794SKit Barton         FusionCandidateSet NewCandSet;
6783cdf8794SKit Barton         NewCandSet.insert(CurrCand);
6793cdf8794SKit Barton         FusionCandidates.push_back(NewCandSet);
6803cdf8794SKit Barton       }
6813cdf8794SKit Barton       NumFusionCandidates++;
6823cdf8794SKit Barton     }
6833cdf8794SKit Barton   }
6843cdf8794SKit Barton 
6853cdf8794SKit Barton   /// Determine if it is beneficial to fuse two loops.
6863cdf8794SKit Barton   ///
6873cdf8794SKit Barton   /// For now, this method simply returns true because we want to fuse as much
6883cdf8794SKit Barton   /// as possible (primarily to test the pass). This method will evolve, over
6893cdf8794SKit Barton   /// time, to add heuristics for profitability of fusion.
isBeneficialFusion__anon08ec0c300111::LoopFuser6903cdf8794SKit Barton   bool isBeneficialFusion(const FusionCandidate &FC0,
6913cdf8794SKit Barton                           const FusionCandidate &FC1) {
6923cdf8794SKit Barton     return true;
6933cdf8794SKit Barton   }
6943cdf8794SKit Barton 
6953cdf8794SKit Barton   /// Determine if two fusion candidates have the same trip count (i.e., they
6963cdf8794SKit Barton   /// execute the same number of iterations).
6973cdf8794SKit Barton   ///
69838a82179SSidharth Baveja   /// This function will return a pair of values. The first is a boolean,
69938a82179SSidharth Baveja   /// stating whether or not the two candidates are known at compile time to
70038a82179SSidharth Baveja   /// have the same TripCount. The second is the difference in the two
70138a82179SSidharth Baveja   /// TripCounts. This information can be used later to determine whether or not
70238a82179SSidharth Baveja   /// peeling can be performed on either one of the candiates.
70338a82179SSidharth Baveja   std::pair<bool, Optional<unsigned>>
haveIdenticalTripCounts__anon08ec0c300111::LoopFuser70438a82179SSidharth Baveja   haveIdenticalTripCounts(const FusionCandidate &FC0,
7053cdf8794SKit Barton                           const FusionCandidate &FC1) const {
70638a82179SSidharth Baveja 
7073cdf8794SKit Barton     const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
7083cdf8794SKit Barton     if (isa<SCEVCouldNotCompute>(TripCount0)) {
7093cdf8794SKit Barton       UncomputableTripCount++;
7103cdf8794SKit Barton       LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
71138a82179SSidharth Baveja       return {false, None};
7123cdf8794SKit Barton     }
7133cdf8794SKit Barton 
7143cdf8794SKit Barton     const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
7153cdf8794SKit Barton     if (isa<SCEVCouldNotCompute>(TripCount1)) {
7163cdf8794SKit Barton       UncomputableTripCount++;
7173cdf8794SKit Barton       LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
71838a82179SSidharth Baveja       return {false, None};
7193cdf8794SKit Barton     }
72038a82179SSidharth Baveja 
7213cdf8794SKit Barton     LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
7223cdf8794SKit Barton                       << *TripCount1 << " are "
7233cdf8794SKit Barton                       << (TripCount0 == TripCount1 ? "identical" : "different")
7243cdf8794SKit Barton                       << "\n");
7253cdf8794SKit Barton 
72638a82179SSidharth Baveja     if (TripCount0 == TripCount1)
72738a82179SSidharth Baveja       return {true, 0};
72838a82179SSidharth Baveja 
72938a82179SSidharth Baveja     LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
73038a82179SSidharth Baveja                          "determining the difference between trip counts\n");
73138a82179SSidharth Baveja 
73238a82179SSidharth Baveja     // Currently only considering loops with a single exit point
73338a82179SSidharth Baveja     // and a non-constant trip count.
73438a82179SSidharth Baveja     const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L);
73538a82179SSidharth Baveja     const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L);
73638a82179SSidharth Baveja 
73738a82179SSidharth Baveja     // If any of the tripcounts are zero that means that loop(s) do not have
73838a82179SSidharth Baveja     // a single exit or a constant tripcount.
73938a82179SSidharth Baveja     if (TC0 == 0 || TC1 == 0) {
74038a82179SSidharth Baveja       LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
74138a82179SSidharth Baveja                            "have a constant number of iterations. Peeling "
74238a82179SSidharth Baveja                            "is not benefical\n");
74338a82179SSidharth Baveja       return {false, None};
74438a82179SSidharth Baveja     }
74538a82179SSidharth Baveja 
74638a82179SSidharth Baveja     Optional<unsigned> Difference = None;
74738a82179SSidharth Baveja     int Diff = TC0 - TC1;
74838a82179SSidharth Baveja 
74938a82179SSidharth Baveja     if (Diff > 0)
75038a82179SSidharth Baveja       Difference = Diff;
75138a82179SSidharth Baveja     else {
75238a82179SSidharth Baveja       LLVM_DEBUG(
75338a82179SSidharth Baveja           dbgs() << "Difference is less than 0. FC1 (second loop) has more "
75438a82179SSidharth Baveja                     "iterations than the first one. Currently not supported\n");
75538a82179SSidharth Baveja     }
75638a82179SSidharth Baveja 
75738a82179SSidharth Baveja     LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference
75838a82179SSidharth Baveja                       << "\n");
75938a82179SSidharth Baveja 
76038a82179SSidharth Baveja     return {false, Difference};
76138a82179SSidharth Baveja   }
76238a82179SSidharth Baveja 
peelFusionCandidate__anon08ec0c300111::LoopFuser76338a82179SSidharth Baveja   void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
76438a82179SSidharth Baveja                            unsigned PeelCount) {
76538a82179SSidharth Baveja     assert(FC0.AbleToPeel && "Should be able to peel loop");
76638a82179SSidharth Baveja 
76738a82179SSidharth Baveja     LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
76838a82179SSidharth Baveja                       << " iterations of the first loop. \n");
76938a82179SSidharth Baveja 
770bc48a266SAnna Thomas     FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true);
77138a82179SSidharth Baveja     if (FC0.Peeled) {
77238a82179SSidharth Baveja       LLVM_DEBUG(dbgs() << "Done Peeling\n");
77338a82179SSidharth Baveja 
77438a82179SSidharth Baveja #ifndef NDEBUG
77538a82179SSidharth Baveja       auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1);
77638a82179SSidharth Baveja 
77738a82179SSidharth Baveja       assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 &&
77838a82179SSidharth Baveja              "Loops should have identical trip counts after peeling");
77938a82179SSidharth Baveja #endif
78038a82179SSidharth Baveja 
78138a82179SSidharth Baveja       FC0.PP.PeelCount += PeelCount;
78238a82179SSidharth Baveja 
78338a82179SSidharth Baveja       // Peeling does not update the PDT
78438a82179SSidharth Baveja       PDT.recalculate(*FC0.Preheader->getParent());
78538a82179SSidharth Baveja 
78638a82179SSidharth Baveja       FC0.updateAfterPeeling();
78738a82179SSidharth Baveja 
78838a82179SSidharth Baveja       // In this case the iterations of the loop are constant, so the first
78938a82179SSidharth Baveja       // loop will execute completely (will not jump from one of
79038a82179SSidharth Baveja       // the peeled blocks to the second loop). Here we are updating the
79138a82179SSidharth Baveja       // branch conditions of each of the peeled blocks, such that it will
79238a82179SSidharth Baveja       // branch to its successor which is not the preheader of the second loop
79338a82179SSidharth Baveja       // in the case of unguarded loops, or the succesors of the exit block of
79438a82179SSidharth Baveja       // the first loop otherwise. Doing this update will ensure that the entry
79538a82179SSidharth Baveja       // block of the first loop dominates the entry block of the second loop.
79638a82179SSidharth Baveja       BasicBlock *BB =
79738a82179SSidharth Baveja           FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;
79838a82179SSidharth Baveja       if (BB) {
79938a82179SSidharth Baveja         SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
80038a82179SSidharth Baveja         SmallVector<Instruction *, 8> WorkList;
80138a82179SSidharth Baveja         for (BasicBlock *Pred : predecessors(BB)) {
80238a82179SSidharth Baveja           if (Pred != FC0.ExitBlock) {
80338a82179SSidharth Baveja             WorkList.emplace_back(Pred->getTerminator());
80438a82179SSidharth Baveja             TreeUpdates.emplace_back(
80538a82179SSidharth Baveja                 DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB));
80638a82179SSidharth Baveja           }
80738a82179SSidharth Baveja         }
80838a82179SSidharth Baveja         // Cannot modify the predecessors inside the above loop as it will cause
80938a82179SSidharth Baveja         // the iterators to be nullptrs, causing memory errors.
81038a82179SSidharth Baveja         for (Instruction *CurrentBranch: WorkList) {
81138a82179SSidharth Baveja           BasicBlock *Succ = CurrentBranch->getSuccessor(0);
81238a82179SSidharth Baveja           if (Succ == BB)
81338a82179SSidharth Baveja             Succ = CurrentBranch->getSuccessor(1);
81438a82179SSidharth Baveja           ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ));
81538a82179SSidharth Baveja         }
81638a82179SSidharth Baveja 
81738a82179SSidharth Baveja         DTU.applyUpdates(TreeUpdates);
81838a82179SSidharth Baveja         DTU.flush();
81938a82179SSidharth Baveja       }
82038a82179SSidharth Baveja       LLVM_DEBUG(
82138a82179SSidharth Baveja           dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount
82238a82179SSidharth Baveja                  << " iterations from the first loop.\n"
82338a82179SSidharth Baveja                     "Both Loops have the same number of iterations now.\n");
82438a82179SSidharth Baveja     }
8253cdf8794SKit Barton   }
8263cdf8794SKit Barton 
8273cdf8794SKit Barton   /// Walk each set of control flow equivalent fusion candidates and attempt to
8283cdf8794SKit Barton   /// fuse them. This does a single linear traversal of all candidates in the
8293cdf8794SKit Barton   /// set. The conditions for legal fusion are checked at this point. If a pair
8303cdf8794SKit Barton   /// of fusion candidates passes all legality checks, they are fused together
8313cdf8794SKit Barton   /// and a new fusion candidate is created and added to the FusionCandidateSet.
8323cdf8794SKit Barton   /// The original fusion candidates are then removed, as they are no longer
8333cdf8794SKit Barton   /// valid.
fuseCandidates__anon08ec0c300111::LoopFuser8343cdf8794SKit Barton   bool fuseCandidates() {
8353cdf8794SKit Barton     bool Fused = false;
8363cdf8794SKit Barton     LLVM_DEBUG(printFusionCandidates(FusionCandidates));
8373cdf8794SKit Barton     for (auto &CandidateSet : FusionCandidates) {
8383cdf8794SKit Barton       if (CandidateSet.size() < 2)
8393cdf8794SKit Barton         continue;
8403cdf8794SKit Barton 
8413cdf8794SKit Barton       LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
8423cdf8794SKit Barton                         << CandidateSet << "\n");
8433cdf8794SKit Barton 
8443cdf8794SKit Barton       for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
8453cdf8794SKit Barton         assert(!LDT.isRemovedLoop(FC0->L) &&
8463cdf8794SKit Barton                "Should not have removed loops in CandidateSet!");
8473cdf8794SKit Barton         auto FC1 = FC0;
8483cdf8794SKit Barton         for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
8493cdf8794SKit Barton           assert(!LDT.isRemovedLoop(FC1->L) &&
8503cdf8794SKit Barton                  "Should not have removed loops in CandidateSet!");
8513cdf8794SKit Barton 
8523cdf8794SKit Barton           LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
8533cdf8794SKit Barton                      dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
8543cdf8794SKit Barton 
8553cdf8794SKit Barton           FC0->verify();
8563cdf8794SKit Barton           FC1->verify();
8573cdf8794SKit Barton 
85838a82179SSidharth Baveja           // Check if the candidates have identical tripcounts (first value of
85938a82179SSidharth Baveja           // pair), and if not check the difference in the tripcounts between
86038a82179SSidharth Baveja           // the loops (second value of pair). The difference is not equal to
86138a82179SSidharth Baveja           // None iff the loops iterate a constant number of times, and have a
86238a82179SSidharth Baveja           // single exit.
86338a82179SSidharth Baveja           std::pair<bool, Optional<unsigned>> IdenticalTripCountRes =
86438a82179SSidharth Baveja               haveIdenticalTripCounts(*FC0, *FC1);
86538a82179SSidharth Baveja           bool SameTripCount = IdenticalTripCountRes.first;
86638a82179SSidharth Baveja           Optional<unsigned> TCDifference = IdenticalTripCountRes.second;
86738a82179SSidharth Baveja 
86838a82179SSidharth Baveja           // Here we are checking that FC0 (the first loop) can be peeled, and
86938a82179SSidharth Baveja           // both loops have different tripcounts.
87038a82179SSidharth Baveja           if (FC0->AbleToPeel && !SameTripCount && TCDifference) {
87138a82179SSidharth Baveja             if (*TCDifference > FusionPeelMaxCount) {
87238a82179SSidharth Baveja               LLVM_DEBUG(dbgs()
87338a82179SSidharth Baveja                          << "Difference in loop trip counts: " << *TCDifference
87438a82179SSidharth Baveja                          << " is greater than maximum peel count specificed: "
87538a82179SSidharth Baveja                          << FusionPeelMaxCount << "\n");
87638a82179SSidharth Baveja             } else {
87738a82179SSidharth Baveja               // Dependent on peeling being performed on the first loop, and
87838a82179SSidharth Baveja               // assuming all other conditions for fusion return true.
87938a82179SSidharth Baveja               SameTripCount = true;
88038a82179SSidharth Baveja             }
88138a82179SSidharth Baveja           }
88238a82179SSidharth Baveja 
88338a82179SSidharth Baveja           if (!SameTripCount) {
8843cdf8794SKit Barton             LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
8853cdf8794SKit Barton                                  "counts. Not fusing.\n");
886de0b6339SKit Barton             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
887de0b6339SKit Barton                                                        NonEqualTripCount);
8883cdf8794SKit Barton             continue;
8893cdf8794SKit Barton           }
8903cdf8794SKit Barton 
8913cdf8794SKit Barton           if (!isAdjacent(*FC0, *FC1)) {
8923cdf8794SKit Barton             LLVM_DEBUG(dbgs()
8933cdf8794SKit Barton                        << "Fusion candidates are not adjacent. Not fusing.\n");
894de0b6339SKit Barton             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
8953cdf8794SKit Barton             continue;
8963cdf8794SKit Barton           }
8973cdf8794SKit Barton 
8986a82ace5STa-Wei Tu           if (!FC0->GuardBranch && FC1->GuardBranch) {
8996a82ace5STa-Wei Tu             LLVM_DEBUG(dbgs() << "The second candidate is guarded while the "
9006a82ace5STa-Wei Tu                                  "first one is not. Not fusing.\n");
9016a82ace5STa-Wei Tu             reportLoopFusion<OptimizationRemarkMissed>(
9026a82ace5STa-Wei Tu                 *FC0, *FC1, OnlySecondCandidateIsGuarded);
9036a82ace5STa-Wei Tu             continue;
9046a82ace5STa-Wei Tu           }
9056a82ace5STa-Wei Tu 
90650bc6104SKit Barton           // Ensure that FC0 and FC1 have identical guards.
90750bc6104SKit Barton           // If one (or both) are not guarded, this check is not necessary.
90850bc6104SKit Barton           if (FC0->GuardBranch && FC1->GuardBranch &&
90938a82179SSidharth Baveja               !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {
91050bc6104SKit Barton             LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
91150bc6104SKit Barton                                  "guards. Not Fusing.\n");
91250bc6104SKit Barton             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
91350bc6104SKit Barton                                                        NonIdenticalGuards);
91450bc6104SKit Barton             continue;
91550bc6104SKit Barton           }
91650bc6104SKit Barton 
917da58e68fSWhitney Tsang           if (!isSafeToMoveBefore(*FC1->Preheader,
918082e3952SSharmaRithik                                   *FC0->Preheader->getTerminator(), DT, &PDT,
919082e3952SSharmaRithik                                   &DI)) {
920da58e68fSWhitney Tsang             LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
921da58e68fSWhitney Tsang                                  "instructions in preheader. Not fusing.\n");
922de0b6339SKit Barton             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
923de0b6339SKit Barton                                                        NonEmptyPreheader);
9243cdf8794SKit Barton             continue;
9253cdf8794SKit Barton           }
9263cdf8794SKit Barton 
927e44f4a8aSWhitney Tsang           if (FC0->GuardBranch) {
928e44f4a8aSWhitney Tsang             assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
929e44f4a8aSWhitney Tsang 
930e44f4a8aSWhitney Tsang             if (!isSafeToMoveBefore(*FC0->ExitBlock,
931e44f4a8aSWhitney Tsang                                     *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,
932082e3952SSharmaRithik                                     &PDT, &DI)) {
933e44f4a8aSWhitney Tsang               LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
934e44f4a8aSWhitney Tsang                                    "instructions in exit block. Not fusing.\n");
93550bc6104SKit Barton               reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
93650bc6104SKit Barton                                                          NonEmptyExitBlock);
93750bc6104SKit Barton               continue;
93850bc6104SKit Barton             }
93950bc6104SKit Barton 
940e44f4a8aSWhitney Tsang             if (!isSafeToMoveBefore(
941e44f4a8aSWhitney Tsang                     *FC1->GuardBranch->getParent(),
942082e3952SSharmaRithik                     *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT,
943082e3952SSharmaRithik                     &DI)) {
944e44f4a8aSWhitney Tsang               LLVM_DEBUG(dbgs()
945e44f4a8aSWhitney Tsang                          << "Fusion candidate contains unsafe "
946e44f4a8aSWhitney Tsang                             "instructions in guard block. Not fusing.\n");
94750bc6104SKit Barton               reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
94850bc6104SKit Barton                                                          NonEmptyGuardBlock);
94950bc6104SKit Barton               continue;
95050bc6104SKit Barton             }
951e44f4a8aSWhitney Tsang           }
95250bc6104SKit Barton 
95350bc6104SKit Barton           // Check the dependencies across the loops and do not fuse if it would
95450bc6104SKit Barton           // violate them.
9553cdf8794SKit Barton           if (!dependencesAllowFusion(*FC0, *FC1)) {
9563cdf8794SKit Barton             LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
957de0b6339SKit Barton             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
958de0b6339SKit Barton                                                        InvalidDependencies);
9593cdf8794SKit Barton             continue;
9603cdf8794SKit Barton           }
9613cdf8794SKit Barton 
9623cdf8794SKit Barton           bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
9633cdf8794SKit Barton           LLVM_DEBUG(dbgs()
9643cdf8794SKit Barton                      << "\tFusion appears to be "
9653cdf8794SKit Barton                      << (BeneficialToFuse ? "" : "un") << "profitable!\n");
966de0b6339SKit Barton           if (!BeneficialToFuse) {
967de0b6339SKit Barton             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
968de0b6339SKit Barton                                                        FusionNotBeneficial);
9693cdf8794SKit Barton             continue;
970de0b6339SKit Barton           }
9713cdf8794SKit Barton           // All analysis has completed and has determined that fusion is legal
9723cdf8794SKit Barton           // and profitable. At this point, start transforming the code and
9733cdf8794SKit Barton           // perform fusion.
9743cdf8794SKit Barton 
9753cdf8794SKit Barton           LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
9763cdf8794SKit Barton                             << *FC1 << "\n");
9773cdf8794SKit Barton 
97838a82179SSidharth Baveja           FusionCandidate FC0Copy = *FC0;
97938a82179SSidharth Baveja           // Peel the loop after determining that fusion is legal. The Loops
98038a82179SSidharth Baveja           // will still be safe to fuse after the peeling is performed.
98138a82179SSidharth Baveja           bool Peel = TCDifference && *TCDifference > 0;
98238a82179SSidharth Baveja           if (Peel)
98338a82179SSidharth Baveja             peelFusionCandidate(FC0Copy, *FC1, *TCDifference);
98438a82179SSidharth Baveja 
9853cdf8794SKit Barton           // Report fusion to the Optimization Remarks.
9863cdf8794SKit Barton           // Note this needs to be done *before* performFusion because
9873cdf8794SKit Barton           // performFusion will change the original loops, making it not
9883cdf8794SKit Barton           // possible to identify them after fusion is complete.
98938a82179SSidharth Baveja           reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,
99038a82179SSidharth Baveja                                                FuseCounter);
9913cdf8794SKit Barton 
99238a82179SSidharth Baveja           FusionCandidate FusedCand(
993a73e4ce6SAnna Thomas               performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE,
99438a82179SSidharth Baveja               FC0Copy.PP);
9953cdf8794SKit Barton           FusedCand.verify();
996de0b6339SKit Barton           assert(FusedCand.isEligibleForFusion(SE) &&
9973cdf8794SKit Barton                  "Fused candidate should be eligible for fusion!");
9983cdf8794SKit Barton 
9993cdf8794SKit Barton           // Notify the loop-depth-tree that these loops are not valid objects
10003cdf8794SKit Barton           LDT.removeLoop(FC1->L);
10013cdf8794SKit Barton 
10023cdf8794SKit Barton           CandidateSet.erase(FC0);
10033cdf8794SKit Barton           CandidateSet.erase(FC1);
10043cdf8794SKit Barton 
10053cdf8794SKit Barton           auto InsertPos = CandidateSet.insert(FusedCand);
10063cdf8794SKit Barton 
10073cdf8794SKit Barton           assert(InsertPos.second &&
10083cdf8794SKit Barton                  "Unable to insert TargetCandidate in CandidateSet!");
10093cdf8794SKit Barton 
10103cdf8794SKit Barton           // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
10113cdf8794SKit Barton           // of the FC1 loop will attempt to fuse the new (fused) loop with the
10123cdf8794SKit Barton           // remaining candidates in the current candidate set.
10133cdf8794SKit Barton           FC0 = FC1 = InsertPos.first;
10143cdf8794SKit Barton 
10153cdf8794SKit Barton           LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
10163cdf8794SKit Barton                             << "\n");
10173cdf8794SKit Barton 
10183cdf8794SKit Barton           Fused = true;
10193cdf8794SKit Barton         }
10203cdf8794SKit Barton       }
10213cdf8794SKit Barton     }
10223cdf8794SKit Barton     return Fused;
10233cdf8794SKit Barton   }
10243cdf8794SKit Barton 
10253cdf8794SKit Barton   /// Rewrite all additive recurrences in a SCEV to use a new loop.
10263cdf8794SKit Barton   class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
10273cdf8794SKit Barton   public:
AddRecLoopReplacer(ScalarEvolution & SE,const Loop & OldL,const Loop & NewL,bool UseMax=true)10283cdf8794SKit Barton     AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
10293cdf8794SKit Barton                        bool UseMax = true)
10303cdf8794SKit Barton         : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),
10313cdf8794SKit Barton           NewL(NewL) {}
10323cdf8794SKit Barton 
visitAddRecExpr(const SCEVAddRecExpr * Expr)10333cdf8794SKit Barton     const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
10343cdf8794SKit Barton       const Loop *ExprL = Expr->getLoop();
10353cdf8794SKit Barton       SmallVector<const SCEV *, 2> Operands;
10363cdf8794SKit Barton       if (ExprL == &OldL) {
10373cdf8794SKit Barton         Operands.append(Expr->op_begin(), Expr->op_end());
10383cdf8794SKit Barton         return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
10393cdf8794SKit Barton       }
10403cdf8794SKit Barton 
10413cdf8794SKit Barton       if (OldL.contains(ExprL)) {
10423cdf8794SKit Barton         bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));
10433cdf8794SKit Barton         if (!UseMax || !Pos || !Expr->isAffine()) {
10443cdf8794SKit Barton           Valid = false;
10453cdf8794SKit Barton           return Expr;
10463cdf8794SKit Barton         }
10473cdf8794SKit Barton         return visit(Expr->getStart());
10483cdf8794SKit Barton       }
10493cdf8794SKit Barton 
10503cdf8794SKit Barton       for (const SCEV *Op : Expr->operands())
10513cdf8794SKit Barton         Operands.push_back(visit(Op));
10523cdf8794SKit Barton       return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());
10533cdf8794SKit Barton     }
10543cdf8794SKit Barton 
wasValidSCEV() const10553cdf8794SKit Barton     bool wasValidSCEV() const { return Valid; }
10563cdf8794SKit Barton 
10573cdf8794SKit Barton   private:
10583cdf8794SKit Barton     bool Valid, UseMax;
10593cdf8794SKit Barton     const Loop &OldL, &NewL;
10603cdf8794SKit Barton   };
10613cdf8794SKit Barton 
10623cdf8794SKit Barton   /// Return false if the access functions of \p I0 and \p I1 could cause
10633cdf8794SKit Barton   /// a negative dependence.
accessDiffIsPositive__anon08ec0c300111::LoopFuser10643cdf8794SKit Barton   bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
10653cdf8794SKit Barton                             Instruction &I1, bool EqualIsInvalid) {
10663cdf8794SKit Barton     Value *Ptr0 = getLoadStorePointerOperand(&I0);
10673cdf8794SKit Barton     Value *Ptr1 = getLoadStorePointerOperand(&I1);
10683cdf8794SKit Barton     if (!Ptr0 || !Ptr1)
10693cdf8794SKit Barton       return false;
10703cdf8794SKit Barton 
10713cdf8794SKit Barton     const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);
10723cdf8794SKit Barton     const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);
10733cdf8794SKit Barton #ifndef NDEBUG
10743cdf8794SKit Barton     if (VerboseFusionDebugging)
10753cdf8794SKit Barton       LLVM_DEBUG(dbgs() << "    Access function check: " << *SCEVPtr0 << " vs "
10763cdf8794SKit Barton                         << *SCEVPtr1 << "\n");
10773cdf8794SKit Barton #endif
10783cdf8794SKit Barton     AddRecLoopReplacer Rewriter(SE, L0, L1);
10793cdf8794SKit Barton     SCEVPtr0 = Rewriter.visit(SCEVPtr0);
10803cdf8794SKit Barton #ifndef NDEBUG
10813cdf8794SKit Barton     if (VerboseFusionDebugging)
10823cdf8794SKit Barton       LLVM_DEBUG(dbgs() << "    Access function after rewrite: " << *SCEVPtr0
10833cdf8794SKit Barton                         << " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
10843cdf8794SKit Barton #endif
10853cdf8794SKit Barton     if (!Rewriter.wasValidSCEV())
10863cdf8794SKit Barton       return false;
10873cdf8794SKit Barton 
10883cdf8794SKit Barton     // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
10893cdf8794SKit Barton     //       L0) and the other is not. We could check if it is monotone and test
10903cdf8794SKit Barton     //       the beginning and end value instead.
10913cdf8794SKit Barton 
10923cdf8794SKit Barton     BasicBlock *L0Header = L0.getHeader();
10933cdf8794SKit Barton     auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
10943cdf8794SKit Barton       const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
10953cdf8794SKit Barton       if (!AddRec)
10963cdf8794SKit Barton         return false;
10973cdf8794SKit Barton       return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&
10983cdf8794SKit Barton              !DT.dominates(AddRec->getLoop()->getHeader(), L0Header);
10993cdf8794SKit Barton     };
11003cdf8794SKit Barton     if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))
11013cdf8794SKit Barton       return false;
11023cdf8794SKit Barton 
11033cdf8794SKit Barton     ICmpInst::Predicate Pred =
11043cdf8794SKit Barton         EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
11053cdf8794SKit Barton     bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);
11063cdf8794SKit Barton #ifndef NDEBUG
11073cdf8794SKit Barton     if (VerboseFusionDebugging)
11083cdf8794SKit Barton       LLVM_DEBUG(dbgs() << "    Relation: " << *SCEVPtr0
11093cdf8794SKit Barton                         << (IsAlwaysGE ? "  >=  " : "  may <  ") << *SCEVPtr1
11103cdf8794SKit Barton                         << "\n");
11113cdf8794SKit Barton #endif
11123cdf8794SKit Barton     return IsAlwaysGE;
11133cdf8794SKit Barton   }
11143cdf8794SKit Barton 
11153cdf8794SKit Barton   /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
11163cdf8794SKit Barton   /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
11173cdf8794SKit Barton   /// specified by @p DepChoice are used to determine this.
dependencesAllowFusion__anon08ec0c300111::LoopFuser11183cdf8794SKit Barton   bool dependencesAllowFusion(const FusionCandidate &FC0,
11193cdf8794SKit Barton                               const FusionCandidate &FC1, Instruction &I0,
11203cdf8794SKit Barton                               Instruction &I1, bool AnyDep,
11213cdf8794SKit Barton                               FusionDependenceAnalysisChoice DepChoice) {
11223cdf8794SKit Barton #ifndef NDEBUG
11233cdf8794SKit Barton     if (VerboseFusionDebugging) {
11243cdf8794SKit Barton       LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
11253cdf8794SKit Barton                         << DepChoice << "\n");
11263cdf8794SKit Barton     }
11273cdf8794SKit Barton #endif
11283cdf8794SKit Barton     switch (DepChoice) {
11293cdf8794SKit Barton     case FUSION_DEPENDENCE_ANALYSIS_SCEV:
11303cdf8794SKit Barton       return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);
11313cdf8794SKit Barton     case FUSION_DEPENDENCE_ANALYSIS_DA: {
11323cdf8794SKit Barton       auto DepResult = DI.depends(&I0, &I1, true);
11333cdf8794SKit Barton       if (!DepResult)
11343cdf8794SKit Barton         return true;
11353cdf8794SKit Barton #ifndef NDEBUG
11363cdf8794SKit Barton       if (VerboseFusionDebugging) {
11373cdf8794SKit Barton         LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
11383cdf8794SKit Barton                    dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
11393cdf8794SKit Barton                           << (DepResult->isOrdered() ? "true" : "false")
11403cdf8794SKit Barton                           << "]\n");
11413cdf8794SKit Barton         LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
11423cdf8794SKit Barton                           << "\n");
11433cdf8794SKit Barton       }
11443cdf8794SKit Barton #endif
11453cdf8794SKit Barton 
11463cdf8794SKit Barton       if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
11473cdf8794SKit Barton         LLVM_DEBUG(
11483cdf8794SKit Barton             dbgs() << "TODO: Implement pred/succ dependence handling!\n");
11493cdf8794SKit Barton 
11503cdf8794SKit Barton       // TODO: Can we actually use the dependence info analysis here?
11513cdf8794SKit Barton       return false;
11523cdf8794SKit Barton     }
11533cdf8794SKit Barton 
11543cdf8794SKit Barton     case FUSION_DEPENDENCE_ANALYSIS_ALL:
11553cdf8794SKit Barton       return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
11563cdf8794SKit Barton                                     FUSION_DEPENDENCE_ANALYSIS_SCEV) ||
11573cdf8794SKit Barton              dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
11583cdf8794SKit Barton                                     FUSION_DEPENDENCE_ANALYSIS_DA);
11593cdf8794SKit Barton     }
11603cdf8794SKit Barton 
11613cdf8794SKit Barton     llvm_unreachable("Unknown fusion dependence analysis choice!");
11623cdf8794SKit Barton   }
11633cdf8794SKit Barton 
11643cdf8794SKit Barton   /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
dependencesAllowFusion__anon08ec0c300111::LoopFuser11653cdf8794SKit Barton   bool dependencesAllowFusion(const FusionCandidate &FC0,
11663cdf8794SKit Barton                               const FusionCandidate &FC1) {
11673cdf8794SKit Barton     LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
11683cdf8794SKit Barton                       << "\n");
11693cdf8794SKit Barton     assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
117050bc6104SKit Barton     assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
11713cdf8794SKit Barton 
11723cdf8794SKit Barton     for (Instruction *WriteL0 : FC0.MemWrites) {
11733cdf8794SKit Barton       for (Instruction *WriteL1 : FC1.MemWrites)
11743cdf8794SKit Barton         if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
11753cdf8794SKit Barton                                     /* AnyDep */ false,
11763cdf8794SKit Barton                                     FusionDependenceAnalysis)) {
11773cdf8794SKit Barton           InvalidDependencies++;
11783cdf8794SKit Barton           return false;
11793cdf8794SKit Barton         }
11803cdf8794SKit Barton       for (Instruction *ReadL1 : FC1.MemReads)
11813cdf8794SKit Barton         if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,
11823cdf8794SKit Barton                                     /* AnyDep */ false,
11833cdf8794SKit Barton                                     FusionDependenceAnalysis)) {
11843cdf8794SKit Barton           InvalidDependencies++;
11853cdf8794SKit Barton           return false;
11863cdf8794SKit Barton         }
11873cdf8794SKit Barton     }
11883cdf8794SKit Barton 
11893cdf8794SKit Barton     for (Instruction *WriteL1 : FC1.MemWrites) {
11903cdf8794SKit Barton       for (Instruction *WriteL0 : FC0.MemWrites)
11913cdf8794SKit Barton         if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
11923cdf8794SKit Barton                                     /* AnyDep */ false,
11933cdf8794SKit Barton                                     FusionDependenceAnalysis)) {
11943cdf8794SKit Barton           InvalidDependencies++;
11953cdf8794SKit Barton           return false;
11963cdf8794SKit Barton         }
11973cdf8794SKit Barton       for (Instruction *ReadL0 : FC0.MemReads)
11983cdf8794SKit Barton         if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,
11993cdf8794SKit Barton                                     /* AnyDep */ false,
12003cdf8794SKit Barton                                     FusionDependenceAnalysis)) {
12013cdf8794SKit Barton           InvalidDependencies++;
12023cdf8794SKit Barton           return false;
12033cdf8794SKit Barton         }
12043cdf8794SKit Barton     }
12053cdf8794SKit Barton 
12063cdf8794SKit Barton     // Walk through all uses in FC1. For each use, find the reaching def. If the
12073cdf8794SKit Barton     // def is located in FC0 then it is is not safe to fuse.
12083cdf8794SKit Barton     for (BasicBlock *BB : FC1.L->blocks())
12093cdf8794SKit Barton       for (Instruction &I : *BB)
12103cdf8794SKit Barton         for (auto &Op : I.operands())
12113cdf8794SKit Barton           if (Instruction *Def = dyn_cast<Instruction>(Op))
12123cdf8794SKit Barton             if (FC0.L->contains(Def->getParent())) {
12133cdf8794SKit Barton               InvalidDependencies++;
12143cdf8794SKit Barton               return false;
12153cdf8794SKit Barton             }
12163cdf8794SKit Barton 
12173cdf8794SKit Barton     return true;
12183cdf8794SKit Barton   }
12193cdf8794SKit Barton 
122050bc6104SKit Barton   /// Determine if two fusion candidates are adjacent in the CFG.
122150bc6104SKit Barton   ///
122250bc6104SKit Barton   /// This method will determine if there are additional basic blocks in the CFG
122350bc6104SKit Barton   /// between the exit of \p FC0 and the entry of \p FC1.
122450bc6104SKit Barton   /// If the two candidates are guarded loops, then it checks whether the
122550bc6104SKit Barton   /// non-loop successor of the \p FC0 guard branch is the entry block of \p
122650bc6104SKit Barton   /// FC1. If not, then the loops are not adjacent. If the two candidates are
122750bc6104SKit Barton   /// not guarded loops, then it checks whether the exit block of \p FC0 is the
122850bc6104SKit Barton   /// preheader of \p FC1.
isAdjacent__anon08ec0c300111::LoopFuser12293cdf8794SKit Barton   bool isAdjacent(const FusionCandidate &FC0,
12303cdf8794SKit Barton                   const FusionCandidate &FC1) const {
123150bc6104SKit Barton     // If the successor of the guard branch is FC1, then the loops are adjacent
123250bc6104SKit Barton     if (FC0.GuardBranch)
123350bc6104SKit Barton       return FC0.getNonLoopBlock() == FC1.getEntryBlock();
123450bc6104SKit Barton     else
123550bc6104SKit Barton       return FC0.ExitBlock == FC1.getEntryBlock();
123650bc6104SKit Barton   }
123750bc6104SKit Barton 
123850bc6104SKit Barton   /// Determine if two fusion candidates have identical guards
123950bc6104SKit Barton   ///
124050bc6104SKit Barton   /// This method will determine if two fusion candidates have the same guards.
124150bc6104SKit Barton   /// The guards are considered the same if:
124250bc6104SKit Barton   ///   1. The instructions to compute the condition used in the compare are
124350bc6104SKit Barton   ///      identical.
124450bc6104SKit Barton   ///   2. The successors of the guard have the same flow into/around the loop.
124550bc6104SKit Barton   /// If the compare instructions are identical, then the first successor of the
124650bc6104SKit Barton   /// guard must go to the same place (either the preheader of the loop or the
124750bc6104SKit Barton   /// NonLoopBlock). In other words, the the first successor of both loops must
124850bc6104SKit Barton   /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
124950bc6104SKit Barton   /// the NonLoopBlock). The same must be true for the second successor.
haveIdenticalGuards__anon08ec0c300111::LoopFuser125050bc6104SKit Barton   bool haveIdenticalGuards(const FusionCandidate &FC0,
125150bc6104SKit Barton                            const FusionCandidate &FC1) const {
125250bc6104SKit Barton     assert(FC0.GuardBranch && FC1.GuardBranch &&
125350bc6104SKit Barton            "Expecting FC0 and FC1 to be guarded loops.");
125450bc6104SKit Barton 
125550bc6104SKit Barton     if (auto FC0CmpInst =
125650bc6104SKit Barton             dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
125750bc6104SKit Barton       if (auto FC1CmpInst =
125850bc6104SKit Barton               dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
125950bc6104SKit Barton         if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
126050bc6104SKit Barton           return false;
126150bc6104SKit Barton 
126250bc6104SKit Barton     // The compare instructions are identical.
126350bc6104SKit Barton     // Now make sure the successor of the guards have the same flow into/around
126450bc6104SKit Barton     // the loop
126550bc6104SKit Barton     if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
126650bc6104SKit Barton       return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
126750bc6104SKit Barton     else
126850bc6104SKit Barton       return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
126950bc6104SKit Barton   }
127050bc6104SKit Barton 
127138a82179SSidharth Baveja   /// Modify the latch branch of FC to be unconditional since successors of the
127238a82179SSidharth Baveja   /// branch are the same.
simplifyLatchBranch__anon08ec0c300111::LoopFuser127336bdc3dcSWhitney Tsang   void simplifyLatchBranch(const FusionCandidate &FC) const {
127436bdc3dcSWhitney Tsang     BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
127536bdc3dcSWhitney Tsang     if (FCLatchBranch) {
127636bdc3dcSWhitney Tsang       assert(FCLatchBranch->isConditional() &&
127736bdc3dcSWhitney Tsang              FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
127836bdc3dcSWhitney Tsang              "Expecting the two successors of FCLatchBranch to be the same");
127938a82179SSidharth Baveja       BranchInst *NewBranch =
128038a82179SSidharth Baveja           BranchInst::Create(FCLatchBranch->getSuccessor(0));
128138a82179SSidharth Baveja       ReplaceInstWithInst(FCLatchBranch, NewBranch);
128236bdc3dcSWhitney Tsang     }
128336bdc3dcSWhitney Tsang   }
128436bdc3dcSWhitney Tsang 
128536bdc3dcSWhitney Tsang   /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
128636bdc3dcSWhitney Tsang   /// successor, then merge FC0.Latch with its unique successor.
mergeLatch__anon08ec0c300111::LoopFuser128736bdc3dcSWhitney Tsang   void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
128878dc6498SWhitney Tsang     moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
128936bdc3dcSWhitney Tsang     if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
129036bdc3dcSWhitney Tsang       MergeBlockIntoPredecessor(Succ, &DTU, &LI);
129136bdc3dcSWhitney Tsang       DTU.flush();
129236bdc3dcSWhitney Tsang     }
129336bdc3dcSWhitney Tsang   }
129436bdc3dcSWhitney Tsang 
12953cdf8794SKit Barton   /// Fuse two fusion candidates, creating a new fused loop.
12963cdf8794SKit Barton   ///
12973cdf8794SKit Barton   /// This method contains the mechanics of fusing two loops, represented by \p
12983cdf8794SKit Barton   /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
12993cdf8794SKit Barton   /// postdominates \p FC0 (making them control flow equivalent). It also
13003cdf8794SKit Barton   /// assumes that the other conditions for fusion have been met: adjacent,
13013cdf8794SKit Barton   /// identical trip counts, and no negative distance dependencies exist that
13023cdf8794SKit Barton   /// would prevent fusion. Thus, there is no checking for these conditions in
13033cdf8794SKit Barton   /// this method.
13043cdf8794SKit Barton   ///
13053cdf8794SKit Barton   /// Fusion is performed by rewiring the CFG to update successor blocks of the
13063cdf8794SKit Barton   /// components of tho loop. Specifically, the following changes are done:
13073cdf8794SKit Barton   ///
13083cdf8794SKit Barton   ///   1. The preheader of \p FC1 is removed as it is no longer necessary
13093cdf8794SKit Barton   ///   (because it is currently only a single statement block).
13103cdf8794SKit Barton   ///   2. The latch of \p FC0 is modified to jump to the header of \p FC1.
13113cdf8794SKit Barton   ///   3. The latch of \p FC1 i modified to jump to the header of \p FC0.
13123cdf8794SKit Barton   ///   4. All blocks from \p FC1 are removed from FC1 and added to FC0.
13133cdf8794SKit Barton   ///
13143cdf8794SKit Barton   /// All of these modifications are done with dominator tree updates, thus
13153cdf8794SKit Barton   /// keeping the dominator (and post dominator) information up-to-date.
13163cdf8794SKit Barton   ///
13173cdf8794SKit Barton   /// This can be improved in the future by actually merging blocks during
13183cdf8794SKit Barton   /// fusion. For example, the preheader of \p FC1 can be merged with the
13193cdf8794SKit Barton   /// preheader of \p FC0. This would allow loops with more than a single
13203cdf8794SKit Barton   /// statement in the preheader to be fused. Similarly, the latch blocks of the
13213cdf8794SKit Barton   /// two loops could also be fused into a single block. This will require
13223cdf8794SKit Barton   /// analysis to prove it is safe to move the contents of the block past
13233cdf8794SKit Barton   /// existing code, which currently has not been implemented.
performFusion__anon08ec0c300111::LoopFuser13243cdf8794SKit Barton   Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {
13253cdf8794SKit Barton     assert(FC0.isValid() && FC1.isValid() &&
13263cdf8794SKit Barton            "Expecting valid fusion candidates");
13273cdf8794SKit Barton 
13283cdf8794SKit Barton     LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
13293cdf8794SKit Barton                dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
13303cdf8794SKit Barton 
1331da58e68fSWhitney Tsang     // Move instructions from the preheader of FC1 to the end of the preheader
1332da58e68fSWhitney Tsang     // of FC0.
1333da58e68fSWhitney Tsang     moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI);
1334da58e68fSWhitney Tsang 
133550bc6104SKit Barton     // Fusing guarded loops is handled slightly differently than non-guarded
133650bc6104SKit Barton     // loops and has been broken out into a separate method instead of trying to
133750bc6104SKit Barton     // intersperse the logic within a single method.
133850bc6104SKit Barton     if (FC0.GuardBranch)
133950bc6104SKit Barton       return fuseGuardedLoops(FC0, FC1);
134050bc6104SKit Barton 
134138a82179SSidharth Baveja     assert(FC1.Preheader ==
134238a82179SSidharth Baveja            (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
13433cdf8794SKit Barton     assert(FC1.Preheader->size() == 1 &&
13443cdf8794SKit Barton            FC1.Preheader->getSingleSuccessor() == FC1.Header);
13453cdf8794SKit Barton 
13463cdf8794SKit Barton     // Remember the phi nodes originally in the header of FC0 in order to rewire
13473cdf8794SKit Barton     // them later. However, this is only necessary if the new loop carried
13483cdf8794SKit Barton     // values might not dominate the exiting branch. While we do not generally
13493cdf8794SKit Barton     // test if this is the case but simply insert intermediate phi nodes, we
13503cdf8794SKit Barton     // need to make sure these intermediate phi nodes have different
13513cdf8794SKit Barton     // predecessors. To this end, we filter the special case where the exiting
13523cdf8794SKit Barton     // block is the latch block of the first loop. Nothing needs to be done
13533cdf8794SKit Barton     // anyway as all loop carried values dominate the latch and thereby also the
13543cdf8794SKit Barton     // exiting branch.
13553cdf8794SKit Barton     SmallVector<PHINode *, 8> OriginalFC0PHIs;
13563cdf8794SKit Barton     if (FC0.ExitingBlock != FC0.Latch)
13573cdf8794SKit Barton       for (PHINode &PHI : FC0.Header->phis())
13583cdf8794SKit Barton         OriginalFC0PHIs.push_back(&PHI);
13593cdf8794SKit Barton 
13603cdf8794SKit Barton     // Replace incoming blocks for header PHIs first.
13613cdf8794SKit Barton     FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
13623cdf8794SKit Barton     FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
13633cdf8794SKit Barton 
13643cdf8794SKit Barton     // Then modify the control flow and update DT and PDT.
13658a268becSFangrui Song     SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
13663cdf8794SKit Barton 
13673cdf8794SKit Barton     // The old exiting block of the first loop (FC0) has to jump to the header
13683cdf8794SKit Barton     // of the second as we need to execute the code in the second header block
13693cdf8794SKit Barton     // regardless of the trip count. That is, if the trip count is 0, so the
13703cdf8794SKit Barton     // back edge is never taken, we still have to execute both loop headers,
13713cdf8794SKit Barton     // especially (but not only!) if the second is a do-while style loop.
13723cdf8794SKit Barton     // However, doing so might invalidate the phi nodes of the first loop as
13733cdf8794SKit Barton     // the new values do only need to dominate their latch and not the exiting
13743cdf8794SKit Barton     // predicate. To remedy this potential problem we always introduce phi
13753cdf8794SKit Barton     // nodes in the header of the second loop later that select the loop carried
13763cdf8794SKit Barton     // value, if the second header was reached through an old latch of the
13773cdf8794SKit Barton     // first, or undef otherwise. This is sound as exiting the first implies the
13783cdf8794SKit Barton     // second will exit too, __without__ taking the back-edge. [Their
13793cdf8794SKit Barton     // trip-counts are equal after all.
13803cdf8794SKit Barton     // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go
13813cdf8794SKit Barton     // to FC1.Header? I think this is basically what the three sequences are
13823cdf8794SKit Barton     // trying to accomplish; however, doing this directly in the CFG may mean
13833cdf8794SKit Barton     // the DT/PDT becomes invalid
138438a82179SSidharth Baveja     if (!FC0.Peeled) {
13853cdf8794SKit Barton       FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
13863cdf8794SKit Barton                                                            FC1.Header);
13873cdf8794SKit Barton       TreeUpdates.emplace_back(DominatorTree::UpdateType(
13883cdf8794SKit Barton           DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
13893cdf8794SKit Barton       TreeUpdates.emplace_back(DominatorTree::UpdateType(
13903cdf8794SKit Barton           DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
139138a82179SSidharth Baveja     } else {
139238a82179SSidharth Baveja       TreeUpdates.emplace_back(DominatorTree::UpdateType(
139338a82179SSidharth Baveja           DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
139438a82179SSidharth Baveja 
139538a82179SSidharth Baveja       // Remove the ExitBlock of the first Loop (also not needed)
139638a82179SSidharth Baveja       FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
139738a82179SSidharth Baveja                                                            FC1.Header);
139838a82179SSidharth Baveja       TreeUpdates.emplace_back(DominatorTree::UpdateType(
139938a82179SSidharth Baveja           DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
140038a82179SSidharth Baveja       FC0.ExitBlock->getTerminator()->eraseFromParent();
140138a82179SSidharth Baveja       TreeUpdates.emplace_back(DominatorTree::UpdateType(
140238a82179SSidharth Baveja           DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
140338a82179SSidharth Baveja       new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
140438a82179SSidharth Baveja     }
14053cdf8794SKit Barton 
14063cdf8794SKit Barton     // The pre-header of L1 is not necessary anymore.
14070888eaf3SKazu Hirata     assert(pred_empty(FC1.Preheader));
14083cdf8794SKit Barton     FC1.Preheader->getTerminator()->eraseFromParent();
14093cdf8794SKit Barton     new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
14103cdf8794SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(
14113cdf8794SKit Barton         DominatorTree::Delete, FC1.Preheader, FC1.Header));
14123cdf8794SKit Barton 
14133cdf8794SKit Barton     // Moves the phi nodes from the second to the first loops header block.
14143cdf8794SKit Barton     while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
14153cdf8794SKit Barton       if (SE.isSCEVable(PHI->getType()))
14163cdf8794SKit Barton         SE.forgetValue(PHI);
14173cdf8794SKit Barton       if (PHI->hasNUsesOrMore(1))
14183cdf8794SKit Barton         PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
14193cdf8794SKit Barton       else
14203cdf8794SKit Barton         PHI->eraseFromParent();
14213cdf8794SKit Barton     }
14223cdf8794SKit Barton 
14233cdf8794SKit Barton     // Introduce new phi nodes in the second loop header to ensure
14243cdf8794SKit Barton     // exiting the first and jumping to the header of the second does not break
14253cdf8794SKit Barton     // the SSA property of the phis originally in the first loop. See also the
14263cdf8794SKit Barton     // comment above.
14273cdf8794SKit Barton     Instruction *L1HeaderIP = &FC1.Header->front();
14283cdf8794SKit Barton     for (PHINode *LCPHI : OriginalFC0PHIs) {
14293cdf8794SKit Barton       int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
14303cdf8794SKit Barton       assert(L1LatchBBIdx >= 0 &&
14313cdf8794SKit Barton              "Expected loop carried value to be rewired at this point!");
14323cdf8794SKit Barton 
14333cdf8794SKit Barton       Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
14343cdf8794SKit Barton 
14353cdf8794SKit Barton       PHINode *L1HeaderPHI = PHINode::Create(
14363cdf8794SKit Barton           LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
14373cdf8794SKit Barton       L1HeaderPHI->addIncoming(LCV, FC0.Latch);
14383cdf8794SKit Barton       L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
14393cdf8794SKit Barton                                FC0.ExitingBlock);
14403cdf8794SKit Barton 
14413cdf8794SKit Barton       LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
14423cdf8794SKit Barton     }
14433cdf8794SKit Barton 
14443cdf8794SKit Barton     // Replace latch terminator destinations.
14453cdf8794SKit Barton     FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
14463cdf8794SKit Barton     FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
14473cdf8794SKit Barton 
144838a82179SSidharth Baveja     // Modify the latch branch of FC0 to be unconditional as both successors of
144936bdc3dcSWhitney Tsang     // the branch are the same.
145036bdc3dcSWhitney Tsang     simplifyLatchBranch(FC0);
145136bdc3dcSWhitney Tsang 
14523cdf8794SKit Barton     // If FC0.Latch and FC0.ExitingBlock are the same then we have already
14533cdf8794SKit Barton     // performed the updates above.
14543cdf8794SKit Barton     if (FC0.Latch != FC0.ExitingBlock)
14553cdf8794SKit Barton       TreeUpdates.emplace_back(DominatorTree::UpdateType(
14563cdf8794SKit Barton           DominatorTree::Insert, FC0.Latch, FC1.Header));
14573cdf8794SKit Barton 
14583cdf8794SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
14593cdf8794SKit Barton                                                        FC0.Latch, FC0.Header));
14603cdf8794SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
14613cdf8794SKit Barton                                                        FC1.Latch, FC0.Header));
14623cdf8794SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
14633cdf8794SKit Barton                                                        FC1.Latch, FC1.Header));
14643cdf8794SKit Barton 
14653cdf8794SKit Barton     // Update DT/PDT
14663cdf8794SKit Barton     DTU.applyUpdates(TreeUpdates);
14673cdf8794SKit Barton 
14683cdf8794SKit Barton     LI.removeBlock(FC1.Preheader);
14693cdf8794SKit Barton     DTU.deleteBB(FC1.Preheader);
147038a82179SSidharth Baveja     if (FC0.Peeled) {
147138a82179SSidharth Baveja       LI.removeBlock(FC0.ExitBlock);
147238a82179SSidharth Baveja       DTU.deleteBB(FC0.ExitBlock);
147338a82179SSidharth Baveja     }
147438a82179SSidharth Baveja 
14753cdf8794SKit Barton     DTU.flush();
14763cdf8794SKit Barton 
14773cdf8794SKit Barton     // Is there a way to keep SE up-to-date so we don't need to forget the loops
14783cdf8794SKit Barton     // and rebuild the information in subsequent passes of fusion?
147936bdc3dcSWhitney Tsang     // Note: Need to forget the loops before merging the loop latches, as
148036bdc3dcSWhitney Tsang     // mergeLatch may remove the only block in FC1.
14813cdf8794SKit Barton     SE.forgetLoop(FC1.L);
14823cdf8794SKit Barton     SE.forgetLoop(FC0.L);
14833cdf8794SKit Barton 
148436bdc3dcSWhitney Tsang     // Move instructions from FC0.Latch to FC1.Latch.
148536bdc3dcSWhitney Tsang     // Note: mergeLatch requires an updated DT.
148636bdc3dcSWhitney Tsang     mergeLatch(FC0, FC1);
148736bdc3dcSWhitney Tsang 
14883cdf8794SKit Barton     // Merge the loops.
1489530c5af6SKazu Hirata     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
14903cdf8794SKit Barton     for (BasicBlock *BB : Blocks) {
14913cdf8794SKit Barton       FC0.L->addBlockEntry(BB);
14923cdf8794SKit Barton       FC1.L->removeBlockFromLoop(BB);
14933cdf8794SKit Barton       if (LI.getLoopFor(BB) != FC1.L)
14943cdf8794SKit Barton         continue;
14953cdf8794SKit Barton       LI.changeLoopFor(BB, FC0.L);
14963cdf8794SKit Barton     }
149789c1e35fSStefanos Baziotis     while (!FC1.L->isInnermost()) {
14983cdf8794SKit Barton       const auto &ChildLoopIt = FC1.L->begin();
14993cdf8794SKit Barton       Loop *ChildLoop = *ChildLoopIt;
15003cdf8794SKit Barton       FC1.L->removeChildLoop(ChildLoopIt);
15013cdf8794SKit Barton       FC0.L->addChildLoop(ChildLoop);
15023cdf8794SKit Barton     }
15033cdf8794SKit Barton 
15043cdf8794SKit Barton     // Delete the now empty loop L1.
15053cdf8794SKit Barton     LI.erase(FC1.L);
15063cdf8794SKit Barton 
15073cdf8794SKit Barton #ifndef NDEBUG
15083cdf8794SKit Barton     assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
15093cdf8794SKit Barton     assert(DT.verify(DominatorTree::VerificationLevel::Fast));
15103cdf8794SKit Barton     assert(PDT.verify());
15113cdf8794SKit Barton     LI.verify(DT);
15123cdf8794SKit Barton     SE.verify();
15133cdf8794SKit Barton #endif
15143cdf8794SKit Barton 
15153cdf8794SKit Barton     LLVM_DEBUG(dbgs() << "Fusion done:\n");
15163cdf8794SKit Barton 
15173cdf8794SKit Barton     return FC0.L;
15183cdf8794SKit Barton   }
1519de0b6339SKit Barton 
1520de0b6339SKit Barton   /// Report details on loop fusion opportunities.
1521de0b6339SKit Barton   ///
1522de0b6339SKit Barton   /// This template function can be used to report both successful and missed
1523de0b6339SKit Barton   /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
1524de0b6339SKit Barton   /// be one of:
1525de0b6339SKit Barton   ///   - OptimizationRemarkMissed to report when loop fusion is unsuccessful
1526de0b6339SKit Barton   ///     given two valid fusion candidates.
1527de0b6339SKit Barton   ///   - OptimizationRemark to report successful fusion of two fusion
1528de0b6339SKit Barton   ///     candidates.
1529de0b6339SKit Barton   /// The remarks will be printed using the form:
1530de0b6339SKit Barton   ///    <path/filename>:<line number>:<column number>: [<function name>]:
1531de0b6339SKit Barton   ///       <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
1532de0b6339SKit Barton   template <typename RemarkKind>
reportLoopFusion__anon08ec0c300111::LoopFuser1533de0b6339SKit Barton   void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
1534de0b6339SKit Barton                         llvm::Statistic &Stat) {
1535de0b6339SKit Barton     assert(FC0.Preheader && FC1.Preheader &&
1536de0b6339SKit Barton            "Expecting valid fusion candidates");
1537de0b6339SKit Barton     using namespace ore;
153818839be9SFangrui Song #if LLVM_ENABLE_STATS
1539de0b6339SKit Barton     ++Stat;
1540de0b6339SKit Barton     ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
1541de0b6339SKit Barton                         FC0.Preheader)
1542de0b6339SKit Barton              << "[" << FC0.Preheader->getParent()->getName()
1543de0b6339SKit Barton              << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
1544de0b6339SKit Barton              << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
1545de0b6339SKit Barton              << ": " << Stat.getDesc());
154618839be9SFangrui Song #endif
1547de0b6339SKit Barton   }
154850bc6104SKit Barton 
154950bc6104SKit Barton   /// Fuse two guarded fusion candidates, creating a new fused loop.
155050bc6104SKit Barton   ///
155150bc6104SKit Barton   /// Fusing guarded loops is handled much the same way as fusing non-guarded
155250bc6104SKit Barton   /// loops. The rewiring of the CFG is slightly different though, because of
155350bc6104SKit Barton   /// the presence of the guards around the loops and the exit blocks after the
155450bc6104SKit Barton   /// loop body. As such, the new loop is rewired as follows:
155550bc6104SKit Barton   ///    1. Keep the guard branch from FC0 and use the non-loop block target
155650bc6104SKit Barton   /// from the FC1 guard branch.
155750bc6104SKit Barton   ///    2. Remove the exit block from FC0 (this exit block should be empty
155850bc6104SKit Barton   /// right now).
155950bc6104SKit Barton   ///    3. Remove the guard branch for FC1
156050bc6104SKit Barton   ///    4. Remove the preheader for FC1.
156150bc6104SKit Barton   /// The exit block successor for the latch of FC0 is updated to be the header
156250bc6104SKit Barton   /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
156350bc6104SKit Barton   /// be the header of FC0, thus creating the fused loop.
fuseGuardedLoops__anon08ec0c300111::LoopFuser156450bc6104SKit Barton   Loop *fuseGuardedLoops(const FusionCandidate &FC0,
156550bc6104SKit Barton                          const FusionCandidate &FC1) {
156650bc6104SKit Barton     assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
156750bc6104SKit Barton 
156850bc6104SKit Barton     BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
156950bc6104SKit Barton     BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
157050bc6104SKit Barton     BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
157150bc6104SKit Barton     BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
157238a82179SSidharth Baveja     BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
157350bc6104SKit Barton 
1574e44f4a8aSWhitney Tsang     // Move instructions from the exit block of FC0 to the beginning of the exit
157538a82179SSidharth Baveja     // block of FC1, in the case that the FC0 loop has not been peeled. In the
157638a82179SSidharth Baveja     // case that FC0 loop is peeled, then move the instructions of the successor
157738a82179SSidharth Baveja     // of the FC0 Exit block to the beginning of the exit block of FC1.
157838a82179SSidharth Baveja     moveInstructionsToTheBeginning(
157938a82179SSidharth Baveja         (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
158038a82179SSidharth Baveja         DT, PDT, DI);
1581e44f4a8aSWhitney Tsang 
1582e44f4a8aSWhitney Tsang     // Move instructions from the guard block of FC1 to the end of the guard
1583e44f4a8aSWhitney Tsang     // block of FC0.
1584e44f4a8aSWhitney Tsang     moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);
1585e44f4a8aSWhitney Tsang 
158650bc6104SKit Barton     assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
158750bc6104SKit Barton 
158850bc6104SKit Barton     SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
158950bc6104SKit Barton 
159050bc6104SKit Barton     ////////////////////////////////////////////////////////////////////////////
159150bc6104SKit Barton     // Update the Loop Guard
159250bc6104SKit Barton     ////////////////////////////////////////////////////////////////////////////
159350bc6104SKit Barton     // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
159450bc6104SKit Barton     // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
159550bc6104SKit Barton     // Thus, one path from the guard goes to the preheader for FC0 (and thus
159650bc6104SKit Barton     // executes the new fused loop) and the other path goes to the NonLoopBlock
159750bc6104SKit Barton     // for FC1 (where FC1 guard would have gone if FC1 was not executed).
159801e64c97SWhitney Tsang     FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
159950bc6104SKit Barton     FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
160038a82179SSidharth Baveja 
160138a82179SSidharth Baveja     BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
160238a82179SSidharth Baveja     BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);
160350bc6104SKit Barton 
160450bc6104SKit Barton     // The guard of FC1 is not necessary anymore.
160550bc6104SKit Barton     FC1.GuardBranch->eraseFromParent();
160650bc6104SKit Barton     new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
160750bc6104SKit Barton 
160850bc6104SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(
160950bc6104SKit Barton         DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
161050bc6104SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(
161150bc6104SKit Barton         DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
161250bc6104SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(
161350bc6104SKit Barton         DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
161450bc6104SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(
161550bc6104SKit Barton         DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
161650bc6104SKit Barton 
161738a82179SSidharth Baveja     if (FC0.Peeled) {
161838a82179SSidharth Baveja       // Remove the Block after the ExitBlock of FC0
161938a82179SSidharth Baveja       TreeUpdates.emplace_back(DominatorTree::UpdateType(
162038a82179SSidharth Baveja           DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
162138a82179SSidharth Baveja       FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();
162238a82179SSidharth Baveja       new UnreachableInst(FC0ExitBlockSuccessor->getContext(),
162338a82179SSidharth Baveja                           FC0ExitBlockSuccessor);
162438a82179SSidharth Baveja     }
162538a82179SSidharth Baveja 
16260888eaf3SKazu Hirata     assert(pred_empty(FC1GuardBlock) &&
162750bc6104SKit Barton            "Expecting guard block to have no predecessors");
16280888eaf3SKazu Hirata     assert(succ_empty(FC1GuardBlock) &&
162950bc6104SKit Barton            "Expecting guard block to have no successors");
163050bc6104SKit Barton 
163150bc6104SKit Barton     // Remember the phi nodes originally in the header of FC0 in order to rewire
163250bc6104SKit Barton     // them later. However, this is only necessary if the new loop carried
163350bc6104SKit Barton     // values might not dominate the exiting branch. While we do not generally
163450bc6104SKit Barton     // test if this is the case but simply insert intermediate phi nodes, we
163550bc6104SKit Barton     // need to make sure these intermediate phi nodes have different
163650bc6104SKit Barton     // predecessors. To this end, we filter the special case where the exiting
163750bc6104SKit Barton     // block is the latch block of the first loop. Nothing needs to be done
163850bc6104SKit Barton     // anyway as all loop carried values dominate the latch and thereby also the
163950bc6104SKit Barton     // exiting branch.
164050bc6104SKit Barton     // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
164150bc6104SKit Barton     // (because the loops are rotated. Thus, nothing will ever be added to
164250bc6104SKit Barton     // OriginalFC0PHIs.
164350bc6104SKit Barton     SmallVector<PHINode *, 8> OriginalFC0PHIs;
164450bc6104SKit Barton     if (FC0.ExitingBlock != FC0.Latch)
164550bc6104SKit Barton       for (PHINode &PHI : FC0.Header->phis())
164650bc6104SKit Barton         OriginalFC0PHIs.push_back(&PHI);
164750bc6104SKit Barton 
164850bc6104SKit Barton     assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
164950bc6104SKit Barton 
165050bc6104SKit Barton     // Replace incoming blocks for header PHIs first.
165150bc6104SKit Barton     FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
165250bc6104SKit Barton     FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
165350bc6104SKit Barton 
165450bc6104SKit Barton     // The old exiting block of the first loop (FC0) has to jump to the header
165550bc6104SKit Barton     // of the second as we need to execute the code in the second header block
165650bc6104SKit Barton     // regardless of the trip count. That is, if the trip count is 0, so the
165750bc6104SKit Barton     // back edge is never taken, we still have to execute both loop headers,
165850bc6104SKit Barton     // especially (but not only!) if the second is a do-while style loop.
165950bc6104SKit Barton     // However, doing so might invalidate the phi nodes of the first loop as
166050bc6104SKit Barton     // the new values do only need to dominate their latch and not the exiting
166150bc6104SKit Barton     // predicate. To remedy this potential problem we always introduce phi
166250bc6104SKit Barton     // nodes in the header of the second loop later that select the loop carried
166350bc6104SKit Barton     // value, if the second header was reached through an old latch of the
166450bc6104SKit Barton     // first, or undef otherwise. This is sound as exiting the first implies the
166550bc6104SKit Barton     // second will exit too, __without__ taking the back-edge (their
166650bc6104SKit Barton     // trip-counts are equal after all).
166750bc6104SKit Barton     FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
166850bc6104SKit Barton                                                          FC1.Header);
166950bc6104SKit Barton 
167050bc6104SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(
167150bc6104SKit Barton         DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
167250bc6104SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(
167350bc6104SKit Barton         DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
167450bc6104SKit Barton 
167550bc6104SKit Barton     // Remove FC0 Exit Block
167650bc6104SKit Barton     // The exit block for FC0 is no longer needed since control will flow
167750bc6104SKit Barton     // directly to the header of FC1. Since it is an empty block, it can be
167850bc6104SKit Barton     // removed at this point.
167950bc6104SKit Barton     // TODO: In the future, we can handle non-empty exit blocks my merging any
168050bc6104SKit Barton     // instructions from FC0 exit block into FC1 exit block prior to removing
168150bc6104SKit Barton     // the block.
16820888eaf3SKazu Hirata     assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");
168350bc6104SKit Barton     FC0.ExitBlock->getTerminator()->eraseFromParent();
168450bc6104SKit Barton     new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
168550bc6104SKit Barton 
168650bc6104SKit Barton     // Remove FC1 Preheader
168750bc6104SKit Barton     // The pre-header of L1 is not necessary anymore.
16880888eaf3SKazu Hirata     assert(pred_empty(FC1.Preheader));
168950bc6104SKit Barton     FC1.Preheader->getTerminator()->eraseFromParent();
169050bc6104SKit Barton     new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
169150bc6104SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(
169250bc6104SKit Barton         DominatorTree::Delete, FC1.Preheader, FC1.Header));
169350bc6104SKit Barton 
169450bc6104SKit Barton     // Moves the phi nodes from the second to the first loops header block.
169550bc6104SKit Barton     while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
169650bc6104SKit Barton       if (SE.isSCEVable(PHI->getType()))
169750bc6104SKit Barton         SE.forgetValue(PHI);
169850bc6104SKit Barton       if (PHI->hasNUsesOrMore(1))
169950bc6104SKit Barton         PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
170050bc6104SKit Barton       else
170150bc6104SKit Barton         PHI->eraseFromParent();
170250bc6104SKit Barton     }
170350bc6104SKit Barton 
170450bc6104SKit Barton     // Introduce new phi nodes in the second loop header to ensure
170550bc6104SKit Barton     // exiting the first and jumping to the header of the second does not break
170650bc6104SKit Barton     // the SSA property of the phis originally in the first loop. See also the
170750bc6104SKit Barton     // comment above.
170850bc6104SKit Barton     Instruction *L1HeaderIP = &FC1.Header->front();
170950bc6104SKit Barton     for (PHINode *LCPHI : OriginalFC0PHIs) {
171050bc6104SKit Barton       int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
171150bc6104SKit Barton       assert(L1LatchBBIdx >= 0 &&
171250bc6104SKit Barton              "Expected loop carried value to be rewired at this point!");
171350bc6104SKit Barton 
171450bc6104SKit Barton       Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
171550bc6104SKit Barton 
171650bc6104SKit Barton       PHINode *L1HeaderPHI = PHINode::Create(
171750bc6104SKit Barton           LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
171850bc6104SKit Barton       L1HeaderPHI->addIncoming(LCV, FC0.Latch);
171950bc6104SKit Barton       L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
172050bc6104SKit Barton                                FC0.ExitingBlock);
172150bc6104SKit Barton 
172250bc6104SKit Barton       LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
172350bc6104SKit Barton     }
172450bc6104SKit Barton 
172550bc6104SKit Barton     // Update the latches
172650bc6104SKit Barton 
172750bc6104SKit Barton     // Replace latch terminator destinations.
172850bc6104SKit Barton     FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
172950bc6104SKit Barton     FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
173050bc6104SKit Barton 
173138a82179SSidharth Baveja     // Modify the latch branch of FC0 to be unconditional as both successors of
173236bdc3dcSWhitney Tsang     // the branch are the same.
173336bdc3dcSWhitney Tsang     simplifyLatchBranch(FC0);
173436bdc3dcSWhitney Tsang 
173550bc6104SKit Barton     // If FC0.Latch and FC0.ExitingBlock are the same then we have already
173650bc6104SKit Barton     // performed the updates above.
173750bc6104SKit Barton     if (FC0.Latch != FC0.ExitingBlock)
173850bc6104SKit Barton       TreeUpdates.emplace_back(DominatorTree::UpdateType(
173950bc6104SKit Barton           DominatorTree::Insert, FC0.Latch, FC1.Header));
174050bc6104SKit Barton 
174150bc6104SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
174250bc6104SKit Barton                                                        FC0.Latch, FC0.Header));
174350bc6104SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
174450bc6104SKit Barton                                                        FC1.Latch, FC0.Header));
174550bc6104SKit Barton     TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
174650bc6104SKit Barton                                                        FC1.Latch, FC1.Header));
174750bc6104SKit Barton 
174850bc6104SKit Barton     // All done
174950bc6104SKit Barton     // Apply the updates to the Dominator Tree and cleanup.
175050bc6104SKit Barton 
17510888eaf3SKazu Hirata     assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");
17520888eaf3SKazu Hirata     assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");
175350bc6104SKit Barton 
175450bc6104SKit Barton     // Update DT/PDT
175550bc6104SKit Barton     DTU.applyUpdates(TreeUpdates);
175650bc6104SKit Barton 
1757f5224d43SDiego Caballero     LI.removeBlock(FC1GuardBlock);
175850bc6104SKit Barton     LI.removeBlock(FC1.Preheader);
1759f5224d43SDiego Caballero     LI.removeBlock(FC0.ExitBlock);
176038a82179SSidharth Baveja     if (FC0.Peeled) {
176138a82179SSidharth Baveja       LI.removeBlock(FC0ExitBlockSuccessor);
176238a82179SSidharth Baveja       DTU.deleteBB(FC0ExitBlockSuccessor);
176338a82179SSidharth Baveja     }
1764f5224d43SDiego Caballero     DTU.deleteBB(FC1GuardBlock);
176550bc6104SKit Barton     DTU.deleteBB(FC1.Preheader);
176650bc6104SKit Barton     DTU.deleteBB(FC0.ExitBlock);
176750bc6104SKit Barton     DTU.flush();
176850bc6104SKit Barton 
176950bc6104SKit Barton     // Is there a way to keep SE up-to-date so we don't need to forget the loops
177050bc6104SKit Barton     // and rebuild the information in subsequent passes of fusion?
177136bdc3dcSWhitney Tsang     // Note: Need to forget the loops before merging the loop latches, as
177236bdc3dcSWhitney Tsang     // mergeLatch may remove the only block in FC1.
177350bc6104SKit Barton     SE.forgetLoop(FC1.L);
177450bc6104SKit Barton     SE.forgetLoop(FC0.L);
177550bc6104SKit Barton 
177636bdc3dcSWhitney Tsang     // Move instructions from FC0.Latch to FC1.Latch.
177736bdc3dcSWhitney Tsang     // Note: mergeLatch requires an updated DT.
177836bdc3dcSWhitney Tsang     mergeLatch(FC0, FC1);
177936bdc3dcSWhitney Tsang 
178050bc6104SKit Barton     // Merge the loops.
1781530c5af6SKazu Hirata     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
178250bc6104SKit Barton     for (BasicBlock *BB : Blocks) {
178350bc6104SKit Barton       FC0.L->addBlockEntry(BB);
178450bc6104SKit Barton       FC1.L->removeBlockFromLoop(BB);
178550bc6104SKit Barton       if (LI.getLoopFor(BB) != FC1.L)
178650bc6104SKit Barton         continue;
178750bc6104SKit Barton       LI.changeLoopFor(BB, FC0.L);
178850bc6104SKit Barton     }
178989c1e35fSStefanos Baziotis     while (!FC1.L->isInnermost()) {
179050bc6104SKit Barton       const auto &ChildLoopIt = FC1.L->begin();
179150bc6104SKit Barton       Loop *ChildLoop = *ChildLoopIt;
179250bc6104SKit Barton       FC1.L->removeChildLoop(ChildLoopIt);
179350bc6104SKit Barton       FC0.L->addChildLoop(ChildLoop);
179450bc6104SKit Barton     }
179550bc6104SKit Barton 
179650bc6104SKit Barton     // Delete the now empty loop L1.
179750bc6104SKit Barton     LI.erase(FC1.L);
179850bc6104SKit Barton 
179950bc6104SKit Barton #ifndef NDEBUG
180050bc6104SKit Barton     assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
180150bc6104SKit Barton     assert(DT.verify(DominatorTree::VerificationLevel::Fast));
180250bc6104SKit Barton     assert(PDT.verify());
180350bc6104SKit Barton     LI.verify(DT);
180450bc6104SKit Barton     SE.verify();
180550bc6104SKit Barton #endif
180650bc6104SKit Barton 
180750bc6104SKit Barton     LLVM_DEBUG(dbgs() << "Fusion done:\n");
180850bc6104SKit Barton 
180950bc6104SKit Barton     return FC0.L;
181050bc6104SKit Barton   }
18113cdf8794SKit Barton };
18123cdf8794SKit Barton 
18133cdf8794SKit Barton struct LoopFuseLegacy : public FunctionPass {
18143cdf8794SKit Barton 
18153cdf8794SKit Barton   static char ID;
18163cdf8794SKit Barton 
LoopFuseLegacy__anon08ec0c300111::LoopFuseLegacy18173cdf8794SKit Barton   LoopFuseLegacy() : FunctionPass(ID) {
18183cdf8794SKit Barton     initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry());
18193cdf8794SKit Barton   }
18203cdf8794SKit Barton 
getAnalysisUsage__anon08ec0c300111::LoopFuseLegacy18213cdf8794SKit Barton   void getAnalysisUsage(AnalysisUsage &AU) const override {
18223cdf8794SKit Barton     AU.addRequiredID(LoopSimplifyID);
18233cdf8794SKit Barton     AU.addRequired<ScalarEvolutionWrapperPass>();
18243cdf8794SKit Barton     AU.addRequired<LoopInfoWrapperPass>();
18253cdf8794SKit Barton     AU.addRequired<DominatorTreeWrapperPass>();
18263cdf8794SKit Barton     AU.addRequired<PostDominatorTreeWrapperPass>();
18273cdf8794SKit Barton     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
18283cdf8794SKit Barton     AU.addRequired<DependenceAnalysisWrapperPass>();
182938a82179SSidharth Baveja     AU.addRequired<AssumptionCacheTracker>();
183038a82179SSidharth Baveja     AU.addRequired<TargetTransformInfoWrapperPass>();
18313cdf8794SKit Barton 
18323cdf8794SKit Barton     AU.addPreserved<ScalarEvolutionWrapperPass>();
18333cdf8794SKit Barton     AU.addPreserved<LoopInfoWrapperPass>();
18343cdf8794SKit Barton     AU.addPreserved<DominatorTreeWrapperPass>();
18353cdf8794SKit Barton     AU.addPreserved<PostDominatorTreeWrapperPass>();
18363cdf8794SKit Barton   }
18373cdf8794SKit Barton 
runOnFunction__anon08ec0c300111::LoopFuseLegacy18383cdf8794SKit Barton   bool runOnFunction(Function &F) override {
18393cdf8794SKit Barton     if (skipFunction(F))
18403cdf8794SKit Barton       return false;
18413cdf8794SKit Barton     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
18423cdf8794SKit Barton     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
18433cdf8794SKit Barton     auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
18443cdf8794SKit Barton     auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
18453cdf8794SKit Barton     auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
18463cdf8794SKit Barton     auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
184738a82179SSidharth Baveja     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
184838a82179SSidharth Baveja     const TargetTransformInfo &TTI =
184938a82179SSidharth Baveja         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
18508a268becSFangrui Song     const DataLayout &DL = F.getParent()->getDataLayout();
185138a82179SSidharth Baveja 
185238a82179SSidharth Baveja     LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
18533cdf8794SKit Barton     return LF.fuseLoops(F);
18543cdf8794SKit Barton   }
18553cdf8794SKit Barton };
1856dc5f805dSBenjamin Kramer } // namespace
18573cdf8794SKit Barton 
run(Function & F,FunctionAnalysisManager & AM)18583cdf8794SKit Barton PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
18593cdf8794SKit Barton   auto &LI = AM.getResult<LoopAnalysis>(F);
18603cdf8794SKit Barton   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
18613cdf8794SKit Barton   auto &DI = AM.getResult<DependenceAnalysis>(F);
18623cdf8794SKit Barton   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
18633cdf8794SKit Barton   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
18643cdf8794SKit Barton   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
186538a82179SSidharth Baveja   auto &AC = AM.getResult<AssumptionAnalysis>(F);
186638a82179SSidharth Baveja   const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
18678a268becSFangrui Song   const DataLayout &DL = F.getParent()->getDataLayout();
186838a82179SSidharth Baveja 
186938a82179SSidharth Baveja   LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
18703cdf8794SKit Barton   bool Changed = LF.fuseLoops(F);
18713cdf8794SKit Barton   if (!Changed)
18723cdf8794SKit Barton     return PreservedAnalyses::all();
18733cdf8794SKit Barton 
18743cdf8794SKit Barton   PreservedAnalyses PA;
18753cdf8794SKit Barton   PA.preserve<DominatorTreeAnalysis>();
18763cdf8794SKit Barton   PA.preserve<PostDominatorTreeAnalysis>();
18773cdf8794SKit Barton   PA.preserve<ScalarEvolutionAnalysis>();
18783cdf8794SKit Barton   PA.preserve<LoopAnalysis>();
18793cdf8794SKit Barton   return PA;
18803cdf8794SKit Barton }
18813cdf8794SKit Barton 
18823cdf8794SKit Barton char LoopFuseLegacy::ID = 0;
18833cdf8794SKit Barton 
18843cdf8794SKit Barton INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false,
18853cdf8794SKit Barton                       false)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)18863cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
18873cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
18883cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
18893cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
18903cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
18913cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
189238a82179SSidharth Baveja INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
189338a82179SSidharth Baveja INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
18943cdf8794SKit Barton INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
18953cdf8794SKit Barton 
18963cdf8794SKit Barton FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
1897