13cdf8794SKit Barton //===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
23cdf8794SKit Barton //
33cdf8794SKit Barton // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
43cdf8794SKit Barton // See https://llvm.org/LICENSE.txt for license information.
53cdf8794SKit Barton // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
63cdf8794SKit Barton //
73cdf8794SKit Barton //===----------------------------------------------------------------------===//
83cdf8794SKit Barton ///
93cdf8794SKit Barton /// \file
103cdf8794SKit Barton /// This file implements the loop fusion pass.
113cdf8794SKit Barton /// The implementation is largely based on the following document:
123cdf8794SKit Barton ///
133cdf8794SKit Barton /// Code Transformations to Augment the Scope of Loop Fusion in a
143cdf8794SKit Barton /// Production Compiler
153cdf8794SKit Barton /// Christopher Mark Barton
163cdf8794SKit Barton /// MSc Thesis
173cdf8794SKit Barton /// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
183cdf8794SKit Barton ///
193cdf8794SKit Barton /// The general approach taken is to collect sets of control flow equivalent
203cdf8794SKit Barton /// loops and test whether they can be fused. The necessary conditions for
213cdf8794SKit Barton /// fusion are:
223cdf8794SKit Barton /// 1. The loops must be adjacent (there cannot be any statements between
233cdf8794SKit Barton /// the two loops).
243cdf8794SKit Barton /// 2. The loops must be conforming (they must execute the same number of
253cdf8794SKit Barton /// iterations).
263cdf8794SKit Barton /// 3. The loops must be control flow equivalent (if one loop executes, the
273cdf8794SKit Barton /// other is guaranteed to execute).
283cdf8794SKit Barton /// 4. There cannot be any negative distance dependencies between the loops.
293cdf8794SKit Barton /// If all of these conditions are satisfied, it is safe to fuse the loops.
303cdf8794SKit Barton ///
313cdf8794SKit Barton /// This implementation creates FusionCandidates that represent the loop and the
323cdf8794SKit Barton /// necessary information needed by fusion. It then operates on the fusion
333cdf8794SKit Barton /// candidates, first confirming that the candidate is eligible for fusion. The
343cdf8794SKit Barton /// candidates are then collected into control flow equivalent sets, sorted in
353cdf8794SKit Barton /// dominance order. Each set of control flow equivalent candidates is then
363cdf8794SKit Barton /// traversed, attempting to fuse pairs of candidates in the set. If all
373cdf8794SKit Barton /// requirements for fusion are met, the two candidates are fused, creating a
383cdf8794SKit Barton /// new (fused) candidate which is then added back into the set to consider for
393cdf8794SKit Barton /// additional fusion.
403cdf8794SKit Barton ///
413cdf8794SKit Barton /// This implementation currently does not make any modifications to remove
423cdf8794SKit Barton /// conditions for fusion. Code transformations to make loops conform to each of
433cdf8794SKit Barton /// the conditions for fusion are discussed in more detail in the document
443cdf8794SKit Barton /// above. These can be added to the current implementation in the future.
453cdf8794SKit Barton //===----------------------------------------------------------------------===//
463cdf8794SKit Barton
473cdf8794SKit Barton #include "llvm/Transforms/Scalar/LoopFuse.h"
483cdf8794SKit Barton #include "llvm/ADT/Statistic.h"
4938a82179SSidharth Baveja #include "llvm/Analysis/AssumptionCache.h"
503cdf8794SKit Barton #include "llvm/Analysis/DependenceAnalysis.h"
513cdf8794SKit Barton #include "llvm/Analysis/DomTreeUpdater.h"
523cdf8794SKit Barton #include "llvm/Analysis/LoopInfo.h"
533cdf8794SKit Barton #include "llvm/Analysis/OptimizationRemarkEmitter.h"
543cdf8794SKit Barton #include "llvm/Analysis/PostDominators.h"
553cdf8794SKit Barton #include "llvm/Analysis/ScalarEvolution.h"
563cdf8794SKit Barton #include "llvm/Analysis/ScalarEvolutionExpressions.h"
5738a82179SSidharth Baveja #include "llvm/Analysis/TargetTransformInfo.h"
583cdf8794SKit Barton #include "llvm/IR/Function.h"
593cdf8794SKit Barton #include "llvm/IR/Verifier.h"
6005da2fe5SReid Kleckner #include "llvm/InitializePasses.h"
613cdf8794SKit Barton #include "llvm/Pass.h"
624c1a1d3cSReid Kleckner #include "llvm/Support/CommandLine.h"
633cdf8794SKit Barton #include "llvm/Support/Debug.h"
643cdf8794SKit Barton #include "llvm/Support/raw_ostream.h"
653cdf8794SKit Barton #include "llvm/Transforms/Scalar.h"
663cdf8794SKit Barton #include "llvm/Transforms/Utils.h"
673cdf8794SKit Barton #include "llvm/Transforms/Utils/BasicBlockUtils.h"
68aaf7f05aSWhitney Tsang #include "llvm/Transforms/Utils/CodeMoverUtils.h"
69b7cfa6caSSidharth Baveja #include "llvm/Transforms/Utils/LoopPeel.h"
703cdf8794SKit Barton
713cdf8794SKit Barton using namespace llvm;
723cdf8794SKit Barton
733cdf8794SKit Barton #define DEBUG_TYPE "loop-fusion"
743cdf8794SKit Barton
75de0b6339SKit Barton STATISTIC(FuseCounter, "Loops fused");
763cdf8794SKit Barton STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
773cdf8794SKit Barton STATISTIC(InvalidPreheader, "Loop has invalid preheader");
783cdf8794SKit Barton STATISTIC(InvalidHeader, "Loop has invalid header");
793cdf8794SKit Barton STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
803cdf8794SKit Barton STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
813cdf8794SKit Barton STATISTIC(InvalidLatch, "Loop has invalid latch");
823cdf8794SKit Barton STATISTIC(InvalidLoop, "Loop is invalid");
833cdf8794SKit Barton STATISTIC(AddressTakenBB, "Basic block has address taken");
843cdf8794SKit Barton STATISTIC(MayThrowException, "Loop may throw an exception");
853cdf8794SKit Barton STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
863cdf8794SKit Barton STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
873cdf8794SKit Barton STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
88de0b6339SKit Barton STATISTIC(UnknownTripCount, "Loop has unknown trip count");
893cdf8794SKit Barton STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
90de0b6339SKit Barton STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
91de0b6339SKit Barton STATISTIC(NonAdjacent, "Loops are not adjacent");
92da58e68fSWhitney Tsang STATISTIC(
93da58e68fSWhitney Tsang NonEmptyPreheader,
94da58e68fSWhitney Tsang "Loop has a non-empty preheader with instructions that cannot be moved");
95de0b6339SKit Barton STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
9650bc6104SKit Barton STATISTIC(NonIdenticalGuards, "Candidates have different guards");
97e44f4a8aSWhitney Tsang STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
98e44f4a8aSWhitney Tsang "instructions that cannot be moved");
99e44f4a8aSWhitney Tsang STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
100e44f4a8aSWhitney Tsang "instructions that cannot be moved");
101ff07fc66SKit Barton STATISTIC(NotRotated, "Candidate is not rotated");
1026a82ace5STa-Wei Tu STATISTIC(OnlySecondCandidateIsGuarded,
1036a82ace5STa-Wei Tu "The second candidate is guarded while the first one is not");
1043cdf8794SKit Barton
1053cdf8794SKit Barton enum FusionDependenceAnalysisChoice {
1063cdf8794SKit Barton FUSION_DEPENDENCE_ANALYSIS_SCEV,
1073cdf8794SKit Barton FUSION_DEPENDENCE_ANALYSIS_DA,
1083cdf8794SKit Barton FUSION_DEPENDENCE_ANALYSIS_ALL,
1093cdf8794SKit Barton };
1103cdf8794SKit Barton
1113cdf8794SKit Barton static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
1123cdf8794SKit Barton "loop-fusion-dependence-analysis",
1133cdf8794SKit Barton cl::desc("Which dependence analysis should loop fusion use?"),
1143cdf8794SKit Barton cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev",
1153cdf8794SKit Barton "Use the scalar evolution interface"),
1163cdf8794SKit Barton clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da",
1173cdf8794SKit Barton "Use the dependence analysis interface"),
1183cdf8794SKit Barton clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all",
1193cdf8794SKit Barton "Use all available analyses")),
120*95a13425SFangrui Song cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL));
1213cdf8794SKit Barton
12238a82179SSidharth Baveja static cl::opt<unsigned> FusionPeelMaxCount(
12338a82179SSidharth Baveja "loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
12438a82179SSidharth Baveja cl::desc("Max number of iterations to be peeled from a loop, such that "
12538a82179SSidharth Baveja "fusion can take place"));
12638a82179SSidharth Baveja
1273cdf8794SKit Barton #ifndef NDEBUG
1283cdf8794SKit Barton static cl::opt<bool>
1293cdf8794SKit Barton VerboseFusionDebugging("loop-fusion-verbose-debug",
1303cdf8794SKit Barton cl::desc("Enable verbose debugging for Loop Fusion"),
13136c7d79dSFangrui Song cl::Hidden, cl::init(false));
1323cdf8794SKit Barton #endif
1333cdf8794SKit Barton
134dc5f805dSBenjamin Kramer namespace {
1353cdf8794SKit Barton /// This class is used to represent a candidate for loop fusion. When it is
1363cdf8794SKit Barton /// constructed, it checks the conditions for loop fusion to ensure that it
1373cdf8794SKit Barton /// represents a valid candidate. It caches several parts of a loop that are
1383cdf8794SKit Barton /// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
1393cdf8794SKit Barton /// of continually querying the underlying Loop to retrieve these values. It is
1403cdf8794SKit Barton /// assumed these will not change throughout loop fusion.
1413cdf8794SKit Barton ///
1423cdf8794SKit Barton /// The invalidate method should be used to indicate that the FusionCandidate is
1433cdf8794SKit Barton /// no longer a valid candidate for fusion. Similarly, the isValid() method can
1443cdf8794SKit Barton /// be used to ensure that the FusionCandidate is still valid for fusion.
1453cdf8794SKit Barton struct FusionCandidate {
1463cdf8794SKit Barton /// Cache of parts of the loop used throughout loop fusion. These should not
1473cdf8794SKit Barton /// need to change throughout the analysis and transformation.
1483cdf8794SKit Barton /// These parts are cached to avoid repeatedly looking up in the Loop class.
1493cdf8794SKit Barton
1503cdf8794SKit Barton /// Preheader of the loop this candidate represents
1513cdf8794SKit Barton BasicBlock *Preheader;
1523cdf8794SKit Barton /// Header of the loop this candidate represents
1533cdf8794SKit Barton BasicBlock *Header;
1543cdf8794SKit Barton /// Blocks in the loop that exit the loop
1553cdf8794SKit Barton BasicBlock *ExitingBlock;
1563cdf8794SKit Barton /// The successor block of this loop (where the exiting blocks go to)
1573cdf8794SKit Barton BasicBlock *ExitBlock;
1583cdf8794SKit Barton /// Latch of the loop
1593cdf8794SKit Barton BasicBlock *Latch;
1603cdf8794SKit Barton /// The loop that this fusion candidate represents
1613cdf8794SKit Barton Loop *L;
1623cdf8794SKit Barton /// Vector of instructions in this loop that read from memory
1633cdf8794SKit Barton SmallVector<Instruction *, 16> MemReads;
1643cdf8794SKit Barton /// Vector of instructions in this loop that write to memory
1653cdf8794SKit Barton SmallVector<Instruction *, 16> MemWrites;
1663cdf8794SKit Barton /// Are all of the members of this fusion candidate still valid
1673cdf8794SKit Barton bool Valid;
16850bc6104SKit Barton /// Guard branch of the loop, if it exists
16950bc6104SKit Barton BranchInst *GuardBranch;
17038a82179SSidharth Baveja /// Peeling Paramaters of the Loop.
17138a82179SSidharth Baveja TTI::PeelingPreferences PP;
17238a82179SSidharth Baveja /// Can you Peel this Loop?
17338a82179SSidharth Baveja bool AbleToPeel;
17438a82179SSidharth Baveja /// Has this loop been Peeled
17538a82179SSidharth Baveja bool Peeled;
1763cdf8794SKit Barton
1773cdf8794SKit Barton /// Dominator and PostDominator trees are needed for the
1783cdf8794SKit Barton /// FusionCandidateCompare function, required by FusionCandidateSet to
1793cdf8794SKit Barton /// determine where the FusionCandidate should be inserted into the set. These
1803cdf8794SKit Barton /// are used to establish ordering of the FusionCandidates based on dominance.
181a73e4ce6SAnna Thomas DominatorTree &DT;
1823cdf8794SKit Barton const PostDominatorTree *PDT;
1833cdf8794SKit Barton
184de0b6339SKit Barton OptimizationRemarkEmitter &ORE;
185de0b6339SKit Barton
FusionCandidate__anon08ec0c300111::FusionCandidate186a73e4ce6SAnna Thomas FusionCandidate(Loop *L, DominatorTree &DT,
18738a82179SSidharth Baveja const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE,
18838a82179SSidharth Baveja TTI::PeelingPreferences PP)
1893cdf8794SKit Barton : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
1903cdf8794SKit Barton ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
1913db1cf7aSKit Barton Latch(L->getLoopLatch()), L(L), Valid(true),
19238a82179SSidharth Baveja GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
19338a82179SSidharth Baveja Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
1943cdf8794SKit Barton
1953cdf8794SKit Barton // Walk over all blocks in the loop and check for conditions that may
1963cdf8794SKit Barton // prevent fusion. For each block, walk over all instructions and collect
1973cdf8794SKit Barton // the memory reads and writes If any instructions that prevent fusion are
1983cdf8794SKit Barton // found, invalidate this object and return.
1993cdf8794SKit Barton for (BasicBlock *BB : L->blocks()) {
2003cdf8794SKit Barton if (BB->hasAddressTaken()) {
2013cdf8794SKit Barton invalidate();
202de0b6339SKit Barton reportInvalidCandidate(AddressTakenBB);
2033cdf8794SKit Barton return;
2043cdf8794SKit Barton }
2053cdf8794SKit Barton
2063cdf8794SKit Barton for (Instruction &I : *BB) {
2073cdf8794SKit Barton if (I.mayThrow()) {
2083cdf8794SKit Barton invalidate();
209de0b6339SKit Barton reportInvalidCandidate(MayThrowException);
2103cdf8794SKit Barton return;
2113cdf8794SKit Barton }
2123cdf8794SKit Barton if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
2133cdf8794SKit Barton if (SI->isVolatile()) {
2143cdf8794SKit Barton invalidate();
215de0b6339SKit Barton reportInvalidCandidate(ContainsVolatileAccess);
2163cdf8794SKit Barton return;
2173cdf8794SKit Barton }
2183cdf8794SKit Barton }
2193cdf8794SKit Barton if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
2203cdf8794SKit Barton if (LI->isVolatile()) {
2213cdf8794SKit Barton invalidate();
222de0b6339SKit Barton reportInvalidCandidate(ContainsVolatileAccess);
2233cdf8794SKit Barton return;
2243cdf8794SKit Barton }
2253cdf8794SKit Barton }
2263cdf8794SKit Barton if (I.mayWriteToMemory())
2273cdf8794SKit Barton MemWrites.push_back(&I);
2283cdf8794SKit Barton if (I.mayReadFromMemory())
2293cdf8794SKit Barton MemReads.push_back(&I);
2303cdf8794SKit Barton }
2313cdf8794SKit Barton }
2323cdf8794SKit Barton }
2333cdf8794SKit Barton
2343cdf8794SKit Barton /// Check if all members of the class are valid.
isValid__anon08ec0c300111::FusionCandidate2353cdf8794SKit Barton bool isValid() const {
2363cdf8794SKit Barton return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
2373cdf8794SKit Barton !L->isInvalid() && Valid;
2383cdf8794SKit Barton }
2393cdf8794SKit Barton
2403cdf8794SKit Barton /// Verify that all members are in sync with the Loop object.
verify__anon08ec0c300111::FusionCandidate2413cdf8794SKit Barton void verify() const {
2423cdf8794SKit Barton assert(isValid() && "Candidate is not valid!!");
2433cdf8794SKit Barton assert(!L->isInvalid() && "Loop is invalid!");
2443cdf8794SKit Barton assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
2453cdf8794SKit Barton assert(Header == L->getHeader() && "Header is out of sync");
2463cdf8794SKit Barton assert(ExitingBlock == L->getExitingBlock() &&
2473cdf8794SKit Barton "Exiting Blocks is out of sync");
2483cdf8794SKit Barton assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
2493cdf8794SKit Barton assert(Latch == L->getLoopLatch() && "Latch is out of sync");
2503cdf8794SKit Barton }
2513cdf8794SKit Barton
25250bc6104SKit Barton /// Get the entry block for this fusion candidate.
25350bc6104SKit Barton ///
25450bc6104SKit Barton /// If this fusion candidate represents a guarded loop, the entry block is the
25550bc6104SKit Barton /// loop guard block. If it represents an unguarded loop, the entry block is
25650bc6104SKit Barton /// the preheader of the loop.
getEntryBlock__anon08ec0c300111::FusionCandidate25750bc6104SKit Barton BasicBlock *getEntryBlock() const {
25850bc6104SKit Barton if (GuardBranch)
25950bc6104SKit Barton return GuardBranch->getParent();
26050bc6104SKit Barton else
26150bc6104SKit Barton return Preheader;
26250bc6104SKit Barton }
26350bc6104SKit Barton
26438a82179SSidharth Baveja /// After Peeling the loop is modified quite a bit, hence all of the Blocks
26538a82179SSidharth Baveja /// need to be updated accordingly.
updateAfterPeeling__anon08ec0c300111::FusionCandidate26638a82179SSidharth Baveja void updateAfterPeeling() {
26738a82179SSidharth Baveja Preheader = L->getLoopPreheader();
26838a82179SSidharth Baveja Header = L->getHeader();
26938a82179SSidharth Baveja ExitingBlock = L->getExitingBlock();
27038a82179SSidharth Baveja ExitBlock = L->getExitBlock();
27138a82179SSidharth Baveja Latch = L->getLoopLatch();
27238a82179SSidharth Baveja verify();
27338a82179SSidharth Baveja }
27438a82179SSidharth Baveja
27550bc6104SKit Barton /// Given a guarded loop, get the successor of the guard that is not in the
27650bc6104SKit Barton /// loop.
27750bc6104SKit Barton ///
27850bc6104SKit Barton /// This method returns the successor of the loop guard that is not located
27950bc6104SKit Barton /// within the loop (i.e., the successor of the guard that is not the
28050bc6104SKit Barton /// preheader).
28150bc6104SKit Barton /// This method is only valid for guarded loops.
getNonLoopBlock__anon08ec0c300111::FusionCandidate28250bc6104SKit Barton BasicBlock *getNonLoopBlock() const {
28350bc6104SKit Barton assert(GuardBranch && "Only valid on guarded loops.");
28450bc6104SKit Barton assert(GuardBranch->isConditional() &&
28550bc6104SKit Barton "Expecting guard to be a conditional branch.");
28638a82179SSidharth Baveja if (Peeled)
28738a82179SSidharth Baveja return GuardBranch->getSuccessor(1);
28850bc6104SKit Barton return (GuardBranch->getSuccessor(0) == Preheader)
28950bc6104SKit Barton ? GuardBranch->getSuccessor(1)
29050bc6104SKit Barton : GuardBranch->getSuccessor(0);
29150bc6104SKit Barton }
29250bc6104SKit Barton
2933cdf8794SKit Barton #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump__anon08ec0c300111::FusionCandidate2943cdf8794SKit Barton LLVM_DUMP_METHOD void dump() const {
295d1f41b2cSWhitney Tsang dbgs() << "\tGuardBranch: ";
296d1f41b2cSWhitney Tsang if (GuardBranch)
297d1f41b2cSWhitney Tsang dbgs() << *GuardBranch;
298d1f41b2cSWhitney Tsang else
299d1f41b2cSWhitney Tsang dbgs() << "nullptr";
300d1f41b2cSWhitney Tsang dbgs() << "\n"
30150bc6104SKit Barton << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
30250bc6104SKit Barton << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
3033cdf8794SKit Barton << "\n"
3043cdf8794SKit Barton << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
3053cdf8794SKit Barton << "\tExitingBB: "
3063cdf8794SKit Barton << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
3073cdf8794SKit Barton << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
3083cdf8794SKit Barton << "\n"
30950bc6104SKit Barton << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
31050bc6104SKit Barton << "\tEntryBlock: "
31150bc6104SKit Barton << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
31250bc6104SKit Barton << "\n";
3133cdf8794SKit Barton }
3143cdf8794SKit Barton #endif
3153cdf8794SKit Barton
316de0b6339SKit Barton /// Determine if a fusion candidate (representing a loop) is eligible for
317de0b6339SKit Barton /// fusion. Note that this only checks whether a single loop can be fused - it
318de0b6339SKit Barton /// does not check whether it is *legal* to fuse two loops together.
isEligibleForFusion__anon08ec0c300111::FusionCandidate319de0b6339SKit Barton bool isEligibleForFusion(ScalarEvolution &SE) const {
320de0b6339SKit Barton if (!isValid()) {
321de0b6339SKit Barton LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
322de0b6339SKit Barton if (!Preheader)
323de0b6339SKit Barton ++InvalidPreheader;
324de0b6339SKit Barton if (!Header)
325de0b6339SKit Barton ++InvalidHeader;
326de0b6339SKit Barton if (!ExitingBlock)
327de0b6339SKit Barton ++InvalidExitingBlock;
328de0b6339SKit Barton if (!ExitBlock)
329de0b6339SKit Barton ++InvalidExitBlock;
330de0b6339SKit Barton if (!Latch)
331de0b6339SKit Barton ++InvalidLatch;
332de0b6339SKit Barton if (L->isInvalid())
333de0b6339SKit Barton ++InvalidLoop;
334de0b6339SKit Barton
335de0b6339SKit Barton return false;
336de0b6339SKit Barton }
337de0b6339SKit Barton
338de0b6339SKit Barton // Require ScalarEvolution to be able to determine a trip count.
339de0b6339SKit Barton if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
340de0b6339SKit Barton LLVM_DEBUG(dbgs() << "Loop " << L->getName()
341de0b6339SKit Barton << " trip count not computable!\n");
342de0b6339SKit Barton return reportInvalidCandidate(UnknownTripCount);
343de0b6339SKit Barton }
344de0b6339SKit Barton
345de0b6339SKit Barton if (!L->isLoopSimplifyForm()) {
346de0b6339SKit Barton LLVM_DEBUG(dbgs() << "Loop " << L->getName()
347de0b6339SKit Barton << " is not in simplified form!\n");
348de0b6339SKit Barton return reportInvalidCandidate(NotSimplifiedForm);
349de0b6339SKit Barton }
350de0b6339SKit Barton
3513db1cf7aSKit Barton if (!L->isRotatedForm()) {
352ff07fc66SKit Barton LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");
353ff07fc66SKit Barton return reportInvalidCandidate(NotRotated);
354ff07fc66SKit Barton }
355ff07fc66SKit Barton
356de0b6339SKit Barton return true;
357de0b6339SKit Barton }
358de0b6339SKit Barton
3593cdf8794SKit Barton private:
3603cdf8794SKit Barton // This is only used internally for now, to clear the MemWrites and MemReads
3613cdf8794SKit Barton // list and setting Valid to false. I can't envision other uses of this right
3623cdf8794SKit Barton // now, since once FusionCandidates are put into the FusionCandidateSet they
3633cdf8794SKit Barton // are immutable. Thus, any time we need to change/update a FusionCandidate,
3643cdf8794SKit Barton // we must create a new one and insert it into the FusionCandidateSet to
3653cdf8794SKit Barton // ensure the FusionCandidateSet remains ordered correctly.
invalidate__anon08ec0c300111::FusionCandidate3663cdf8794SKit Barton void invalidate() {
3673cdf8794SKit Barton MemWrites.clear();
3683cdf8794SKit Barton MemReads.clear();
3693cdf8794SKit Barton Valid = false;
3703cdf8794SKit Barton }
371de0b6339SKit Barton
reportInvalidCandidate__anon08ec0c300111::FusionCandidate372de0b6339SKit Barton bool reportInvalidCandidate(llvm::Statistic &Stat) const {
373de0b6339SKit Barton using namespace ore;
374de0b6339SKit Barton assert(L && Preheader && "Fusion candidate not initialized properly!");
37518839be9SFangrui Song #if LLVM_ENABLE_STATS
376de0b6339SKit Barton ++Stat;
377de0b6339SKit Barton ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
378de0b6339SKit Barton L->getStartLoc(), Preheader)
379de0b6339SKit Barton << "[" << Preheader->getParent()->getName() << "]: "
380de0b6339SKit Barton << "Loop is not a candidate for fusion: " << Stat.getDesc());
38118839be9SFangrui Song #endif
382de0b6339SKit Barton return false;
383de0b6339SKit Barton }
3843cdf8794SKit Barton };
3853cdf8794SKit Barton
3863cdf8794SKit Barton struct FusionCandidateCompare {
3873cdf8794SKit Barton /// Comparison functor to sort two Control Flow Equivalent fusion candidates
3883cdf8794SKit Barton /// into dominance order.
3893cdf8794SKit Barton /// If LHS dominates RHS and RHS post-dominates LHS, return true;
3903cdf8794SKit Barton /// IF RHS dominates LHS and LHS post-dominates RHS, return false;
operator ()__anon08ec0c300111::FusionCandidateCompare3913cdf8794SKit Barton bool operator()(const FusionCandidate &LHS,
3923cdf8794SKit Barton const FusionCandidate &RHS) const {
393a73e4ce6SAnna Thomas const DominatorTree *DT = &(LHS.DT);
3943cdf8794SKit Barton
39550bc6104SKit Barton BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
39650bc6104SKit Barton BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
39750bc6104SKit Barton
3983cdf8794SKit Barton // Do not save PDT to local variable as it is only used in asserts and thus
3993cdf8794SKit Barton // will trigger an unused variable warning if building without asserts.
4001ee1da1eSJordan Rupprecht assert(DT && LHS.PDT && "Expecting valid dominator tree");
4013cdf8794SKit Barton
4027b619202SRichard Trieu // Do this compare first so if LHS == RHS, function returns false.
40350bc6104SKit Barton if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
4043cdf8794SKit Barton // RHS dominates LHS
4053cdf8794SKit Barton // Verify LHS post-dominates RHS
4061ee1da1eSJordan Rupprecht assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
4073cdf8794SKit Barton return false;
4083cdf8794SKit Barton }
4097b619202SRichard Trieu
41050bc6104SKit Barton if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
4117b619202SRichard Trieu // Verify RHS Postdominates LHS
4121ee1da1eSJordan Rupprecht assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
4137b619202SRichard Trieu return true;
4147b619202SRichard Trieu }
4157b619202SRichard Trieu
4163cdf8794SKit Barton // If LHS does not dominate RHS and RHS does not dominate LHS then there is
4173cdf8794SKit Barton // no dominance relationship between the two FusionCandidates. Thus, they
4183cdf8794SKit Barton // should not be in the same set together.
4193cdf8794SKit Barton llvm_unreachable(
4203cdf8794SKit Barton "No dominance relationship between these fusion candidates!");
4213cdf8794SKit Barton }
4223cdf8794SKit Barton };
4233cdf8794SKit Barton
4243cdf8794SKit Barton using LoopVector = SmallVector<Loop *, 4>;
4253cdf8794SKit Barton
4263cdf8794SKit Barton // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
4273cdf8794SKit Barton // order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
4283cdf8794SKit Barton // dominates FC1 and FC1 post-dominates FC0.
4293cdf8794SKit Barton // std::set was chosen because we want a sorted data structure with stable
4303cdf8794SKit Barton // iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent
4313cdf8794SKit Barton // loops by moving intervening code around. When this intervening code contains
4323cdf8794SKit Barton // loops, those loops will be moved also. The corresponding FusionCandidates
4333cdf8794SKit Barton // will also need to be moved accordingly. As this is done, having stable
4343cdf8794SKit Barton // iterators will simplify the logic. Similarly, having an efficient insert that
4353cdf8794SKit Barton // keeps the FusionCandidateSet sorted will also simplify the implementation.
4363cdf8794SKit Barton using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
4373cdf8794SKit Barton using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
4383cdf8794SKit Barton
439eb70ac02SFangrui Song #if !defined(NDEBUG)
operator <<(llvm::raw_ostream & OS,const FusionCandidate & FC)440eb70ac02SFangrui Song static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
44169640273SFangrui Song const FusionCandidate &FC) {
44269640273SFangrui Song if (FC.isValid())
44369640273SFangrui Song OS << FC.Preheader->getName();
44469640273SFangrui Song else
44569640273SFangrui Song OS << "<Invalid>";
44669640273SFangrui Song
44769640273SFangrui Song return OS;
44869640273SFangrui Song }
44969640273SFangrui Song
operator <<(llvm::raw_ostream & OS,const FusionCandidateSet & CandSet)45069640273SFangrui Song static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
4513cdf8794SKit Barton const FusionCandidateSet &CandSet) {
452eb70ac02SFangrui Song for (const FusionCandidate &FC : CandSet)
453eb70ac02SFangrui Song OS << FC << '\n';
4543cdf8794SKit Barton
4553cdf8794SKit Barton return OS;
4563cdf8794SKit Barton }
4573cdf8794SKit Barton
4583cdf8794SKit Barton static void
printFusionCandidates(const FusionCandidateCollection & FusionCandidates)4593cdf8794SKit Barton printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
4608e64f0a6SKit Barton dbgs() << "Fusion Candidates: \n";
4613cdf8794SKit Barton for (const auto &CandidateSet : FusionCandidates) {
4623cdf8794SKit Barton dbgs() << "*** Fusion Candidate Set ***\n";
4633cdf8794SKit Barton dbgs() << CandidateSet;
4643cdf8794SKit Barton dbgs() << "****************************\n";
4653cdf8794SKit Barton }
4663cdf8794SKit Barton }
4673cdf8794SKit Barton #endif
4683cdf8794SKit Barton
4693cdf8794SKit Barton /// Collect all loops in function at the same nest level, starting at the
4703cdf8794SKit Barton /// outermost level.
4713cdf8794SKit Barton ///
4723cdf8794SKit Barton /// This data structure collects all loops at the same nest level for a
4733cdf8794SKit Barton /// given function (specified by the LoopInfo object). It starts at the
4743cdf8794SKit Barton /// outermost level.
4753cdf8794SKit Barton struct LoopDepthTree {
4763cdf8794SKit Barton using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
4773cdf8794SKit Barton using iterator = LoopsOnLevelTy::iterator;
4783cdf8794SKit Barton using const_iterator = LoopsOnLevelTy::const_iterator;
4793cdf8794SKit Barton
LoopDepthTree__anon08ec0c300111::LoopDepthTree4803cdf8794SKit Barton LoopDepthTree(LoopInfo &LI) : Depth(1) {
4813cdf8794SKit Barton if (!LI.empty())
4823cdf8794SKit Barton LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));
4833cdf8794SKit Barton }
4843cdf8794SKit Barton
4853cdf8794SKit Barton /// Test whether a given loop has been removed from the function, and thus is
4863cdf8794SKit Barton /// no longer valid.
isRemovedLoop__anon08ec0c300111::LoopDepthTree4873cdf8794SKit Barton bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }
4883cdf8794SKit Barton
4893cdf8794SKit Barton /// Record that a given loop has been removed from the function and is no
4903cdf8794SKit Barton /// longer valid.
removeLoop__anon08ec0c300111::LoopDepthTree4913cdf8794SKit Barton void removeLoop(const Loop *L) { RemovedLoops.insert(L); }
4923cdf8794SKit Barton
4933cdf8794SKit Barton /// Descend the tree to the next (inner) nesting level
descend__anon08ec0c300111::LoopDepthTree4943cdf8794SKit Barton void descend() {
4953cdf8794SKit Barton LoopsOnLevelTy LoopsOnNextLevel;
4963cdf8794SKit Barton
4973cdf8794SKit Barton for (const LoopVector &LV : *this)
4983cdf8794SKit Barton for (Loop *L : LV)
4993cdf8794SKit Barton if (!isRemovedLoop(L) && L->begin() != L->end())
5003cdf8794SKit Barton LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));
5013cdf8794SKit Barton
5023cdf8794SKit Barton LoopsOnLevel = LoopsOnNextLevel;
5033cdf8794SKit Barton RemovedLoops.clear();
5043cdf8794SKit Barton Depth++;
5053cdf8794SKit Barton }
5063cdf8794SKit Barton
empty__anon08ec0c300111::LoopDepthTree5073cdf8794SKit Barton bool empty() const { return size() == 0; }
size__anon08ec0c300111::LoopDepthTree5083cdf8794SKit Barton size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
getDepth__anon08ec0c300111::LoopDepthTree5093cdf8794SKit Barton unsigned getDepth() const { return Depth; }
5103cdf8794SKit Barton
begin__anon08ec0c300111::LoopDepthTree5113cdf8794SKit Barton iterator begin() { return LoopsOnLevel.begin(); }
end__anon08ec0c300111::LoopDepthTree5123cdf8794SKit Barton iterator end() { return LoopsOnLevel.end(); }
begin__anon08ec0c300111::LoopDepthTree5133cdf8794SKit Barton const_iterator begin() const { return LoopsOnLevel.begin(); }
end__anon08ec0c300111::LoopDepthTree5143cdf8794SKit Barton const_iterator end() const { return LoopsOnLevel.end(); }
5153cdf8794SKit Barton
5163cdf8794SKit Barton private:
5173cdf8794SKit Barton /// Set of loops that have been removed from the function and are no longer
5183cdf8794SKit Barton /// valid.
5193cdf8794SKit Barton SmallPtrSet<const Loop *, 8> RemovedLoops;
5203cdf8794SKit Barton
5213cdf8794SKit Barton /// Depth of the current level, starting at 1 (outermost loops).
5223cdf8794SKit Barton unsigned Depth;
5233cdf8794SKit Barton
5243cdf8794SKit Barton /// Vector of loops at the current depth level that have the same parent loop
5253cdf8794SKit Barton LoopsOnLevelTy LoopsOnLevel;
5263cdf8794SKit Barton };
5273cdf8794SKit Barton
5283cdf8794SKit Barton #ifndef NDEBUG
printLoopVector(const LoopVector & LV)5293cdf8794SKit Barton static void printLoopVector(const LoopVector &LV) {
5303cdf8794SKit Barton dbgs() << "****************************\n";
5313cdf8794SKit Barton for (auto L : LV)
5323cdf8794SKit Barton printLoop(*L, dbgs());
5333cdf8794SKit Barton dbgs() << "****************************\n";
5343cdf8794SKit Barton }
5353cdf8794SKit Barton #endif
5363cdf8794SKit Barton
5373cdf8794SKit Barton struct LoopFuser {
5383cdf8794SKit Barton private:
5393cdf8794SKit Barton // Sets of control flow equivalent fusion candidates for a given nest level.
5403cdf8794SKit Barton FusionCandidateCollection FusionCandidates;
5413cdf8794SKit Barton
5423cdf8794SKit Barton LoopDepthTree LDT;
5433cdf8794SKit Barton DomTreeUpdater DTU;
5443cdf8794SKit Barton
5453cdf8794SKit Barton LoopInfo &LI;
5463cdf8794SKit Barton DominatorTree &DT;
5473cdf8794SKit Barton DependenceInfo &DI;
5483cdf8794SKit Barton ScalarEvolution &SE;
5493cdf8794SKit Barton PostDominatorTree &PDT;
5503cdf8794SKit Barton OptimizationRemarkEmitter &ORE;
55138a82179SSidharth Baveja AssumptionCache &AC;
55238a82179SSidharth Baveja
55338a82179SSidharth Baveja const TargetTransformInfo &TTI;
5543cdf8794SKit Barton
5553cdf8794SKit Barton public:
LoopFuser__anon08ec0c300111::LoopFuser5563cdf8794SKit Barton LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
5573cdf8794SKit Barton ScalarEvolution &SE, PostDominatorTree &PDT,
55838a82179SSidharth Baveja OptimizationRemarkEmitter &ORE, const DataLayout &DL,
55938a82179SSidharth Baveja AssumptionCache &AC, const TargetTransformInfo &TTI)
5603cdf8794SKit Barton : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
56138a82179SSidharth Baveja DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
5623cdf8794SKit Barton
5633cdf8794SKit Barton /// This is the main entry point for loop fusion. It will traverse the
5643cdf8794SKit Barton /// specified function and collect candidate loops to fuse, starting at the
5653cdf8794SKit Barton /// outermost nesting level and working inwards.
fuseLoops__anon08ec0c300111::LoopFuser5663cdf8794SKit Barton bool fuseLoops(Function &F) {
5673cdf8794SKit Barton #ifndef NDEBUG
5683cdf8794SKit Barton if (VerboseFusionDebugging) {
5693cdf8794SKit Barton LI.print(dbgs());
5703cdf8794SKit Barton }
5713cdf8794SKit Barton #endif
5723cdf8794SKit Barton
5733cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
5743cdf8794SKit Barton << "\n");
5753cdf8794SKit Barton bool Changed = false;
5763cdf8794SKit Barton
5773cdf8794SKit Barton while (!LDT.empty()) {
5783cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
5793cdf8794SKit Barton << LDT.getDepth() << "\n";);
5803cdf8794SKit Barton
5813cdf8794SKit Barton for (const LoopVector &LV : LDT) {
5823cdf8794SKit Barton assert(LV.size() > 0 && "Empty loop set was build!");
5833cdf8794SKit Barton
5843cdf8794SKit Barton // Skip singleton loop sets as they do not offer fusion opportunities on
5853cdf8794SKit Barton // this level.
5863cdf8794SKit Barton if (LV.size() == 1)
5873cdf8794SKit Barton continue;
5883cdf8794SKit Barton #ifndef NDEBUG
5893cdf8794SKit Barton if (VerboseFusionDebugging) {
5903cdf8794SKit Barton LLVM_DEBUG({
5913cdf8794SKit Barton dbgs() << " Visit loop set (#" << LV.size() << "):\n";
5923cdf8794SKit Barton printLoopVector(LV);
5933cdf8794SKit Barton });
5943cdf8794SKit Barton }
5953cdf8794SKit Barton #endif
5963cdf8794SKit Barton
5973cdf8794SKit Barton collectFusionCandidates(LV);
5983cdf8794SKit Barton Changed |= fuseCandidates();
5993cdf8794SKit Barton }
6003cdf8794SKit Barton
6013cdf8794SKit Barton // Finished analyzing candidates at this level.
6023cdf8794SKit Barton // Descend to the next level and clear all of the candidates currently
6033cdf8794SKit Barton // collected. Note that it will not be possible to fuse any of the
6043cdf8794SKit Barton // existing candidates with new candidates because the new candidates will
6053cdf8794SKit Barton // be at a different nest level and thus not be control flow equivalent
6063cdf8794SKit Barton // with all of the candidates collected so far.
6073cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Descend one level!\n");
6083cdf8794SKit Barton LDT.descend();
6093cdf8794SKit Barton FusionCandidates.clear();
6103cdf8794SKit Barton }
6113cdf8794SKit Barton
6123cdf8794SKit Barton if (Changed)
6133cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
6143cdf8794SKit Barton
6153cdf8794SKit Barton #ifndef NDEBUG
6163cdf8794SKit Barton assert(DT.verify());
6173cdf8794SKit Barton assert(PDT.verify());
6183cdf8794SKit Barton LI.verify(DT);
6193cdf8794SKit Barton SE.verify();
6203cdf8794SKit Barton #endif
6213cdf8794SKit Barton
6223cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
6233cdf8794SKit Barton return Changed;
6243cdf8794SKit Barton }
6253cdf8794SKit Barton
6263cdf8794SKit Barton private:
6273cdf8794SKit Barton /// Determine if two fusion candidates are control flow equivalent.
6283cdf8794SKit Barton ///
6293cdf8794SKit Barton /// Two fusion candidates are control flow equivalent if when one executes,
6303cdf8794SKit Barton /// the other is guaranteed to execute. This is determined using dominators
6313cdf8794SKit Barton /// and post-dominators: if A dominates B and B post-dominates A then A and B
6323cdf8794SKit Barton /// are control-flow equivalent.
isControlFlowEquivalent__anon08ec0c300111::LoopFuser6333cdf8794SKit Barton bool isControlFlowEquivalent(const FusionCandidate &FC0,
6343cdf8794SKit Barton const FusionCandidate &FC1) const {
6353cdf8794SKit Barton assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
6363cdf8794SKit Barton
637aaf7f05aSWhitney Tsang return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(),
638aaf7f05aSWhitney Tsang DT, PDT);
6393cdf8794SKit Barton }
6403cdf8794SKit Barton
6413cdf8794SKit Barton /// Iterate over all loops in the given loop set and identify the loops that
6423cdf8794SKit Barton /// are eligible for fusion. Place all eligible fusion candidates into Control
6433cdf8794SKit Barton /// Flow Equivalent sets, sorted by dominance.
collectFusionCandidates__anon08ec0c300111::LoopFuser6443cdf8794SKit Barton void collectFusionCandidates(const LoopVector &LV) {
6453cdf8794SKit Barton for (Loop *L : LV) {
64638a82179SSidharth Baveja TTI::PeelingPreferences PP =
64738a82179SSidharth Baveja gatherPeelingPreferences(L, SE, TTI, None, None);
648a73e4ce6SAnna Thomas FusionCandidate CurrCand(L, DT, &PDT, ORE, PP);
649de0b6339SKit Barton if (!CurrCand.isEligibleForFusion(SE))
6503cdf8794SKit Barton continue;
6513cdf8794SKit Barton
6523cdf8794SKit Barton // Go through each list in FusionCandidates and determine if L is control
6533cdf8794SKit Barton // flow equivalent with the first loop in that list. If it is, append LV.
6543cdf8794SKit Barton // If not, go to the next list.
6553cdf8794SKit Barton // If no suitable list is found, start another list and add it to
6563cdf8794SKit Barton // FusionCandidates.
6573cdf8794SKit Barton bool FoundSet = false;
6583cdf8794SKit Barton
6593cdf8794SKit Barton for (auto &CurrCandSet : FusionCandidates) {
6603cdf8794SKit Barton if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {
6613cdf8794SKit Barton CurrCandSet.insert(CurrCand);
6623cdf8794SKit Barton FoundSet = true;
6633cdf8794SKit Barton #ifndef NDEBUG
6643cdf8794SKit Barton if (VerboseFusionDebugging)
6653cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Adding " << CurrCand
6663cdf8794SKit Barton << " to existing candidate set\n");
6673cdf8794SKit Barton #endif
6683cdf8794SKit Barton break;
6693cdf8794SKit Barton }
6703cdf8794SKit Barton }
6713cdf8794SKit Barton if (!FoundSet) {
6723cdf8794SKit Barton // No set was found. Create a new set and add to FusionCandidates
6733cdf8794SKit Barton #ifndef NDEBUG
6743cdf8794SKit Barton if (VerboseFusionDebugging)
6753cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
6763cdf8794SKit Barton #endif
6773cdf8794SKit Barton FusionCandidateSet NewCandSet;
6783cdf8794SKit Barton NewCandSet.insert(CurrCand);
6793cdf8794SKit Barton FusionCandidates.push_back(NewCandSet);
6803cdf8794SKit Barton }
6813cdf8794SKit Barton NumFusionCandidates++;
6823cdf8794SKit Barton }
6833cdf8794SKit Barton }
6843cdf8794SKit Barton
6853cdf8794SKit Barton /// Determine if it is beneficial to fuse two loops.
6863cdf8794SKit Barton ///
6873cdf8794SKit Barton /// For now, this method simply returns true because we want to fuse as much
6883cdf8794SKit Barton /// as possible (primarily to test the pass). This method will evolve, over
6893cdf8794SKit Barton /// time, to add heuristics for profitability of fusion.
isBeneficialFusion__anon08ec0c300111::LoopFuser6903cdf8794SKit Barton bool isBeneficialFusion(const FusionCandidate &FC0,
6913cdf8794SKit Barton const FusionCandidate &FC1) {
6923cdf8794SKit Barton return true;
6933cdf8794SKit Barton }
6943cdf8794SKit Barton
6953cdf8794SKit Barton /// Determine if two fusion candidates have the same trip count (i.e., they
6963cdf8794SKit Barton /// execute the same number of iterations).
6973cdf8794SKit Barton ///
69838a82179SSidharth Baveja /// This function will return a pair of values. The first is a boolean,
69938a82179SSidharth Baveja /// stating whether or not the two candidates are known at compile time to
70038a82179SSidharth Baveja /// have the same TripCount. The second is the difference in the two
70138a82179SSidharth Baveja /// TripCounts. This information can be used later to determine whether or not
70238a82179SSidharth Baveja /// peeling can be performed on either one of the candiates.
70338a82179SSidharth Baveja std::pair<bool, Optional<unsigned>>
haveIdenticalTripCounts__anon08ec0c300111::LoopFuser70438a82179SSidharth Baveja haveIdenticalTripCounts(const FusionCandidate &FC0,
7053cdf8794SKit Barton const FusionCandidate &FC1) const {
70638a82179SSidharth Baveja
7073cdf8794SKit Barton const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
7083cdf8794SKit Barton if (isa<SCEVCouldNotCompute>(TripCount0)) {
7093cdf8794SKit Barton UncomputableTripCount++;
7103cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
71138a82179SSidharth Baveja return {false, None};
7123cdf8794SKit Barton }
7133cdf8794SKit Barton
7143cdf8794SKit Barton const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
7153cdf8794SKit Barton if (isa<SCEVCouldNotCompute>(TripCount1)) {
7163cdf8794SKit Barton UncomputableTripCount++;
7173cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
71838a82179SSidharth Baveja return {false, None};
7193cdf8794SKit Barton }
72038a82179SSidharth Baveja
7213cdf8794SKit Barton LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
7223cdf8794SKit Barton << *TripCount1 << " are "
7233cdf8794SKit Barton << (TripCount0 == TripCount1 ? "identical" : "different")
7243cdf8794SKit Barton << "\n");
7253cdf8794SKit Barton
72638a82179SSidharth Baveja if (TripCount0 == TripCount1)
72738a82179SSidharth Baveja return {true, 0};
72838a82179SSidharth Baveja
72938a82179SSidharth Baveja LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
73038a82179SSidharth Baveja "determining the difference between trip counts\n");
73138a82179SSidharth Baveja
73238a82179SSidharth Baveja // Currently only considering loops with a single exit point
73338a82179SSidharth Baveja // and a non-constant trip count.
73438a82179SSidharth Baveja const unsigned TC0 = SE.getSmallConstantTripCount(FC0.L);
73538a82179SSidharth Baveja const unsigned TC1 = SE.getSmallConstantTripCount(FC1.L);
73638a82179SSidharth Baveja
73738a82179SSidharth Baveja // If any of the tripcounts are zero that means that loop(s) do not have
73838a82179SSidharth Baveja // a single exit or a constant tripcount.
73938a82179SSidharth Baveja if (TC0 == 0 || TC1 == 0) {
74038a82179SSidharth Baveja LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
74138a82179SSidharth Baveja "have a constant number of iterations. Peeling "
74238a82179SSidharth Baveja "is not benefical\n");
74338a82179SSidharth Baveja return {false, None};
74438a82179SSidharth Baveja }
74538a82179SSidharth Baveja
74638a82179SSidharth Baveja Optional<unsigned> Difference = None;
74738a82179SSidharth Baveja int Diff = TC0 - TC1;
74838a82179SSidharth Baveja
74938a82179SSidharth Baveja if (Diff > 0)
75038a82179SSidharth Baveja Difference = Diff;
75138a82179SSidharth Baveja else {
75238a82179SSidharth Baveja LLVM_DEBUG(
75338a82179SSidharth Baveja dbgs() << "Difference is less than 0. FC1 (second loop) has more "
75438a82179SSidharth Baveja "iterations than the first one. Currently not supported\n");
75538a82179SSidharth Baveja }
75638a82179SSidharth Baveja
75738a82179SSidharth Baveja LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference
75838a82179SSidharth Baveja << "\n");
75938a82179SSidharth Baveja
76038a82179SSidharth Baveja return {false, Difference};
76138a82179SSidharth Baveja }
76238a82179SSidharth Baveja
peelFusionCandidate__anon08ec0c300111::LoopFuser76338a82179SSidharth Baveja void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
76438a82179SSidharth Baveja unsigned PeelCount) {
76538a82179SSidharth Baveja assert(FC0.AbleToPeel && "Should be able to peel loop");
76638a82179SSidharth Baveja
76738a82179SSidharth Baveja LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
76838a82179SSidharth Baveja << " iterations of the first loop. \n");
76938a82179SSidharth Baveja
770bc48a266SAnna Thomas FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, DT, &AC, true);
77138a82179SSidharth Baveja if (FC0.Peeled) {
77238a82179SSidharth Baveja LLVM_DEBUG(dbgs() << "Done Peeling\n");
77338a82179SSidharth Baveja
77438a82179SSidharth Baveja #ifndef NDEBUG
77538a82179SSidharth Baveja auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1);
77638a82179SSidharth Baveja
77738a82179SSidharth Baveja assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 &&
77838a82179SSidharth Baveja "Loops should have identical trip counts after peeling");
77938a82179SSidharth Baveja #endif
78038a82179SSidharth Baveja
78138a82179SSidharth Baveja FC0.PP.PeelCount += PeelCount;
78238a82179SSidharth Baveja
78338a82179SSidharth Baveja // Peeling does not update the PDT
78438a82179SSidharth Baveja PDT.recalculate(*FC0.Preheader->getParent());
78538a82179SSidharth Baveja
78638a82179SSidharth Baveja FC0.updateAfterPeeling();
78738a82179SSidharth Baveja
78838a82179SSidharth Baveja // In this case the iterations of the loop are constant, so the first
78938a82179SSidharth Baveja // loop will execute completely (will not jump from one of
79038a82179SSidharth Baveja // the peeled blocks to the second loop). Here we are updating the
79138a82179SSidharth Baveja // branch conditions of each of the peeled blocks, such that it will
79238a82179SSidharth Baveja // branch to its successor which is not the preheader of the second loop
79338a82179SSidharth Baveja // in the case of unguarded loops, or the succesors of the exit block of
79438a82179SSidharth Baveja // the first loop otherwise. Doing this update will ensure that the entry
79538a82179SSidharth Baveja // block of the first loop dominates the entry block of the second loop.
79638a82179SSidharth Baveja BasicBlock *BB =
79738a82179SSidharth Baveja FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;
79838a82179SSidharth Baveja if (BB) {
79938a82179SSidharth Baveja SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
80038a82179SSidharth Baveja SmallVector<Instruction *, 8> WorkList;
80138a82179SSidharth Baveja for (BasicBlock *Pred : predecessors(BB)) {
80238a82179SSidharth Baveja if (Pred != FC0.ExitBlock) {
80338a82179SSidharth Baveja WorkList.emplace_back(Pred->getTerminator());
80438a82179SSidharth Baveja TreeUpdates.emplace_back(
80538a82179SSidharth Baveja DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB));
80638a82179SSidharth Baveja }
80738a82179SSidharth Baveja }
80838a82179SSidharth Baveja // Cannot modify the predecessors inside the above loop as it will cause
80938a82179SSidharth Baveja // the iterators to be nullptrs, causing memory errors.
81038a82179SSidharth Baveja for (Instruction *CurrentBranch: WorkList) {
81138a82179SSidharth Baveja BasicBlock *Succ = CurrentBranch->getSuccessor(0);
81238a82179SSidharth Baveja if (Succ == BB)
81338a82179SSidharth Baveja Succ = CurrentBranch->getSuccessor(1);
81438a82179SSidharth Baveja ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ));
81538a82179SSidharth Baveja }
81638a82179SSidharth Baveja
81738a82179SSidharth Baveja DTU.applyUpdates(TreeUpdates);
81838a82179SSidharth Baveja DTU.flush();
81938a82179SSidharth Baveja }
82038a82179SSidharth Baveja LLVM_DEBUG(
82138a82179SSidharth Baveja dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount
82238a82179SSidharth Baveja << " iterations from the first loop.\n"
82338a82179SSidharth Baveja "Both Loops have the same number of iterations now.\n");
82438a82179SSidharth Baveja }
8253cdf8794SKit Barton }
8263cdf8794SKit Barton
8273cdf8794SKit Barton /// Walk each set of control flow equivalent fusion candidates and attempt to
8283cdf8794SKit Barton /// fuse them. This does a single linear traversal of all candidates in the
8293cdf8794SKit Barton /// set. The conditions for legal fusion are checked at this point. If a pair
8303cdf8794SKit Barton /// of fusion candidates passes all legality checks, they are fused together
8313cdf8794SKit Barton /// and a new fusion candidate is created and added to the FusionCandidateSet.
8323cdf8794SKit Barton /// The original fusion candidates are then removed, as they are no longer
8333cdf8794SKit Barton /// valid.
fuseCandidates__anon08ec0c300111::LoopFuser8343cdf8794SKit Barton bool fuseCandidates() {
8353cdf8794SKit Barton bool Fused = false;
8363cdf8794SKit Barton LLVM_DEBUG(printFusionCandidates(FusionCandidates));
8373cdf8794SKit Barton for (auto &CandidateSet : FusionCandidates) {
8383cdf8794SKit Barton if (CandidateSet.size() < 2)
8393cdf8794SKit Barton continue;
8403cdf8794SKit Barton
8413cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
8423cdf8794SKit Barton << CandidateSet << "\n");
8433cdf8794SKit Barton
8443cdf8794SKit Barton for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
8453cdf8794SKit Barton assert(!LDT.isRemovedLoop(FC0->L) &&
8463cdf8794SKit Barton "Should not have removed loops in CandidateSet!");
8473cdf8794SKit Barton auto FC1 = FC0;
8483cdf8794SKit Barton for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
8493cdf8794SKit Barton assert(!LDT.isRemovedLoop(FC1->L) &&
8503cdf8794SKit Barton "Should not have removed loops in CandidateSet!");
8513cdf8794SKit Barton
8523cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
8533cdf8794SKit Barton dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
8543cdf8794SKit Barton
8553cdf8794SKit Barton FC0->verify();
8563cdf8794SKit Barton FC1->verify();
8573cdf8794SKit Barton
85838a82179SSidharth Baveja // Check if the candidates have identical tripcounts (first value of
85938a82179SSidharth Baveja // pair), and if not check the difference in the tripcounts between
86038a82179SSidharth Baveja // the loops (second value of pair). The difference is not equal to
86138a82179SSidharth Baveja // None iff the loops iterate a constant number of times, and have a
86238a82179SSidharth Baveja // single exit.
86338a82179SSidharth Baveja std::pair<bool, Optional<unsigned>> IdenticalTripCountRes =
86438a82179SSidharth Baveja haveIdenticalTripCounts(*FC0, *FC1);
86538a82179SSidharth Baveja bool SameTripCount = IdenticalTripCountRes.first;
86638a82179SSidharth Baveja Optional<unsigned> TCDifference = IdenticalTripCountRes.second;
86738a82179SSidharth Baveja
86838a82179SSidharth Baveja // Here we are checking that FC0 (the first loop) can be peeled, and
86938a82179SSidharth Baveja // both loops have different tripcounts.
87038a82179SSidharth Baveja if (FC0->AbleToPeel && !SameTripCount && TCDifference) {
87138a82179SSidharth Baveja if (*TCDifference > FusionPeelMaxCount) {
87238a82179SSidharth Baveja LLVM_DEBUG(dbgs()
87338a82179SSidharth Baveja << "Difference in loop trip counts: " << *TCDifference
87438a82179SSidharth Baveja << " is greater than maximum peel count specificed: "
87538a82179SSidharth Baveja << FusionPeelMaxCount << "\n");
87638a82179SSidharth Baveja } else {
87738a82179SSidharth Baveja // Dependent on peeling being performed on the first loop, and
87838a82179SSidharth Baveja // assuming all other conditions for fusion return true.
87938a82179SSidharth Baveja SameTripCount = true;
88038a82179SSidharth Baveja }
88138a82179SSidharth Baveja }
88238a82179SSidharth Baveja
88338a82179SSidharth Baveja if (!SameTripCount) {
8843cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
8853cdf8794SKit Barton "counts. Not fusing.\n");
886de0b6339SKit Barton reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
887de0b6339SKit Barton NonEqualTripCount);
8883cdf8794SKit Barton continue;
8893cdf8794SKit Barton }
8903cdf8794SKit Barton
8913cdf8794SKit Barton if (!isAdjacent(*FC0, *FC1)) {
8923cdf8794SKit Barton LLVM_DEBUG(dbgs()
8933cdf8794SKit Barton << "Fusion candidates are not adjacent. Not fusing.\n");
894de0b6339SKit Barton reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
8953cdf8794SKit Barton continue;
8963cdf8794SKit Barton }
8973cdf8794SKit Barton
8986a82ace5STa-Wei Tu if (!FC0->GuardBranch && FC1->GuardBranch) {
8996a82ace5STa-Wei Tu LLVM_DEBUG(dbgs() << "The second candidate is guarded while the "
9006a82ace5STa-Wei Tu "first one is not. Not fusing.\n");
9016a82ace5STa-Wei Tu reportLoopFusion<OptimizationRemarkMissed>(
9026a82ace5STa-Wei Tu *FC0, *FC1, OnlySecondCandidateIsGuarded);
9036a82ace5STa-Wei Tu continue;
9046a82ace5STa-Wei Tu }
9056a82ace5STa-Wei Tu
90650bc6104SKit Barton // Ensure that FC0 and FC1 have identical guards.
90750bc6104SKit Barton // If one (or both) are not guarded, this check is not necessary.
90850bc6104SKit Barton if (FC0->GuardBranch && FC1->GuardBranch &&
90938a82179SSidharth Baveja !haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {
91050bc6104SKit Barton LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
91150bc6104SKit Barton "guards. Not Fusing.\n");
91250bc6104SKit Barton reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
91350bc6104SKit Barton NonIdenticalGuards);
91450bc6104SKit Barton continue;
91550bc6104SKit Barton }
91650bc6104SKit Barton
917da58e68fSWhitney Tsang if (!isSafeToMoveBefore(*FC1->Preheader,
918082e3952SSharmaRithik *FC0->Preheader->getTerminator(), DT, &PDT,
919082e3952SSharmaRithik &DI)) {
920da58e68fSWhitney Tsang LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
921da58e68fSWhitney Tsang "instructions in preheader. Not fusing.\n");
922de0b6339SKit Barton reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
923de0b6339SKit Barton NonEmptyPreheader);
9243cdf8794SKit Barton continue;
9253cdf8794SKit Barton }
9263cdf8794SKit Barton
927e44f4a8aSWhitney Tsang if (FC0->GuardBranch) {
928e44f4a8aSWhitney Tsang assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
929e44f4a8aSWhitney Tsang
930e44f4a8aSWhitney Tsang if (!isSafeToMoveBefore(*FC0->ExitBlock,
931e44f4a8aSWhitney Tsang *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,
932082e3952SSharmaRithik &PDT, &DI)) {
933e44f4a8aSWhitney Tsang LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
934e44f4a8aSWhitney Tsang "instructions in exit block. Not fusing.\n");
93550bc6104SKit Barton reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
93650bc6104SKit Barton NonEmptyExitBlock);
93750bc6104SKit Barton continue;
93850bc6104SKit Barton }
93950bc6104SKit Barton
940e44f4a8aSWhitney Tsang if (!isSafeToMoveBefore(
941e44f4a8aSWhitney Tsang *FC1->GuardBranch->getParent(),
942082e3952SSharmaRithik *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT,
943082e3952SSharmaRithik &DI)) {
944e44f4a8aSWhitney Tsang LLVM_DEBUG(dbgs()
945e44f4a8aSWhitney Tsang << "Fusion candidate contains unsafe "
946e44f4a8aSWhitney Tsang "instructions in guard block. Not fusing.\n");
94750bc6104SKit Barton reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
94850bc6104SKit Barton NonEmptyGuardBlock);
94950bc6104SKit Barton continue;
95050bc6104SKit Barton }
951e44f4a8aSWhitney Tsang }
95250bc6104SKit Barton
95350bc6104SKit Barton // Check the dependencies across the loops and do not fuse if it would
95450bc6104SKit Barton // violate them.
9553cdf8794SKit Barton if (!dependencesAllowFusion(*FC0, *FC1)) {
9563cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
957de0b6339SKit Barton reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
958de0b6339SKit Barton InvalidDependencies);
9593cdf8794SKit Barton continue;
9603cdf8794SKit Barton }
9613cdf8794SKit Barton
9623cdf8794SKit Barton bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
9633cdf8794SKit Barton LLVM_DEBUG(dbgs()
9643cdf8794SKit Barton << "\tFusion appears to be "
9653cdf8794SKit Barton << (BeneficialToFuse ? "" : "un") << "profitable!\n");
966de0b6339SKit Barton if (!BeneficialToFuse) {
967de0b6339SKit Barton reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
968de0b6339SKit Barton FusionNotBeneficial);
9693cdf8794SKit Barton continue;
970de0b6339SKit Barton }
9713cdf8794SKit Barton // All analysis has completed and has determined that fusion is legal
9723cdf8794SKit Barton // and profitable. At this point, start transforming the code and
9733cdf8794SKit Barton // perform fusion.
9743cdf8794SKit Barton
9753cdf8794SKit Barton LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
9763cdf8794SKit Barton << *FC1 << "\n");
9773cdf8794SKit Barton
97838a82179SSidharth Baveja FusionCandidate FC0Copy = *FC0;
97938a82179SSidharth Baveja // Peel the loop after determining that fusion is legal. The Loops
98038a82179SSidharth Baveja // will still be safe to fuse after the peeling is performed.
98138a82179SSidharth Baveja bool Peel = TCDifference && *TCDifference > 0;
98238a82179SSidharth Baveja if (Peel)
98338a82179SSidharth Baveja peelFusionCandidate(FC0Copy, *FC1, *TCDifference);
98438a82179SSidharth Baveja
9853cdf8794SKit Barton // Report fusion to the Optimization Remarks.
9863cdf8794SKit Barton // Note this needs to be done *before* performFusion because
9873cdf8794SKit Barton // performFusion will change the original loops, making it not
9883cdf8794SKit Barton // possible to identify them after fusion is complete.
98938a82179SSidharth Baveja reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,
99038a82179SSidharth Baveja FuseCounter);
9913cdf8794SKit Barton
99238a82179SSidharth Baveja FusionCandidate FusedCand(
993a73e4ce6SAnna Thomas performFusion((Peel ? FC0Copy : *FC0), *FC1), DT, &PDT, ORE,
99438a82179SSidharth Baveja FC0Copy.PP);
9953cdf8794SKit Barton FusedCand.verify();
996de0b6339SKit Barton assert(FusedCand.isEligibleForFusion(SE) &&
9973cdf8794SKit Barton "Fused candidate should be eligible for fusion!");
9983cdf8794SKit Barton
9993cdf8794SKit Barton // Notify the loop-depth-tree that these loops are not valid objects
10003cdf8794SKit Barton LDT.removeLoop(FC1->L);
10013cdf8794SKit Barton
10023cdf8794SKit Barton CandidateSet.erase(FC0);
10033cdf8794SKit Barton CandidateSet.erase(FC1);
10043cdf8794SKit Barton
10053cdf8794SKit Barton auto InsertPos = CandidateSet.insert(FusedCand);
10063cdf8794SKit Barton
10073cdf8794SKit Barton assert(InsertPos.second &&
10083cdf8794SKit Barton "Unable to insert TargetCandidate in CandidateSet!");
10093cdf8794SKit Barton
10103cdf8794SKit Barton // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
10113cdf8794SKit Barton // of the FC1 loop will attempt to fuse the new (fused) loop with the
10123cdf8794SKit Barton // remaining candidates in the current candidate set.
10133cdf8794SKit Barton FC0 = FC1 = InsertPos.first;
10143cdf8794SKit Barton
10153cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
10163cdf8794SKit Barton << "\n");
10173cdf8794SKit Barton
10183cdf8794SKit Barton Fused = true;
10193cdf8794SKit Barton }
10203cdf8794SKit Barton }
10213cdf8794SKit Barton }
10223cdf8794SKit Barton return Fused;
10233cdf8794SKit Barton }
10243cdf8794SKit Barton
10253cdf8794SKit Barton /// Rewrite all additive recurrences in a SCEV to use a new loop.
10263cdf8794SKit Barton class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
10273cdf8794SKit Barton public:
AddRecLoopReplacer(ScalarEvolution & SE,const Loop & OldL,const Loop & NewL,bool UseMax=true)10283cdf8794SKit Barton AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
10293cdf8794SKit Barton bool UseMax = true)
10303cdf8794SKit Barton : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),
10313cdf8794SKit Barton NewL(NewL) {}
10323cdf8794SKit Barton
visitAddRecExpr(const SCEVAddRecExpr * Expr)10333cdf8794SKit Barton const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
10343cdf8794SKit Barton const Loop *ExprL = Expr->getLoop();
10353cdf8794SKit Barton SmallVector<const SCEV *, 2> Operands;
10363cdf8794SKit Barton if (ExprL == &OldL) {
10373cdf8794SKit Barton Operands.append(Expr->op_begin(), Expr->op_end());
10383cdf8794SKit Barton return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
10393cdf8794SKit Barton }
10403cdf8794SKit Barton
10413cdf8794SKit Barton if (OldL.contains(ExprL)) {
10423cdf8794SKit Barton bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));
10433cdf8794SKit Barton if (!UseMax || !Pos || !Expr->isAffine()) {
10443cdf8794SKit Barton Valid = false;
10453cdf8794SKit Barton return Expr;
10463cdf8794SKit Barton }
10473cdf8794SKit Barton return visit(Expr->getStart());
10483cdf8794SKit Barton }
10493cdf8794SKit Barton
10503cdf8794SKit Barton for (const SCEV *Op : Expr->operands())
10513cdf8794SKit Barton Operands.push_back(visit(Op));
10523cdf8794SKit Barton return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());
10533cdf8794SKit Barton }
10543cdf8794SKit Barton
wasValidSCEV() const10553cdf8794SKit Barton bool wasValidSCEV() const { return Valid; }
10563cdf8794SKit Barton
10573cdf8794SKit Barton private:
10583cdf8794SKit Barton bool Valid, UseMax;
10593cdf8794SKit Barton const Loop &OldL, &NewL;
10603cdf8794SKit Barton };
10613cdf8794SKit Barton
10623cdf8794SKit Barton /// Return false if the access functions of \p I0 and \p I1 could cause
10633cdf8794SKit Barton /// a negative dependence.
accessDiffIsPositive__anon08ec0c300111::LoopFuser10643cdf8794SKit Barton bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
10653cdf8794SKit Barton Instruction &I1, bool EqualIsInvalid) {
10663cdf8794SKit Barton Value *Ptr0 = getLoadStorePointerOperand(&I0);
10673cdf8794SKit Barton Value *Ptr1 = getLoadStorePointerOperand(&I1);
10683cdf8794SKit Barton if (!Ptr0 || !Ptr1)
10693cdf8794SKit Barton return false;
10703cdf8794SKit Barton
10713cdf8794SKit Barton const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);
10723cdf8794SKit Barton const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);
10733cdf8794SKit Barton #ifndef NDEBUG
10743cdf8794SKit Barton if (VerboseFusionDebugging)
10753cdf8794SKit Barton LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs "
10763cdf8794SKit Barton << *SCEVPtr1 << "\n");
10773cdf8794SKit Barton #endif
10783cdf8794SKit Barton AddRecLoopReplacer Rewriter(SE, L0, L1);
10793cdf8794SKit Barton SCEVPtr0 = Rewriter.visit(SCEVPtr0);
10803cdf8794SKit Barton #ifndef NDEBUG
10813cdf8794SKit Barton if (VerboseFusionDebugging)
10823cdf8794SKit Barton LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0
10833cdf8794SKit Barton << " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
10843cdf8794SKit Barton #endif
10853cdf8794SKit Barton if (!Rewriter.wasValidSCEV())
10863cdf8794SKit Barton return false;
10873cdf8794SKit Barton
10883cdf8794SKit Barton // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
10893cdf8794SKit Barton // L0) and the other is not. We could check if it is monotone and test
10903cdf8794SKit Barton // the beginning and end value instead.
10913cdf8794SKit Barton
10923cdf8794SKit Barton BasicBlock *L0Header = L0.getHeader();
10933cdf8794SKit Barton auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
10943cdf8794SKit Barton const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
10953cdf8794SKit Barton if (!AddRec)
10963cdf8794SKit Barton return false;
10973cdf8794SKit Barton return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&
10983cdf8794SKit Barton !DT.dominates(AddRec->getLoop()->getHeader(), L0Header);
10993cdf8794SKit Barton };
11003cdf8794SKit Barton if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))
11013cdf8794SKit Barton return false;
11023cdf8794SKit Barton
11033cdf8794SKit Barton ICmpInst::Predicate Pred =
11043cdf8794SKit Barton EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
11053cdf8794SKit Barton bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);
11063cdf8794SKit Barton #ifndef NDEBUG
11073cdf8794SKit Barton if (VerboseFusionDebugging)
11083cdf8794SKit Barton LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0
11093cdf8794SKit Barton << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1
11103cdf8794SKit Barton << "\n");
11113cdf8794SKit Barton #endif
11123cdf8794SKit Barton return IsAlwaysGE;
11133cdf8794SKit Barton }
11143cdf8794SKit Barton
11153cdf8794SKit Barton /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
11163cdf8794SKit Barton /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
11173cdf8794SKit Barton /// specified by @p DepChoice are used to determine this.
dependencesAllowFusion__anon08ec0c300111::LoopFuser11183cdf8794SKit Barton bool dependencesAllowFusion(const FusionCandidate &FC0,
11193cdf8794SKit Barton const FusionCandidate &FC1, Instruction &I0,
11203cdf8794SKit Barton Instruction &I1, bool AnyDep,
11213cdf8794SKit Barton FusionDependenceAnalysisChoice DepChoice) {
11223cdf8794SKit Barton #ifndef NDEBUG
11233cdf8794SKit Barton if (VerboseFusionDebugging) {
11243cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
11253cdf8794SKit Barton << DepChoice << "\n");
11263cdf8794SKit Barton }
11273cdf8794SKit Barton #endif
11283cdf8794SKit Barton switch (DepChoice) {
11293cdf8794SKit Barton case FUSION_DEPENDENCE_ANALYSIS_SCEV:
11303cdf8794SKit Barton return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);
11313cdf8794SKit Barton case FUSION_DEPENDENCE_ANALYSIS_DA: {
11323cdf8794SKit Barton auto DepResult = DI.depends(&I0, &I1, true);
11333cdf8794SKit Barton if (!DepResult)
11343cdf8794SKit Barton return true;
11353cdf8794SKit Barton #ifndef NDEBUG
11363cdf8794SKit Barton if (VerboseFusionDebugging) {
11373cdf8794SKit Barton LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
11383cdf8794SKit Barton dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
11393cdf8794SKit Barton << (DepResult->isOrdered() ? "true" : "false")
11403cdf8794SKit Barton << "]\n");
11413cdf8794SKit Barton LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
11423cdf8794SKit Barton << "\n");
11433cdf8794SKit Barton }
11443cdf8794SKit Barton #endif
11453cdf8794SKit Barton
11463cdf8794SKit Barton if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
11473cdf8794SKit Barton LLVM_DEBUG(
11483cdf8794SKit Barton dbgs() << "TODO: Implement pred/succ dependence handling!\n");
11493cdf8794SKit Barton
11503cdf8794SKit Barton // TODO: Can we actually use the dependence info analysis here?
11513cdf8794SKit Barton return false;
11523cdf8794SKit Barton }
11533cdf8794SKit Barton
11543cdf8794SKit Barton case FUSION_DEPENDENCE_ANALYSIS_ALL:
11553cdf8794SKit Barton return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
11563cdf8794SKit Barton FUSION_DEPENDENCE_ANALYSIS_SCEV) ||
11573cdf8794SKit Barton dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
11583cdf8794SKit Barton FUSION_DEPENDENCE_ANALYSIS_DA);
11593cdf8794SKit Barton }
11603cdf8794SKit Barton
11613cdf8794SKit Barton llvm_unreachable("Unknown fusion dependence analysis choice!");
11623cdf8794SKit Barton }
11633cdf8794SKit Barton
11643cdf8794SKit Barton /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
dependencesAllowFusion__anon08ec0c300111::LoopFuser11653cdf8794SKit Barton bool dependencesAllowFusion(const FusionCandidate &FC0,
11663cdf8794SKit Barton const FusionCandidate &FC1) {
11673cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
11683cdf8794SKit Barton << "\n");
11693cdf8794SKit Barton assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
117050bc6104SKit Barton assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
11713cdf8794SKit Barton
11723cdf8794SKit Barton for (Instruction *WriteL0 : FC0.MemWrites) {
11733cdf8794SKit Barton for (Instruction *WriteL1 : FC1.MemWrites)
11743cdf8794SKit Barton if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
11753cdf8794SKit Barton /* AnyDep */ false,
11763cdf8794SKit Barton FusionDependenceAnalysis)) {
11773cdf8794SKit Barton InvalidDependencies++;
11783cdf8794SKit Barton return false;
11793cdf8794SKit Barton }
11803cdf8794SKit Barton for (Instruction *ReadL1 : FC1.MemReads)
11813cdf8794SKit Barton if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,
11823cdf8794SKit Barton /* AnyDep */ false,
11833cdf8794SKit Barton FusionDependenceAnalysis)) {
11843cdf8794SKit Barton InvalidDependencies++;
11853cdf8794SKit Barton return false;
11863cdf8794SKit Barton }
11873cdf8794SKit Barton }
11883cdf8794SKit Barton
11893cdf8794SKit Barton for (Instruction *WriteL1 : FC1.MemWrites) {
11903cdf8794SKit Barton for (Instruction *WriteL0 : FC0.MemWrites)
11913cdf8794SKit Barton if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
11923cdf8794SKit Barton /* AnyDep */ false,
11933cdf8794SKit Barton FusionDependenceAnalysis)) {
11943cdf8794SKit Barton InvalidDependencies++;
11953cdf8794SKit Barton return false;
11963cdf8794SKit Barton }
11973cdf8794SKit Barton for (Instruction *ReadL0 : FC0.MemReads)
11983cdf8794SKit Barton if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,
11993cdf8794SKit Barton /* AnyDep */ false,
12003cdf8794SKit Barton FusionDependenceAnalysis)) {
12013cdf8794SKit Barton InvalidDependencies++;
12023cdf8794SKit Barton return false;
12033cdf8794SKit Barton }
12043cdf8794SKit Barton }
12053cdf8794SKit Barton
12063cdf8794SKit Barton // Walk through all uses in FC1. For each use, find the reaching def. If the
12073cdf8794SKit Barton // def is located in FC0 then it is is not safe to fuse.
12083cdf8794SKit Barton for (BasicBlock *BB : FC1.L->blocks())
12093cdf8794SKit Barton for (Instruction &I : *BB)
12103cdf8794SKit Barton for (auto &Op : I.operands())
12113cdf8794SKit Barton if (Instruction *Def = dyn_cast<Instruction>(Op))
12123cdf8794SKit Barton if (FC0.L->contains(Def->getParent())) {
12133cdf8794SKit Barton InvalidDependencies++;
12143cdf8794SKit Barton return false;
12153cdf8794SKit Barton }
12163cdf8794SKit Barton
12173cdf8794SKit Barton return true;
12183cdf8794SKit Barton }
12193cdf8794SKit Barton
122050bc6104SKit Barton /// Determine if two fusion candidates are adjacent in the CFG.
122150bc6104SKit Barton ///
122250bc6104SKit Barton /// This method will determine if there are additional basic blocks in the CFG
122350bc6104SKit Barton /// between the exit of \p FC0 and the entry of \p FC1.
122450bc6104SKit Barton /// If the two candidates are guarded loops, then it checks whether the
122550bc6104SKit Barton /// non-loop successor of the \p FC0 guard branch is the entry block of \p
122650bc6104SKit Barton /// FC1. If not, then the loops are not adjacent. If the two candidates are
122750bc6104SKit Barton /// not guarded loops, then it checks whether the exit block of \p FC0 is the
122850bc6104SKit Barton /// preheader of \p FC1.
isAdjacent__anon08ec0c300111::LoopFuser12293cdf8794SKit Barton bool isAdjacent(const FusionCandidate &FC0,
12303cdf8794SKit Barton const FusionCandidate &FC1) const {
123150bc6104SKit Barton // If the successor of the guard branch is FC1, then the loops are adjacent
123250bc6104SKit Barton if (FC0.GuardBranch)
123350bc6104SKit Barton return FC0.getNonLoopBlock() == FC1.getEntryBlock();
123450bc6104SKit Barton else
123550bc6104SKit Barton return FC0.ExitBlock == FC1.getEntryBlock();
123650bc6104SKit Barton }
123750bc6104SKit Barton
123850bc6104SKit Barton /// Determine if two fusion candidates have identical guards
123950bc6104SKit Barton ///
124050bc6104SKit Barton /// This method will determine if two fusion candidates have the same guards.
124150bc6104SKit Barton /// The guards are considered the same if:
124250bc6104SKit Barton /// 1. The instructions to compute the condition used in the compare are
124350bc6104SKit Barton /// identical.
124450bc6104SKit Barton /// 2. The successors of the guard have the same flow into/around the loop.
124550bc6104SKit Barton /// If the compare instructions are identical, then the first successor of the
124650bc6104SKit Barton /// guard must go to the same place (either the preheader of the loop or the
124750bc6104SKit Barton /// NonLoopBlock). In other words, the the first successor of both loops must
124850bc6104SKit Barton /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
124950bc6104SKit Barton /// the NonLoopBlock). The same must be true for the second successor.
haveIdenticalGuards__anon08ec0c300111::LoopFuser125050bc6104SKit Barton bool haveIdenticalGuards(const FusionCandidate &FC0,
125150bc6104SKit Barton const FusionCandidate &FC1) const {
125250bc6104SKit Barton assert(FC0.GuardBranch && FC1.GuardBranch &&
125350bc6104SKit Barton "Expecting FC0 and FC1 to be guarded loops.");
125450bc6104SKit Barton
125550bc6104SKit Barton if (auto FC0CmpInst =
125650bc6104SKit Barton dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
125750bc6104SKit Barton if (auto FC1CmpInst =
125850bc6104SKit Barton dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
125950bc6104SKit Barton if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
126050bc6104SKit Barton return false;
126150bc6104SKit Barton
126250bc6104SKit Barton // The compare instructions are identical.
126350bc6104SKit Barton // Now make sure the successor of the guards have the same flow into/around
126450bc6104SKit Barton // the loop
126550bc6104SKit Barton if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
126650bc6104SKit Barton return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
126750bc6104SKit Barton else
126850bc6104SKit Barton return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
126950bc6104SKit Barton }
127050bc6104SKit Barton
127138a82179SSidharth Baveja /// Modify the latch branch of FC to be unconditional since successors of the
127238a82179SSidharth Baveja /// branch are the same.
simplifyLatchBranch__anon08ec0c300111::LoopFuser127336bdc3dcSWhitney Tsang void simplifyLatchBranch(const FusionCandidate &FC) const {
127436bdc3dcSWhitney Tsang BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
127536bdc3dcSWhitney Tsang if (FCLatchBranch) {
127636bdc3dcSWhitney Tsang assert(FCLatchBranch->isConditional() &&
127736bdc3dcSWhitney Tsang FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
127836bdc3dcSWhitney Tsang "Expecting the two successors of FCLatchBranch to be the same");
127938a82179SSidharth Baveja BranchInst *NewBranch =
128038a82179SSidharth Baveja BranchInst::Create(FCLatchBranch->getSuccessor(0));
128138a82179SSidharth Baveja ReplaceInstWithInst(FCLatchBranch, NewBranch);
128236bdc3dcSWhitney Tsang }
128336bdc3dcSWhitney Tsang }
128436bdc3dcSWhitney Tsang
128536bdc3dcSWhitney Tsang /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
128636bdc3dcSWhitney Tsang /// successor, then merge FC0.Latch with its unique successor.
mergeLatch__anon08ec0c300111::LoopFuser128736bdc3dcSWhitney Tsang void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
128878dc6498SWhitney Tsang moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
128936bdc3dcSWhitney Tsang if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
129036bdc3dcSWhitney Tsang MergeBlockIntoPredecessor(Succ, &DTU, &LI);
129136bdc3dcSWhitney Tsang DTU.flush();
129236bdc3dcSWhitney Tsang }
129336bdc3dcSWhitney Tsang }
129436bdc3dcSWhitney Tsang
12953cdf8794SKit Barton /// Fuse two fusion candidates, creating a new fused loop.
12963cdf8794SKit Barton ///
12973cdf8794SKit Barton /// This method contains the mechanics of fusing two loops, represented by \p
12983cdf8794SKit Barton /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
12993cdf8794SKit Barton /// postdominates \p FC0 (making them control flow equivalent). It also
13003cdf8794SKit Barton /// assumes that the other conditions for fusion have been met: adjacent,
13013cdf8794SKit Barton /// identical trip counts, and no negative distance dependencies exist that
13023cdf8794SKit Barton /// would prevent fusion. Thus, there is no checking for these conditions in
13033cdf8794SKit Barton /// this method.
13043cdf8794SKit Barton ///
13053cdf8794SKit Barton /// Fusion is performed by rewiring the CFG to update successor blocks of the
13063cdf8794SKit Barton /// components of tho loop. Specifically, the following changes are done:
13073cdf8794SKit Barton ///
13083cdf8794SKit Barton /// 1. The preheader of \p FC1 is removed as it is no longer necessary
13093cdf8794SKit Barton /// (because it is currently only a single statement block).
13103cdf8794SKit Barton /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1.
13113cdf8794SKit Barton /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0.
13123cdf8794SKit Barton /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0.
13133cdf8794SKit Barton ///
13143cdf8794SKit Barton /// All of these modifications are done with dominator tree updates, thus
13153cdf8794SKit Barton /// keeping the dominator (and post dominator) information up-to-date.
13163cdf8794SKit Barton ///
13173cdf8794SKit Barton /// This can be improved in the future by actually merging blocks during
13183cdf8794SKit Barton /// fusion. For example, the preheader of \p FC1 can be merged with the
13193cdf8794SKit Barton /// preheader of \p FC0. This would allow loops with more than a single
13203cdf8794SKit Barton /// statement in the preheader to be fused. Similarly, the latch blocks of the
13213cdf8794SKit Barton /// two loops could also be fused into a single block. This will require
13223cdf8794SKit Barton /// analysis to prove it is safe to move the contents of the block past
13233cdf8794SKit Barton /// existing code, which currently has not been implemented.
performFusion__anon08ec0c300111::LoopFuser13243cdf8794SKit Barton Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {
13253cdf8794SKit Barton assert(FC0.isValid() && FC1.isValid() &&
13263cdf8794SKit Barton "Expecting valid fusion candidates");
13273cdf8794SKit Barton
13283cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
13293cdf8794SKit Barton dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
13303cdf8794SKit Barton
1331da58e68fSWhitney Tsang // Move instructions from the preheader of FC1 to the end of the preheader
1332da58e68fSWhitney Tsang // of FC0.
1333da58e68fSWhitney Tsang moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI);
1334da58e68fSWhitney Tsang
133550bc6104SKit Barton // Fusing guarded loops is handled slightly differently than non-guarded
133650bc6104SKit Barton // loops and has been broken out into a separate method instead of trying to
133750bc6104SKit Barton // intersperse the logic within a single method.
133850bc6104SKit Barton if (FC0.GuardBranch)
133950bc6104SKit Barton return fuseGuardedLoops(FC0, FC1);
134050bc6104SKit Barton
134138a82179SSidharth Baveja assert(FC1.Preheader ==
134238a82179SSidharth Baveja (FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
13433cdf8794SKit Barton assert(FC1.Preheader->size() == 1 &&
13443cdf8794SKit Barton FC1.Preheader->getSingleSuccessor() == FC1.Header);
13453cdf8794SKit Barton
13463cdf8794SKit Barton // Remember the phi nodes originally in the header of FC0 in order to rewire
13473cdf8794SKit Barton // them later. However, this is only necessary if the new loop carried
13483cdf8794SKit Barton // values might not dominate the exiting branch. While we do not generally
13493cdf8794SKit Barton // test if this is the case but simply insert intermediate phi nodes, we
13503cdf8794SKit Barton // need to make sure these intermediate phi nodes have different
13513cdf8794SKit Barton // predecessors. To this end, we filter the special case where the exiting
13523cdf8794SKit Barton // block is the latch block of the first loop. Nothing needs to be done
13533cdf8794SKit Barton // anyway as all loop carried values dominate the latch and thereby also the
13543cdf8794SKit Barton // exiting branch.
13553cdf8794SKit Barton SmallVector<PHINode *, 8> OriginalFC0PHIs;
13563cdf8794SKit Barton if (FC0.ExitingBlock != FC0.Latch)
13573cdf8794SKit Barton for (PHINode &PHI : FC0.Header->phis())
13583cdf8794SKit Barton OriginalFC0PHIs.push_back(&PHI);
13593cdf8794SKit Barton
13603cdf8794SKit Barton // Replace incoming blocks for header PHIs first.
13613cdf8794SKit Barton FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
13623cdf8794SKit Barton FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
13633cdf8794SKit Barton
13643cdf8794SKit Barton // Then modify the control flow and update DT and PDT.
13658a268becSFangrui Song SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
13663cdf8794SKit Barton
13673cdf8794SKit Barton // The old exiting block of the first loop (FC0) has to jump to the header
13683cdf8794SKit Barton // of the second as we need to execute the code in the second header block
13693cdf8794SKit Barton // regardless of the trip count. That is, if the trip count is 0, so the
13703cdf8794SKit Barton // back edge is never taken, we still have to execute both loop headers,
13713cdf8794SKit Barton // especially (but not only!) if the second is a do-while style loop.
13723cdf8794SKit Barton // However, doing so might invalidate the phi nodes of the first loop as
13733cdf8794SKit Barton // the new values do only need to dominate their latch and not the exiting
13743cdf8794SKit Barton // predicate. To remedy this potential problem we always introduce phi
13753cdf8794SKit Barton // nodes in the header of the second loop later that select the loop carried
13763cdf8794SKit Barton // value, if the second header was reached through an old latch of the
13773cdf8794SKit Barton // first, or undef otherwise. This is sound as exiting the first implies the
13783cdf8794SKit Barton // second will exit too, __without__ taking the back-edge. [Their
13793cdf8794SKit Barton // trip-counts are equal after all.
13803cdf8794SKit Barton // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go
13813cdf8794SKit Barton // to FC1.Header? I think this is basically what the three sequences are
13823cdf8794SKit Barton // trying to accomplish; however, doing this directly in the CFG may mean
13833cdf8794SKit Barton // the DT/PDT becomes invalid
138438a82179SSidharth Baveja if (!FC0.Peeled) {
13853cdf8794SKit Barton FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
13863cdf8794SKit Barton FC1.Header);
13873cdf8794SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
13883cdf8794SKit Barton DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
13893cdf8794SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
13903cdf8794SKit Barton DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
139138a82179SSidharth Baveja } else {
139238a82179SSidharth Baveja TreeUpdates.emplace_back(DominatorTree::UpdateType(
139338a82179SSidharth Baveja DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
139438a82179SSidharth Baveja
139538a82179SSidharth Baveja // Remove the ExitBlock of the first Loop (also not needed)
139638a82179SSidharth Baveja FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
139738a82179SSidharth Baveja FC1.Header);
139838a82179SSidharth Baveja TreeUpdates.emplace_back(DominatorTree::UpdateType(
139938a82179SSidharth Baveja DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
140038a82179SSidharth Baveja FC0.ExitBlock->getTerminator()->eraseFromParent();
140138a82179SSidharth Baveja TreeUpdates.emplace_back(DominatorTree::UpdateType(
140238a82179SSidharth Baveja DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
140338a82179SSidharth Baveja new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
140438a82179SSidharth Baveja }
14053cdf8794SKit Barton
14063cdf8794SKit Barton // The pre-header of L1 is not necessary anymore.
14070888eaf3SKazu Hirata assert(pred_empty(FC1.Preheader));
14083cdf8794SKit Barton FC1.Preheader->getTerminator()->eraseFromParent();
14093cdf8794SKit Barton new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
14103cdf8794SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
14113cdf8794SKit Barton DominatorTree::Delete, FC1.Preheader, FC1.Header));
14123cdf8794SKit Barton
14133cdf8794SKit Barton // Moves the phi nodes from the second to the first loops header block.
14143cdf8794SKit Barton while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
14153cdf8794SKit Barton if (SE.isSCEVable(PHI->getType()))
14163cdf8794SKit Barton SE.forgetValue(PHI);
14173cdf8794SKit Barton if (PHI->hasNUsesOrMore(1))
14183cdf8794SKit Barton PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
14193cdf8794SKit Barton else
14203cdf8794SKit Barton PHI->eraseFromParent();
14213cdf8794SKit Barton }
14223cdf8794SKit Barton
14233cdf8794SKit Barton // Introduce new phi nodes in the second loop header to ensure
14243cdf8794SKit Barton // exiting the first and jumping to the header of the second does not break
14253cdf8794SKit Barton // the SSA property of the phis originally in the first loop. See also the
14263cdf8794SKit Barton // comment above.
14273cdf8794SKit Barton Instruction *L1HeaderIP = &FC1.Header->front();
14283cdf8794SKit Barton for (PHINode *LCPHI : OriginalFC0PHIs) {
14293cdf8794SKit Barton int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
14303cdf8794SKit Barton assert(L1LatchBBIdx >= 0 &&
14313cdf8794SKit Barton "Expected loop carried value to be rewired at this point!");
14323cdf8794SKit Barton
14333cdf8794SKit Barton Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
14343cdf8794SKit Barton
14353cdf8794SKit Barton PHINode *L1HeaderPHI = PHINode::Create(
14363cdf8794SKit Barton LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
14373cdf8794SKit Barton L1HeaderPHI->addIncoming(LCV, FC0.Latch);
14383cdf8794SKit Barton L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
14393cdf8794SKit Barton FC0.ExitingBlock);
14403cdf8794SKit Barton
14413cdf8794SKit Barton LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
14423cdf8794SKit Barton }
14433cdf8794SKit Barton
14443cdf8794SKit Barton // Replace latch terminator destinations.
14453cdf8794SKit Barton FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
14463cdf8794SKit Barton FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
14473cdf8794SKit Barton
144838a82179SSidharth Baveja // Modify the latch branch of FC0 to be unconditional as both successors of
144936bdc3dcSWhitney Tsang // the branch are the same.
145036bdc3dcSWhitney Tsang simplifyLatchBranch(FC0);
145136bdc3dcSWhitney Tsang
14523cdf8794SKit Barton // If FC0.Latch and FC0.ExitingBlock are the same then we have already
14533cdf8794SKit Barton // performed the updates above.
14543cdf8794SKit Barton if (FC0.Latch != FC0.ExitingBlock)
14553cdf8794SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
14563cdf8794SKit Barton DominatorTree::Insert, FC0.Latch, FC1.Header));
14573cdf8794SKit Barton
14583cdf8794SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
14593cdf8794SKit Barton FC0.Latch, FC0.Header));
14603cdf8794SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
14613cdf8794SKit Barton FC1.Latch, FC0.Header));
14623cdf8794SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
14633cdf8794SKit Barton FC1.Latch, FC1.Header));
14643cdf8794SKit Barton
14653cdf8794SKit Barton // Update DT/PDT
14663cdf8794SKit Barton DTU.applyUpdates(TreeUpdates);
14673cdf8794SKit Barton
14683cdf8794SKit Barton LI.removeBlock(FC1.Preheader);
14693cdf8794SKit Barton DTU.deleteBB(FC1.Preheader);
147038a82179SSidharth Baveja if (FC0.Peeled) {
147138a82179SSidharth Baveja LI.removeBlock(FC0.ExitBlock);
147238a82179SSidharth Baveja DTU.deleteBB(FC0.ExitBlock);
147338a82179SSidharth Baveja }
147438a82179SSidharth Baveja
14753cdf8794SKit Barton DTU.flush();
14763cdf8794SKit Barton
14773cdf8794SKit Barton // Is there a way to keep SE up-to-date so we don't need to forget the loops
14783cdf8794SKit Barton // and rebuild the information in subsequent passes of fusion?
147936bdc3dcSWhitney Tsang // Note: Need to forget the loops before merging the loop latches, as
148036bdc3dcSWhitney Tsang // mergeLatch may remove the only block in FC1.
14813cdf8794SKit Barton SE.forgetLoop(FC1.L);
14823cdf8794SKit Barton SE.forgetLoop(FC0.L);
14833cdf8794SKit Barton
148436bdc3dcSWhitney Tsang // Move instructions from FC0.Latch to FC1.Latch.
148536bdc3dcSWhitney Tsang // Note: mergeLatch requires an updated DT.
148636bdc3dcSWhitney Tsang mergeLatch(FC0, FC1);
148736bdc3dcSWhitney Tsang
14883cdf8794SKit Barton // Merge the loops.
1489530c5af6SKazu Hirata SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
14903cdf8794SKit Barton for (BasicBlock *BB : Blocks) {
14913cdf8794SKit Barton FC0.L->addBlockEntry(BB);
14923cdf8794SKit Barton FC1.L->removeBlockFromLoop(BB);
14933cdf8794SKit Barton if (LI.getLoopFor(BB) != FC1.L)
14943cdf8794SKit Barton continue;
14953cdf8794SKit Barton LI.changeLoopFor(BB, FC0.L);
14963cdf8794SKit Barton }
149789c1e35fSStefanos Baziotis while (!FC1.L->isInnermost()) {
14983cdf8794SKit Barton const auto &ChildLoopIt = FC1.L->begin();
14993cdf8794SKit Barton Loop *ChildLoop = *ChildLoopIt;
15003cdf8794SKit Barton FC1.L->removeChildLoop(ChildLoopIt);
15013cdf8794SKit Barton FC0.L->addChildLoop(ChildLoop);
15023cdf8794SKit Barton }
15033cdf8794SKit Barton
15043cdf8794SKit Barton // Delete the now empty loop L1.
15053cdf8794SKit Barton LI.erase(FC1.L);
15063cdf8794SKit Barton
15073cdf8794SKit Barton #ifndef NDEBUG
15083cdf8794SKit Barton assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
15093cdf8794SKit Barton assert(DT.verify(DominatorTree::VerificationLevel::Fast));
15103cdf8794SKit Barton assert(PDT.verify());
15113cdf8794SKit Barton LI.verify(DT);
15123cdf8794SKit Barton SE.verify();
15133cdf8794SKit Barton #endif
15143cdf8794SKit Barton
15153cdf8794SKit Barton LLVM_DEBUG(dbgs() << "Fusion done:\n");
15163cdf8794SKit Barton
15173cdf8794SKit Barton return FC0.L;
15183cdf8794SKit Barton }
1519de0b6339SKit Barton
1520de0b6339SKit Barton /// Report details on loop fusion opportunities.
1521de0b6339SKit Barton ///
1522de0b6339SKit Barton /// This template function can be used to report both successful and missed
1523de0b6339SKit Barton /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
1524de0b6339SKit Barton /// be one of:
1525de0b6339SKit Barton /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
1526de0b6339SKit Barton /// given two valid fusion candidates.
1527de0b6339SKit Barton /// - OptimizationRemark to report successful fusion of two fusion
1528de0b6339SKit Barton /// candidates.
1529de0b6339SKit Barton /// The remarks will be printed using the form:
1530de0b6339SKit Barton /// <path/filename>:<line number>:<column number>: [<function name>]:
1531de0b6339SKit Barton /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
1532de0b6339SKit Barton template <typename RemarkKind>
reportLoopFusion__anon08ec0c300111::LoopFuser1533de0b6339SKit Barton void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
1534de0b6339SKit Barton llvm::Statistic &Stat) {
1535de0b6339SKit Barton assert(FC0.Preheader && FC1.Preheader &&
1536de0b6339SKit Barton "Expecting valid fusion candidates");
1537de0b6339SKit Barton using namespace ore;
153818839be9SFangrui Song #if LLVM_ENABLE_STATS
1539de0b6339SKit Barton ++Stat;
1540de0b6339SKit Barton ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
1541de0b6339SKit Barton FC0.Preheader)
1542de0b6339SKit Barton << "[" << FC0.Preheader->getParent()->getName()
1543de0b6339SKit Barton << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
1544de0b6339SKit Barton << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
1545de0b6339SKit Barton << ": " << Stat.getDesc());
154618839be9SFangrui Song #endif
1547de0b6339SKit Barton }
154850bc6104SKit Barton
154950bc6104SKit Barton /// Fuse two guarded fusion candidates, creating a new fused loop.
155050bc6104SKit Barton ///
155150bc6104SKit Barton /// Fusing guarded loops is handled much the same way as fusing non-guarded
155250bc6104SKit Barton /// loops. The rewiring of the CFG is slightly different though, because of
155350bc6104SKit Barton /// the presence of the guards around the loops and the exit blocks after the
155450bc6104SKit Barton /// loop body. As such, the new loop is rewired as follows:
155550bc6104SKit Barton /// 1. Keep the guard branch from FC0 and use the non-loop block target
155650bc6104SKit Barton /// from the FC1 guard branch.
155750bc6104SKit Barton /// 2. Remove the exit block from FC0 (this exit block should be empty
155850bc6104SKit Barton /// right now).
155950bc6104SKit Barton /// 3. Remove the guard branch for FC1
156050bc6104SKit Barton /// 4. Remove the preheader for FC1.
156150bc6104SKit Barton /// The exit block successor for the latch of FC0 is updated to be the header
156250bc6104SKit Barton /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
156350bc6104SKit Barton /// be the header of FC0, thus creating the fused loop.
fuseGuardedLoops__anon08ec0c300111::LoopFuser156450bc6104SKit Barton Loop *fuseGuardedLoops(const FusionCandidate &FC0,
156550bc6104SKit Barton const FusionCandidate &FC1) {
156650bc6104SKit Barton assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
156750bc6104SKit Barton
156850bc6104SKit Barton BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
156950bc6104SKit Barton BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
157050bc6104SKit Barton BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
157150bc6104SKit Barton BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
157238a82179SSidharth Baveja BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
157350bc6104SKit Barton
1574e44f4a8aSWhitney Tsang // Move instructions from the exit block of FC0 to the beginning of the exit
157538a82179SSidharth Baveja // block of FC1, in the case that the FC0 loop has not been peeled. In the
157638a82179SSidharth Baveja // case that FC0 loop is peeled, then move the instructions of the successor
157738a82179SSidharth Baveja // of the FC0 Exit block to the beginning of the exit block of FC1.
157838a82179SSidharth Baveja moveInstructionsToTheBeginning(
157938a82179SSidharth Baveja (FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
158038a82179SSidharth Baveja DT, PDT, DI);
1581e44f4a8aSWhitney Tsang
1582e44f4a8aSWhitney Tsang // Move instructions from the guard block of FC1 to the end of the guard
1583e44f4a8aSWhitney Tsang // block of FC0.
1584e44f4a8aSWhitney Tsang moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);
1585e44f4a8aSWhitney Tsang
158650bc6104SKit Barton assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
158750bc6104SKit Barton
158850bc6104SKit Barton SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
158950bc6104SKit Barton
159050bc6104SKit Barton ////////////////////////////////////////////////////////////////////////////
159150bc6104SKit Barton // Update the Loop Guard
159250bc6104SKit Barton ////////////////////////////////////////////////////////////////////////////
159350bc6104SKit Barton // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
159450bc6104SKit Barton // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
159550bc6104SKit Barton // Thus, one path from the guard goes to the preheader for FC0 (and thus
159650bc6104SKit Barton // executes the new fused loop) and the other path goes to the NonLoopBlock
159750bc6104SKit Barton // for FC1 (where FC1 guard would have gone if FC1 was not executed).
159801e64c97SWhitney Tsang FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
159950bc6104SKit Barton FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
160038a82179SSidharth Baveja
160138a82179SSidharth Baveja BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
160238a82179SSidharth Baveja BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);
160350bc6104SKit Barton
160450bc6104SKit Barton // The guard of FC1 is not necessary anymore.
160550bc6104SKit Barton FC1.GuardBranch->eraseFromParent();
160650bc6104SKit Barton new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
160750bc6104SKit Barton
160850bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
160950bc6104SKit Barton DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
161050bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
161150bc6104SKit Barton DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
161250bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
161350bc6104SKit Barton DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
161450bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
161550bc6104SKit Barton DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
161650bc6104SKit Barton
161738a82179SSidharth Baveja if (FC0.Peeled) {
161838a82179SSidharth Baveja // Remove the Block after the ExitBlock of FC0
161938a82179SSidharth Baveja TreeUpdates.emplace_back(DominatorTree::UpdateType(
162038a82179SSidharth Baveja DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
162138a82179SSidharth Baveja FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();
162238a82179SSidharth Baveja new UnreachableInst(FC0ExitBlockSuccessor->getContext(),
162338a82179SSidharth Baveja FC0ExitBlockSuccessor);
162438a82179SSidharth Baveja }
162538a82179SSidharth Baveja
16260888eaf3SKazu Hirata assert(pred_empty(FC1GuardBlock) &&
162750bc6104SKit Barton "Expecting guard block to have no predecessors");
16280888eaf3SKazu Hirata assert(succ_empty(FC1GuardBlock) &&
162950bc6104SKit Barton "Expecting guard block to have no successors");
163050bc6104SKit Barton
163150bc6104SKit Barton // Remember the phi nodes originally in the header of FC0 in order to rewire
163250bc6104SKit Barton // them later. However, this is only necessary if the new loop carried
163350bc6104SKit Barton // values might not dominate the exiting branch. While we do not generally
163450bc6104SKit Barton // test if this is the case but simply insert intermediate phi nodes, we
163550bc6104SKit Barton // need to make sure these intermediate phi nodes have different
163650bc6104SKit Barton // predecessors. To this end, we filter the special case where the exiting
163750bc6104SKit Barton // block is the latch block of the first loop. Nothing needs to be done
163850bc6104SKit Barton // anyway as all loop carried values dominate the latch and thereby also the
163950bc6104SKit Barton // exiting branch.
164050bc6104SKit Barton // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
164150bc6104SKit Barton // (because the loops are rotated. Thus, nothing will ever be added to
164250bc6104SKit Barton // OriginalFC0PHIs.
164350bc6104SKit Barton SmallVector<PHINode *, 8> OriginalFC0PHIs;
164450bc6104SKit Barton if (FC0.ExitingBlock != FC0.Latch)
164550bc6104SKit Barton for (PHINode &PHI : FC0.Header->phis())
164650bc6104SKit Barton OriginalFC0PHIs.push_back(&PHI);
164750bc6104SKit Barton
164850bc6104SKit Barton assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
164950bc6104SKit Barton
165050bc6104SKit Barton // Replace incoming blocks for header PHIs first.
165150bc6104SKit Barton FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
165250bc6104SKit Barton FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
165350bc6104SKit Barton
165450bc6104SKit Barton // The old exiting block of the first loop (FC0) has to jump to the header
165550bc6104SKit Barton // of the second as we need to execute the code in the second header block
165650bc6104SKit Barton // regardless of the trip count. That is, if the trip count is 0, so the
165750bc6104SKit Barton // back edge is never taken, we still have to execute both loop headers,
165850bc6104SKit Barton // especially (but not only!) if the second is a do-while style loop.
165950bc6104SKit Barton // However, doing so might invalidate the phi nodes of the first loop as
166050bc6104SKit Barton // the new values do only need to dominate their latch and not the exiting
166150bc6104SKit Barton // predicate. To remedy this potential problem we always introduce phi
166250bc6104SKit Barton // nodes in the header of the second loop later that select the loop carried
166350bc6104SKit Barton // value, if the second header was reached through an old latch of the
166450bc6104SKit Barton // first, or undef otherwise. This is sound as exiting the first implies the
166550bc6104SKit Barton // second will exit too, __without__ taking the back-edge (their
166650bc6104SKit Barton // trip-counts are equal after all).
166750bc6104SKit Barton FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
166850bc6104SKit Barton FC1.Header);
166950bc6104SKit Barton
167050bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
167150bc6104SKit Barton DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
167250bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
167350bc6104SKit Barton DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
167450bc6104SKit Barton
167550bc6104SKit Barton // Remove FC0 Exit Block
167650bc6104SKit Barton // The exit block for FC0 is no longer needed since control will flow
167750bc6104SKit Barton // directly to the header of FC1. Since it is an empty block, it can be
167850bc6104SKit Barton // removed at this point.
167950bc6104SKit Barton // TODO: In the future, we can handle non-empty exit blocks my merging any
168050bc6104SKit Barton // instructions from FC0 exit block into FC1 exit block prior to removing
168150bc6104SKit Barton // the block.
16820888eaf3SKazu Hirata assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");
168350bc6104SKit Barton FC0.ExitBlock->getTerminator()->eraseFromParent();
168450bc6104SKit Barton new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
168550bc6104SKit Barton
168650bc6104SKit Barton // Remove FC1 Preheader
168750bc6104SKit Barton // The pre-header of L1 is not necessary anymore.
16880888eaf3SKazu Hirata assert(pred_empty(FC1.Preheader));
168950bc6104SKit Barton FC1.Preheader->getTerminator()->eraseFromParent();
169050bc6104SKit Barton new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
169150bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
169250bc6104SKit Barton DominatorTree::Delete, FC1.Preheader, FC1.Header));
169350bc6104SKit Barton
169450bc6104SKit Barton // Moves the phi nodes from the second to the first loops header block.
169550bc6104SKit Barton while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
169650bc6104SKit Barton if (SE.isSCEVable(PHI->getType()))
169750bc6104SKit Barton SE.forgetValue(PHI);
169850bc6104SKit Barton if (PHI->hasNUsesOrMore(1))
169950bc6104SKit Barton PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
170050bc6104SKit Barton else
170150bc6104SKit Barton PHI->eraseFromParent();
170250bc6104SKit Barton }
170350bc6104SKit Barton
170450bc6104SKit Barton // Introduce new phi nodes in the second loop header to ensure
170550bc6104SKit Barton // exiting the first and jumping to the header of the second does not break
170650bc6104SKit Barton // the SSA property of the phis originally in the first loop. See also the
170750bc6104SKit Barton // comment above.
170850bc6104SKit Barton Instruction *L1HeaderIP = &FC1.Header->front();
170950bc6104SKit Barton for (PHINode *LCPHI : OriginalFC0PHIs) {
171050bc6104SKit Barton int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
171150bc6104SKit Barton assert(L1LatchBBIdx >= 0 &&
171250bc6104SKit Barton "Expected loop carried value to be rewired at this point!");
171350bc6104SKit Barton
171450bc6104SKit Barton Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
171550bc6104SKit Barton
171650bc6104SKit Barton PHINode *L1HeaderPHI = PHINode::Create(
171750bc6104SKit Barton LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
171850bc6104SKit Barton L1HeaderPHI->addIncoming(LCV, FC0.Latch);
171950bc6104SKit Barton L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
172050bc6104SKit Barton FC0.ExitingBlock);
172150bc6104SKit Barton
172250bc6104SKit Barton LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
172350bc6104SKit Barton }
172450bc6104SKit Barton
172550bc6104SKit Barton // Update the latches
172650bc6104SKit Barton
172750bc6104SKit Barton // Replace latch terminator destinations.
172850bc6104SKit Barton FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
172950bc6104SKit Barton FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
173050bc6104SKit Barton
173138a82179SSidharth Baveja // Modify the latch branch of FC0 to be unconditional as both successors of
173236bdc3dcSWhitney Tsang // the branch are the same.
173336bdc3dcSWhitney Tsang simplifyLatchBranch(FC0);
173436bdc3dcSWhitney Tsang
173550bc6104SKit Barton // If FC0.Latch and FC0.ExitingBlock are the same then we have already
173650bc6104SKit Barton // performed the updates above.
173750bc6104SKit Barton if (FC0.Latch != FC0.ExitingBlock)
173850bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(
173950bc6104SKit Barton DominatorTree::Insert, FC0.Latch, FC1.Header));
174050bc6104SKit Barton
174150bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
174250bc6104SKit Barton FC0.Latch, FC0.Header));
174350bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
174450bc6104SKit Barton FC1.Latch, FC0.Header));
174550bc6104SKit Barton TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
174650bc6104SKit Barton FC1.Latch, FC1.Header));
174750bc6104SKit Barton
174850bc6104SKit Barton // All done
174950bc6104SKit Barton // Apply the updates to the Dominator Tree and cleanup.
175050bc6104SKit Barton
17510888eaf3SKazu Hirata assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");
17520888eaf3SKazu Hirata assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");
175350bc6104SKit Barton
175450bc6104SKit Barton // Update DT/PDT
175550bc6104SKit Barton DTU.applyUpdates(TreeUpdates);
175650bc6104SKit Barton
1757f5224d43SDiego Caballero LI.removeBlock(FC1GuardBlock);
175850bc6104SKit Barton LI.removeBlock(FC1.Preheader);
1759f5224d43SDiego Caballero LI.removeBlock(FC0.ExitBlock);
176038a82179SSidharth Baveja if (FC0.Peeled) {
176138a82179SSidharth Baveja LI.removeBlock(FC0ExitBlockSuccessor);
176238a82179SSidharth Baveja DTU.deleteBB(FC0ExitBlockSuccessor);
176338a82179SSidharth Baveja }
1764f5224d43SDiego Caballero DTU.deleteBB(FC1GuardBlock);
176550bc6104SKit Barton DTU.deleteBB(FC1.Preheader);
176650bc6104SKit Barton DTU.deleteBB(FC0.ExitBlock);
176750bc6104SKit Barton DTU.flush();
176850bc6104SKit Barton
176950bc6104SKit Barton // Is there a way to keep SE up-to-date so we don't need to forget the loops
177050bc6104SKit Barton // and rebuild the information in subsequent passes of fusion?
177136bdc3dcSWhitney Tsang // Note: Need to forget the loops before merging the loop latches, as
177236bdc3dcSWhitney Tsang // mergeLatch may remove the only block in FC1.
177350bc6104SKit Barton SE.forgetLoop(FC1.L);
177450bc6104SKit Barton SE.forgetLoop(FC0.L);
177550bc6104SKit Barton
177636bdc3dcSWhitney Tsang // Move instructions from FC0.Latch to FC1.Latch.
177736bdc3dcSWhitney Tsang // Note: mergeLatch requires an updated DT.
177836bdc3dcSWhitney Tsang mergeLatch(FC0, FC1);
177936bdc3dcSWhitney Tsang
178050bc6104SKit Barton // Merge the loops.
1781530c5af6SKazu Hirata SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
178250bc6104SKit Barton for (BasicBlock *BB : Blocks) {
178350bc6104SKit Barton FC0.L->addBlockEntry(BB);
178450bc6104SKit Barton FC1.L->removeBlockFromLoop(BB);
178550bc6104SKit Barton if (LI.getLoopFor(BB) != FC1.L)
178650bc6104SKit Barton continue;
178750bc6104SKit Barton LI.changeLoopFor(BB, FC0.L);
178850bc6104SKit Barton }
178989c1e35fSStefanos Baziotis while (!FC1.L->isInnermost()) {
179050bc6104SKit Barton const auto &ChildLoopIt = FC1.L->begin();
179150bc6104SKit Barton Loop *ChildLoop = *ChildLoopIt;
179250bc6104SKit Barton FC1.L->removeChildLoop(ChildLoopIt);
179350bc6104SKit Barton FC0.L->addChildLoop(ChildLoop);
179450bc6104SKit Barton }
179550bc6104SKit Barton
179650bc6104SKit Barton // Delete the now empty loop L1.
179750bc6104SKit Barton LI.erase(FC1.L);
179850bc6104SKit Barton
179950bc6104SKit Barton #ifndef NDEBUG
180050bc6104SKit Barton assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
180150bc6104SKit Barton assert(DT.verify(DominatorTree::VerificationLevel::Fast));
180250bc6104SKit Barton assert(PDT.verify());
180350bc6104SKit Barton LI.verify(DT);
180450bc6104SKit Barton SE.verify();
180550bc6104SKit Barton #endif
180650bc6104SKit Barton
180750bc6104SKit Barton LLVM_DEBUG(dbgs() << "Fusion done:\n");
180850bc6104SKit Barton
180950bc6104SKit Barton return FC0.L;
181050bc6104SKit Barton }
18113cdf8794SKit Barton };
18123cdf8794SKit Barton
18133cdf8794SKit Barton struct LoopFuseLegacy : public FunctionPass {
18143cdf8794SKit Barton
18153cdf8794SKit Barton static char ID;
18163cdf8794SKit Barton
LoopFuseLegacy__anon08ec0c300111::LoopFuseLegacy18173cdf8794SKit Barton LoopFuseLegacy() : FunctionPass(ID) {
18183cdf8794SKit Barton initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry());
18193cdf8794SKit Barton }
18203cdf8794SKit Barton
getAnalysisUsage__anon08ec0c300111::LoopFuseLegacy18213cdf8794SKit Barton void getAnalysisUsage(AnalysisUsage &AU) const override {
18223cdf8794SKit Barton AU.addRequiredID(LoopSimplifyID);
18233cdf8794SKit Barton AU.addRequired<ScalarEvolutionWrapperPass>();
18243cdf8794SKit Barton AU.addRequired<LoopInfoWrapperPass>();
18253cdf8794SKit Barton AU.addRequired<DominatorTreeWrapperPass>();
18263cdf8794SKit Barton AU.addRequired<PostDominatorTreeWrapperPass>();
18273cdf8794SKit Barton AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
18283cdf8794SKit Barton AU.addRequired<DependenceAnalysisWrapperPass>();
182938a82179SSidharth Baveja AU.addRequired<AssumptionCacheTracker>();
183038a82179SSidharth Baveja AU.addRequired<TargetTransformInfoWrapperPass>();
18313cdf8794SKit Barton
18323cdf8794SKit Barton AU.addPreserved<ScalarEvolutionWrapperPass>();
18333cdf8794SKit Barton AU.addPreserved<LoopInfoWrapperPass>();
18343cdf8794SKit Barton AU.addPreserved<DominatorTreeWrapperPass>();
18353cdf8794SKit Barton AU.addPreserved<PostDominatorTreeWrapperPass>();
18363cdf8794SKit Barton }
18373cdf8794SKit Barton
runOnFunction__anon08ec0c300111::LoopFuseLegacy18383cdf8794SKit Barton bool runOnFunction(Function &F) override {
18393cdf8794SKit Barton if (skipFunction(F))
18403cdf8794SKit Barton return false;
18413cdf8794SKit Barton auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
18423cdf8794SKit Barton auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
18433cdf8794SKit Barton auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
18443cdf8794SKit Barton auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
18453cdf8794SKit Barton auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
18463cdf8794SKit Barton auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
184738a82179SSidharth Baveja auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
184838a82179SSidharth Baveja const TargetTransformInfo &TTI =
184938a82179SSidharth Baveja getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
18508a268becSFangrui Song const DataLayout &DL = F.getParent()->getDataLayout();
185138a82179SSidharth Baveja
185238a82179SSidharth Baveja LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
18533cdf8794SKit Barton return LF.fuseLoops(F);
18543cdf8794SKit Barton }
18553cdf8794SKit Barton };
1856dc5f805dSBenjamin Kramer } // namespace
18573cdf8794SKit Barton
run(Function & F,FunctionAnalysisManager & AM)18583cdf8794SKit Barton PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
18593cdf8794SKit Barton auto &LI = AM.getResult<LoopAnalysis>(F);
18603cdf8794SKit Barton auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
18613cdf8794SKit Barton auto &DI = AM.getResult<DependenceAnalysis>(F);
18623cdf8794SKit Barton auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
18633cdf8794SKit Barton auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
18643cdf8794SKit Barton auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
186538a82179SSidharth Baveja auto &AC = AM.getResult<AssumptionAnalysis>(F);
186638a82179SSidharth Baveja const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
18678a268becSFangrui Song const DataLayout &DL = F.getParent()->getDataLayout();
186838a82179SSidharth Baveja
186938a82179SSidharth Baveja LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
18703cdf8794SKit Barton bool Changed = LF.fuseLoops(F);
18713cdf8794SKit Barton if (!Changed)
18723cdf8794SKit Barton return PreservedAnalyses::all();
18733cdf8794SKit Barton
18743cdf8794SKit Barton PreservedAnalyses PA;
18753cdf8794SKit Barton PA.preserve<DominatorTreeAnalysis>();
18763cdf8794SKit Barton PA.preserve<PostDominatorTreeAnalysis>();
18773cdf8794SKit Barton PA.preserve<ScalarEvolutionAnalysis>();
18783cdf8794SKit Barton PA.preserve<LoopAnalysis>();
18793cdf8794SKit Barton return PA;
18803cdf8794SKit Barton }
18813cdf8794SKit Barton
18823cdf8794SKit Barton char LoopFuseLegacy::ID = 0;
18833cdf8794SKit Barton
18843cdf8794SKit Barton INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false,
18853cdf8794SKit Barton false)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)18863cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
18873cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
18883cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
18893cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
18903cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
18913cdf8794SKit Barton INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
189238a82179SSidharth Baveja INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
189338a82179SSidharth Baveja INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
18943cdf8794SKit Barton INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
18953cdf8794SKit Barton
18963cdf8794SKit Barton FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
1897