1a6fd919cSSam Parker //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2a6fd919cSSam Parker //
3a6fd919cSSam Parker // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4a6fd919cSSam Parker // See https://llvm.org/LICENSE.txt for license information.
5a6fd919cSSam Parker // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6a6fd919cSSam Parker //
7a6fd919cSSam Parker //===----------------------------------------------------------------------===//
8a6fd919cSSam Parker /// \file
9a6fd919cSSam Parker /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10a6fd919cSSam Parker /// instructions into machine operations.
11a6fd919cSSam Parker /// The expectation is that the loop contains three pseudo instructions:
12a6fd919cSSam Parker /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13a6fd919cSSam Parker ///   form should be in the preheader, whereas the while form should be in the
14173de037SSam Parker ///   preheaders only predecessor.
15a6fd919cSSam Parker /// - t2LoopDec - placed within in the loop body.
16a6fd919cSSam Parker /// - t2LoopEnd - the loop latch terminator.
17a6fd919cSSam Parker ///
180efc9e5aSSjoerd Meijer /// In addition to this, we also look for the presence of the VCTP instruction,
190efc9e5aSSjoerd Meijer /// which determines whether we can generated the tail-predicated low-overhead
200efc9e5aSSjoerd Meijer /// loop form.
210efc9e5aSSjoerd Meijer ///
229c91d79dSSam Parker /// Assumptions and Dependencies:
239c91d79dSSam Parker /// Low-overhead loops are constructed and executed using a setup instruction:
249c91d79dSSam Parker /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
259c91d79dSSam Parker /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
269c91d79dSSam Parker /// but fixed polarity: WLS can only branch forwards and LE can only branch
279c91d79dSSam Parker /// backwards. These restrictions mean that this pass is dependent upon block
289c91d79dSSam Parker /// layout and block sizes, which is why it's the last pass to run. The same is
299c91d79dSSam Parker /// true for ConstantIslands, but this pass does not increase the size of the
309c91d79dSSam Parker /// basic blocks, nor does it change the CFG. Instructions are mainly removed
319c91d79dSSam Parker /// during the transform and pseudo instructions are replaced by real ones. In
329c91d79dSSam Parker /// some cases, when we have to revert to a 'normal' loop, we have to introduce
339c91d79dSSam Parker /// multiple instructions for a single pseudo (see RevertWhile and
34fad70c30SDavid Green /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
359c91d79dSSam Parker /// are defined to be as large as this maximum sequence of replacement
369c91d79dSSam Parker /// instructions.
379c91d79dSSam Parker ///
38835251f7SPierre-vh /// A note on VPR.P0 (the lane mask):
39835251f7SPierre-vh /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
40835251f7SPierre-vh /// "VPT Active" context (which includes low-overhead loops and vpt blocks).
41835251f7SPierre-vh /// They will simply "and" the result of their calculation with the current
42835251f7SPierre-vh /// value of VPR.P0. You can think of it like this:
43835251f7SPierre-vh /// \verbatim
44835251f7SPierre-vh /// if VPT active:    ; Between a DLSTP/LETP, or for predicated instrs
45835251f7SPierre-vh ///   VPR.P0 &= Value
46835251f7SPierre-vh /// else
47835251f7SPierre-vh ///   VPR.P0 = Value
48835251f7SPierre-vh /// \endverbatim
49835251f7SPierre-vh /// When we're inside the low-overhead loop (between DLSTP and LETP), we always
50835251f7SPierre-vh /// fall in the "VPT active" case, so we can consider that all VPR writes by
51835251f7SPierre-vh /// one of those instruction is actually a "and".
52a6fd919cSSam Parker //===----------------------------------------------------------------------===//
53a6fd919cSSam Parker 
54a6fd919cSSam Parker #include "ARM.h"
55a6fd919cSSam Parker #include "ARMBaseInstrInfo.h"
56a6fd919cSSam Parker #include "ARMBaseRegisterInfo.h"
57a6fd919cSSam Parker #include "ARMBasicBlockInfo.h"
58a6fd919cSSam Parker #include "ARMSubtarget.h"
59d9bf6245SDavid Green #include "MVETailPredUtils.h"
60bad6032bSSam Parker #include "Thumb2InstrInfo.h"
6140425183SSam Parker #include "llvm/ADT/SetOperations.h"
62ed98c1b3Sserge-sans-paille #include "llvm/ADT/SetVector.h"
63acbc9aedSSam Parker #include "llvm/ADT/SmallSet.h"
6442350cd8SSam Parker #include "llvm/CodeGen/LivePhysRegs.h"
65989f1c72Sserge-sans-paille #include "llvm/CodeGen/MachineFrameInfo.h"
66a6fd919cSSam Parker #include "llvm/CodeGen/MachineFunctionPass.h"
67a6fd919cSSam Parker #include "llvm/CodeGen/MachineLoopInfo.h"
68d97cf1f8SSjoerd Meijer #include "llvm/CodeGen/MachineLoopUtils.h"
69a6fd919cSSam Parker #include "llvm/CodeGen/MachineRegisterInfo.h"
70cced971fSSam Parker #include "llvm/CodeGen/Passes.h"
71cced971fSSam Parker #include "llvm/CodeGen/ReachingDefAnalysis.h"
728978c12bSSam Parker #include "llvm/MC/MCInstrDesc.h"
73a6fd919cSSam Parker 
74a6fd919cSSam Parker using namespace llvm;
75a6fd919cSSam Parker 
76a6fd919cSSam Parker #define DEBUG_TYPE "arm-low-overhead-loops"
77a6fd919cSSam Parker #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
78a6fd919cSSam Parker 
792fc690acSSjoerd Meijer static cl::opt<bool>
802fc690acSSjoerd Meijer DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
812fc690acSSjoerd Meijer     cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
822fc690acSSjoerd Meijer     cl::init(false));
832fc690acSSjoerd Meijer 
isVectorPredicated(MachineInstr * MI)84a0c1dcc3SSam Parker static bool isVectorPredicated(MachineInstr *MI) {
85a0c1dcc3SSam Parker   int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
86a0c1dcc3SSam Parker   return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
87a0c1dcc3SSam Parker }
88a0c1dcc3SSam Parker 
isVectorPredicate(MachineInstr * MI)89a0c1dcc3SSam Parker static bool isVectorPredicate(MachineInstr *MI) {
90a0c1dcc3SSam Parker   return MI->findRegisterDefOperandIdx(ARM::VPR) != -1;
91a0c1dcc3SSam Parker }
92a0c1dcc3SSam Parker 
hasVPRUse(MachineInstr & MI)93da2e4728SSam Tebbs static bool hasVPRUse(MachineInstr &MI) {
94da2e4728SSam Tebbs   return MI.findRegisterUseOperandIdx(ARM::VPR) != -1;
95a0c1dcc3SSam Parker }
96a0c1dcc3SSam Parker 
isDomainMVE(MachineInstr * MI)97a0c1dcc3SSam Parker static bool isDomainMVE(MachineInstr *MI) {
98a0c1dcc3SSam Parker   uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
99a0c1dcc3SSam Parker   return Domain == ARMII::DomainMVE;
100a0c1dcc3SSam Parker }
101a0c1dcc3SSam Parker 
getVecSize(const MachineInstr & MI)10202cd8a6bSDavid Green static int getVecSize(const MachineInstr &MI) {
10302cd8a6bSDavid Green   const MCInstrDesc &MCID = MI.getDesc();
10402cd8a6bSDavid Green   uint64_t Flags = MCID.TSFlags;
10502cd8a6bSDavid Green   return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
10602cd8a6bSDavid Green }
10702cd8a6bSDavid Green 
shouldInspect(MachineInstr & MI)108a0c1dcc3SSam Parker static bool shouldInspect(MachineInstr &MI) {
10902cd8a6bSDavid Green   if (MI.isDebugInstr())
11002cd8a6bSDavid Green     return false;
111da2e4728SSam Tebbs   return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
112a0c1dcc3SSam Parker }
113a0c1dcc3SSam Parker 
114a6fd919cSSam Parker namespace {
115a6fd919cSSam Parker 
11694cacebcSSam Parker   using InstSet = SmallPtrSetImpl<MachineInstr *>;
11794cacebcSSam Parker 
118760b1751SSam Parker   class PostOrderLoopTraversal {
119760b1751SSam Parker     MachineLoop &ML;
120760b1751SSam Parker     MachineLoopInfo &MLI;
121760b1751SSam Parker     SmallPtrSet<MachineBasicBlock*, 4> Visited;
122760b1751SSam Parker     SmallVector<MachineBasicBlock*, 4> Order;
123760b1751SSam Parker 
124760b1751SSam Parker   public:
PostOrderLoopTraversal(MachineLoop & ML,MachineLoopInfo & MLI)125760b1751SSam Parker     PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
126760b1751SSam Parker       : ML(ML), MLI(MLI) { }
127760b1751SSam Parker 
getOrder() const128760b1751SSam Parker     const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
129760b1751SSam Parker       return Order;
130760b1751SSam Parker     }
131760b1751SSam Parker 
132760b1751SSam Parker     // Visit all the blocks within the loop, as well as exit blocks and any
133760b1751SSam Parker     // blocks properly dominating the header.
ProcessLoop()134760b1751SSam Parker     void ProcessLoop() {
135760b1751SSam Parker       std::function<void(MachineBasicBlock*)> Search = [this, &Search]
136760b1751SSam Parker         (MachineBasicBlock *MBB) -> void {
137760b1751SSam Parker         if (Visited.count(MBB))
138760b1751SSam Parker           return;
139760b1751SSam Parker 
140760b1751SSam Parker         Visited.insert(MBB);
141760b1751SSam Parker         for (auto *Succ : MBB->successors()) {
142760b1751SSam Parker           if (!ML.contains(Succ))
143760b1751SSam Parker             continue;
144760b1751SSam Parker           Search(Succ);
145760b1751SSam Parker         }
146760b1751SSam Parker         Order.push_back(MBB);
147760b1751SSam Parker       };
148760b1751SSam Parker 
149760b1751SSam Parker       // Insert exit blocks.
150760b1751SSam Parker       SmallVector<MachineBasicBlock*, 2> ExitBlocks;
151760b1751SSam Parker       ML.getExitBlocks(ExitBlocks);
15205444417SKazu Hirata       append_range(Order, ExitBlocks);
153760b1751SSam Parker 
154760b1751SSam Parker       // Then add the loop body.
155760b1751SSam Parker       Search(ML.getHeader());
156760b1751SSam Parker 
157760b1751SSam Parker       // Then try the preheader and its predecessors.
158760b1751SSam Parker       std::function<void(MachineBasicBlock*)> GetPredecessor =
159760b1751SSam Parker         [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
160760b1751SSam Parker         Order.push_back(MBB);
161760b1751SSam Parker         if (MBB->pred_size() == 1)
162760b1751SSam Parker           GetPredecessor(*MBB->pred_begin());
163760b1751SSam Parker       };
164760b1751SSam Parker 
165760b1751SSam Parker       if (auto *Preheader = ML.getLoopPreheader())
166760b1751SSam Parker         GetPredecessor(Preheader);
167543406a6SDavid Green       else if (auto *Preheader = MLI.findLoopPreheader(&ML, true, true))
168760b1751SSam Parker         GetPredecessor(Preheader);
169760b1751SSam Parker     }
170760b1751SSam Parker   };
171760b1751SSam Parker 
17240425183SSam Parker   struct PredicatedMI {
17340425183SSam Parker     MachineInstr *MI = nullptr;
17440425183SSam Parker     SetVector<MachineInstr*> Predicates;
17540425183SSam Parker 
17640425183SSam Parker   public:
PredicatedMI__anon59c8a0920111::PredicatedMI177835251f7SPierre-vh     PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) {
178835251f7SPierre-vh       assert(I && "Instruction must not be null!");
179835251f7SPierre-vh       Predicates.insert(Preds.begin(), Preds.end());
180835251f7SPierre-vh     }
18140425183SSam Parker   };
18240425183SSam Parker 
183b4fa884aSSam Parker   // Represent the current state of the VPR and hold all instances which
184b4fa884aSSam Parker   // represent a VPT block, which is a list of instructions that begins with a
185b4fa884aSSam Parker   // VPT/VPST and has a maximum of four proceeding instructions. All
186b4fa884aSSam Parker   // instructions within the block are predicated upon the vpr and we allow
187b4fa884aSSam Parker   // instructions to define the vpr within in the block too.
188b4fa884aSSam Parker   class VPTState {
189b4fa884aSSam Parker     friend struct LowOverheadLoop;
190b4fa884aSSam Parker 
191b4fa884aSSam Parker     SmallVector<MachineInstr *, 4> Insts;
192b4fa884aSSam Parker 
193b4fa884aSSam Parker     static SmallVector<VPTState, 4> Blocks;
194b4fa884aSSam Parker     static SetVector<MachineInstr *> CurrentPredicates;
195b4fa884aSSam Parker     static std::map<MachineInstr *,
196b4fa884aSSam Parker       std::unique_ptr<PredicatedMI>> PredicatedInsts;
197b4fa884aSSam Parker 
CreateVPTBlock(MachineInstr * MI)198b4fa884aSSam Parker     static void CreateVPTBlock(MachineInstr *MI) {
199195c22f2SSam Parker       assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
200195c22f2SSam Parker              && "Can't begin VPT without predicate");
201b4fa884aSSam Parker       Blocks.emplace_back(MI);
202b4fa884aSSam Parker       // The execution of MI is predicated upon the current set of instructions
203b4fa884aSSam Parker       // that are AND'ed together to form the VPR predicate value. In the case
204b4fa884aSSam Parker       // that MI is a VPT, CurrentPredicates will also just be MI.
205b4fa884aSSam Parker       PredicatedInsts.emplace(
206b4fa884aSSam Parker         MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
207b4fa884aSSam Parker     }
208b4fa884aSSam Parker 
reset()209b4fa884aSSam Parker     static void reset() {
210b4fa884aSSam Parker       Blocks.clear();
211b4fa884aSSam Parker       PredicatedInsts.clear();
212b4fa884aSSam Parker       CurrentPredicates.clear();
213b4fa884aSSam Parker     }
214b4fa884aSSam Parker 
addInst(MachineInstr * MI)215b4fa884aSSam Parker     static void addInst(MachineInstr *MI) {
216b4fa884aSSam Parker       Blocks.back().insert(MI);
217b4fa884aSSam Parker       PredicatedInsts.emplace(
218b4fa884aSSam Parker         MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
219b4fa884aSSam Parker     }
220b4fa884aSSam Parker 
addPredicate(MachineInstr * MI)221b4fa884aSSam Parker     static void addPredicate(MachineInstr *MI) {
222b4fa884aSSam Parker       LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
223b4fa884aSSam Parker       CurrentPredicates.insert(MI);
224b4fa884aSSam Parker     }
225b4fa884aSSam Parker 
resetPredicate(MachineInstr * MI)226b4fa884aSSam Parker     static void resetPredicate(MachineInstr *MI) {
227b4fa884aSSam Parker       LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
228b4fa884aSSam Parker       CurrentPredicates.clear();
229b4fa884aSSam Parker       CurrentPredicates.insert(MI);
230b4fa884aSSam Parker     }
23140425183SSam Parker 
23240425183SSam Parker   public:
23340425183SSam Parker     // Have we found an instruction within the block which defines the vpr? If
23440425183SSam Parker     // so, not all the instructions in the block will have the same predicate.
hasUniformPredicate(VPTState & Block)235b4fa884aSSam Parker     static bool hasUniformPredicate(VPTState &Block) {
236b4fa884aSSam Parker       return getDivergent(Block) == nullptr;
23740425183SSam Parker     }
23840425183SSam Parker 
239b4fa884aSSam Parker     // If it exists, return the first internal instruction which modifies the
240b4fa884aSSam Parker     // VPR.
getDivergent(VPTState & Block)241b4fa884aSSam Parker     static MachineInstr *getDivergent(VPTState &Block) {
242b4fa884aSSam Parker       SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
243b4fa884aSSam Parker       for (unsigned i = 1; i < Insts.size(); ++i) {
244b4fa884aSSam Parker         MachineInstr *Next = Insts[i];
245b4fa884aSSam Parker         if (isVectorPredicate(Next))
246b4fa884aSSam Parker           return Next; // Found an instruction altering the vpr.
247b4fa884aSSam Parker       }
248b4fa884aSSam Parker       return nullptr;
249835251f7SPierre-vh     }
250835251f7SPierre-vh 
251b4fa884aSSam Parker     // Return whether the given instruction is predicated upon a VCTP.
isPredicatedOnVCTP(MachineInstr * MI,bool Exclusive=false)252b4fa884aSSam Parker     static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
253b4fa884aSSam Parker       SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
254b4fa884aSSam Parker       if (Exclusive && Predicates.size() != 1)
255b4fa884aSSam Parker         return false;
256c2bb9637SKazu Hirata       return llvm::any_of(Predicates, isVCTP);
25740425183SSam Parker     }
25840425183SSam Parker 
259b4fa884aSSam Parker     // Is the VPST, controlling the block entry, predicated upon a VCTP.
isEntryPredicatedOnVCTP(VPTState & Block,bool Exclusive=false)260b4fa884aSSam Parker     static bool isEntryPredicatedOnVCTP(VPTState &Block,
261b4fa884aSSam Parker                                         bool Exclusive = false) {
262b4fa884aSSam Parker       SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
263b4fa884aSSam Parker       return isPredicatedOnVCTP(Insts.front(), Exclusive);
264b4fa884aSSam Parker     }
265b4fa884aSSam Parker 
266a399d188SSam Parker     // If this block begins with a VPT, we can check whether it's using
267a399d188SSam Parker     // at least one predicated input(s), as well as possible loop invariant
268a399d188SSam Parker     // which would result in it being implicitly predicated.
hasImplicitlyValidVPT(VPTState & Block,ReachingDefAnalysis & RDA)269a399d188SSam Parker     static bool hasImplicitlyValidVPT(VPTState &Block,
270a399d188SSam Parker                                       ReachingDefAnalysis &RDA) {
271a399d188SSam Parker       SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
272a399d188SSam Parker       MachineInstr *VPT = Insts.front();
273a399d188SSam Parker       assert(isVPTOpcode(VPT->getOpcode()) &&
274a399d188SSam Parker              "Expected VPT block to begin with VPT/VPST");
275a399d188SSam Parker 
276a399d188SSam Parker       if (VPT->getOpcode() == ARM::MVE_VPST)
277a399d188SSam Parker         return false;
278a399d188SSam Parker 
279a399d188SSam Parker       auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
280a399d188SSam Parker         MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
281a399d188SSam Parker         return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
282a399d188SSam Parker       };
283a399d188SSam Parker 
284a399d188SSam Parker       auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
285a399d188SSam Parker         MachineOperand &MO = MI->getOperand(Idx);
286a399d188SSam Parker         if (!MO.isReg() || !MO.getReg())
287a399d188SSam Parker           return true;
288a399d188SSam Parker 
289a399d188SSam Parker         SmallPtrSet<MachineInstr *, 2> Defs;
290a399d188SSam Parker         RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
291a399d188SSam Parker         if (Defs.empty())
292a399d188SSam Parker           return true;
293a399d188SSam Parker 
294a399d188SSam Parker         for (auto *Def : Defs)
295a399d188SSam Parker           if (Def->getParent() == VPT->getParent())
296a399d188SSam Parker             return false;
297a399d188SSam Parker         return true;
298a399d188SSam Parker       };
299a399d188SSam Parker 
300a399d188SSam Parker       // Check that at least one of the operands is directly predicated on a
301a399d188SSam Parker       // vctp and allow an invariant value too.
302a399d188SSam Parker       return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
303a399d188SSam Parker              (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
304a399d188SSam Parker              (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
305a399d188SSam Parker     }
306a399d188SSam Parker 
isValid(ReachingDefAnalysis & RDA)307a399d188SSam Parker     static bool isValid(ReachingDefAnalysis &RDA) {
308b4fa884aSSam Parker       // All predication within the loop should be based on vctp. If the block
309b4fa884aSSam Parker       // isn't predicated on entry, check whether the vctp is within the block
310b4fa884aSSam Parker       // and that all other instructions are then predicated on it.
311b4fa884aSSam Parker       for (auto &Block : Blocks) {
312a399d188SSam Parker         if (isEntryPredicatedOnVCTP(Block, false) ||
313a399d188SSam Parker             hasImplicitlyValidVPT(Block, RDA))
314b4fa884aSSam Parker           continue;
315b4fa884aSSam Parker 
316b4fa884aSSam Parker         SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
3171de3e7fdSDavid Green         // We don't know how to convert a block with just a VPT;VCTP into
3181de3e7fdSDavid Green         // anything valid once we remove the VCTP. For now just bail out.
3191de3e7fdSDavid Green         assert(isVPTOpcode(Insts.front()->getOpcode()) &&
3201de3e7fdSDavid Green                "Expected VPT block to start with a VPST or VPT!");
3211de3e7fdSDavid Green         if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
3221de3e7fdSDavid Green             isVCTP(Insts.back()))
3231de3e7fdSDavid Green           return false;
3241de3e7fdSDavid Green 
325b4fa884aSSam Parker         for (auto *MI : Insts) {
326b4fa884aSSam Parker           // Check that any internal VCTPs are 'Then' predicated.
327b4fa884aSSam Parker           if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
328b4fa884aSSam Parker             return false;
329b4fa884aSSam Parker           // Skip other instructions that build up the predicate.
330b4fa884aSSam Parker           if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
331b4fa884aSSam Parker             continue;
332b4fa884aSSam Parker           // Check that any other instructions are predicated upon a vctp.
333b4fa884aSSam Parker           // TODO: We could infer when VPTs are implicitly predicated on the
334b4fa884aSSam Parker           // vctp (when the operands are predicated).
335b4fa884aSSam Parker           if (!isPredicatedOnVCTP(MI)) {
336b4fa884aSSam Parker             LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
337b4fa884aSSam Parker             return false;
338b4fa884aSSam Parker           }
339b4fa884aSSam Parker         }
340b4fa884aSSam Parker       }
341b4fa884aSSam Parker       return true;
342b4fa884aSSam Parker     }
343b4fa884aSSam Parker 
VPTState(MachineInstr * MI)344b4fa884aSSam Parker     VPTState(MachineInstr *MI) { Insts.push_back(MI); }
345b4fa884aSSam Parker 
insert(MachineInstr * MI)346b4fa884aSSam Parker     void insert(MachineInstr *MI) {
347b4fa884aSSam Parker       Insts.push_back(MI);
348b4fa884aSSam Parker       // VPT/VPST + 4 predicated instructions.
349b4fa884aSSam Parker       assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
350b4fa884aSSam Parker     }
351b4fa884aSSam Parker 
containsVCTP() const352b4fa884aSSam Parker     bool containsVCTP() const {
353c2bb9637SKazu Hirata       return llvm::any_of(Insts, isVCTP);
35440425183SSam Parker     }
35540425183SSam Parker 
size() const35640425183SSam Parker     unsigned size() const { return Insts.size(); }
getInsts()357b4fa884aSSam Parker     SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
35840425183SSam Parker   };
35940425183SSam Parker 
3608978c12bSSam Parker   struct LowOverheadLoop {
3618978c12bSSam Parker 
362fd01b2f4SSam Parker     MachineLoop &ML;
3633ee580d0SSam Parker     MachineBasicBlock *Preheader = nullptr;
364fd01b2f4SSam Parker     MachineLoopInfo &MLI;
365fd01b2f4SSam Parker     ReachingDefAnalysis &RDA;
366de3e65e6SSam Parker     const TargetRegisterInfo &TRI;
3673ee580d0SSam Parker     const ARMBaseInstrInfo &TII;
3688978c12bSSam Parker     MachineFunction *MF = nullptr;
369dfa2c14bSSam Parker     MachineBasicBlock::iterator StartInsertPt;
370dfa2c14bSSam Parker     MachineBasicBlock *StartInsertBB = nullptr;
3718978c12bSSam Parker     MachineInstr *Start = nullptr;
3728978c12bSSam Parker     MachineInstr *Dec = nullptr;
3738978c12bSSam Parker     MachineInstr *End = nullptr;
37431f02ac6SSam Tebbs     MachineOperand TPNumElements;
375b4fa884aSSam Parker     SmallVector<MachineInstr *, 4> VCTPs;
37642350cd8SSam Parker     SmallPtrSet<MachineInstr *, 4> ToRemove;
37724bf8063SPierre-vh     SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute;
37802cd8a6bSDavid Green     SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs;
37973346f58SDavid Green     SmallPtrSet<MachineInstr *, 4> VMOVCopies;
3808978c12bSSam Parker     bool Revert = false;
3818978c12bSSam Parker     bool CannotTailPredicate = false;
3828978c12bSSam Parker 
LowOverheadLoop__anon59c8a0920111::LowOverheadLoop383fd01b2f4SSam Parker     LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
3843ee580d0SSam Parker                     ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
3853ee580d0SSam Parker                     const ARMBaseInstrInfo &TII)
38631f02ac6SSam Tebbs         : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
38731f02ac6SSam Tebbs           TPNumElements(MachineOperand::CreateImm(0)) {
388fd01b2f4SSam Parker       MF = ML.getHeader()->getParent();
3893ee580d0SSam Parker       if (auto *MBB = ML.getLoopPreheader())
3903ee580d0SSam Parker         Preheader = MBB;
391543406a6SDavid Green       else if (auto *MBB = MLI.findLoopPreheader(&ML, true, true))
3923ee580d0SSam Parker         Preheader = MBB;
393b4fa884aSSam Parker       VPTState::reset();
3948978c12bSSam Parker     }
3958978c12bSSam Parker 
3968978c12bSSam Parker     // If this is an MVE instruction, check that we know how to use tail
397e27632c3SSam Parker     // predication with it. Record VPT blocks and return whether the
398e27632c3SSam Parker     // instruction is valid for tail predication.
399e27632c3SSam Parker     bool ValidateMVEInst(MachineInstr *MI);
400e27632c3SSam Parker 
AnalyseMVEInst__anon59c8a0920111::LowOverheadLoop4018f6a6763SSam Parker     void AnalyseMVEInst(MachineInstr *MI) {
402e27632c3SSam Parker       CannotTailPredicate = !ValidateMVEInst(MI);
4038978c12bSSam Parker     }
4048978c12bSSam Parker 
IsTailPredicationLegal__anon59c8a0920111::LowOverheadLoop4058978c12bSSam Parker     bool IsTailPredicationLegal() const {
4068978c12bSSam Parker       // For now, let's keep things really simple and only support a single
4078978c12bSSam Parker       // block for tail predication.
408b4fa884aSSam Parker       return !Revert && FoundAllComponents() && !VCTPs.empty() &&
409fd01b2f4SSam Parker              !CannotTailPredicate && ML.getNumBlocks() == 1;
4108978c12bSSam Parker     }
4118978c12bSSam Parker 
412b4fa884aSSam Parker     // Given that MI is a VCTP, check that is equivalent to any other VCTPs
413b4fa884aSSam Parker     // found.
414b4fa884aSSam Parker     bool AddVCTP(MachineInstr *MI);
415b4fa884aSSam Parker 
416de3e65e6SSam Parker     // Check that the predication in the loop will be equivalent once we
417de3e65e6SSam Parker     // perform the conversion. Also ensure that we can provide the number
418de3e65e6SSam Parker     // of elements to the loop start instruction.
419dfa2c14bSSam Parker     bool ValidateTailPredicate();
4208f6a6763SSam Parker 
421de3e65e6SSam Parker     // Check that any values available outside of the loop will be the same
422de3e65e6SSam Parker     // after tail predication conversion.
4233ee580d0SSam Parker     bool ValidateLiveOuts();
424de3e65e6SSam Parker 
4258978c12bSSam Parker     // Is it safe to define LR with DLS/WLS?
4268978c12bSSam Parker     // LR can be defined if it is the operand to start, because it's the same
4278978c12bSSam Parker     // value, or if it's going to be equivalent to the operand to Start.
428ac30ea2fSSam Parker     MachineInstr *isSafeToDefineLR();
4298978c12bSSam Parker 
430cced971fSSam Parker     // Check the branch targets are within range and we satisfy our
431cced971fSSam Parker     // restrictions.
432e82a0084SSam Parker     void Validate(ARMBasicBlockUtils *BBUtils);
4338978c12bSSam Parker 
FoundAllComponents__anon59c8a0920111::LowOverheadLoop4348978c12bSSam Parker     bool FoundAllComponents() const {
4358978c12bSSam Parker       return Start && Dec && End;
4368978c12bSSam Parker     }
4378978c12bSSam Parker 
getVPTBlocks__anon59c8a0920111::LowOverheadLoop438b4fa884aSSam Parker     SmallVectorImpl<VPTState> &getVPTBlocks() {
439b4fa884aSSam Parker       return VPTState::Blocks;
440b4fa884aSSam Parker     }
44140425183SSam Parker 
44231f02ac6SSam Tebbs     // Return the operand for the loop start instruction. This will be the loop
44331f02ac6SSam Tebbs     // iteration count, or the number of elements if we're tail predicating.
getLoopStartOperand__anon59c8a0920111::LowOverheadLoop44431f02ac6SSam Tebbs     MachineOperand &getLoopStartOperand() {
445b2ac9681SDavid Green       if (IsTailPredicationLegal())
446b2ac9681SDavid Green         return TPNumElements;
447fad70c30SDavid Green       return Start->getOperand(1);
44828166816SSam Parker     }
44928166816SSam Parker 
getStartOpcode__anon59c8a0920111::LowOverheadLoop45028166816SSam Parker     unsigned getStartOpcode() const {
451bee2f618SDavid Green       bool IsDo = isDoLoopStart(*Start);
45228166816SSam Parker       if (!IsTailPredicationLegal())
45328166816SSam Parker         return IsDo ? ARM::t2DLS : ARM::t2WLS;
45428166816SSam Parker 
455b4fa884aSSam Parker       return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
45628166816SSam Parker     }
45728166816SSam Parker 
dump__anon59c8a0920111::LowOverheadLoop4588978c12bSSam Parker     void dump() const {
4598978c12bSSam Parker       if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
4608978c12bSSam Parker       if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
4618978c12bSSam Parker       if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
462b4fa884aSSam Parker       if (!VCTPs.empty()) {
463b4fa884aSSam Parker         dbgs() << "ARM Loops: Found VCTP(s):\n";
464b4fa884aSSam Parker         for (auto *MI : VCTPs)
465b4fa884aSSam Parker           dbgs() << " - " << *MI;
466b4fa884aSSam Parker       }
4678978c12bSSam Parker       if (!FoundAllComponents())
4688978c12bSSam Parker         dbgs() << "ARM Loops: Not a low-overhead loop.\n";
4698978c12bSSam Parker       else if (!(Start && Dec && End))
4708978c12bSSam Parker         dbgs() << "ARM Loops: Failed to find all loop components.\n";
4718978c12bSSam Parker     }
4728978c12bSSam Parker   };
4738978c12bSSam Parker 
474a6fd919cSSam Parker   class ARMLowOverheadLoops : public MachineFunctionPass {
47536c92227SSam Parker     MachineFunction           *MF = nullptr;
47628166816SSam Parker     MachineLoopInfo           *MLI = nullptr;
477cced971fSSam Parker     ReachingDefAnalysis       *RDA = nullptr;
478a6fd919cSSam Parker     const ARMBaseInstrInfo    *TII = nullptr;
479a6fd919cSSam Parker     MachineRegisterInfo       *MRI = nullptr;
480d97cf1f8SSjoerd Meijer     const TargetRegisterInfo  *TRI = nullptr;
481a6fd919cSSam Parker     std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
482a6fd919cSSam Parker 
483a6fd919cSSam Parker   public:
484a6fd919cSSam Parker     static char ID;
485a6fd919cSSam Parker 
ARMLowOverheadLoops()486a6fd919cSSam Parker     ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
487a6fd919cSSam Parker 
getAnalysisUsage(AnalysisUsage & AU) const488a6fd919cSSam Parker     void getAnalysisUsage(AnalysisUsage &AU) const override {
489a6fd919cSSam Parker       AU.setPreservesCFG();
490a6fd919cSSam Parker       AU.addRequired<MachineLoopInfo>();
491cced971fSSam Parker       AU.addRequired<ReachingDefAnalysis>();
492a6fd919cSSam Parker       MachineFunctionPass::getAnalysisUsage(AU);
493a6fd919cSSam Parker     }
494a6fd919cSSam Parker 
495a6fd919cSSam Parker     bool runOnMachineFunction(MachineFunction &MF) override;
496a6fd919cSSam Parker 
getRequiredProperties() const497a6fd919cSSam Parker     MachineFunctionProperties getRequiredProperties() const override {
498a6fd919cSSam Parker       return MachineFunctionProperties().set(
499cced971fSSam Parker           MachineFunctionProperties::Property::NoVRegs).set(
500cced971fSSam Parker           MachineFunctionProperties::Property::TracksLiveness);
501a6fd919cSSam Parker     }
502a6fd919cSSam Parker 
getPassName() const503a6fd919cSSam Parker     StringRef getPassName() const override {
504a6fd919cSSam Parker       return ARM_LOW_OVERHEAD_LOOPS_NAME;
505a6fd919cSSam Parker     }
50636c92227SSam Parker 
50736c92227SSam Parker   private:
50836c92227SSam Parker     bool ProcessLoop(MachineLoop *ML);
50936c92227SSam Parker 
51036c92227SSam Parker     bool RevertNonLoops();
51136c92227SSam Parker 
51236c92227SSam Parker     void RevertWhile(MachineInstr *MI) const;
513b2ac9681SDavid Green     void RevertDo(MachineInstr *MI) const;
51436c92227SSam Parker 
515ac30ea2fSSam Parker     bool RevertLoopDec(MachineInstr *MI) const;
51636c92227SSam Parker 
5174ba6d0deSSam Parker     void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
51836c92227SSam Parker 
5190447f350SDavid Green     void RevertLoopEndDec(MachineInstr *MI) const;
5200447f350SDavid Green 
52140425183SSam Parker     void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
5228978c12bSSam Parker 
5238978c12bSSam Parker     MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
5248978c12bSSam Parker 
5258978c12bSSam Parker     void Expand(LowOverheadLoop &LoLoop);
52636c92227SSam Parker 
52701022af5SSjoerd Meijer     void IterationCountDCE(LowOverheadLoop &LoLoop);
528a6fd919cSSam Parker   };
529a6fd919cSSam Parker }
530a6fd919cSSam Parker 
531a6fd919cSSam Parker char ARMLowOverheadLoops::ID = 0;
532a6fd919cSSam Parker 
533b4fa884aSSam Parker SmallVector<VPTState, 4> VPTState::Blocks;
534b4fa884aSSam Parker SetVector<MachineInstr *> VPTState::CurrentPredicates;
535b4fa884aSSam Parker std::map<MachineInstr *,
536b4fa884aSSam Parker          std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
537b4fa884aSSam Parker 
INITIALIZE_PASS(ARMLowOverheadLoops,DEBUG_TYPE,ARM_LOW_OVERHEAD_LOOPS_NAME,false,false)538a6fd919cSSam Parker INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
539a6fd919cSSam Parker                 false, false)
540a6fd919cSSam Parker 
541779a8a02SSam Parker static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
542779a8a02SSam Parker                       InstSet &ToRemove, InstSet &Ignore) {
543779a8a02SSam Parker 
544779a8a02SSam Parker   // Check that we can remove all of Killed without having to modify any IT
545779a8a02SSam Parker   // blocks.
546779a8a02SSam Parker   auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
547779a8a02SSam Parker     // Collect the dead code and the MBBs in which they reside.
548779a8a02SSam Parker     SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
549779a8a02SSam Parker     for (auto *Dead : Killed)
550779a8a02SSam Parker       BasicBlocks.insert(Dead->getParent());
551779a8a02SSam Parker 
552779a8a02SSam Parker     // Collect IT blocks in all affected basic blocks.
553779a8a02SSam Parker     std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
554779a8a02SSam Parker     for (auto *MBB : BasicBlocks) {
555779a8a02SSam Parker       for (auto &IT : *MBB) {
556779a8a02SSam Parker         if (IT.getOpcode() != ARM::t2IT)
557779a8a02SSam Parker           continue;
558e24537d4SMircea Trofin         RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE),
559e24537d4SMircea Trofin                                  ITBlocks[&IT]);
560779a8a02SSam Parker       }
561779a8a02SSam Parker     }
562779a8a02SSam Parker 
563779a8a02SSam Parker     // If we're removing all of the instructions within an IT block, then
564779a8a02SSam Parker     // also remove the IT instruction.
565779a8a02SSam Parker     SmallPtrSet<MachineInstr *, 2> ModifiedITs;
566779a8a02SSam Parker     SmallPtrSet<MachineInstr *, 2> RemoveITs;
567779a8a02SSam Parker     for (auto *Dead : Killed) {
568779a8a02SSam Parker       if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) {
569779a8a02SSam Parker         MachineInstr *IT = RDA.getMIOperand(Dead, *MO);
570779a8a02SSam Parker         RemoveITs.insert(IT);
571779a8a02SSam Parker         auto &CurrentBlock = ITBlocks[IT];
572779a8a02SSam Parker         CurrentBlock.erase(Dead);
573779a8a02SSam Parker         if (CurrentBlock.empty())
574779a8a02SSam Parker           ModifiedITs.erase(IT);
575779a8a02SSam Parker         else
576779a8a02SSam Parker           ModifiedITs.insert(IT);
577779a8a02SSam Parker       }
578779a8a02SSam Parker     }
579779a8a02SSam Parker     if (!ModifiedITs.empty())
580779a8a02SSam Parker       return false;
581779a8a02SSam Parker     Killed.insert(RemoveITs.begin(), RemoveITs.end());
582779a8a02SSam Parker     return true;
583779a8a02SSam Parker   };
584779a8a02SSam Parker 
585779a8a02SSam Parker   SmallPtrSet<MachineInstr *, 2> Uses;
586779a8a02SSam Parker   if (!RDA.isSafeToRemove(MI, Uses, Ignore))
587779a8a02SSam Parker     return false;
588779a8a02SSam Parker 
589779a8a02SSam Parker   if (WontCorruptITs(Uses, RDA)) {
590779a8a02SSam Parker     ToRemove.insert(Uses.begin(), Uses.end());
591779a8a02SSam Parker     LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
592779a8a02SSam Parker                << " - can also remove:\n";
593779a8a02SSam Parker                for (auto *Use : Uses)
594779a8a02SSam Parker                  dbgs() << "   - " << *Use);
595779a8a02SSam Parker 
596779a8a02SSam Parker     SmallPtrSet<MachineInstr*, 4> Killed;
597779a8a02SSam Parker     RDA.collectKilledOperands(MI, Killed);
598779a8a02SSam Parker     if (WontCorruptITs(Killed, RDA)) {
599779a8a02SSam Parker       ToRemove.insert(Killed.begin(), Killed.end());
600779a8a02SSam Parker       LLVM_DEBUG(for (auto *Dead : Killed)
601779a8a02SSam Parker                    dbgs() << "   - " << *Dead);
602779a8a02SSam Parker     }
603779a8a02SSam Parker     return true;
604779a8a02SSam Parker   }
605779a8a02SSam Parker   return false;
606779a8a02SSam Parker }
607779a8a02SSam Parker 
ValidateTailPredicate()608dfa2c14bSSam Parker bool LowOverheadLoop::ValidateTailPredicate() {
609e82a0084SSam Parker   if (!IsTailPredicationLegal()) {
610e82a0084SSam Parker     LLVM_DEBUG(if (VCTPs.empty())
611e82a0084SSam Parker                  dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
612e82a0084SSam Parker                dbgs() << "ARM Loops: Tail-predication is not valid.\n");
613e82a0084SSam Parker     return false;
61442350cd8SSam Parker   }
61542350cd8SSam Parker 
616b4fa884aSSam Parker   assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
617e82a0084SSam Parker   assert(ML.getBlocks().size() == 1 &&
618e82a0084SSam Parker          "Shouldn't be processing a loop with more than one block");
619b4fa884aSSam Parker 
6202fc690acSSjoerd Meijer   if (DisableTailPredication) {
6212fc690acSSjoerd Meijer     LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
6222fc690acSSjoerd Meijer     return false;
6232fc690acSSjoerd Meijer   }
6242fc690acSSjoerd Meijer 
6257e02bc81SSam Parker   if (!VPTState::isValid(RDA)) {
6267e02bc81SSam Parker     LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
6278f6a6763SSam Parker     return false;
6287e02bc81SSam Parker   }
6298f6a6763SSam Parker 
630b30adfb5SSam Parker   if (!ValidateLiveOuts()) {
631b30adfb5SSam Parker     LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
632de3e65e6SSam Parker     return false;
633b30adfb5SSam Parker   }
634de3e65e6SSam Parker 
6358f6a6763SSam Parker   // For tail predication, we need to provide the number of elements, instead
6368f6a6763SSam Parker   // of the iteration count, to the loop start instruction. The number of
6378f6a6763SSam Parker   // elements is provided to the vctp instruction, so we need to check that
6388f6a6763SSam Parker   // we can use this register at InsertPt.
639b4fa884aSSam Parker   MachineInstr *VCTP = VCTPs.back();
640bee2f618SDavid Green   if (Start->getOpcode() == ARM::t2DoLoopStartTP ||
641bee2f618SDavid Green       Start->getOpcode() == ARM::t2WhileLoopStartTP) {
64208d1c2d4SDavid Green     TPNumElements = Start->getOperand(2);
64308d1c2d4SDavid Green     StartInsertPt = Start;
64408d1c2d4SDavid Green     StartInsertBB = Start->getParent();
64508d1c2d4SDavid Green   } else {
64631f02ac6SSam Tebbs     TPNumElements = VCTP->getOperand(1);
647e24537d4SMircea Trofin     MCRegister NumElements = TPNumElements.getReg().asMCReg();
6488f6a6763SSam Parker 
6498f6a6763SSam Parker     // If the register is defined within loop, then we can't perform TP.
6508f6a6763SSam Parker     // TODO: Check whether this is just a mov of a register that would be
6518f6a6763SSam Parker     // available.
652fd01b2f4SSam Parker     if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
6538f6a6763SSam Parker       LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
6548f6a6763SSam Parker       return false;
6558f6a6763SSam Parker     }
6568f6a6763SSam Parker 
6576dcbc323SDavid Green     // The element count register maybe defined after InsertPt, in which case we
6586dcbc323SDavid Green     // need to try to move either InsertPt or the def so that the [w|d]lstp can
6596dcbc323SDavid Green     // use the value.
6606dcbc323SDavid Green 
6616dcbc323SDavid Green     if (StartInsertPt != StartInsertBB->end() &&
6626dcbc323SDavid Green         !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
66308d1c2d4SDavid Green       if (auto *ElemDef =
66408d1c2d4SDavid Green               RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
6656dcbc323SDavid Green         if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
6666dcbc323SDavid Green           ElemDef->removeFromParent();
6676dcbc323SDavid Green           StartInsertBB->insert(StartInsertPt, ElemDef);
66808d1c2d4SDavid Green           LLVM_DEBUG(dbgs()
66908d1c2d4SDavid Green                      << "ARM Loops: Moved element count def: " << *ElemDef);
6706dcbc323SDavid Green         } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
6716dcbc323SDavid Green           StartInsertPt->removeFromParent();
6726dcbc323SDavid Green           StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
6736dcbc323SDavid Green                                      &*StartInsertPt);
6746dcbc323SDavid Green           LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
6756dcbc323SDavid Green         } else {
6766dcbc323SDavid Green           // If we fail to move an instruction and the element count is provided
6776dcbc323SDavid Green           // by a mov, use the mov operand if it will have the same value at the
6786dcbc323SDavid Green           // insertion point
6796dcbc323SDavid Green           MachineOperand Operand = ElemDef->getOperand(1);
6806dcbc323SDavid Green           if (isMovRegOpcode(ElemDef->getOpcode()) &&
681e24537d4SMircea Trofin               RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
682e24537d4SMircea Trofin                   RDA.getUniqueReachingMIDef(&*StartInsertPt,
683e24537d4SMircea Trofin                                              Operand.getReg().asMCReg())) {
6846dcbc323SDavid Green             TPNumElements = Operand;
6856dcbc323SDavid Green             NumElements = TPNumElements.getReg();
6866dcbc323SDavid Green           } else {
6876dcbc323SDavid Green             LLVM_DEBUG(dbgs()
6886dcbc323SDavid Green                        << "ARM Loops: Unable to move element count to loop "
6896dcbc323SDavid Green                        << "start instruction.\n");
6906dcbc323SDavid Green             return false;
6916dcbc323SDavid Green           }
6926dcbc323SDavid Green         }
6936dcbc323SDavid Green       }
6946dcbc323SDavid Green     }
6956dcbc323SDavid Green 
6968f6a6763SSam Parker     // Especially in the case of while loops, InsertBB may not be the
6978f6a6763SSam Parker     // preheader, so we need to check that the register isn't redefined
6988f6a6763SSam Parker     // before entering the loop.
699ddbc0778SSam Parker     auto CannotProvideElements = [this](MachineBasicBlock *MBB,
700e24537d4SMircea Trofin                                         MCRegister NumElements) {
701cb27006aSDavid Green       if (MBB->empty())
702cb27006aSDavid Green         return false;
7038f6a6763SSam Parker       // NumElements is redefined in this block.
704fd01b2f4SSam Parker       if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
7058f6a6763SSam Parker         return true;
7068f6a6763SSam Parker 
7078f6a6763SSam Parker       // Don't continue searching up through multiple predecessors.
7088f6a6763SSam Parker       if (MBB->pred_size() > 1)
7098f6a6763SSam Parker         return true;
7108f6a6763SSam Parker 
7118f6a6763SSam Parker       return false;
7128f6a6763SSam Parker     };
7138f6a6763SSam Parker 
714e82a0084SSam Parker     // Search backwards for a def, until we get to InsertBB.
7153ee580d0SSam Parker     MachineBasicBlock *MBB = Preheader;
716dfa2c14bSSam Parker     while (MBB && MBB != StartInsertBB) {
717bad6032bSSam Parker       if (CannotProvideElements(MBB, NumElements)) {
718bad6032bSSam Parker         LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
7198f6a6763SSam Parker         return false;
720bad6032bSSam Parker       }
7218f6a6763SSam Parker       MBB = *MBB->pred_begin();
7228f6a6763SSam Parker     }
72308d1c2d4SDavid Green   }
72408d1c2d4SDavid Green 
72508d1c2d4SDavid Green   // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
72608d1c2d4SDavid Green   // world the [w|d]lstp instruction would be last instruction in the preheader
72708d1c2d4SDavid Green   // and so it would only affect instructions within the loop body. But due to
72808d1c2d4SDavid Green   // scheduling, and/or the logic in this pass (above), the insertion point can
72908d1c2d4SDavid Green   // be moved earlier. So if the Loop Start isn't the last instruction in the
73008d1c2d4SDavid Green   // preheader, and if the initial element count is smaller than the vector
73108d1c2d4SDavid Green   // width, the Loop Start instruction will immediately generate one or more
73208d1c2d4SDavid Green   // false lane mask which can, incorrectly, affect the proceeding MVE
73308d1c2d4SDavid Green   // instructions in the preheader.
734898a81dfSSam Parker   if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
735898a81dfSSam Parker     LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
73608d1c2d4SDavid Green     return false;
737898a81dfSSam Parker   }
7388f6a6763SSam Parker 
73902cd8a6bSDavid Green   // For any DoubleWidthResultInstrs we found whilst scanning instructions, they
74002cd8a6bSDavid Green   // need to compute an output size that is smaller than the VCTP mask operates
74102cd8a6bSDavid Green   // on. The VecSize of the DoubleWidthResult is the larger vector size - the
74202cd8a6bSDavid Green   // size it extends into, so any VCTP VecSize <= is valid.
74302cd8a6bSDavid Green   unsigned VCTPVecSize = getVecSize(*VCTP);
74402cd8a6bSDavid Green   for (MachineInstr *MI : DoubleWidthResultInstrs) {
74502cd8a6bSDavid Green     unsigned InstrVecSize = getVecSize(*MI);
74602cd8a6bSDavid Green     if (InstrVecSize > VCTPVecSize) {
74702cd8a6bSDavid Green       LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
74802cd8a6bSDavid Green                         << "VecSize:\n" << *MI);
74902cd8a6bSDavid Green       return false;
75002cd8a6bSDavid Green     }
75102cd8a6bSDavid Green   }
75202cd8a6bSDavid Green 
75342350cd8SSam Parker   // Check that the value change of the element count is what we expect and
75442350cd8SSam Parker   // that the predication will be equivalent. For this we need:
75542350cd8SSam Parker   // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
75642350cd8SSam Parker   // and we can also allow register copies within the chain too.
757892af45cSDavid Green   auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
758892af45cSDavid Green     return -getAddSubImmediate(*MI) == ExpectedVecWidth;
75942350cd8SSam Parker   };
76042350cd8SSam Parker 
76108d1c2d4SDavid Green   MachineBasicBlock *MBB = VCTP->getParent();
7627aabb6adSSam Tebbs   // Remove modifications to the element count since they have no purpose in a
7637aabb6adSSam Tebbs   // tail predicated loop. Explicitly refer to the vctp operand no matter which
7647aabb6adSSam Tebbs   // register NumElements has been assigned to, since that is what the
7657aabb6adSSam Tebbs   // modifications will be using
766e24537d4SMircea Trofin   if (auto *Def = RDA.getUniqueReachingMIDef(
767e24537d4SMircea Trofin           &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
76842350cd8SSam Parker     SmallPtrSet<MachineInstr*, 2> ElementChain;
769b4fa884aSSam Parker     SmallPtrSet<MachineInstr*, 2> Ignore;
77042350cd8SSam Parker     unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
77142350cd8SSam Parker 
772b4fa884aSSam Parker     Ignore.insert(VCTPs.begin(), VCTPs.end());
773835251f7SPierre-vh 
774779a8a02SSam Parker     if (TryRemove(Def, RDA, ElementChain, Ignore)) {
77542350cd8SSam Parker       bool FoundSub = false;
77642350cd8SSam Parker 
77742350cd8SSam Parker       for (auto *MI : ElementChain) {
77842350cd8SSam Parker         if (isMovRegOpcode(MI->getOpcode()))
77942350cd8SSam Parker           continue;
78042350cd8SSam Parker 
78142350cd8SSam Parker         if (isSubImmOpcode(MI->getOpcode())) {
7827e02bc81SSam Parker           if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
7837e02bc81SSam Parker             LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
7847e02bc81SSam Parker                        " count: " << *MI);
78542350cd8SSam Parker             return false;
7867e02bc81SSam Parker           }
78742350cd8SSam Parker           FoundSub = true;
7887e02bc81SSam Parker         } else {
7897e02bc81SSam Parker           LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
7907e02bc81SSam Parker                      " count: " << *MI);
79142350cd8SSam Parker           return false;
79242350cd8SSam Parker         }
7937e02bc81SSam Parker       }
79442350cd8SSam Parker       ToRemove.insert(ElementChain.begin(), ElementChain.end());
79542350cd8SSam Parker     }
79642350cd8SSam Parker   }
7977786ac83SDavid Green 
798bee2f618SDavid Green   // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
799bee2f618SDavid Green   // can also remove any extra instructions in the preheader, which often
800bee2f618SDavid Green   // includes a now unused MOV.
801bee2f618SDavid Green   if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||
802bee2f618SDavid Green        Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
803bee2f618SDavid Green       Preheader && !Preheader->empty() &&
8047786ac83SDavid Green       !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {
8057786ac83SDavid Green     if (auto *Def = RDA.getUniqueReachingMIDef(
8067786ac83SDavid Green             &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {
8077786ac83SDavid Green       SmallPtrSet<MachineInstr*, 2> Ignore;
8087786ac83SDavid Green       Ignore.insert(VCTPs.begin(), VCTPs.end());
8097786ac83SDavid Green       TryRemove(Def, RDA, ToRemove, Ignore);
8107786ac83SDavid Green     }
8117786ac83SDavid Green   }
8127786ac83SDavid Green 
8138f6a6763SSam Parker   return true;
8148f6a6763SSam Parker }
8158f6a6763SSam Parker 
isRegInClass(const MachineOperand & MO,const TargetRegisterClass * Class)816ff9ac33eSSam Parker static bool isRegInClass(const MachineOperand &MO,
817ff9ac33eSSam Parker                          const TargetRegisterClass *Class) {
818ff9ac33eSSam Parker   return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
819ff9ac33eSSam Parker }
820ff9ac33eSSam Parker 
82194cacebcSSam Parker // MVE 'narrowing' operate on half a lane, reading from half and writing
82294cacebcSSam Parker // to half, which are referred to has the top and bottom half. The other
82394cacebcSSam Parker // half retains its previous value.
retainsPreviousHalfElement(const MachineInstr & MI)82494cacebcSSam Parker static bool retainsPreviousHalfElement(const MachineInstr &MI) {
82594cacebcSSam Parker   const MCInstrDesc &MCID = MI.getDesc();
82694cacebcSSam Parker   uint64_t Flags = MCID.TSFlags;
82794cacebcSSam Parker   return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
82894cacebcSSam Parker }
82994cacebcSSam Parker 
830d7084fa3SSam Parker // Some MVE instructions read from the top/bottom halves of their operand(s)
831d7084fa3SSam Parker // and generate a vector result with result elements that are double the
832d7084fa3SSam Parker // width of the input.
producesDoubleWidthResult(const MachineInstr & MI)833d7084fa3SSam Parker static bool producesDoubleWidthResult(const MachineInstr &MI) {
834d7084fa3SSam Parker   const MCInstrDesc &MCID = MI.getDesc();
835d7084fa3SSam Parker   uint64_t Flags = MCID.TSFlags;
836d7084fa3SSam Parker   return (Flags & ARMII::DoubleWidthResult) != 0;
837d7084fa3SSam Parker }
838d7084fa3SSam Parker 
isHorizontalReduction(const MachineInstr & MI)83994b195ffSSam Parker static bool isHorizontalReduction(const MachineInstr &MI) {
84094b195ffSSam Parker   const MCInstrDesc &MCID = MI.getDesc();
84194b195ffSSam Parker   uint64_t Flags = MCID.TSFlags;
84294b195ffSSam Parker   return (Flags & ARMII::HorizontalReduction) != 0;
84394b195ffSSam Parker }
84494b195ffSSam Parker 
845d7084fa3SSam Parker // Can this instruction generate a non-zero result when given only zeroed
846d7084fa3SSam Parker // operands? This allows us to know that, given operands with false bytes
847d7084fa3SSam Parker // zeroed by masked loads, that the result will also contain zeros in those
848d7084fa3SSam Parker // bytes.
canGenerateNonZeros(const MachineInstr & MI)849d7084fa3SSam Parker static bool canGenerateNonZeros(const MachineInstr &MI) {
850d7084fa3SSam Parker 
851d7084fa3SSam Parker   // Check for instructions which can write into a larger element size,
852d7084fa3SSam Parker   // possibly writing into a previous zero'd lane.
853d7084fa3SSam Parker   if (producesDoubleWidthResult(MI))
854d7084fa3SSam Parker     return true;
855d7084fa3SSam Parker 
856d7084fa3SSam Parker   switch (MI.getOpcode()) {
857d7084fa3SSam Parker   default:
858d7084fa3SSam Parker     break;
859d7084fa3SSam Parker   // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
860d7084fa3SSam Parker   // fp16 -> fp32 vector conversions.
861d7084fa3SSam Parker   // Instructions that perform a NOT will generate 1s from 0s.
862d7084fa3SSam Parker   case ARM::MVE_VMVN:
863d7084fa3SSam Parker   case ARM::MVE_VORN:
864d7084fa3SSam Parker   // Count leading zeros will do just that!
865d7084fa3SSam Parker   case ARM::MVE_VCLZs8:
866d7084fa3SSam Parker   case ARM::MVE_VCLZs16:
867d7084fa3SSam Parker   case ARM::MVE_VCLZs32:
868d7084fa3SSam Parker     return true;
869d7084fa3SSam Parker   }
870d7084fa3SSam Parker   return false;
871d7084fa3SSam Parker }
872d7084fa3SSam Parker 
87394cacebcSSam Parker // Look at its register uses to see if it only can only receive zeros
87494cacebcSSam Parker // into its false lanes which would then produce zeros. Also check that
87594b195ffSSam Parker // the output register is also defined by an FalseLanesZero instruction
87694cacebcSSam Parker // so that if tail-predication happens, the lanes that aren't updated will
87794cacebcSSam Parker // still be zeros.
producesFalseLanesZero(MachineInstr & MI,const TargetRegisterClass * QPRs,const ReachingDefAnalysis & RDA,InstSet & FalseLanesZero)87894b195ffSSam Parker static bool producesFalseLanesZero(MachineInstr &MI,
87994cacebcSSam Parker                                    const TargetRegisterClass *QPRs,
88094cacebcSSam Parker                                    const ReachingDefAnalysis &RDA,
88194b195ffSSam Parker                                    InstSet &FalseLanesZero) {
88294cacebcSSam Parker   if (canGenerateNonZeros(MI))
88394cacebcSSam Parker     return false;
88494b195ffSSam Parker 
885b30adfb5SSam Parker   bool isPredicated = isVectorPredicated(&MI);
886b30adfb5SSam Parker   // Predicated loads will write zeros to the falsely predicated bytes of the
887b30adfb5SSam Parker   // destination register.
888b30adfb5SSam Parker   if (MI.mayLoad())
889b30adfb5SSam Parker     return isPredicated;
890b30adfb5SSam Parker 
891b30adfb5SSam Parker   auto IsZeroInit = [](MachineInstr *Def) {
892b30adfb5SSam Parker     return !isVectorPredicated(Def) &&
893b30adfb5SSam Parker            Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
894b30adfb5SSam Parker            Def->getOperand(1).getImm() == 0;
895b30adfb5SSam Parker   };
896b30adfb5SSam Parker 
89794b195ffSSam Parker   bool AllowScalars = isHorizontalReduction(MI);
89894cacebcSSam Parker   for (auto &MO : MI.operands()) {
89994cacebcSSam Parker     if (!MO.isReg() || !MO.getReg())
90094cacebcSSam Parker       continue;
90194b195ffSSam Parker     if (!isRegInClass(MO, QPRs) && AllowScalars)
90294b195ffSSam Parker       continue;
9039cb8f4d1SDavid Green     // Skip the lr predicate reg
9049cb8f4d1SDavid Green     int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
9059cb8f4d1SDavid Green     if (PIdx != -1 && (int)MI.getOperandNo(&MO) == PIdx + 2)
9069cb8f4d1SDavid Green       continue;
907b30adfb5SSam Parker 
908b30adfb5SSam Parker     // Check that this instruction will produce zeros in its false lanes:
909b30adfb5SSam Parker     // - If it only consumes false lanes zero or constant 0 (vmov #0)
910b30adfb5SSam Parker     // - If it's predicated, it only matters that it's def register already has
911b30adfb5SSam Parker     //   false lane zeros, so we can ignore the uses.
912b30adfb5SSam Parker     SmallPtrSet<MachineInstr *, 2> Defs;
913b30adfb5SSam Parker     RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
914b30adfb5SSam Parker     for (auto *Def : Defs) {
915b30adfb5SSam Parker       if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
916b30adfb5SSam Parker         continue;
917b30adfb5SSam Parker       if (MO.isUse() && isPredicated)
91894cacebcSSam Parker         continue;
91994cacebcSSam Parker       return false;
92094cacebcSSam Parker     }
921b30adfb5SSam Parker   }
92294cacebcSSam Parker   LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
92394cacebcSSam Parker   return true;
92494cacebcSSam Parker }
92594cacebcSSam Parker 
ValidateLiveOuts()9263ee580d0SSam Parker bool LowOverheadLoop::ValidateLiveOuts() {
927ff9ac33eSSam Parker   // We want to find out if the tail-predicated version of this loop will
928ff9ac33eSSam Parker   // produce the same values as the loop in its original form. For this to
929ff9ac33eSSam Parker   // be true, the newly inserted implicit predication must not change the
930ff9ac33eSSam Parker   // the (observable) results.
931ff9ac33eSSam Parker   // We're doing this because many instructions in the loop will not be
932ff9ac33eSSam Parker   // predicated and so the conversion from VPT predication to tail-predication
933ff9ac33eSSam Parker   // can result in different values being produced; due to the tail-predication
934ff9ac33eSSam Parker   // preventing many instructions from updating their falsely predicated
935ff9ac33eSSam Parker   // lanes. This analysis assumes that all the instructions perform lane-wise
936ff9ac33eSSam Parker   // operations and don't perform any exchanges.
937ff9ac33eSSam Parker   // A masked load, whether through VPT or tail predication, will write zeros
938ff9ac33eSSam Parker   // to any of the falsely predicated bytes. So, from the loads, we know that
939ff9ac33eSSam Parker   // the false lanes are zeroed and here we're trying to track that those false
940ff9ac33eSSam Parker   // lanes remain zero, or where they change, the differences are masked away
941ff9ac33eSSam Parker   // by their user(s).
9423471520bSDavid Green   // All MVE stores have to be predicated, so we know that any predicate load
943ff9ac33eSSam Parker   // operands, or stored results are equivalent already. Other explicitly
944ff9ac33eSSam Parker   // predicated instructions will perform the same operation in the original
945ff9ac33eSSam Parker   // loop and the tail-predicated form too. Because of this, we can insert
94694cacebcSSam Parker   // loads, stores and other predicated instructions into our Predicated
947ff9ac33eSSam Parker   // set and build from there.
948d941df36SSam Parker   const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
94994b195ffSSam Parker   SetVector<MachineInstr *> FalseLanesUnknown;
95094b195ffSSam Parker   SmallPtrSet<MachineInstr *, 4> FalseLanesZero;
95194cacebcSSam Parker   SmallPtrSet<MachineInstr *, 4> Predicated;
9523ee580d0SSam Parker   MachineBasicBlock *Header = ML.getHeader();
95394cacebcSSam Parker 
9548c50b5fbSDavid Green   LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
9558c50b5fbSDavid Green 
9563ee580d0SSam Parker   for (auto &MI : *Header) {
957a0c1dcc3SSam Parker     if (!shouldInspect(MI))
958ff9ac33eSSam Parker       continue;
959ff9ac33eSSam Parker 
960835251f7SPierre-vh     if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
96194b195ffSSam Parker       continue;
96294b195ffSSam Parker 
963b30adfb5SSam Parker     bool isPredicated = isVectorPredicated(&MI);
964b30adfb5SSam Parker     bool retainsOrReduces =
965b30adfb5SSam Parker       retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);
966b30adfb5SSam Parker 
967b30adfb5SSam Parker     if (isPredicated)
96894cacebcSSam Parker       Predicated.insert(&MI);
969b30adfb5SSam Parker     if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
97094b195ffSSam Parker       FalseLanesZero.insert(&MI);
971b30adfb5SSam Parker     else if (MI.getNumDefs() == 0)
972b30adfb5SSam Parker       continue;
9738c50b5fbSDavid Green     else if (!isPredicated && retainsOrReduces) {
9748c50b5fbSDavid Green       LLVM_DEBUG(dbgs() << "  Unpredicated instruction that retainsOrReduces: " << MI);
975b30adfb5SSam Parker       return false;
97673346f58SDavid Green     } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
977b30adfb5SSam Parker       FalseLanesUnknown.insert(&MI);
978ff9ac33eSSam Parker   }
979ff9ac33eSSam Parker 
9808c50b5fbSDavid Green   LLVM_DEBUG({
9818c50b5fbSDavid Green     dbgs() << "  Predicated:\n";
9828c50b5fbSDavid Green     for (auto *I : Predicated)
9838c50b5fbSDavid Green       dbgs() << "  " << *I;
9848c50b5fbSDavid Green     dbgs() << "  FalseLanesZero:\n";
9858c50b5fbSDavid Green     for (auto *I : FalseLanesZero)
9868c50b5fbSDavid Green       dbgs() << "  " << *I;
9878c50b5fbSDavid Green     dbgs() << "  FalseLanesUnknown:\n";
9888c50b5fbSDavid Green     for (auto *I : FalseLanesUnknown)
9898c50b5fbSDavid Green       dbgs() << "  " << *I;
9908c50b5fbSDavid Green   });
9918c50b5fbSDavid Green 
99294cacebcSSam Parker   auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
99394cacebcSSam Parker                               SmallPtrSetImpl<MachineInstr *> &Predicated) {
994ff9ac33eSSam Parker     SmallPtrSet<MachineInstr *, 2> Uses;
995e24537d4SMircea Trofin     RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
996ff9ac33eSSam Parker     for (auto *Use : Uses) {
99794cacebcSSam Parker       if (Use != MI && !Predicated.count(Use))
998ff9ac33eSSam Parker         return false;
999ff9ac33eSSam Parker     }
1000ff9ac33eSSam Parker     return true;
1001ff9ac33eSSam Parker   };
1002ff9ac33eSSam Parker 
100394cacebcSSam Parker   // Visit the unknowns in reverse so that we can start at the values being
1004ff9ac33eSSam Parker   // stored and then we can work towards the leaves, hopefully adding more
100594b195ffSSam Parker   // instructions to Predicated. Successfully terminating the loop means that
100694b195ffSSam Parker   // all the unknown values have to found to be masked by predicated user(s).
10073ee580d0SSam Parker   // For any unpredicated values, we store them in NonPredicated so that we
10083ee580d0SSam Parker   // can later check whether these form a reduction.
10093ee580d0SSam Parker   SmallPtrSet<MachineInstr*, 2> NonPredicated;
101094b195ffSSam Parker   for (auto *MI : reverse(FalseLanesUnknown)) {
1011ff9ac33eSSam Parker     for (auto &MO : MI->operands()) {
1012ff9ac33eSSam Parker       if (!isRegInClass(MO, QPRs) || !MO.isDef())
1013ff9ac33eSSam Parker         continue;
101494cacebcSSam Parker       if (!HasPredicatedUsers(MI, MO, Predicated)) {
10158c50b5fbSDavid Green         LLVM_DEBUG(dbgs() << "  Found an unknown def of : "
1016ff9ac33eSSam Parker                           << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
10173ee580d0SSam Parker         NonPredicated.insert(MI);
1018b30adfb5SSam Parker         break;
1019ff9ac33eSSam Parker       }
1020ff9ac33eSSam Parker     }
1021ff9ac33eSSam Parker     // Any unknown false lanes have been masked away by the user(s).
1022b30adfb5SSam Parker     if (!NonPredicated.contains(MI))
102394cacebcSSam Parker       Predicated.insert(MI);
1024ff9ac33eSSam Parker   }
1025d941df36SSam Parker 
10263ee580d0SSam Parker   SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
1027d941df36SSam Parker   SmallVector<MachineBasicBlock *, 2> ExitBlocks;
1028d941df36SSam Parker   ML.getExitBlocks(ExitBlocks);
1029d941df36SSam Parker   assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
10303ee580d0SSam Parker   assert(ExitBlocks.size() == 1 && "Expected a single exit block");
10313ee580d0SSam Parker   MachineBasicBlock *ExitBB = ExitBlocks.front();
10323ee580d0SSam Parker   for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
1033b30adfb5SSam Parker     // TODO: Instead of blocking predication, we could move the vctp to the exit
1034b30adfb5SSam Parker     // block and calculate it's operand there in or the preheader.
10358c50b5fbSDavid Green     if (RegMask.PhysReg == ARM::VPR) {
10368c50b5fbSDavid Green       LLVM_DEBUG(dbgs() << "  VPR is live in to the exit block.");
1037b30adfb5SSam Parker       return false;
10388c50b5fbSDavid Green     }
10393ee580d0SSam Parker     // Check Q-regs that are live in the exit blocks. We don't collect scalars
10403ee580d0SSam Parker     // because they won't be affected by lane predication.
1041b30adfb5SSam Parker     if (QPRs->contains(RegMask.PhysReg))
10423ee580d0SSam Parker       if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
10433ee580d0SSam Parker         LiveOutMIs.insert(MI);
10443ee580d0SSam Parker   }
10453ee580d0SSam Parker 
1046d941df36SSam Parker   // We've already validated that any VPT predication within the loop will be
1047d941df36SSam Parker   // equivalent when we perform the predication transformation; so we know that
1048d941df36SSam Parker   // any VPT predicated instruction is predicated upon VCTP. Any live-out
10493ee580d0SSam Parker   // instruction needs to be predicated, so check this here. The instructions
10503ee580d0SSam Parker   // in NonPredicated have been found to be a reduction that we can ensure its
105173346f58SDavid Green   // legality. Any MQPRCopy found will need to validate its input as if it was
105273346f58SDavid Green   // live out.
105373346f58SDavid Green   SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
105473346f58SDavid Green   while (!Worklist.empty()) {
105573346f58SDavid Green     MachineInstr *MI = Worklist.pop_back_val();
105673346f58SDavid Green     if (MI->getOpcode() == ARM::MQPRCopy) {
105773346f58SDavid Green       VMOVCopies.insert(MI);
105873346f58SDavid Green       MachineInstr *CopySrc =
105973346f58SDavid Green           RDA.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
106073346f58SDavid Green       if (CopySrc)
106173346f58SDavid Green         Worklist.push_back(CopySrc);
106273346f58SDavid Green     } else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
10638c50b5fbSDavid Green       LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
106473346f58SDavid Green       VMOVCopies.clear();
1065d941df36SSam Parker       return false;
1066b30adfb5SSam Parker     }
1067b30adfb5SSam Parker   }
1068d941df36SSam Parker 
1069de3e65e6SSam Parker   return true;
1070de3e65e6SSam Parker }
1071de3e65e6SSam Parker 
Validate(ARMBasicBlockUtils * BBUtils)1072e82a0084SSam Parker void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
10738978c12bSSam Parker   if (Revert)
10748978c12bSSam Parker     return;
10758978c12bSSam Parker 
10764c19b89bSSam Parker   // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
10774c19b89bSSam Parker   // can only jump back.
10784c19b89bSSam Parker   auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
10794c19b89bSSam Parker                            ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
10800447f350SDavid Green     MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
10810447f350SDavid Green                                    ? End->getOperand(1).getMBB()
10820447f350SDavid Green                                    : End->getOperand(2).getMBB();
10838978c12bSSam Parker     // TODO Maybe there's cases where the target doesn't have to be the header,
10848978c12bSSam Parker     // but for now be safe and revert.
10850447f350SDavid Green     if (TgtBB != ML.getHeader()) {
10864c19b89bSSam Parker       LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
1087e82a0084SSam Parker       return false;
10888978c12bSSam Parker     }
10898978c12bSSam Parker 
10908978c12bSSam Parker     // The WLS and LE instructions have 12-bits for the label offset. WLS
10918978c12bSSam Parker     // requires a positive offset, while LE uses negative.
1092fd01b2f4SSam Parker     if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
1093fd01b2f4SSam Parker         !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
10948978c12bSSam Parker       LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
1095e82a0084SSam Parker       return false;
10968978c12bSSam Parker     }
10978978c12bSSam Parker 
1098bee2f618SDavid Green     if (isWhileLoopStart(*Start)) {
1099bee2f618SDavid Green       MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);
1100bee2f618SDavid Green       if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||
1101bee2f618SDavid Green           !BBUtils->isBBInRange(Start, TargetBB, 4094)) {
11028978c12bSSam Parker         LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
1103e82a0084SSam Parker         return false;
11048978c12bSSam Parker       }
1105bee2f618SDavid Green     }
1106e82a0084SSam Parker     return true;
1107e82a0084SSam Parker   };
11088978c12bSSam Parker 
1109fad70c30SDavid Green   StartInsertPt = MachineBasicBlock::iterator(Start);
1110fad70c30SDavid Green   StartInsertBB = Start->getParent();
1111fad70c30SDavid Green   LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
1112fad70c30SDavid Green                     << *StartInsertPt);
11137e02bc81SSam Parker 
1114dfa2c14bSSam Parker   Revert = !ValidateRanges(Start, End, BBUtils, ML);
1115dfa2c14bSSam Parker   CannotTailPredicate = !ValidateTailPredicate();
111640425183SSam Parker }
111740425183SSam Parker 
AddVCTP(MachineInstr * MI)1118b4fa884aSSam Parker bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
1119b4fa884aSSam Parker   LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
1120b4fa884aSSam Parker   if (VCTPs.empty()) {
1121b4fa884aSSam Parker     VCTPs.push_back(MI);
1122b4fa884aSSam Parker     return true;
1123b4fa884aSSam Parker   }
1124b4fa884aSSam Parker 
1125b4fa884aSSam Parker   // If we find another VCTP, check whether it uses the same value as the main VCTP.
1126b4fa884aSSam Parker   // If it does, store it in the VCTPs set, else refuse it.
1127b4fa884aSSam Parker   MachineInstr *Prev = VCTPs.back();
1128b4fa884aSSam Parker   if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
1129e24537d4SMircea Trofin       !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
1130b4fa884aSSam Parker     LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
1131b4fa884aSSam Parker                          "definition from the main VCTP");
1132b4fa884aSSam Parker     return false;
1133b4fa884aSSam Parker   }
1134b4fa884aSSam Parker   VCTPs.push_back(MI);
1135b4fa884aSSam Parker   return true;
1136b4fa884aSSam Parker }
1137b4fa884aSSam Parker 
ValidateMVEStore(MachineInstr * MI,MachineLoop * ML)1138ff0ef6a5SSam Tebbs static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) {
1139ff0ef6a5SSam Tebbs 
1140ff0ef6a5SSam Tebbs   auto GetFrameIndex = [](MachineMemOperand *Operand) {
1141ff0ef6a5SSam Tebbs     const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
1142ff0ef6a5SSam Tebbs     if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
1143ff0ef6a5SSam Tebbs       if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(PseudoValue)) {
1144ff0ef6a5SSam Tebbs         return FS->getFrameIndex();
1145ff0ef6a5SSam Tebbs       }
1146ff0ef6a5SSam Tebbs     }
1147ff0ef6a5SSam Tebbs     return -1;
1148ff0ef6a5SSam Tebbs   };
1149ff0ef6a5SSam Tebbs 
1150ff0ef6a5SSam Tebbs   auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
1151ff0ef6a5SSam Tebbs     switch (I->getOpcode()) {
1152ff0ef6a5SSam Tebbs     case ARM::MVE_VSTRWU32:
1153ff0ef6a5SSam Tebbs     case ARM::MVE_VLDRWU32: {
1154ff0ef6a5SSam Tebbs       return I->getOperand(1).getReg() == ARM::SP &&
1155ff0ef6a5SSam Tebbs              I->memoperands().size() == 1 &&
1156ff0ef6a5SSam Tebbs              GetFrameIndex(I->memoperands().front()) >= 0;
1157ff0ef6a5SSam Tebbs     }
1158ff0ef6a5SSam Tebbs     default:
1159ff0ef6a5SSam Tebbs       return false;
1160ff0ef6a5SSam Tebbs     }
1161ff0ef6a5SSam Tebbs   };
1162ff0ef6a5SSam Tebbs 
1163ff0ef6a5SSam Tebbs   // An unpredicated vector register spill is allowed if all of the uses of the
1164ff0ef6a5SSam Tebbs   // stack slot are within the loop
1165ff0ef6a5SSam Tebbs   if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI))
1166ff0ef6a5SSam Tebbs     return false;
1167ff0ef6a5SSam Tebbs 
1168ff0ef6a5SSam Tebbs   // Search all blocks after the loop for accesses to the same stack slot.
1169ff0ef6a5SSam Tebbs   // ReachingDefAnalysis doesn't work for sp as it relies on registers being
1170ff0ef6a5SSam Tebbs   // live-out (which sp never is) to know what blocks to look in
1171ff0ef6a5SSam Tebbs   if (MI->memoperands().size() == 0)
1172ff0ef6a5SSam Tebbs     return false;
1173ff0ef6a5SSam Tebbs   int FI = GetFrameIndex(MI->memoperands().front());
1174ff0ef6a5SSam Tebbs 
11754fee756cSAmara Emerson   auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
1176ff0ef6a5SSam Tebbs   if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))
1177ff0ef6a5SSam Tebbs     return false;
1178ff0ef6a5SSam Tebbs 
1179ff0ef6a5SSam Tebbs   SmallVector<MachineBasicBlock *> Frontier;
1180ff0ef6a5SSam Tebbs   ML->getExitBlocks(Frontier);
1181ff0ef6a5SSam Tebbs   SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()};
1182ff0ef6a5SSam Tebbs   unsigned Idx = 0;
1183ff0ef6a5SSam Tebbs   while (Idx < Frontier.size()) {
1184ff0ef6a5SSam Tebbs     MachineBasicBlock *BB = Frontier[Idx];
1185ff0ef6a5SSam Tebbs     bool LookAtSuccessors = true;
1186ff0ef6a5SSam Tebbs     for (auto &I : *BB) {
1187ff0ef6a5SSam Tebbs       if (!IsStackOp(&I) || I.memoperands().size() == 0)
1188ff0ef6a5SSam Tebbs         continue;
1189ff0ef6a5SSam Tebbs       if (GetFrameIndex(I.memoperands().front()) != FI)
1190ff0ef6a5SSam Tebbs         continue;
1191ff0ef6a5SSam Tebbs       // If this block has a store to the stack slot before any loads then we
1192ff0ef6a5SSam Tebbs       // can ignore the block
1193ff0ef6a5SSam Tebbs       if (I.getOpcode() == ARM::MVE_VSTRWU32) {
1194ff0ef6a5SSam Tebbs         LookAtSuccessors = false;
1195ff0ef6a5SSam Tebbs         break;
1196ff0ef6a5SSam Tebbs       }
1197ff0ef6a5SSam Tebbs       // If the store and the load are using the same stack slot then the
1198ff0ef6a5SSam Tebbs       // store isn't valid for tail predication
1199ff0ef6a5SSam Tebbs       if (I.getOpcode() == ARM::MVE_VLDRWU32)
1200ff0ef6a5SSam Tebbs         return false;
1201ff0ef6a5SSam Tebbs     }
1202ff0ef6a5SSam Tebbs 
1203ff0ef6a5SSam Tebbs     if (LookAtSuccessors) {
1204ff0ef6a5SSam Tebbs       for (auto Succ : BB->successors()) {
1205ff0ef6a5SSam Tebbs         if (!Visited.contains(Succ) && !is_contained(Frontier, Succ))
1206ff0ef6a5SSam Tebbs           Frontier.push_back(Succ);
1207ff0ef6a5SSam Tebbs       }
1208ff0ef6a5SSam Tebbs     }
1209ff0ef6a5SSam Tebbs     Visited.insert(BB);
1210ff0ef6a5SSam Tebbs     Idx++;
1211ff0ef6a5SSam Tebbs   }
1212ff0ef6a5SSam Tebbs 
1213ff0ef6a5SSam Tebbs   return true;
1214ff0ef6a5SSam Tebbs }
1215ff0ef6a5SSam Tebbs 
ValidateMVEInst(MachineInstr * MI)1216e27632c3SSam Parker bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
1217e73b20c5SSam Parker   if (CannotTailPredicate)
1218e73b20c5SSam Parker     return false;
1219e73b20c5SSam Parker 
1220a0c1dcc3SSam Parker   if (!shouldInspect(*MI))
12213ce9ec0cSSam Parker     return true;
12223ce9ec0cSSam Parker 
12233ce9ec0cSSam Parker   if (MI->getOpcode() == ARM::MVE_VPSEL ||
12243ce9ec0cSSam Parker       MI->getOpcode() == ARM::MVE_VPNOT) {
12253ce9ec0cSSam Parker     // TODO: Allow VPSEL and VPNOT, we currently cannot because:
12263ce9ec0cSSam Parker     // 1) It will use the VPR as a predicate operand, but doesn't have to be
12273ce9ec0cSSam Parker     //    instead a VPT block, which means we can assert while building up
12283ce9ec0cSSam Parker     //    the VPT block because we don't find another VPT or VPST to being a new
12293ce9ec0cSSam Parker     //    one.
12303ce9ec0cSSam Parker     // 2) VPSEL still requires a VPR operand even after tail predicating,
12313ce9ec0cSSam Parker     //    which means we can't remove it unless there is another
12323ce9ec0cSSam Parker     //    instruction, such as vcmp, that can provide the VPR def.
12333ce9ec0cSSam Parker     return false;
12343ce9ec0cSSam Parker   }
12353ce9ec0cSSam Parker 
1236b4fa884aSSam Parker   // Record all VCTPs and check that they're equivalent to one another.
1237b4fa884aSSam Parker   if (isVCTP(MI) && !AddVCTP(MI))
123840425183SSam Parker     return false;
123940425183SSam Parker 
1240a0c1dcc3SSam Parker   // Inspect uses first so that any instructions that alter the VPR don't
1241a0c1dcc3SSam Parker   // alter the predicate upon themselves.
1242a0c1dcc3SSam Parker   const MCInstrDesc &MCID = MI->getDesc();
124340425183SSam Parker   bool IsUse = false;
124494c799feSSam Parker   unsigned LastOpIdx = MI->getNumOperands() - 1;
124594c799feSSam Parker   for (auto &Op : enumerate(reverse(MCID.operands()))) {
124694c799feSSam Parker     const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
1247b4fa884aSSam Parker     if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
124840425183SSam Parker       continue;
124940425183SSam Parker 
125094c799feSSam Parker     if (ARM::isVpred(Op.value().OperandType)) {
1251b4fa884aSSam Parker       VPTState::addInst(MI);
1252bad6032bSSam Parker       IsUse = true;
1253b4fa884aSSam Parker     } else if (MI->getOpcode() != ARM::MVE_VPST) {
125440425183SSam Parker       LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
125540425183SSam Parker       return false;
125640425183SSam Parker     }
125740425183SSam Parker   }
125840425183SSam Parker 
1259e27632c3SSam Parker   // If we find an instruction that has been marked as not valid for tail
1260e27632c3SSam Parker   // predication, only allow the instruction if it's contained within a valid
1261e27632c3SSam Parker   // VPT block.
1262a0c1dcc3SSam Parker   bool RequiresExplicitPredication =
1263a0c1dcc3SSam Parker     (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
1264a0c1dcc3SSam Parker   if (isDomainMVE(MI) && RequiresExplicitPredication) {
126573346f58SDavid Green     if (MI->getOpcode() == ARM::MQPRCopy)
126673346f58SDavid Green       return true;
126702cd8a6bSDavid Green     if (!IsUse && producesDoubleWidthResult(*MI)) {
126802cd8a6bSDavid Green       DoubleWidthResultInstrs.insert(MI);
126902cd8a6bSDavid Green       return true;
127002cd8a6bSDavid Green     }
127102cd8a6bSDavid Green 
127202cd8a6bSDavid Green     LLVM_DEBUG(if (!IsUse) dbgs()
127302cd8a6bSDavid Green                << "ARM Loops: Can't tail predicate: " << *MI);
1274a63b2a46SSam Parker     return IsUse;
1275e27632c3SSam Parker   }
1276e27632c3SSam Parker 
1277de3e65e6SSam Parker   // If the instruction is already explicitly predicated, then the conversion
12783471520bSDavid Green   // will be fine, but ensure that all store operations are predicated.
1279ff0ef6a5SSam Tebbs   if (MI->mayStore() && !ValidateMVEStore(MI, &ML))
1280b4fa884aSSam Parker     return IsUse;
1281b4fa884aSSam Parker 
1282b4fa884aSSam Parker   // If this instruction defines the VPR, update the predicate for the
1283b4fa884aSSam Parker   // proceeding instructions.
1284b4fa884aSSam Parker   if (isVectorPredicate(MI)) {
1285b4fa884aSSam Parker     // Clear the existing predicate when we're not in VPT Active state,
1286b4fa884aSSam Parker     // otherwise we add to it.
1287b4fa884aSSam Parker     if (!isVectorPredicated(MI))
1288b4fa884aSSam Parker       VPTState::resetPredicate(MI);
1289b4fa884aSSam Parker     else
1290b4fa884aSSam Parker       VPTState::addPredicate(MI);
1291b4fa884aSSam Parker   }
1292b4fa884aSSam Parker 
1293b4fa884aSSam Parker   // Finally once the predicate has been modified, we can start a new VPT
1294b4fa884aSSam Parker   // block if necessary.
1295b4fa884aSSam Parker   if (isVPTOpcode(MI->getOpcode()))
1296b4fa884aSSam Parker     VPTState::CreateVPTBlock(MI);
1297b4fa884aSSam Parker 
1298b4fa884aSSam Parker   return true;
12998978c12bSSam Parker }
13008978c12bSSam Parker 
runOnMachineFunction(MachineFunction & mf)13018978c12bSSam Parker bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
1302*ad73ce31SZongwei Lan   const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
13038978c12bSSam Parker   if (!ST.hasLOB())
13048978c12bSSam Parker     return false;
13058978c12bSSam Parker 
13068978c12bSSam Parker   MF = &mf;
13078978c12bSSam Parker   LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
13088978c12bSSam Parker 
130928166816SSam Parker   MLI = &getAnalysis<MachineLoopInfo>();
1310cced971fSSam Parker   RDA = &getAnalysis<ReachingDefAnalysis>();
13118978c12bSSam Parker   MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
13128978c12bSSam Parker   MRI = &MF->getRegInfo();
13138978c12bSSam Parker   TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
1314d97cf1f8SSjoerd Meijer   TRI = ST.getRegisterInfo();
13158978c12bSSam Parker   BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
13168978c12bSSam Parker   BBUtils->computeAllBlockSizes();
13178978c12bSSam Parker   BBUtils->adjustBBOffsetsAfter(&MF->front());
13188978c12bSSam Parker 
13198978c12bSSam Parker   bool Changed = false;
132028166816SSam Parker   for (auto ML : *MLI) {
132189c1e35fSStefanos Baziotis     if (ML->isOutermost())
13228978c12bSSam Parker       Changed |= ProcessLoop(ML);
13238978c12bSSam Parker   }
13248978c12bSSam Parker   Changed |= RevertNonLoops();
13258978c12bSSam Parker   return Changed;
13268978c12bSSam Parker }
13278978c12bSSam Parker 
ProcessLoop(MachineLoop * ML)1328a6fd919cSSam Parker bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
1329a6fd919cSSam Parker 
1330a6fd919cSSam Parker   bool Changed = false;
1331a6fd919cSSam Parker 
1332a6fd919cSSam Parker   // Process inner loops first.
1333c5cf7d91SKazu Hirata   for (MachineLoop *L : *ML)
1334c5cf7d91SKazu Hirata     Changed |= ProcessLoop(L);
1335a6fd919cSSam Parker 
1336543406a6SDavid Green   LLVM_DEBUG({
1337543406a6SDavid Green     dbgs() << "ARM Loops: Processing loop containing:\n";
133828166816SSam Parker     if (auto *Preheader = ML->getLoopPreheader())
1339543406a6SDavid Green       dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1340543406a6SDavid Green     else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true))
1341543406a6SDavid Green       dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
134228166816SSam Parker     for (auto *MBB : ML->getBlocks())
1343543406a6SDavid Green       dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
1344543406a6SDavid Green   });
1345a6fd919cSSam Parker 
134698722691SSam Parker   // Search the given block for a loop start instruction. If one isn't found,
134798722691SSam Parker   // and there's only one predecessor block, search that one too.
134898722691SSam Parker   std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
13494379a400SSam Parker     [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
1350a6fd919cSSam Parker     for (auto &MI : *MBB) {
1351049f9672SSjoerd Meijer       if (isLoopStart(MI))
1352a6fd919cSSam Parker         return &MI;
1353a6fd919cSSam Parker     }
135498722691SSam Parker     if (MBB->pred_size() == 1)
135598722691SSam Parker       return SearchForStart(*MBB->pred_begin());
1356a6fd919cSSam Parker     return nullptr;
1357a6fd919cSSam Parker   };
1358a6fd919cSSam Parker 
13593ee580d0SSam Parker   LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
136028166816SSam Parker   // Search the preheader for the start intrinsic.
136198722691SSam Parker   // FIXME: I don't see why we shouldn't be supporting multiple predecessors
136298722691SSam Parker   // with potentially multiple set.loop.iterations, so we need to enable this.
13633ee580d0SSam Parker   if (LoLoop.Preheader)
13643ee580d0SSam Parker     LoLoop.Start = SearchForStart(LoLoop.Preheader);
136528166816SSam Parker   else
136600d19c67SMichael Benfield     return Changed;
1367a6fd919cSSam Parker 
1368a6fd919cSSam Parker   // Find the low-overhead loop components and decide whether or not to fall
13698978c12bSSam Parker   // back to a normal loop. Also look for a vctp instructions and decide
13708978c12bSSam Parker   // whether we can convert that predicate using tail predication.
1371a6fd919cSSam Parker   for (auto *MBB : reverse(ML->getBlocks())) {
1372a6fd919cSSam Parker     for (auto &MI : *MBB) {
137306e12893SSam Parker       if (MI.isDebugValue())
137406e12893SSam Parker         continue;
137506e12893SSam Parker       else if (MI.getOpcode() == ARM::t2LoopDec)
13768978c12bSSam Parker         LoLoop.Dec = &MI;
1377a6fd919cSSam Parker       else if (MI.getOpcode() == ARM::t2LoopEnd)
13788978c12bSSam Parker         LoLoop.End = &MI;
13790447f350SDavid Green       else if (MI.getOpcode() == ARM::t2LoopEndDec)
13800447f350SDavid Green         LoLoop.End = LoLoop.Dec = &MI;
1381049f9672SSjoerd Meijer       else if (isLoopStart(MI))
13828978c12bSSam Parker         LoLoop.Start = &MI;
138336c92227SSam Parker       else if (MI.getDesc().isCall()) {
1384bcf0eb7aSSam Parker         // TODO: Though the call will require LE to execute again, does this
1385bcf0eb7aSSam Parker         // mean we should revert? Always executing LE hopefully should be
1386bcf0eb7aSSam Parker         // faster than performing a sub,cmp,br or even subs,br.
13878978c12bSSam Parker         LoLoop.Revert = true;
138836c92227SSam Parker         LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
13898978c12bSSam Parker       } else {
139040425183SSam Parker         // Record VPR defs and build up their corresponding vpt blocks.
13918978c12bSSam Parker         // Check we know how to tail predicate any mve instructions.
13928f6a6763SSam Parker         LoLoop.AnalyseMVEInst(&MI);
139336c92227SSam Parker       }
1394a6fd919cSSam Parker     }
1395a6fd919cSSam Parker   }
1396a6fd919cSSam Parker 
13978978c12bSSam Parker   LLVM_DEBUG(LoLoop.dump());
13984569f63aSSjoerd Meijer   if (!LoLoop.FoundAllComponents()) {
13994569f63aSSjoerd Meijer     LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
140000d19c67SMichael Benfield     return Changed;
14014569f63aSSjoerd Meijer   }
1402a6fd919cSSam Parker 
1403fad70c30SDavid Green   assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
1404fad70c30SDavid Green          "Expected t2WhileLoopStart to be removed before regalloc!");
1405fad70c30SDavid Green 
14060447f350SDavid Green   // Check that the only instruction using LoopDec is LoopEnd. This can only
14070447f350SDavid Green   // happen when the Dec and End are separate, not a single t2LoopEndDec.
140856427528SSam Parker   // TODO: Check for copy chains that really have no effect.
14090447f350SDavid Green   if (LoLoop.Dec != LoLoop.End) {
141056427528SSam Parker     SmallPtrSet<MachineInstr *, 2> Uses;
1411e24537d4SMircea Trofin     RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
141256427528SSam Parker     if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
141356427528SSam Parker       LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
141442350cd8SSam Parker       LoLoop.Revert = true;
141542350cd8SSam Parker     }
14160447f350SDavid Green   }
1417e82a0084SSam Parker   LoLoop.Validate(BBUtils.get());
14188978c12bSSam Parker   Expand(LoLoop);
1419a6fd919cSSam Parker   return true;
1420a6fd919cSSam Parker }
1421a6fd919cSSam Parker 
1422775b2f59SSam Parker // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
1423775b2f59SSam Parker // beq that branches to the exit branch.
1424566127e3SSam Parker // TODO: We could also try to generate a cbz if the value in LR is also in
1425775b2f59SSam Parker // another low register.
RevertWhile(MachineInstr * MI) const1426775b2f59SSam Parker void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
1427775b2f59SSam Parker   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
1428bee2f618SDavid Green   MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI);
1429566127e3SSam Parker   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
1430566127e3SSam Parker     ARM::tBcc : ARM::t2Bcc;
1431566127e3SSam Parker 
1432fad70c30SDavid Green   RevertWhileLoopStartLR(MI, TII, BrOpc);
1433775b2f59SSam Parker }
1434775b2f59SSam Parker 
RevertDo(MachineInstr * MI) const1435b2ac9681SDavid Green void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
1436b2ac9681SDavid Green   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
1437d9bf6245SDavid Green   RevertDoLoopStart(MI, TII);
1438b2ac9681SDavid Green }
1439b2ac9681SDavid Green 
RevertLoopDec(MachineInstr * MI) const1440ac30ea2fSSam Parker bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
1441775b2f59SSam Parker   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
1442775b2f59SSam Parker   MachineBasicBlock *MBB = MI->getParent();
1443ac30ea2fSSam Parker   SmallPtrSet<MachineInstr*, 1> Ignore;
1444bf61421aSSam Parker   for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
1445bf61421aSSam Parker     if (I->getOpcode() == ARM::t2LoopEnd) {
1446bf61421aSSam Parker       Ignore.insert(&*I);
1447bf61421aSSam Parker       break;
1448bf61421aSSam Parker     }
1449bf61421aSSam Parker   }
14504ba6d0deSSam Parker 
1451cced971fSSam Parker   // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
1452e24537d4SMircea Trofin   bool SetFlags =
1453e24537d4SMircea Trofin       RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);
14544ba6d0deSSam Parker 
1455d9bf6245SDavid Green   llvm::RevertLoopDec(MI, TII, SetFlags);
14564ba6d0deSSam Parker   return SetFlags;
1457775b2f59SSam Parker }
1458775b2f59SSam Parker 
1459775b2f59SSam Parker // Generate a subs, or sub and cmp, and a branch instead of an LE.
RevertLoopEnd(MachineInstr * MI,bool SkipCmp) const14604ba6d0deSSam Parker void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
1461775b2f59SSam Parker   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
1462775b2f59SSam Parker 
1463566127e3SSam Parker   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
1464566127e3SSam Parker   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
1465566127e3SSam Parker     ARM::tBcc : ARM::t2Bcc;
1466566127e3SSam Parker 
1467d9bf6245SDavid Green   llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
1468775b2f59SSam Parker }
1469775b2f59SSam Parker 
14700447f350SDavid Green // Generate a subs, or sub and cmp, and a branch instead of an LE.
RevertLoopEndDec(MachineInstr * MI) const14710447f350SDavid Green void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
14720447f350SDavid Green   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
14730447f350SDavid Green   assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
14740447f350SDavid Green   MachineBasicBlock *MBB = MI->getParent();
14750447f350SDavid Green 
14760447f350SDavid Green   MachineInstrBuilder MIB =
14770447f350SDavid Green       BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
14780447f350SDavid Green   MIB.addDef(ARM::LR);
14790447f350SDavid Green   MIB.add(MI->getOperand(1));
14800447f350SDavid Green   MIB.addImm(1);
14810447f350SDavid Green   MIB.addImm(ARMCC::AL);
14820447f350SDavid Green   MIB.addReg(ARM::NoRegister);
14830447f350SDavid Green   MIB.addReg(ARM::CPSR);
14840447f350SDavid Green   MIB->getOperand(5).setIsDef(true);
14850447f350SDavid Green 
14860447f350SDavid Green   MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
14870447f350SDavid Green   unsigned BrOpc =
14880447f350SDavid Green       BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
14890447f350SDavid Green 
14900447f350SDavid Green   // Create bne
14910447f350SDavid Green   MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
14920447f350SDavid Green   MIB.add(MI->getOperand(2)); // branch target
14930447f350SDavid Green   MIB.addImm(ARMCC::NE);      // condition code
14940447f350SDavid Green   MIB.addReg(ARM::CPSR);
14950447f350SDavid Green 
14960447f350SDavid Green   MI->eraseFromParent();
14970447f350SDavid Green }
14980447f350SDavid Green 
149901022af5SSjoerd Meijer // Perform dead code elimation on the loop iteration count setup expression.
150001022af5SSjoerd Meijer // If we are tail-predicating, the number of elements to be processed is the
150101022af5SSjoerd Meijer // operand of the VCTP instruction in the vector body, see getCount(), which is
150201022af5SSjoerd Meijer // register $r3 in this example:
150301022af5SSjoerd Meijer //
150401022af5SSjoerd Meijer //   $lr = big-itercount-expression
150501022af5SSjoerd Meijer //   ..
1506b2ac9681SDavid Green //   $lr = t2DoLoopStart renamable $lr
150701022af5SSjoerd Meijer //   vector.body:
150801022af5SSjoerd Meijer //     ..
150901022af5SSjoerd Meijer //     $vpr = MVE_VCTP32 renamable $r3
151001022af5SSjoerd Meijer //     renamable $lr = t2LoopDec killed renamable $lr, 1
151101022af5SSjoerd Meijer //     t2LoopEnd renamable $lr, %vector.body
151201022af5SSjoerd Meijer //     tB %end
151301022af5SSjoerd Meijer //
151401022af5SSjoerd Meijer // What we would like achieve here is to replace the do-loop start pseudo
151501022af5SSjoerd Meijer // instruction t2DoLoopStart with:
151601022af5SSjoerd Meijer //
151701022af5SSjoerd Meijer //    $lr = MVE_DLSTP_32 killed renamable $r3
151801022af5SSjoerd Meijer //
151901022af5SSjoerd Meijer // Thus, $r3 which defines the number of elements, is written to $lr,
152001022af5SSjoerd Meijer // and then we want to delete the whole chain that used to define $lr,
152101022af5SSjoerd Meijer // see the comment below how this chain could look like.
152201022af5SSjoerd Meijer //
IterationCountDCE(LowOverheadLoop & LoLoop)152301022af5SSjoerd Meijer void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
152401022af5SSjoerd Meijer   if (!LoLoop.IsTailPredicationLegal())
152501022af5SSjoerd Meijer     return;
152601022af5SSjoerd Meijer 
15275618e9beSSam Parker   LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
1528a67eb221SSam Parker 
1529e7a6df68SDavid Green   MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 1);
15305618e9beSSam Parker   if (!Def) {
15315618e9beSSam Parker     LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
15325618e9beSSam Parker     return;
15335618e9beSSam Parker   }
15345618e9beSSam Parker 
15355618e9beSSam Parker   // Collect and remove the users of iteration count.
15365618e9beSSam Parker   SmallPtrSet<MachineInstr*, 4> Killed  = { LoLoop.Start, LoLoop.Dec,
1537dfa2c14bSSam Parker                                             LoLoop.End };
1538779a8a02SSam Parker   if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
15393d1d0891SSam Parker     LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
154042350cd8SSam Parker }
154142350cd8SSam Parker 
ExpandLoopStart(LowOverheadLoop & LoLoop)154201022af5SSjoerd Meijer MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
154301022af5SSjoerd Meijer   LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
154401022af5SSjoerd Meijer   // When using tail-predication, try to delete the dead code that was used to
154501022af5SSjoerd Meijer   // calculate the number of loop iterations.
154601022af5SSjoerd Meijer   IterationCountDCE(LoLoop);
154701022af5SSjoerd Meijer 
1548dfa2c14bSSam Parker   MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
15498978c12bSSam Parker   MachineInstr *Start = LoLoop.Start;
1550dfa2c14bSSam Parker   MachineBasicBlock *MBB = LoLoop.StartInsertBB;
155128166816SSam Parker   unsigned Opc = LoLoop.getStartOpcode();
1552c466c5faSFangrui Song   MachineOperand &Count = LoLoop.getLoopStartOperand();
15538978c12bSSam Parker 
155448230355SDavid Green   // A DLS lr, lr we needn't emit
155548230355SDavid Green   MachineInstr* NewStart;
155648230355SDavid Green   if (Opc == ARM::t2DLS && Count.isReg() && Count.getReg() == ARM::LR) {
155748230355SDavid Green     LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
155848230355SDavid Green     NewStart = nullptr;
155948230355SDavid Green   } else {
1560a6fd919cSSam Parker     MachineInstrBuilder MIB =
1561dfa2c14bSSam Parker       BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
1562a6fd919cSSam Parker 
1563a6fd919cSSam Parker     MIB.addDef(ARM::LR);
156428166816SSam Parker     MIB.add(Count);
1565bee2f618SDavid Green     if (isWhileLoopStart(*Start))
1566bee2f618SDavid Green       MIB.addMBB(getWhileLoopStartTargetBB(*Start));
156798722691SSam Parker 
156898722691SSam Parker     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
156948230355SDavid Green     NewStart = &*MIB;
157048230355SDavid Green   }
157148230355SDavid Green 
157248230355SDavid Green   LoLoop.ToRemove.insert(Start);
157348230355SDavid Green   return NewStart;
15748978c12bSSam Parker }
15758978c12bSSam Parker 
ConvertVPTBlocks(LowOverheadLoop & LoLoop)157640425183SSam Parker void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
157740425183SSam Parker   auto RemovePredicate = [](MachineInstr *MI) {
1578f22b4c71SVictor Campos     if (MI->isDebugInstr())
1579f22b4c71SVictor Campos       return;
158040425183SSam Parker     LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
1581f22b4c71SVictor Campos     int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
1582f22b4c71SVictor Campos     assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");
1583bad6032bSSam Parker     assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
158440425183SSam Parker            "Expected Then predicate!");
1585bad6032bSSam Parker     MI->getOperand(PIdx).setImm(ARMVCC::None);
1586bad6032bSSam Parker     MI->getOperand(PIdx + 1).setReg(0);
158740425183SSam Parker   };
158840425183SSam Parker 
158940425183SSam Parker   for (auto &Block : LoLoop.getVPTBlocks()) {
1590b4fa884aSSam Parker     SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
1591b4fa884aSSam Parker 
1592da2e4728SSam Tebbs     auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
1593da2e4728SSam Tebbs       assert(TheVCMP && "Replacing a removed or non-existent VCMP");
1594da2e4728SSam Tebbs       // Replace the VCMP with a VPT
1595da2e4728SSam Tebbs       MachineInstrBuilder MIB =
1596da2e4728SSam Tebbs           BuildMI(*At->getParent(), At, At->getDebugLoc(),
1597da2e4728SSam Tebbs                   TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
1598da2e4728SSam Tebbs       MIB.addImm(ARMVCC::Then);
1599da2e4728SSam Tebbs       // Register one
1600da2e4728SSam Tebbs       MIB.add(TheVCMP->getOperand(1));
1601da2e4728SSam Tebbs       // Register two
1602da2e4728SSam Tebbs       MIB.add(TheVCMP->getOperand(2));
1603da2e4728SSam Tebbs       // The comparison code, e.g. ge, eq, lt
1604da2e4728SSam Tebbs       MIB.add(TheVCMP->getOperand(3));
1605da2e4728SSam Tebbs       LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
1606da2e4728SSam Tebbs       LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1607da2e4728SSam Tebbs       LoLoop.ToRemove.insert(TheVCMP);
1608da2e4728SSam Tebbs       TheVCMP = nullptr;
1609da2e4728SSam Tebbs     };
1610da2e4728SSam Tebbs 
1611b4fa884aSSam Parker     if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
1612da2e4728SSam Tebbs       MachineInstr *VPST = Insts.front();
1613b4fa884aSSam Parker       if (VPTState::hasUniformPredicate(Block)) {
1614b4fa884aSSam Parker         // A vpt block starting with VPST, is only predicated upon vctp and has no
1615b4fa884aSSam Parker         // internal vpr defs:
1616b4fa884aSSam Parker         // - Remove vpst.
1617b4fa884aSSam Parker         // - Unpredicate the remaining instructions.
1618da2e4728SSam Tebbs         LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1619b4fa884aSSam Parker         for (unsigned i = 1; i < Insts.size(); ++i)
1620b4fa884aSSam Parker           RemovePredicate(Insts[i]);
1621b4fa884aSSam Parker       } else {
1622835251f7SPierre-vh         // The VPT block has a non-uniform predicate but it uses a vpst and its
1623835251f7SPierre-vh         // entry is guarded only by a vctp, which means we:
162440425183SSam Parker         // - Need to remove the original vpst.
162540425183SSam Parker         // - Then need to unpredicate any following instructions, until
162640425183SSam Parker         //   we come across the divergent vpr def.
162740425183SSam Parker         // - Insert a new vpst to predicate the instruction(s) that following
162840425183SSam Parker         //   the divergent vpr def.
1629b4fa884aSSam Parker         MachineInstr *Divergent = VPTState::getDivergent(Block);
1630f22b4c71SVictor Campos         MachineBasicBlock *MBB = Divergent->getParent();
163140a3f7e4SSam Tebbs         auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
1632f22b4c71SVictor Campos         while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())
1633f22b4c71SVictor Campos           ++DivergentNext;
1634f22b4c71SVictor Campos 
163540a3f7e4SSam Tebbs         bool DivergentNextIsPredicated =
1636f22b4c71SVictor Campos             DivergentNext != MBB->end() &&
163740a3f7e4SSam Tebbs             getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
163840a3f7e4SSam Tebbs 
163940a3f7e4SSam Tebbs         for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
164040a3f7e4SSam Tebbs              I != E; ++I)
164140425183SSam Parker           RemovePredicate(&*I);
164240425183SSam Parker 
1643ef0b9f33SSam Tebbs         // Check if the instruction defining vpr is a vcmp so it can be combined
1644ef0b9f33SSam Tebbs         // with the VPST This should be the divergent instruction
164540a3f7e4SSam Tebbs         MachineInstr *VCMP =
164640a3f7e4SSam Tebbs             VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
1647ef0b9f33SSam Tebbs 
164840a3f7e4SSam Tebbs         if (DivergentNextIsPredicated) {
164940a3f7e4SSam Tebbs           // Insert a VPST at the divergent only if the next instruction
165040a3f7e4SSam Tebbs           // would actually use it. A VCMP following a VPST can be
165140a3f7e4SSam Tebbs           // merged into a VPT so do that instead if the VCMP exists.
165240a3f7e4SSam Tebbs           if (!VCMP) {
165340a3f7e4SSam Tebbs             // Create a VPST (with a null mask for now, we'll recompute it
165440a3f7e4SSam Tebbs             // later)
165540a3f7e4SSam Tebbs             MachineInstrBuilder MIB =
165640a3f7e4SSam Tebbs                 BuildMI(*Divergent->getParent(), Divergent,
165700ee52aeSSam Parker                         Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
165824bf8063SPierre-vh             MIB.addImm(0);
165940425183SSam Parker             LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
166024bf8063SPierre-vh             LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
166140a3f7e4SSam Tebbs           } else {
166240a3f7e4SSam Tebbs             // No RDA checks are necessary here since the VPST would have been
1663da2e4728SSam Tebbs             // directly after the VCMP
1664da2e4728SSam Tebbs             ReplaceVCMPWithVPT(VCMP, VCMP);
1665da2e4728SSam Tebbs           }
166640a3f7e4SSam Tebbs         }
166740a3f7e4SSam Tebbs       }
166840a3f7e4SSam Tebbs       LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
166940a3f7e4SSam Tebbs       LoLoop.ToRemove.insert(VPST);
1670b4fa884aSSam Parker     } else if (Block.containsVCTP()) {
16711de3e7fdSDavid Green       // The vctp will be removed, so either the entire block will be dead or
16721de3e7fdSDavid Green       // the block mask of the vp(s)t will need to be recomputed.
16731de3e7fdSDavid Green       MachineInstr *VPST = Insts.front();
16741de3e7fdSDavid Green       if (Block.size() == 2) {
16751de3e7fdSDavid Green         assert(VPST->getOpcode() == ARM::MVE_VPST &&
16761de3e7fdSDavid Green                "Found a VPST in an otherwise empty vpt block");
16771de3e7fdSDavid Green         LoLoop.ToRemove.insert(VPST);
16781de3e7fdSDavid Green       } else
16791de3e7fdSDavid Green         LoLoop.BlockMasksToRecompute.insert(VPST);
1680da2e4728SSam Tebbs     } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
1681da2e4728SSam Tebbs       // If this block starts with a VPST then attempt to merge it with the
1682da2e4728SSam Tebbs       // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
1683da2e4728SSam Tebbs       // block that no longer exists
1684da2e4728SSam Tebbs       MachineInstr *VPST = Insts.front();
1685da2e4728SSam Tebbs       auto Next = ++MachineBasicBlock::iterator(VPST);
1686da2e4728SSam Tebbs       assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
1687da2e4728SSam Tebbs              "The instruction after a VPST must be predicated");
1688f45c052cSMikhail Goncharov       (void)Next;
1689da2e4728SSam Tebbs       MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
1690da2e4728SSam Tebbs       if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
1691da2e4728SSam Tebbs           !LoLoop.ToRemove.contains(VprDef)) {
1692da2e4728SSam Tebbs         MachineInstr *VCMP = VprDef;
1693da2e4728SSam Tebbs         // The VCMP and VPST can only be merged if the VCMP's operands will have
16948ecb015eSSam Tebbs         // the same values at the VPST.
16958ecb015eSSam Tebbs         // If any of the instructions between the VCMP and VPST are predicated
16968ecb015eSSam Tebbs         // then a different code path is expected to have merged the VCMP and
16978ecb015eSSam Tebbs         // VPST already.
169826bd534aSKazu Hirata         if (std::none_of(++MachineBasicBlock::iterator(VCMP),
1699f45c052cSMikhail Goncharov                          MachineBasicBlock::iterator(VPST), hasVPRUse) &&
17008ecb015eSSam Tebbs             RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
17018ecb015eSSam Tebbs             RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
1702da2e4728SSam Tebbs           ReplaceVCMPWithVPT(VCMP, VPST);
1703da2e4728SSam Tebbs           LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1704da2e4728SSam Tebbs           LoLoop.ToRemove.insert(VPST);
1705da2e4728SSam Tebbs         }
1706da2e4728SSam Tebbs       }
1707835251f7SPierre-vh     }
1708835251f7SPierre-vh   }
1709b4fa884aSSam Parker 
1710b4fa884aSSam Parker   LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
17118978c12bSSam Parker }
17128978c12bSSam Parker 
Expand(LowOverheadLoop & LoLoop)17138978c12bSSam Parker void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1714a6fd919cSSam Parker 
1715a6fd919cSSam Parker   // Combine the LoopDec and LoopEnd instructions into LE(TP).
17168978c12bSSam Parker   auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
17178978c12bSSam Parker     MachineInstr *End = LoLoop.End;
1718a6fd919cSSam Parker     MachineBasicBlock *MBB = End->getParent();
17198978c12bSSam Parker     unsigned Opc = LoLoop.IsTailPredicationLegal() ?
17208978c12bSSam Parker       ARM::MVE_LETP : ARM::t2LEUpdate;
1721a6fd919cSSam Parker     MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
17228978c12bSSam Parker                                       TII->get(Opc));
1723a6fd919cSSam Parker     MIB.addDef(ARM::LR);
17240447f350SDavid Green     unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
17250447f350SDavid Green     MIB.add(End->getOperand(Off + 0));
17260447f350SDavid Green     MIB.add(End->getOperand(Off + 1));
1727a6fd919cSSam Parker     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
17285618e9beSSam Parker     LoLoop.ToRemove.insert(LoLoop.Dec);
17295618e9beSSam Parker     LoLoop.ToRemove.insert(End);
173098722691SSam Parker     return &*MIB;
1731a6fd919cSSam Parker   };
1732a6fd919cSSam Parker 
173398722691SSam Parker   // TODO: We should be able to automatically remove these branches before we
173498722691SSam Parker   // get here - probably by teaching analyzeBranch about the pseudo
173598722691SSam Parker   // instructions.
173698722691SSam Parker   // If there is an unconditional branch, after I, that just branches to the
173798722691SSam Parker   // next block, remove it.
173898722691SSam Parker   auto RemoveDeadBranch = [](MachineInstr *I) {
173998722691SSam Parker     MachineBasicBlock *BB = I->getParent();
174098722691SSam Parker     MachineInstr *Terminator = &BB->instr_back();
174198722691SSam Parker     if (Terminator->isUnconditionalBranch() && I != Terminator) {
174298722691SSam Parker       MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
174398722691SSam Parker       if (BB->isLayoutSuccessor(Succ)) {
174498722691SSam Parker         LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
174598722691SSam Parker         Terminator->eraseFromParent();
174698722691SSam Parker       }
174798722691SSam Parker     }
174898722691SSam Parker   };
174998722691SSam Parker 
175073346f58SDavid Green   // And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
175173346f58SDavid Green   // Anything other MQPRCopy can be converted to MVE_VORR later on.
175273346f58SDavid Green   auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) {
175373346f58SDavid Green     for (auto *MI : VMOVCopies) {
175473346f58SDavid Green       LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
175573346f58SDavid Green       assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
175673346f58SDavid Green       MachineBasicBlock *MBB = MI->getParent();
175773346f58SDavid Green       Register Dst = MI->getOperand(0).getReg();
175873346f58SDavid Green       Register Src = MI->getOperand(1).getReg();
175973346f58SDavid Green       auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
176073346f58SDavid Green                           ARM::D0 + (Dst - ARM::Q0) * 2)
176173346f58SDavid Green                       .addReg(ARM::D0 + (Src - ARM::Q0) * 2)
176273346f58SDavid Green                       .add(predOps(ARMCC::AL));
176353801a59SMichael Forster       (void)MIB1;
176473346f58SDavid Green       LLVM_DEBUG(dbgs() << " into " << *MIB1);
176573346f58SDavid Green       auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
176673346f58SDavid Green                           ARM::D0 + (Dst - ARM::Q0) * 2 + 1)
176773346f58SDavid Green                       .addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1)
176873346f58SDavid Green                       .add(predOps(ARMCC::AL));
176973346f58SDavid Green       LLVM_DEBUG(dbgs() << " and  " << *MIB2);
177053801a59SMichael Forster       (void)MIB2;
177173346f58SDavid Green       MI->eraseFromParent();
177273346f58SDavid Green     }
177373346f58SDavid Green   };
177473346f58SDavid Green 
17758978c12bSSam Parker   if (LoLoop.Revert) {
1776bee2f618SDavid Green     if (isWhileLoopStart(*LoLoop.Start))
17778978c12bSSam Parker       RevertWhile(LoLoop.Start);
1778775b2f59SSam Parker     else
1779b2ac9681SDavid Green       RevertDo(LoLoop.Start);
17800447f350SDavid Green     if (LoLoop.Dec == LoLoop.End)
17810447f350SDavid Green       RevertLoopEndDec(LoLoop.End);
17820447f350SDavid Green     else
17830447f350SDavid Green       RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
1784a6fd919cSSam Parker   } else {
178573346f58SDavid Green     ExpandVMOVCopies(LoLoop.VMOVCopies);
17868978c12bSSam Parker     LoLoop.Start = ExpandLoopStart(LoLoop);
178748230355SDavid Green     if (LoLoop.Start)
17888978c12bSSam Parker       RemoveDeadBranch(LoLoop.Start);
17898978c12bSSam Parker     LoLoop.End = ExpandLoopEnd(LoLoop);
17908978c12bSSam Parker     RemoveDeadBranch(LoLoop.End);
1791b30adfb5SSam Parker     if (LoLoop.IsTailPredicationLegal())
179240425183SSam Parker       ConvertVPTBlocks(LoLoop);
179342350cd8SSam Parker     for (auto *I : LoLoop.ToRemove) {
179442350cd8SSam Parker       LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
179542350cd8SSam Parker       I->eraseFromParent();
1796a6fd919cSSam Parker     }
179724bf8063SPierre-vh     for (auto *I : LoLoop.BlockMasksToRecompute) {
179824bf8063SPierre-vh       LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
179924bf8063SPierre-vh       recomputeVPTBlockMask(*I);
180024bf8063SPierre-vh       LLVM_DEBUG(dbgs() << "           ... done: " << *I);
180124bf8063SPierre-vh     }
1802a6fd919cSSam Parker   }
1803760b1751SSam Parker 
1804fd01b2f4SSam Parker   PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
1805760b1751SSam Parker   DFS.ProcessLoop();
1806760b1751SSam Parker   const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
1807760b1751SSam Parker   for (auto *MBB : PostOrder) {
1808760b1751SSam Parker     recomputeLiveIns(*MBB);
1809760b1751SSam Parker     // FIXME: For some reason, the live-in print order is non-deterministic for
1810760b1751SSam Parker     // our tests and I can't out why... So just sort them.
1811760b1751SSam Parker     MBB->sortUniqueLiveIns();
1812760b1751SSam Parker   }
1813760b1751SSam Parker 
1814760b1751SSam Parker   for (auto *MBB : reverse(PostOrder))
1815760b1751SSam Parker     recomputeLivenessFlags(*MBB);
18161d06e75dSSam Parker 
18171d06e75dSSam Parker   // We've moved, removed and inserted new instructions, so update RDA.
18181d06e75dSSam Parker   RDA->reset();
1819d97cf1f8SSjoerd Meijer }
1820a6fd919cSSam Parker 
RevertNonLoops()182136c92227SSam Parker bool ARMLowOverheadLoops::RevertNonLoops() {
18224379a400SSam Parker   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
18234379a400SSam Parker   bool Changed = false;
18244379a400SSam Parker 
182536c92227SSam Parker   for (auto &MBB : *MF) {
18264379a400SSam Parker     SmallVector<MachineInstr*, 4> Starts;
18274379a400SSam Parker     SmallVector<MachineInstr*, 4> Decs;
18284379a400SSam Parker     SmallVector<MachineInstr*, 4> Ends;
18290447f350SDavid Green     SmallVector<MachineInstr *, 4> EndDecs;
183073346f58SDavid Green     SmallVector<MachineInstr *, 4> MQPRCopies;
18314379a400SSam Parker 
18324379a400SSam Parker     for (auto &I : MBB) {
1833049f9672SSjoerd Meijer       if (isLoopStart(I))
18344379a400SSam Parker         Starts.push_back(&I);
18354379a400SSam Parker       else if (I.getOpcode() == ARM::t2LoopDec)
18364379a400SSam Parker         Decs.push_back(&I);
18374379a400SSam Parker       else if (I.getOpcode() == ARM::t2LoopEnd)
18384379a400SSam Parker         Ends.push_back(&I);
18390447f350SDavid Green       else if (I.getOpcode() == ARM::t2LoopEndDec)
18400447f350SDavid Green         EndDecs.push_back(&I);
184173346f58SDavid Green       else if (I.getOpcode() == ARM::MQPRCopy)
184273346f58SDavid Green         MQPRCopies.push_back(&I);
18434379a400SSam Parker     }
18444379a400SSam Parker 
184573346f58SDavid Green     if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
184673346f58SDavid Green         MQPRCopies.empty())
18474379a400SSam Parker       continue;
18484379a400SSam Parker 
18494379a400SSam Parker     Changed = true;
18504379a400SSam Parker 
18514379a400SSam Parker     for (auto *Start : Starts) {
1852bee2f618SDavid Green       if (isWhileLoopStart(*Start))
18534379a400SSam Parker         RevertWhile(Start);
18544379a400SSam Parker       else
1855b2ac9681SDavid Green         RevertDo(Start);
18564379a400SSam Parker     }
18574379a400SSam Parker     for (auto *Dec : Decs)
18584379a400SSam Parker       RevertLoopDec(Dec);
18594379a400SSam Parker 
18604379a400SSam Parker     for (auto *End : Ends)
18614379a400SSam Parker       RevertLoopEnd(End);
18620447f350SDavid Green     for (auto *End : EndDecs)
18630447f350SDavid Green       RevertLoopEndDec(End);
186473346f58SDavid Green     for (auto *MI : MQPRCopies) {
186573346f58SDavid Green       LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
186673346f58SDavid Green       assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
186773346f58SDavid Green       MachineBasicBlock *MBB = MI->getParent();
186873346f58SDavid Green       auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR),
186973346f58SDavid Green                          MI->getOperand(0).getReg())
187073346f58SDavid Green                      .add(MI->getOperand(1))
187173346f58SDavid Green                      .add(MI->getOperand(1));
187273346f58SDavid Green       addUnpredicatedMveVpredROp(MIB, MI->getOperand(0).getReg());
187373346f58SDavid Green       MI->eraseFromParent();
187473346f58SDavid Green     }
18754379a400SSam Parker   }
18764379a400SSam Parker   return Changed;
18774379a400SSam Parker }
18784379a400SSam Parker 
createARMLowOverheadLoopsPass()1879a6fd919cSSam Parker FunctionPass *llvm::createARMLowOverheadLoopsPass() {
1880a6fd919cSSam Parker   return new ARMLowOverheadLoops();
1881a6fd919cSSam Parker }
1882