1a6fd919cSSam Parker //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2a6fd919cSSam Parker //
3a6fd919cSSam Parker // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4a6fd919cSSam Parker // See https://llvm.org/LICENSE.txt for license information.
5a6fd919cSSam Parker // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6a6fd919cSSam Parker //
7a6fd919cSSam Parker //===----------------------------------------------------------------------===//
8a6fd919cSSam Parker /// \file
9a6fd919cSSam Parker /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10a6fd919cSSam Parker /// instructions into machine operations.
11a6fd919cSSam Parker /// The expectation is that the loop contains three pseudo instructions:
12a6fd919cSSam Parker /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13a6fd919cSSam Parker /// form should be in the preheader, whereas the while form should be in the
14173de037SSam Parker /// preheaders only predecessor.
15a6fd919cSSam Parker /// - t2LoopDec - placed within in the loop body.
16a6fd919cSSam Parker /// - t2LoopEnd - the loop latch terminator.
17a6fd919cSSam Parker ///
180efc9e5aSSjoerd Meijer /// In addition to this, we also look for the presence of the VCTP instruction,
190efc9e5aSSjoerd Meijer /// which determines whether we can generated the tail-predicated low-overhead
200efc9e5aSSjoerd Meijer /// loop form.
210efc9e5aSSjoerd Meijer ///
229c91d79dSSam Parker /// Assumptions and Dependencies:
239c91d79dSSam Parker /// Low-overhead loops are constructed and executed using a setup instruction:
249c91d79dSSam Parker /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
259c91d79dSSam Parker /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
269c91d79dSSam Parker /// but fixed polarity: WLS can only branch forwards and LE can only branch
279c91d79dSSam Parker /// backwards. These restrictions mean that this pass is dependent upon block
289c91d79dSSam Parker /// layout and block sizes, which is why it's the last pass to run. The same is
299c91d79dSSam Parker /// true for ConstantIslands, but this pass does not increase the size of the
309c91d79dSSam Parker /// basic blocks, nor does it change the CFG. Instructions are mainly removed
319c91d79dSSam Parker /// during the transform and pseudo instructions are replaced by real ones. In
329c91d79dSSam Parker /// some cases, when we have to revert to a 'normal' loop, we have to introduce
339c91d79dSSam Parker /// multiple instructions for a single pseudo (see RevertWhile and
34fad70c30SDavid Green /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
359c91d79dSSam Parker /// are defined to be as large as this maximum sequence of replacement
369c91d79dSSam Parker /// instructions.
379c91d79dSSam Parker ///
38835251f7SPierre-vh /// A note on VPR.P0 (the lane mask):
39835251f7SPierre-vh /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
40835251f7SPierre-vh /// "VPT Active" context (which includes low-overhead loops and vpt blocks).
41835251f7SPierre-vh /// They will simply "and" the result of their calculation with the current
42835251f7SPierre-vh /// value of VPR.P0. You can think of it like this:
43835251f7SPierre-vh /// \verbatim
44835251f7SPierre-vh /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs
45835251f7SPierre-vh /// VPR.P0 &= Value
46835251f7SPierre-vh /// else
47835251f7SPierre-vh /// VPR.P0 = Value
48835251f7SPierre-vh /// \endverbatim
49835251f7SPierre-vh /// When we're inside the low-overhead loop (between DLSTP and LETP), we always
50835251f7SPierre-vh /// fall in the "VPT active" case, so we can consider that all VPR writes by
51835251f7SPierre-vh /// one of those instruction is actually a "and".
52a6fd919cSSam Parker //===----------------------------------------------------------------------===//
53a6fd919cSSam Parker
54a6fd919cSSam Parker #include "ARM.h"
55a6fd919cSSam Parker #include "ARMBaseInstrInfo.h"
56a6fd919cSSam Parker #include "ARMBaseRegisterInfo.h"
57a6fd919cSSam Parker #include "ARMBasicBlockInfo.h"
58a6fd919cSSam Parker #include "ARMSubtarget.h"
59d9bf6245SDavid Green #include "MVETailPredUtils.h"
60bad6032bSSam Parker #include "Thumb2InstrInfo.h"
6140425183SSam Parker #include "llvm/ADT/SetOperations.h"
62ed98c1b3Sserge-sans-paille #include "llvm/ADT/SetVector.h"
63acbc9aedSSam Parker #include "llvm/ADT/SmallSet.h"
6442350cd8SSam Parker #include "llvm/CodeGen/LivePhysRegs.h"
65989f1c72Sserge-sans-paille #include "llvm/CodeGen/MachineFrameInfo.h"
66a6fd919cSSam Parker #include "llvm/CodeGen/MachineFunctionPass.h"
67a6fd919cSSam Parker #include "llvm/CodeGen/MachineLoopInfo.h"
68d97cf1f8SSjoerd Meijer #include "llvm/CodeGen/MachineLoopUtils.h"
69a6fd919cSSam Parker #include "llvm/CodeGen/MachineRegisterInfo.h"
70cced971fSSam Parker #include "llvm/CodeGen/Passes.h"
71cced971fSSam Parker #include "llvm/CodeGen/ReachingDefAnalysis.h"
728978c12bSSam Parker #include "llvm/MC/MCInstrDesc.h"
73a6fd919cSSam Parker
74a6fd919cSSam Parker using namespace llvm;
75a6fd919cSSam Parker
76a6fd919cSSam Parker #define DEBUG_TYPE "arm-low-overhead-loops"
77a6fd919cSSam Parker #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
78a6fd919cSSam Parker
792fc690acSSjoerd Meijer static cl::opt<bool>
802fc690acSSjoerd Meijer DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
812fc690acSSjoerd Meijer cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass"),
822fc690acSSjoerd Meijer cl::init(false));
832fc690acSSjoerd Meijer
isVectorPredicated(MachineInstr * MI)84a0c1dcc3SSam Parker static bool isVectorPredicated(MachineInstr *MI) {
85a0c1dcc3SSam Parker int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
86a0c1dcc3SSam Parker return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
87a0c1dcc3SSam Parker }
88a0c1dcc3SSam Parker
isVectorPredicate(MachineInstr * MI)89a0c1dcc3SSam Parker static bool isVectorPredicate(MachineInstr *MI) {
90a0c1dcc3SSam Parker return MI->findRegisterDefOperandIdx(ARM::VPR) != -1;
91a0c1dcc3SSam Parker }
92a0c1dcc3SSam Parker
hasVPRUse(MachineInstr & MI)93da2e4728SSam Tebbs static bool hasVPRUse(MachineInstr &MI) {
94da2e4728SSam Tebbs return MI.findRegisterUseOperandIdx(ARM::VPR) != -1;
95a0c1dcc3SSam Parker }
96a0c1dcc3SSam Parker
isDomainMVE(MachineInstr * MI)97a0c1dcc3SSam Parker static bool isDomainMVE(MachineInstr *MI) {
98a0c1dcc3SSam Parker uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
99a0c1dcc3SSam Parker return Domain == ARMII::DomainMVE;
100a0c1dcc3SSam Parker }
101a0c1dcc3SSam Parker
getVecSize(const MachineInstr & MI)10202cd8a6bSDavid Green static int getVecSize(const MachineInstr &MI) {
10302cd8a6bSDavid Green const MCInstrDesc &MCID = MI.getDesc();
10402cd8a6bSDavid Green uint64_t Flags = MCID.TSFlags;
10502cd8a6bSDavid Green return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
10602cd8a6bSDavid Green }
10702cd8a6bSDavid Green
shouldInspect(MachineInstr & MI)108a0c1dcc3SSam Parker static bool shouldInspect(MachineInstr &MI) {
10902cd8a6bSDavid Green if (MI.isDebugInstr())
11002cd8a6bSDavid Green return false;
111da2e4728SSam Tebbs return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI);
112a0c1dcc3SSam Parker }
113a0c1dcc3SSam Parker
114a6fd919cSSam Parker namespace {
115a6fd919cSSam Parker
11694cacebcSSam Parker using InstSet = SmallPtrSetImpl<MachineInstr *>;
11794cacebcSSam Parker
118760b1751SSam Parker class PostOrderLoopTraversal {
119760b1751SSam Parker MachineLoop &ML;
120760b1751SSam Parker MachineLoopInfo &MLI;
121760b1751SSam Parker SmallPtrSet<MachineBasicBlock*, 4> Visited;
122760b1751SSam Parker SmallVector<MachineBasicBlock*, 4> Order;
123760b1751SSam Parker
124760b1751SSam Parker public:
PostOrderLoopTraversal(MachineLoop & ML,MachineLoopInfo & MLI)125760b1751SSam Parker PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
126760b1751SSam Parker : ML(ML), MLI(MLI) { }
127760b1751SSam Parker
getOrder() const128760b1751SSam Parker const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
129760b1751SSam Parker return Order;
130760b1751SSam Parker }
131760b1751SSam Parker
132760b1751SSam Parker // Visit all the blocks within the loop, as well as exit blocks and any
133760b1751SSam Parker // blocks properly dominating the header.
ProcessLoop()134760b1751SSam Parker void ProcessLoop() {
135760b1751SSam Parker std::function<void(MachineBasicBlock*)> Search = [this, &Search]
136760b1751SSam Parker (MachineBasicBlock *MBB) -> void {
137760b1751SSam Parker if (Visited.count(MBB))
138760b1751SSam Parker return;
139760b1751SSam Parker
140760b1751SSam Parker Visited.insert(MBB);
141760b1751SSam Parker for (auto *Succ : MBB->successors()) {
142760b1751SSam Parker if (!ML.contains(Succ))
143760b1751SSam Parker continue;
144760b1751SSam Parker Search(Succ);
145760b1751SSam Parker }
146760b1751SSam Parker Order.push_back(MBB);
147760b1751SSam Parker };
148760b1751SSam Parker
149760b1751SSam Parker // Insert exit blocks.
150760b1751SSam Parker SmallVector<MachineBasicBlock*, 2> ExitBlocks;
151760b1751SSam Parker ML.getExitBlocks(ExitBlocks);
15205444417SKazu Hirata append_range(Order, ExitBlocks);
153760b1751SSam Parker
154760b1751SSam Parker // Then add the loop body.
155760b1751SSam Parker Search(ML.getHeader());
156760b1751SSam Parker
157760b1751SSam Parker // Then try the preheader and its predecessors.
158760b1751SSam Parker std::function<void(MachineBasicBlock*)> GetPredecessor =
159760b1751SSam Parker [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
160760b1751SSam Parker Order.push_back(MBB);
161760b1751SSam Parker if (MBB->pred_size() == 1)
162760b1751SSam Parker GetPredecessor(*MBB->pred_begin());
163760b1751SSam Parker };
164760b1751SSam Parker
165760b1751SSam Parker if (auto *Preheader = ML.getLoopPreheader())
166760b1751SSam Parker GetPredecessor(Preheader);
167543406a6SDavid Green else if (auto *Preheader = MLI.findLoopPreheader(&ML, true, true))
168760b1751SSam Parker GetPredecessor(Preheader);
169760b1751SSam Parker }
170760b1751SSam Parker };
171760b1751SSam Parker
17240425183SSam Parker struct PredicatedMI {
17340425183SSam Parker MachineInstr *MI = nullptr;
17440425183SSam Parker SetVector<MachineInstr*> Predicates;
17540425183SSam Parker
17640425183SSam Parker public:
PredicatedMI__anon59c8a0920111::PredicatedMI177835251f7SPierre-vh PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) {
178835251f7SPierre-vh assert(I && "Instruction must not be null!");
179835251f7SPierre-vh Predicates.insert(Preds.begin(), Preds.end());
180835251f7SPierre-vh }
18140425183SSam Parker };
18240425183SSam Parker
183b4fa884aSSam Parker // Represent the current state of the VPR and hold all instances which
184b4fa884aSSam Parker // represent a VPT block, which is a list of instructions that begins with a
185b4fa884aSSam Parker // VPT/VPST and has a maximum of four proceeding instructions. All
186b4fa884aSSam Parker // instructions within the block are predicated upon the vpr and we allow
187b4fa884aSSam Parker // instructions to define the vpr within in the block too.
188b4fa884aSSam Parker class VPTState {
189b4fa884aSSam Parker friend struct LowOverheadLoop;
190b4fa884aSSam Parker
191b4fa884aSSam Parker SmallVector<MachineInstr *, 4> Insts;
192b4fa884aSSam Parker
193b4fa884aSSam Parker static SmallVector<VPTState, 4> Blocks;
194b4fa884aSSam Parker static SetVector<MachineInstr *> CurrentPredicates;
195b4fa884aSSam Parker static std::map<MachineInstr *,
196b4fa884aSSam Parker std::unique_ptr<PredicatedMI>> PredicatedInsts;
197b4fa884aSSam Parker
CreateVPTBlock(MachineInstr * MI)198b4fa884aSSam Parker static void CreateVPTBlock(MachineInstr *MI) {
199195c22f2SSam Parker assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR))
200195c22f2SSam Parker && "Can't begin VPT without predicate");
201b4fa884aSSam Parker Blocks.emplace_back(MI);
202b4fa884aSSam Parker // The execution of MI is predicated upon the current set of instructions
203b4fa884aSSam Parker // that are AND'ed together to form the VPR predicate value. In the case
204b4fa884aSSam Parker // that MI is a VPT, CurrentPredicates will also just be MI.
205b4fa884aSSam Parker PredicatedInsts.emplace(
206b4fa884aSSam Parker MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
207b4fa884aSSam Parker }
208b4fa884aSSam Parker
reset()209b4fa884aSSam Parker static void reset() {
210b4fa884aSSam Parker Blocks.clear();
211b4fa884aSSam Parker PredicatedInsts.clear();
212b4fa884aSSam Parker CurrentPredicates.clear();
213b4fa884aSSam Parker }
214b4fa884aSSam Parker
addInst(MachineInstr * MI)215b4fa884aSSam Parker static void addInst(MachineInstr *MI) {
216b4fa884aSSam Parker Blocks.back().insert(MI);
217b4fa884aSSam Parker PredicatedInsts.emplace(
218b4fa884aSSam Parker MI, std::make_unique<PredicatedMI>(MI, CurrentPredicates));
219b4fa884aSSam Parker }
220b4fa884aSSam Parker
addPredicate(MachineInstr * MI)221b4fa884aSSam Parker static void addPredicate(MachineInstr *MI) {
222b4fa884aSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
223b4fa884aSSam Parker CurrentPredicates.insert(MI);
224b4fa884aSSam Parker }
225b4fa884aSSam Parker
resetPredicate(MachineInstr * MI)226b4fa884aSSam Parker static void resetPredicate(MachineInstr *MI) {
227b4fa884aSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
228b4fa884aSSam Parker CurrentPredicates.clear();
229b4fa884aSSam Parker CurrentPredicates.insert(MI);
230b4fa884aSSam Parker }
23140425183SSam Parker
23240425183SSam Parker public:
23340425183SSam Parker // Have we found an instruction within the block which defines the vpr? If
23440425183SSam Parker // so, not all the instructions in the block will have the same predicate.
hasUniformPredicate(VPTState & Block)235b4fa884aSSam Parker static bool hasUniformPredicate(VPTState &Block) {
236b4fa884aSSam Parker return getDivergent(Block) == nullptr;
23740425183SSam Parker }
23840425183SSam Parker
239b4fa884aSSam Parker // If it exists, return the first internal instruction which modifies the
240b4fa884aSSam Parker // VPR.
getDivergent(VPTState & Block)241b4fa884aSSam Parker static MachineInstr *getDivergent(VPTState &Block) {
242b4fa884aSSam Parker SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
243b4fa884aSSam Parker for (unsigned i = 1; i < Insts.size(); ++i) {
244b4fa884aSSam Parker MachineInstr *Next = Insts[i];
245b4fa884aSSam Parker if (isVectorPredicate(Next))
246b4fa884aSSam Parker return Next; // Found an instruction altering the vpr.
247b4fa884aSSam Parker }
248b4fa884aSSam Parker return nullptr;
249835251f7SPierre-vh }
250835251f7SPierre-vh
251b4fa884aSSam Parker // Return whether the given instruction is predicated upon a VCTP.
isPredicatedOnVCTP(MachineInstr * MI,bool Exclusive=false)252b4fa884aSSam Parker static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) {
253b4fa884aSSam Parker SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates;
254b4fa884aSSam Parker if (Exclusive && Predicates.size() != 1)
255b4fa884aSSam Parker return false;
256c2bb9637SKazu Hirata return llvm::any_of(Predicates, isVCTP);
25740425183SSam Parker }
25840425183SSam Parker
259b4fa884aSSam Parker // Is the VPST, controlling the block entry, predicated upon a VCTP.
isEntryPredicatedOnVCTP(VPTState & Block,bool Exclusive=false)260b4fa884aSSam Parker static bool isEntryPredicatedOnVCTP(VPTState &Block,
261b4fa884aSSam Parker bool Exclusive = false) {
262b4fa884aSSam Parker SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
263b4fa884aSSam Parker return isPredicatedOnVCTP(Insts.front(), Exclusive);
264b4fa884aSSam Parker }
265b4fa884aSSam Parker
266a399d188SSam Parker // If this block begins with a VPT, we can check whether it's using
267a399d188SSam Parker // at least one predicated input(s), as well as possible loop invariant
268a399d188SSam Parker // which would result in it being implicitly predicated.
hasImplicitlyValidVPT(VPTState & Block,ReachingDefAnalysis & RDA)269a399d188SSam Parker static bool hasImplicitlyValidVPT(VPTState &Block,
270a399d188SSam Parker ReachingDefAnalysis &RDA) {
271a399d188SSam Parker SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
272a399d188SSam Parker MachineInstr *VPT = Insts.front();
273a399d188SSam Parker assert(isVPTOpcode(VPT->getOpcode()) &&
274a399d188SSam Parker "Expected VPT block to begin with VPT/VPST");
275a399d188SSam Parker
276a399d188SSam Parker if (VPT->getOpcode() == ARM::MVE_VPST)
277a399d188SSam Parker return false;
278a399d188SSam Parker
279a399d188SSam Parker auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) {
280a399d188SSam Parker MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx));
281a399d188SSam Parker return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op);
282a399d188SSam Parker };
283a399d188SSam Parker
284a399d188SSam Parker auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) {
285a399d188SSam Parker MachineOperand &MO = MI->getOperand(Idx);
286a399d188SSam Parker if (!MO.isReg() || !MO.getReg())
287a399d188SSam Parker return true;
288a399d188SSam Parker
289a399d188SSam Parker SmallPtrSet<MachineInstr *, 2> Defs;
290a399d188SSam Parker RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs);
291a399d188SSam Parker if (Defs.empty())
292a399d188SSam Parker return true;
293a399d188SSam Parker
294a399d188SSam Parker for (auto *Def : Defs)
295a399d188SSam Parker if (Def->getParent() == VPT->getParent())
296a399d188SSam Parker return false;
297a399d188SSam Parker return true;
298a399d188SSam Parker };
299a399d188SSam Parker
300a399d188SSam Parker // Check that at least one of the operands is directly predicated on a
301a399d188SSam Parker // vctp and allow an invariant value too.
302a399d188SSam Parker return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) &&
303a399d188SSam Parker (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) &&
304a399d188SSam Parker (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2));
305a399d188SSam Parker }
306a399d188SSam Parker
isValid(ReachingDefAnalysis & RDA)307a399d188SSam Parker static bool isValid(ReachingDefAnalysis &RDA) {
308b4fa884aSSam Parker // All predication within the loop should be based on vctp. If the block
309b4fa884aSSam Parker // isn't predicated on entry, check whether the vctp is within the block
310b4fa884aSSam Parker // and that all other instructions are then predicated on it.
311b4fa884aSSam Parker for (auto &Block : Blocks) {
312a399d188SSam Parker if (isEntryPredicatedOnVCTP(Block, false) ||
313a399d188SSam Parker hasImplicitlyValidVPT(Block, RDA))
314b4fa884aSSam Parker continue;
315b4fa884aSSam Parker
316b4fa884aSSam Parker SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
3171de3e7fdSDavid Green // We don't know how to convert a block with just a VPT;VCTP into
3181de3e7fdSDavid Green // anything valid once we remove the VCTP. For now just bail out.
3191de3e7fdSDavid Green assert(isVPTOpcode(Insts.front()->getOpcode()) &&
3201de3e7fdSDavid Green "Expected VPT block to start with a VPST or VPT!");
3211de3e7fdSDavid Green if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST &&
3221de3e7fdSDavid Green isVCTP(Insts.back()))
3231de3e7fdSDavid Green return false;
3241de3e7fdSDavid Green
325b4fa884aSSam Parker for (auto *MI : Insts) {
326b4fa884aSSam Parker // Check that any internal VCTPs are 'Then' predicated.
327b4fa884aSSam Parker if (isVCTP(MI) && getVPTInstrPredicate(*MI) != ARMVCC::Then)
328b4fa884aSSam Parker return false;
329b4fa884aSSam Parker // Skip other instructions that build up the predicate.
330b4fa884aSSam Parker if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI))
331b4fa884aSSam Parker continue;
332b4fa884aSSam Parker // Check that any other instructions are predicated upon a vctp.
333b4fa884aSSam Parker // TODO: We could infer when VPTs are implicitly predicated on the
334b4fa884aSSam Parker // vctp (when the operands are predicated).
335b4fa884aSSam Parker if (!isPredicatedOnVCTP(MI)) {
336b4fa884aSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
337b4fa884aSSam Parker return false;
338b4fa884aSSam Parker }
339b4fa884aSSam Parker }
340b4fa884aSSam Parker }
341b4fa884aSSam Parker return true;
342b4fa884aSSam Parker }
343b4fa884aSSam Parker
VPTState(MachineInstr * MI)344b4fa884aSSam Parker VPTState(MachineInstr *MI) { Insts.push_back(MI); }
345b4fa884aSSam Parker
insert(MachineInstr * MI)346b4fa884aSSam Parker void insert(MachineInstr *MI) {
347b4fa884aSSam Parker Insts.push_back(MI);
348b4fa884aSSam Parker // VPT/VPST + 4 predicated instructions.
349b4fa884aSSam Parker assert(Insts.size() <= 5 && "Too many instructions in VPT block!");
350b4fa884aSSam Parker }
351b4fa884aSSam Parker
containsVCTP() const352b4fa884aSSam Parker bool containsVCTP() const {
353c2bb9637SKazu Hirata return llvm::any_of(Insts, isVCTP);
35440425183SSam Parker }
35540425183SSam Parker
size() const35640425183SSam Parker unsigned size() const { return Insts.size(); }
getInsts()357b4fa884aSSam Parker SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; }
35840425183SSam Parker };
35940425183SSam Parker
3608978c12bSSam Parker struct LowOverheadLoop {
3618978c12bSSam Parker
362fd01b2f4SSam Parker MachineLoop &ML;
3633ee580d0SSam Parker MachineBasicBlock *Preheader = nullptr;
364fd01b2f4SSam Parker MachineLoopInfo &MLI;
365fd01b2f4SSam Parker ReachingDefAnalysis &RDA;
366de3e65e6SSam Parker const TargetRegisterInfo &TRI;
3673ee580d0SSam Parker const ARMBaseInstrInfo &TII;
3688978c12bSSam Parker MachineFunction *MF = nullptr;
369dfa2c14bSSam Parker MachineBasicBlock::iterator StartInsertPt;
370dfa2c14bSSam Parker MachineBasicBlock *StartInsertBB = nullptr;
3718978c12bSSam Parker MachineInstr *Start = nullptr;
3728978c12bSSam Parker MachineInstr *Dec = nullptr;
3738978c12bSSam Parker MachineInstr *End = nullptr;
37431f02ac6SSam Tebbs MachineOperand TPNumElements;
375b4fa884aSSam Parker SmallVector<MachineInstr *, 4> VCTPs;
37642350cd8SSam Parker SmallPtrSet<MachineInstr *, 4> ToRemove;
37724bf8063SPierre-vh SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute;
37802cd8a6bSDavid Green SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs;
37973346f58SDavid Green SmallPtrSet<MachineInstr *, 4> VMOVCopies;
3808978c12bSSam Parker bool Revert = false;
3818978c12bSSam Parker bool CannotTailPredicate = false;
3828978c12bSSam Parker
LowOverheadLoop__anon59c8a0920111::LowOverheadLoop383fd01b2f4SSam Parker LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
3843ee580d0SSam Parker ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
3853ee580d0SSam Parker const ARMBaseInstrInfo &TII)
38631f02ac6SSam Tebbs : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
38731f02ac6SSam Tebbs TPNumElements(MachineOperand::CreateImm(0)) {
388fd01b2f4SSam Parker MF = ML.getHeader()->getParent();
3893ee580d0SSam Parker if (auto *MBB = ML.getLoopPreheader())
3903ee580d0SSam Parker Preheader = MBB;
391543406a6SDavid Green else if (auto *MBB = MLI.findLoopPreheader(&ML, true, true))
3923ee580d0SSam Parker Preheader = MBB;
393b4fa884aSSam Parker VPTState::reset();
3948978c12bSSam Parker }
3958978c12bSSam Parker
3968978c12bSSam Parker // If this is an MVE instruction, check that we know how to use tail
397e27632c3SSam Parker // predication with it. Record VPT blocks and return whether the
398e27632c3SSam Parker // instruction is valid for tail predication.
399e27632c3SSam Parker bool ValidateMVEInst(MachineInstr *MI);
400e27632c3SSam Parker
AnalyseMVEInst__anon59c8a0920111::LowOverheadLoop4018f6a6763SSam Parker void AnalyseMVEInst(MachineInstr *MI) {
402e27632c3SSam Parker CannotTailPredicate = !ValidateMVEInst(MI);
4038978c12bSSam Parker }
4048978c12bSSam Parker
IsTailPredicationLegal__anon59c8a0920111::LowOverheadLoop4058978c12bSSam Parker bool IsTailPredicationLegal() const {
4068978c12bSSam Parker // For now, let's keep things really simple and only support a single
4078978c12bSSam Parker // block for tail predication.
408b4fa884aSSam Parker return !Revert && FoundAllComponents() && !VCTPs.empty() &&
409fd01b2f4SSam Parker !CannotTailPredicate && ML.getNumBlocks() == 1;
4108978c12bSSam Parker }
4118978c12bSSam Parker
412b4fa884aSSam Parker // Given that MI is a VCTP, check that is equivalent to any other VCTPs
413b4fa884aSSam Parker // found.
414b4fa884aSSam Parker bool AddVCTP(MachineInstr *MI);
415b4fa884aSSam Parker
416de3e65e6SSam Parker // Check that the predication in the loop will be equivalent once we
417de3e65e6SSam Parker // perform the conversion. Also ensure that we can provide the number
418de3e65e6SSam Parker // of elements to the loop start instruction.
419dfa2c14bSSam Parker bool ValidateTailPredicate();
4208f6a6763SSam Parker
421de3e65e6SSam Parker // Check that any values available outside of the loop will be the same
422de3e65e6SSam Parker // after tail predication conversion.
4233ee580d0SSam Parker bool ValidateLiveOuts();
424de3e65e6SSam Parker
4258978c12bSSam Parker // Is it safe to define LR with DLS/WLS?
4268978c12bSSam Parker // LR can be defined if it is the operand to start, because it's the same
4278978c12bSSam Parker // value, or if it's going to be equivalent to the operand to Start.
428ac30ea2fSSam Parker MachineInstr *isSafeToDefineLR();
4298978c12bSSam Parker
430cced971fSSam Parker // Check the branch targets are within range and we satisfy our
431cced971fSSam Parker // restrictions.
432e82a0084SSam Parker void Validate(ARMBasicBlockUtils *BBUtils);
4338978c12bSSam Parker
FoundAllComponents__anon59c8a0920111::LowOverheadLoop4348978c12bSSam Parker bool FoundAllComponents() const {
4358978c12bSSam Parker return Start && Dec && End;
4368978c12bSSam Parker }
4378978c12bSSam Parker
getVPTBlocks__anon59c8a0920111::LowOverheadLoop438b4fa884aSSam Parker SmallVectorImpl<VPTState> &getVPTBlocks() {
439b4fa884aSSam Parker return VPTState::Blocks;
440b4fa884aSSam Parker }
44140425183SSam Parker
44231f02ac6SSam Tebbs // Return the operand for the loop start instruction. This will be the loop
44331f02ac6SSam Tebbs // iteration count, or the number of elements if we're tail predicating.
getLoopStartOperand__anon59c8a0920111::LowOverheadLoop44431f02ac6SSam Tebbs MachineOperand &getLoopStartOperand() {
445b2ac9681SDavid Green if (IsTailPredicationLegal())
446b2ac9681SDavid Green return TPNumElements;
447fad70c30SDavid Green return Start->getOperand(1);
44828166816SSam Parker }
44928166816SSam Parker
getStartOpcode__anon59c8a0920111::LowOverheadLoop45028166816SSam Parker unsigned getStartOpcode() const {
451bee2f618SDavid Green bool IsDo = isDoLoopStart(*Start);
45228166816SSam Parker if (!IsTailPredicationLegal())
45328166816SSam Parker return IsDo ? ARM::t2DLS : ARM::t2WLS;
45428166816SSam Parker
455b4fa884aSSam Parker return VCTPOpcodeToLSTP(VCTPs.back()->getOpcode(), IsDo);
45628166816SSam Parker }
45728166816SSam Parker
dump__anon59c8a0920111::LowOverheadLoop4588978c12bSSam Parker void dump() const {
4598978c12bSSam Parker if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
4608978c12bSSam Parker if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
4618978c12bSSam Parker if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
462b4fa884aSSam Parker if (!VCTPs.empty()) {
463b4fa884aSSam Parker dbgs() << "ARM Loops: Found VCTP(s):\n";
464b4fa884aSSam Parker for (auto *MI : VCTPs)
465b4fa884aSSam Parker dbgs() << " - " << *MI;
466b4fa884aSSam Parker }
4678978c12bSSam Parker if (!FoundAllComponents())
4688978c12bSSam Parker dbgs() << "ARM Loops: Not a low-overhead loop.\n";
4698978c12bSSam Parker else if (!(Start && Dec && End))
4708978c12bSSam Parker dbgs() << "ARM Loops: Failed to find all loop components.\n";
4718978c12bSSam Parker }
4728978c12bSSam Parker };
4738978c12bSSam Parker
474a6fd919cSSam Parker class ARMLowOverheadLoops : public MachineFunctionPass {
47536c92227SSam Parker MachineFunction *MF = nullptr;
47628166816SSam Parker MachineLoopInfo *MLI = nullptr;
477cced971fSSam Parker ReachingDefAnalysis *RDA = nullptr;
478a6fd919cSSam Parker const ARMBaseInstrInfo *TII = nullptr;
479a6fd919cSSam Parker MachineRegisterInfo *MRI = nullptr;
480d97cf1f8SSjoerd Meijer const TargetRegisterInfo *TRI = nullptr;
481a6fd919cSSam Parker std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
482a6fd919cSSam Parker
483a6fd919cSSam Parker public:
484a6fd919cSSam Parker static char ID;
485a6fd919cSSam Parker
ARMLowOverheadLoops()486a6fd919cSSam Parker ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
487a6fd919cSSam Parker
getAnalysisUsage(AnalysisUsage & AU) const488a6fd919cSSam Parker void getAnalysisUsage(AnalysisUsage &AU) const override {
489a6fd919cSSam Parker AU.setPreservesCFG();
490a6fd919cSSam Parker AU.addRequired<MachineLoopInfo>();
491cced971fSSam Parker AU.addRequired<ReachingDefAnalysis>();
492a6fd919cSSam Parker MachineFunctionPass::getAnalysisUsage(AU);
493a6fd919cSSam Parker }
494a6fd919cSSam Parker
495a6fd919cSSam Parker bool runOnMachineFunction(MachineFunction &MF) override;
496a6fd919cSSam Parker
getRequiredProperties() const497a6fd919cSSam Parker MachineFunctionProperties getRequiredProperties() const override {
498a6fd919cSSam Parker return MachineFunctionProperties().set(
499cced971fSSam Parker MachineFunctionProperties::Property::NoVRegs).set(
500cced971fSSam Parker MachineFunctionProperties::Property::TracksLiveness);
501a6fd919cSSam Parker }
502a6fd919cSSam Parker
getPassName() const503a6fd919cSSam Parker StringRef getPassName() const override {
504a6fd919cSSam Parker return ARM_LOW_OVERHEAD_LOOPS_NAME;
505a6fd919cSSam Parker }
50636c92227SSam Parker
50736c92227SSam Parker private:
50836c92227SSam Parker bool ProcessLoop(MachineLoop *ML);
50936c92227SSam Parker
51036c92227SSam Parker bool RevertNonLoops();
51136c92227SSam Parker
51236c92227SSam Parker void RevertWhile(MachineInstr *MI) const;
513b2ac9681SDavid Green void RevertDo(MachineInstr *MI) const;
51436c92227SSam Parker
515ac30ea2fSSam Parker bool RevertLoopDec(MachineInstr *MI) const;
51636c92227SSam Parker
5174ba6d0deSSam Parker void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
51836c92227SSam Parker
5190447f350SDavid Green void RevertLoopEndDec(MachineInstr *MI) const;
5200447f350SDavid Green
52140425183SSam Parker void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
5228978c12bSSam Parker
5238978c12bSSam Parker MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
5248978c12bSSam Parker
5258978c12bSSam Parker void Expand(LowOverheadLoop &LoLoop);
52636c92227SSam Parker
52701022af5SSjoerd Meijer void IterationCountDCE(LowOverheadLoop &LoLoop);
528a6fd919cSSam Parker };
529a6fd919cSSam Parker }
530a6fd919cSSam Parker
531a6fd919cSSam Parker char ARMLowOverheadLoops::ID = 0;
532a6fd919cSSam Parker
533b4fa884aSSam Parker SmallVector<VPTState, 4> VPTState::Blocks;
534b4fa884aSSam Parker SetVector<MachineInstr *> VPTState::CurrentPredicates;
535b4fa884aSSam Parker std::map<MachineInstr *,
536b4fa884aSSam Parker std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
537b4fa884aSSam Parker
INITIALIZE_PASS(ARMLowOverheadLoops,DEBUG_TYPE,ARM_LOW_OVERHEAD_LOOPS_NAME,false,false)538a6fd919cSSam Parker INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
539a6fd919cSSam Parker false, false)
540a6fd919cSSam Parker
541779a8a02SSam Parker static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
542779a8a02SSam Parker InstSet &ToRemove, InstSet &Ignore) {
543779a8a02SSam Parker
544779a8a02SSam Parker // Check that we can remove all of Killed without having to modify any IT
545779a8a02SSam Parker // blocks.
546779a8a02SSam Parker auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
547779a8a02SSam Parker // Collect the dead code and the MBBs in which they reside.
548779a8a02SSam Parker SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
549779a8a02SSam Parker for (auto *Dead : Killed)
550779a8a02SSam Parker BasicBlocks.insert(Dead->getParent());
551779a8a02SSam Parker
552779a8a02SSam Parker // Collect IT blocks in all affected basic blocks.
553779a8a02SSam Parker std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
554779a8a02SSam Parker for (auto *MBB : BasicBlocks) {
555779a8a02SSam Parker for (auto &IT : *MBB) {
556779a8a02SSam Parker if (IT.getOpcode() != ARM::t2IT)
557779a8a02SSam Parker continue;
558e24537d4SMircea Trofin RDA.getReachingLocalUses(&IT, MCRegister::from(ARM::ITSTATE),
559e24537d4SMircea Trofin ITBlocks[&IT]);
560779a8a02SSam Parker }
561779a8a02SSam Parker }
562779a8a02SSam Parker
563779a8a02SSam Parker // If we're removing all of the instructions within an IT block, then
564779a8a02SSam Parker // also remove the IT instruction.
565779a8a02SSam Parker SmallPtrSet<MachineInstr *, 2> ModifiedITs;
566779a8a02SSam Parker SmallPtrSet<MachineInstr *, 2> RemoveITs;
567779a8a02SSam Parker for (auto *Dead : Killed) {
568779a8a02SSam Parker if (MachineOperand *MO = Dead->findRegisterUseOperand(ARM::ITSTATE)) {
569779a8a02SSam Parker MachineInstr *IT = RDA.getMIOperand(Dead, *MO);
570779a8a02SSam Parker RemoveITs.insert(IT);
571779a8a02SSam Parker auto &CurrentBlock = ITBlocks[IT];
572779a8a02SSam Parker CurrentBlock.erase(Dead);
573779a8a02SSam Parker if (CurrentBlock.empty())
574779a8a02SSam Parker ModifiedITs.erase(IT);
575779a8a02SSam Parker else
576779a8a02SSam Parker ModifiedITs.insert(IT);
577779a8a02SSam Parker }
578779a8a02SSam Parker }
579779a8a02SSam Parker if (!ModifiedITs.empty())
580779a8a02SSam Parker return false;
581779a8a02SSam Parker Killed.insert(RemoveITs.begin(), RemoveITs.end());
582779a8a02SSam Parker return true;
583779a8a02SSam Parker };
584779a8a02SSam Parker
585779a8a02SSam Parker SmallPtrSet<MachineInstr *, 2> Uses;
586779a8a02SSam Parker if (!RDA.isSafeToRemove(MI, Uses, Ignore))
587779a8a02SSam Parker return false;
588779a8a02SSam Parker
589779a8a02SSam Parker if (WontCorruptITs(Uses, RDA)) {
590779a8a02SSam Parker ToRemove.insert(Uses.begin(), Uses.end());
591779a8a02SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
592779a8a02SSam Parker << " - can also remove:\n";
593779a8a02SSam Parker for (auto *Use : Uses)
594779a8a02SSam Parker dbgs() << " - " << *Use);
595779a8a02SSam Parker
596779a8a02SSam Parker SmallPtrSet<MachineInstr*, 4> Killed;
597779a8a02SSam Parker RDA.collectKilledOperands(MI, Killed);
598779a8a02SSam Parker if (WontCorruptITs(Killed, RDA)) {
599779a8a02SSam Parker ToRemove.insert(Killed.begin(), Killed.end());
600779a8a02SSam Parker LLVM_DEBUG(for (auto *Dead : Killed)
601779a8a02SSam Parker dbgs() << " - " << *Dead);
602779a8a02SSam Parker }
603779a8a02SSam Parker return true;
604779a8a02SSam Parker }
605779a8a02SSam Parker return false;
606779a8a02SSam Parker }
607779a8a02SSam Parker
ValidateTailPredicate()608dfa2c14bSSam Parker bool LowOverheadLoop::ValidateTailPredicate() {
609e82a0084SSam Parker if (!IsTailPredicationLegal()) {
610e82a0084SSam Parker LLVM_DEBUG(if (VCTPs.empty())
611e82a0084SSam Parker dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
612e82a0084SSam Parker dbgs() << "ARM Loops: Tail-predication is not valid.\n");
613e82a0084SSam Parker return false;
61442350cd8SSam Parker }
61542350cd8SSam Parker
616b4fa884aSSam Parker assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
617e82a0084SSam Parker assert(ML.getBlocks().size() == 1 &&
618e82a0084SSam Parker "Shouldn't be processing a loop with more than one block");
619b4fa884aSSam Parker
6202fc690acSSjoerd Meijer if (DisableTailPredication) {
6212fc690acSSjoerd Meijer LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
6222fc690acSSjoerd Meijer return false;
6232fc690acSSjoerd Meijer }
6242fc690acSSjoerd Meijer
6257e02bc81SSam Parker if (!VPTState::isValid(RDA)) {
6267e02bc81SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
6278f6a6763SSam Parker return false;
6287e02bc81SSam Parker }
6298f6a6763SSam Parker
630b30adfb5SSam Parker if (!ValidateLiveOuts()) {
631b30adfb5SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
632de3e65e6SSam Parker return false;
633b30adfb5SSam Parker }
634de3e65e6SSam Parker
6358f6a6763SSam Parker // For tail predication, we need to provide the number of elements, instead
6368f6a6763SSam Parker // of the iteration count, to the loop start instruction. The number of
6378f6a6763SSam Parker // elements is provided to the vctp instruction, so we need to check that
6388f6a6763SSam Parker // we can use this register at InsertPt.
639b4fa884aSSam Parker MachineInstr *VCTP = VCTPs.back();
640bee2f618SDavid Green if (Start->getOpcode() == ARM::t2DoLoopStartTP ||
641bee2f618SDavid Green Start->getOpcode() == ARM::t2WhileLoopStartTP) {
64208d1c2d4SDavid Green TPNumElements = Start->getOperand(2);
64308d1c2d4SDavid Green StartInsertPt = Start;
64408d1c2d4SDavid Green StartInsertBB = Start->getParent();
64508d1c2d4SDavid Green } else {
64631f02ac6SSam Tebbs TPNumElements = VCTP->getOperand(1);
647e24537d4SMircea Trofin MCRegister NumElements = TPNumElements.getReg().asMCReg();
6488f6a6763SSam Parker
6498f6a6763SSam Parker // If the register is defined within loop, then we can't perform TP.
6508f6a6763SSam Parker // TODO: Check whether this is just a mov of a register that would be
6518f6a6763SSam Parker // available.
652fd01b2f4SSam Parker if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
6538f6a6763SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
6548f6a6763SSam Parker return false;
6558f6a6763SSam Parker }
6568f6a6763SSam Parker
6576dcbc323SDavid Green // The element count register maybe defined after InsertPt, in which case we
6586dcbc323SDavid Green // need to try to move either InsertPt or the def so that the [w|d]lstp can
6596dcbc323SDavid Green // use the value.
6606dcbc323SDavid Green
6616dcbc323SDavid Green if (StartInsertPt != StartInsertBB->end() &&
6626dcbc323SDavid Green !RDA.isReachingDefLiveOut(&*StartInsertPt, NumElements)) {
66308d1c2d4SDavid Green if (auto *ElemDef =
66408d1c2d4SDavid Green RDA.getLocalLiveOutMIDef(StartInsertBB, NumElements)) {
6656dcbc323SDavid Green if (RDA.isSafeToMoveForwards(ElemDef, &*StartInsertPt)) {
6666dcbc323SDavid Green ElemDef->removeFromParent();
6676dcbc323SDavid Green StartInsertBB->insert(StartInsertPt, ElemDef);
66808d1c2d4SDavid Green LLVM_DEBUG(dbgs()
66908d1c2d4SDavid Green << "ARM Loops: Moved element count def: " << *ElemDef);
6706dcbc323SDavid Green } else if (RDA.isSafeToMoveBackwards(&*StartInsertPt, ElemDef)) {
6716dcbc323SDavid Green StartInsertPt->removeFromParent();
6726dcbc323SDavid Green StartInsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
6736dcbc323SDavid Green &*StartInsertPt);
6746dcbc323SDavid Green LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
6756dcbc323SDavid Green } else {
6766dcbc323SDavid Green // If we fail to move an instruction and the element count is provided
6776dcbc323SDavid Green // by a mov, use the mov operand if it will have the same value at the
6786dcbc323SDavid Green // insertion point
6796dcbc323SDavid Green MachineOperand Operand = ElemDef->getOperand(1);
6806dcbc323SDavid Green if (isMovRegOpcode(ElemDef->getOpcode()) &&
681e24537d4SMircea Trofin RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg().asMCReg()) ==
682e24537d4SMircea Trofin RDA.getUniqueReachingMIDef(&*StartInsertPt,
683e24537d4SMircea Trofin Operand.getReg().asMCReg())) {
6846dcbc323SDavid Green TPNumElements = Operand;
6856dcbc323SDavid Green NumElements = TPNumElements.getReg();
6866dcbc323SDavid Green } else {
6876dcbc323SDavid Green LLVM_DEBUG(dbgs()
6886dcbc323SDavid Green << "ARM Loops: Unable to move element count to loop "
6896dcbc323SDavid Green << "start instruction.\n");
6906dcbc323SDavid Green return false;
6916dcbc323SDavid Green }
6926dcbc323SDavid Green }
6936dcbc323SDavid Green }
6946dcbc323SDavid Green }
6956dcbc323SDavid Green
6968f6a6763SSam Parker // Especially in the case of while loops, InsertBB may not be the
6978f6a6763SSam Parker // preheader, so we need to check that the register isn't redefined
6988f6a6763SSam Parker // before entering the loop.
699ddbc0778SSam Parker auto CannotProvideElements = [this](MachineBasicBlock *MBB,
700e24537d4SMircea Trofin MCRegister NumElements) {
701cb27006aSDavid Green if (MBB->empty())
702cb27006aSDavid Green return false;
7038f6a6763SSam Parker // NumElements is redefined in this block.
704fd01b2f4SSam Parker if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
7058f6a6763SSam Parker return true;
7068f6a6763SSam Parker
7078f6a6763SSam Parker // Don't continue searching up through multiple predecessors.
7088f6a6763SSam Parker if (MBB->pred_size() > 1)
7098f6a6763SSam Parker return true;
7108f6a6763SSam Parker
7118f6a6763SSam Parker return false;
7128f6a6763SSam Parker };
7138f6a6763SSam Parker
714e82a0084SSam Parker // Search backwards for a def, until we get to InsertBB.
7153ee580d0SSam Parker MachineBasicBlock *MBB = Preheader;
716dfa2c14bSSam Parker while (MBB && MBB != StartInsertBB) {
717bad6032bSSam Parker if (CannotProvideElements(MBB, NumElements)) {
718bad6032bSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
7198f6a6763SSam Parker return false;
720bad6032bSSam Parker }
7218f6a6763SSam Parker MBB = *MBB->pred_begin();
7228f6a6763SSam Parker }
72308d1c2d4SDavid Green }
72408d1c2d4SDavid Green
72508d1c2d4SDavid Green // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect
72608d1c2d4SDavid Green // world the [w|d]lstp instruction would be last instruction in the preheader
72708d1c2d4SDavid Green // and so it would only affect instructions within the loop body. But due to
72808d1c2d4SDavid Green // scheduling, and/or the logic in this pass (above), the insertion point can
72908d1c2d4SDavid Green // be moved earlier. So if the Loop Start isn't the last instruction in the
73008d1c2d4SDavid Green // preheader, and if the initial element count is smaller than the vector
73108d1c2d4SDavid Green // width, the Loop Start instruction will immediately generate one or more
73208d1c2d4SDavid Green // false lane mask which can, incorrectly, affect the proceeding MVE
73308d1c2d4SDavid Green // instructions in the preheader.
734898a81dfSSam Parker if (std::any_of(StartInsertPt, StartInsertBB->end(), shouldInspect)) {
735898a81dfSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n");
73608d1c2d4SDavid Green return false;
737898a81dfSSam Parker }
7388f6a6763SSam Parker
73902cd8a6bSDavid Green // For any DoubleWidthResultInstrs we found whilst scanning instructions, they
74002cd8a6bSDavid Green // need to compute an output size that is smaller than the VCTP mask operates
74102cd8a6bSDavid Green // on. The VecSize of the DoubleWidthResult is the larger vector size - the
74202cd8a6bSDavid Green // size it extends into, so any VCTP VecSize <= is valid.
74302cd8a6bSDavid Green unsigned VCTPVecSize = getVecSize(*VCTP);
74402cd8a6bSDavid Green for (MachineInstr *MI : DoubleWidthResultInstrs) {
74502cd8a6bSDavid Green unsigned InstrVecSize = getVecSize(*MI);
74602cd8a6bSDavid Green if (InstrVecSize > VCTPVecSize) {
74702cd8a6bSDavid Green LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
74802cd8a6bSDavid Green << "VecSize:\n" << *MI);
74902cd8a6bSDavid Green return false;
75002cd8a6bSDavid Green }
75102cd8a6bSDavid Green }
75202cd8a6bSDavid Green
75342350cd8SSam Parker // Check that the value change of the element count is what we expect and
75442350cd8SSam Parker // that the predication will be equivalent. For this we need:
75542350cd8SSam Parker // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
75642350cd8SSam Parker // and we can also allow register copies within the chain too.
757892af45cSDavid Green auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
758892af45cSDavid Green return -getAddSubImmediate(*MI) == ExpectedVecWidth;
75942350cd8SSam Parker };
76042350cd8SSam Parker
76108d1c2d4SDavid Green MachineBasicBlock *MBB = VCTP->getParent();
7627aabb6adSSam Tebbs // Remove modifications to the element count since they have no purpose in a
7637aabb6adSSam Tebbs // tail predicated loop. Explicitly refer to the vctp operand no matter which
7647aabb6adSSam Tebbs // register NumElements has been assigned to, since that is what the
7657aabb6adSSam Tebbs // modifications will be using
766e24537d4SMircea Trofin if (auto *Def = RDA.getUniqueReachingMIDef(
767e24537d4SMircea Trofin &MBB->back(), VCTP->getOperand(1).getReg().asMCReg())) {
76842350cd8SSam Parker SmallPtrSet<MachineInstr*, 2> ElementChain;
769b4fa884aSSam Parker SmallPtrSet<MachineInstr*, 2> Ignore;
77042350cd8SSam Parker unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
77142350cd8SSam Parker
772b4fa884aSSam Parker Ignore.insert(VCTPs.begin(), VCTPs.end());
773835251f7SPierre-vh
774779a8a02SSam Parker if (TryRemove(Def, RDA, ElementChain, Ignore)) {
77542350cd8SSam Parker bool FoundSub = false;
77642350cd8SSam Parker
77742350cd8SSam Parker for (auto *MI : ElementChain) {
77842350cd8SSam Parker if (isMovRegOpcode(MI->getOpcode()))
77942350cd8SSam Parker continue;
78042350cd8SSam Parker
78142350cd8SSam Parker if (isSubImmOpcode(MI->getOpcode())) {
7827e02bc81SSam Parker if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) {
7837e02bc81SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
7847e02bc81SSam Parker " count: " << *MI);
78542350cd8SSam Parker return false;
7867e02bc81SSam Parker }
78742350cd8SSam Parker FoundSub = true;
7887e02bc81SSam Parker } else {
7897e02bc81SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
7907e02bc81SSam Parker " count: " << *MI);
79142350cd8SSam Parker return false;
79242350cd8SSam Parker }
7937e02bc81SSam Parker }
79442350cd8SSam Parker ToRemove.insert(ElementChain.begin(), ElementChain.end());
79542350cd8SSam Parker }
79642350cd8SSam Parker }
7977786ac83SDavid Green
798bee2f618SDavid Green // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
799bee2f618SDavid Green // can also remove any extra instructions in the preheader, which often
800bee2f618SDavid Green // includes a now unused MOV.
801bee2f618SDavid Green if ((Start->getOpcode() == ARM::t2DoLoopStartTP ||
802bee2f618SDavid Green Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
803bee2f618SDavid Green Preheader && !Preheader->empty() &&
8047786ac83SDavid Green !RDA.hasLocalDefBefore(VCTP, VCTP->getOperand(1).getReg())) {
8057786ac83SDavid Green if (auto *Def = RDA.getUniqueReachingMIDef(
8067786ac83SDavid Green &Preheader->back(), VCTP->getOperand(1).getReg().asMCReg())) {
8077786ac83SDavid Green SmallPtrSet<MachineInstr*, 2> Ignore;
8087786ac83SDavid Green Ignore.insert(VCTPs.begin(), VCTPs.end());
8097786ac83SDavid Green TryRemove(Def, RDA, ToRemove, Ignore);
8107786ac83SDavid Green }
8117786ac83SDavid Green }
8127786ac83SDavid Green
8138f6a6763SSam Parker return true;
8148f6a6763SSam Parker }
8158f6a6763SSam Parker
isRegInClass(const MachineOperand & MO,const TargetRegisterClass * Class)816ff9ac33eSSam Parker static bool isRegInClass(const MachineOperand &MO,
817ff9ac33eSSam Parker const TargetRegisterClass *Class) {
818ff9ac33eSSam Parker return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
819ff9ac33eSSam Parker }
820ff9ac33eSSam Parker
82194cacebcSSam Parker // MVE 'narrowing' operate on half a lane, reading from half and writing
82294cacebcSSam Parker // to half, which are referred to has the top and bottom half. The other
82394cacebcSSam Parker // half retains its previous value.
retainsPreviousHalfElement(const MachineInstr & MI)82494cacebcSSam Parker static bool retainsPreviousHalfElement(const MachineInstr &MI) {
82594cacebcSSam Parker const MCInstrDesc &MCID = MI.getDesc();
82694cacebcSSam Parker uint64_t Flags = MCID.TSFlags;
82794cacebcSSam Parker return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
82894cacebcSSam Parker }
82994cacebcSSam Parker
830d7084fa3SSam Parker // Some MVE instructions read from the top/bottom halves of their operand(s)
831d7084fa3SSam Parker // and generate a vector result with result elements that are double the
832d7084fa3SSam Parker // width of the input.
producesDoubleWidthResult(const MachineInstr & MI)833d7084fa3SSam Parker static bool producesDoubleWidthResult(const MachineInstr &MI) {
834d7084fa3SSam Parker const MCInstrDesc &MCID = MI.getDesc();
835d7084fa3SSam Parker uint64_t Flags = MCID.TSFlags;
836d7084fa3SSam Parker return (Flags & ARMII::DoubleWidthResult) != 0;
837d7084fa3SSam Parker }
838d7084fa3SSam Parker
isHorizontalReduction(const MachineInstr & MI)83994b195ffSSam Parker static bool isHorizontalReduction(const MachineInstr &MI) {
84094b195ffSSam Parker const MCInstrDesc &MCID = MI.getDesc();
84194b195ffSSam Parker uint64_t Flags = MCID.TSFlags;
84294b195ffSSam Parker return (Flags & ARMII::HorizontalReduction) != 0;
84394b195ffSSam Parker }
84494b195ffSSam Parker
845d7084fa3SSam Parker // Can this instruction generate a non-zero result when given only zeroed
846d7084fa3SSam Parker // operands? This allows us to know that, given operands with false bytes
847d7084fa3SSam Parker // zeroed by masked loads, that the result will also contain zeros in those
848d7084fa3SSam Parker // bytes.
canGenerateNonZeros(const MachineInstr & MI)849d7084fa3SSam Parker static bool canGenerateNonZeros(const MachineInstr &MI) {
850d7084fa3SSam Parker
851d7084fa3SSam Parker // Check for instructions which can write into a larger element size,
852d7084fa3SSam Parker // possibly writing into a previous zero'd lane.
853d7084fa3SSam Parker if (producesDoubleWidthResult(MI))
854d7084fa3SSam Parker return true;
855d7084fa3SSam Parker
856d7084fa3SSam Parker switch (MI.getOpcode()) {
857d7084fa3SSam Parker default:
858d7084fa3SSam Parker break;
859d7084fa3SSam Parker // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
860d7084fa3SSam Parker // fp16 -> fp32 vector conversions.
861d7084fa3SSam Parker // Instructions that perform a NOT will generate 1s from 0s.
862d7084fa3SSam Parker case ARM::MVE_VMVN:
863d7084fa3SSam Parker case ARM::MVE_VORN:
864d7084fa3SSam Parker // Count leading zeros will do just that!
865d7084fa3SSam Parker case ARM::MVE_VCLZs8:
866d7084fa3SSam Parker case ARM::MVE_VCLZs16:
867d7084fa3SSam Parker case ARM::MVE_VCLZs32:
868d7084fa3SSam Parker return true;
869d7084fa3SSam Parker }
870d7084fa3SSam Parker return false;
871d7084fa3SSam Parker }
872d7084fa3SSam Parker
87394cacebcSSam Parker // Look at its register uses to see if it only can only receive zeros
87494cacebcSSam Parker // into its false lanes which would then produce zeros. Also check that
87594b195ffSSam Parker // the output register is also defined by an FalseLanesZero instruction
87694cacebcSSam Parker // so that if tail-predication happens, the lanes that aren't updated will
87794cacebcSSam Parker // still be zeros.
producesFalseLanesZero(MachineInstr & MI,const TargetRegisterClass * QPRs,const ReachingDefAnalysis & RDA,InstSet & FalseLanesZero)87894b195ffSSam Parker static bool producesFalseLanesZero(MachineInstr &MI,
87994cacebcSSam Parker const TargetRegisterClass *QPRs,
88094cacebcSSam Parker const ReachingDefAnalysis &RDA,
88194b195ffSSam Parker InstSet &FalseLanesZero) {
88294cacebcSSam Parker if (canGenerateNonZeros(MI))
88394cacebcSSam Parker return false;
88494b195ffSSam Parker
885b30adfb5SSam Parker bool isPredicated = isVectorPredicated(&MI);
886b30adfb5SSam Parker // Predicated loads will write zeros to the falsely predicated bytes of the
887b30adfb5SSam Parker // destination register.
888b30adfb5SSam Parker if (MI.mayLoad())
889b30adfb5SSam Parker return isPredicated;
890b30adfb5SSam Parker
891b30adfb5SSam Parker auto IsZeroInit = [](MachineInstr *Def) {
892b30adfb5SSam Parker return !isVectorPredicated(Def) &&
893b30adfb5SSam Parker Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
894b30adfb5SSam Parker Def->getOperand(1).getImm() == 0;
895b30adfb5SSam Parker };
896b30adfb5SSam Parker
89794b195ffSSam Parker bool AllowScalars = isHorizontalReduction(MI);
89894cacebcSSam Parker for (auto &MO : MI.operands()) {
89994cacebcSSam Parker if (!MO.isReg() || !MO.getReg())
90094cacebcSSam Parker continue;
90194b195ffSSam Parker if (!isRegInClass(MO, QPRs) && AllowScalars)
90294b195ffSSam Parker continue;
9039cb8f4d1SDavid Green // Skip the lr predicate reg
9049cb8f4d1SDavid Green int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
9059cb8f4d1SDavid Green if (PIdx != -1 && (int)MI.getOperandNo(&MO) == PIdx + 2)
9069cb8f4d1SDavid Green continue;
907b30adfb5SSam Parker
908b30adfb5SSam Parker // Check that this instruction will produce zeros in its false lanes:
909b30adfb5SSam Parker // - If it only consumes false lanes zero or constant 0 (vmov #0)
910b30adfb5SSam Parker // - If it's predicated, it only matters that it's def register already has
911b30adfb5SSam Parker // false lane zeros, so we can ignore the uses.
912b30adfb5SSam Parker SmallPtrSet<MachineInstr *, 2> Defs;
913b30adfb5SSam Parker RDA.getGlobalReachingDefs(&MI, MO.getReg(), Defs);
914b30adfb5SSam Parker for (auto *Def : Defs) {
915b30adfb5SSam Parker if (Def == &MI || FalseLanesZero.count(Def) || IsZeroInit(Def))
916b30adfb5SSam Parker continue;
917b30adfb5SSam Parker if (MO.isUse() && isPredicated)
91894cacebcSSam Parker continue;
91994cacebcSSam Parker return false;
92094cacebcSSam Parker }
921b30adfb5SSam Parker }
92294cacebcSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
92394cacebcSSam Parker return true;
92494cacebcSSam Parker }
92594cacebcSSam Parker
ValidateLiveOuts()9263ee580d0SSam Parker bool LowOverheadLoop::ValidateLiveOuts() {
927ff9ac33eSSam Parker // We want to find out if the tail-predicated version of this loop will
928ff9ac33eSSam Parker // produce the same values as the loop in its original form. For this to
929ff9ac33eSSam Parker // be true, the newly inserted implicit predication must not change the
930ff9ac33eSSam Parker // the (observable) results.
931ff9ac33eSSam Parker // We're doing this because many instructions in the loop will not be
932ff9ac33eSSam Parker // predicated and so the conversion from VPT predication to tail-predication
933ff9ac33eSSam Parker // can result in different values being produced; due to the tail-predication
934ff9ac33eSSam Parker // preventing many instructions from updating their falsely predicated
935ff9ac33eSSam Parker // lanes. This analysis assumes that all the instructions perform lane-wise
936ff9ac33eSSam Parker // operations and don't perform any exchanges.
937ff9ac33eSSam Parker // A masked load, whether through VPT or tail predication, will write zeros
938ff9ac33eSSam Parker // to any of the falsely predicated bytes. So, from the loads, we know that
939ff9ac33eSSam Parker // the false lanes are zeroed and here we're trying to track that those false
940ff9ac33eSSam Parker // lanes remain zero, or where they change, the differences are masked away
941ff9ac33eSSam Parker // by their user(s).
9423471520bSDavid Green // All MVE stores have to be predicated, so we know that any predicate load
943ff9ac33eSSam Parker // operands, or stored results are equivalent already. Other explicitly
944ff9ac33eSSam Parker // predicated instructions will perform the same operation in the original
945ff9ac33eSSam Parker // loop and the tail-predicated form too. Because of this, we can insert
94694cacebcSSam Parker // loads, stores and other predicated instructions into our Predicated
947ff9ac33eSSam Parker // set and build from there.
948d941df36SSam Parker const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
94994b195ffSSam Parker SetVector<MachineInstr *> FalseLanesUnknown;
95094b195ffSSam Parker SmallPtrSet<MachineInstr *, 4> FalseLanesZero;
95194cacebcSSam Parker SmallPtrSet<MachineInstr *, 4> Predicated;
9523ee580d0SSam Parker MachineBasicBlock *Header = ML.getHeader();
95394cacebcSSam Parker
9548c50b5fbSDavid Green LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
9558c50b5fbSDavid Green
9563ee580d0SSam Parker for (auto &MI : *Header) {
957a0c1dcc3SSam Parker if (!shouldInspect(MI))
958ff9ac33eSSam Parker continue;
959ff9ac33eSSam Parker
960835251f7SPierre-vh if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
96194b195ffSSam Parker continue;
96294b195ffSSam Parker
963b30adfb5SSam Parker bool isPredicated = isVectorPredicated(&MI);
964b30adfb5SSam Parker bool retainsOrReduces =
965b30adfb5SSam Parker retainsPreviousHalfElement(MI) || isHorizontalReduction(MI);
966b30adfb5SSam Parker
967b30adfb5SSam Parker if (isPredicated)
96894cacebcSSam Parker Predicated.insert(&MI);
969b30adfb5SSam Parker if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
97094b195ffSSam Parker FalseLanesZero.insert(&MI);
971b30adfb5SSam Parker else if (MI.getNumDefs() == 0)
972b30adfb5SSam Parker continue;
9738c50b5fbSDavid Green else if (!isPredicated && retainsOrReduces) {
9748c50b5fbSDavid Green LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI);
975b30adfb5SSam Parker return false;
97673346f58SDavid Green } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
977b30adfb5SSam Parker FalseLanesUnknown.insert(&MI);
978ff9ac33eSSam Parker }
979ff9ac33eSSam Parker
9808c50b5fbSDavid Green LLVM_DEBUG({
9818c50b5fbSDavid Green dbgs() << " Predicated:\n";
9828c50b5fbSDavid Green for (auto *I : Predicated)
9838c50b5fbSDavid Green dbgs() << " " << *I;
9848c50b5fbSDavid Green dbgs() << " FalseLanesZero:\n";
9858c50b5fbSDavid Green for (auto *I : FalseLanesZero)
9868c50b5fbSDavid Green dbgs() << " " << *I;
9878c50b5fbSDavid Green dbgs() << " FalseLanesUnknown:\n";
9888c50b5fbSDavid Green for (auto *I : FalseLanesUnknown)
9898c50b5fbSDavid Green dbgs() << " " << *I;
9908c50b5fbSDavid Green });
9918c50b5fbSDavid Green
99294cacebcSSam Parker auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
99394cacebcSSam Parker SmallPtrSetImpl<MachineInstr *> &Predicated) {
994ff9ac33eSSam Parker SmallPtrSet<MachineInstr *, 2> Uses;
995e24537d4SMircea Trofin RDA.getGlobalUses(MI, MO.getReg().asMCReg(), Uses);
996ff9ac33eSSam Parker for (auto *Use : Uses) {
99794cacebcSSam Parker if (Use != MI && !Predicated.count(Use))
998ff9ac33eSSam Parker return false;
999ff9ac33eSSam Parker }
1000ff9ac33eSSam Parker return true;
1001ff9ac33eSSam Parker };
1002ff9ac33eSSam Parker
100394cacebcSSam Parker // Visit the unknowns in reverse so that we can start at the values being
1004ff9ac33eSSam Parker // stored and then we can work towards the leaves, hopefully adding more
100594b195ffSSam Parker // instructions to Predicated. Successfully terminating the loop means that
100694b195ffSSam Parker // all the unknown values have to found to be masked by predicated user(s).
10073ee580d0SSam Parker // For any unpredicated values, we store them in NonPredicated so that we
10083ee580d0SSam Parker // can later check whether these form a reduction.
10093ee580d0SSam Parker SmallPtrSet<MachineInstr*, 2> NonPredicated;
101094b195ffSSam Parker for (auto *MI : reverse(FalseLanesUnknown)) {
1011ff9ac33eSSam Parker for (auto &MO : MI->operands()) {
1012ff9ac33eSSam Parker if (!isRegInClass(MO, QPRs) || !MO.isDef())
1013ff9ac33eSSam Parker continue;
101494cacebcSSam Parker if (!HasPredicatedUsers(MI, MO, Predicated)) {
10158c50b5fbSDavid Green LLVM_DEBUG(dbgs() << " Found an unknown def of : "
1016ff9ac33eSSam Parker << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
10173ee580d0SSam Parker NonPredicated.insert(MI);
1018b30adfb5SSam Parker break;
1019ff9ac33eSSam Parker }
1020ff9ac33eSSam Parker }
1021ff9ac33eSSam Parker // Any unknown false lanes have been masked away by the user(s).
1022b30adfb5SSam Parker if (!NonPredicated.contains(MI))
102394cacebcSSam Parker Predicated.insert(MI);
1024ff9ac33eSSam Parker }
1025d941df36SSam Parker
10263ee580d0SSam Parker SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
1027d941df36SSam Parker SmallVector<MachineBasicBlock *, 2> ExitBlocks;
1028d941df36SSam Parker ML.getExitBlocks(ExitBlocks);
1029d941df36SSam Parker assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
10303ee580d0SSam Parker assert(ExitBlocks.size() == 1 && "Expected a single exit block");
10313ee580d0SSam Parker MachineBasicBlock *ExitBB = ExitBlocks.front();
10323ee580d0SSam Parker for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
1033b30adfb5SSam Parker // TODO: Instead of blocking predication, we could move the vctp to the exit
1034b30adfb5SSam Parker // block and calculate it's operand there in or the preheader.
10358c50b5fbSDavid Green if (RegMask.PhysReg == ARM::VPR) {
10368c50b5fbSDavid Green LLVM_DEBUG(dbgs() << " VPR is live in to the exit block.");
1037b30adfb5SSam Parker return false;
10388c50b5fbSDavid Green }
10393ee580d0SSam Parker // Check Q-regs that are live in the exit blocks. We don't collect scalars
10403ee580d0SSam Parker // because they won't be affected by lane predication.
1041b30adfb5SSam Parker if (QPRs->contains(RegMask.PhysReg))
10423ee580d0SSam Parker if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
10433ee580d0SSam Parker LiveOutMIs.insert(MI);
10443ee580d0SSam Parker }
10453ee580d0SSam Parker
1046d941df36SSam Parker // We've already validated that any VPT predication within the loop will be
1047d941df36SSam Parker // equivalent when we perform the predication transformation; so we know that
1048d941df36SSam Parker // any VPT predicated instruction is predicated upon VCTP. Any live-out
10493ee580d0SSam Parker // instruction needs to be predicated, so check this here. The instructions
10503ee580d0SSam Parker // in NonPredicated have been found to be a reduction that we can ensure its
105173346f58SDavid Green // legality. Any MQPRCopy found will need to validate its input as if it was
105273346f58SDavid Green // live out.
105373346f58SDavid Green SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
105473346f58SDavid Green while (!Worklist.empty()) {
105573346f58SDavid Green MachineInstr *MI = Worklist.pop_back_val();
105673346f58SDavid Green if (MI->getOpcode() == ARM::MQPRCopy) {
105773346f58SDavid Green VMOVCopies.insert(MI);
105873346f58SDavid Green MachineInstr *CopySrc =
105973346f58SDavid Green RDA.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
106073346f58SDavid Green if (CopySrc)
106173346f58SDavid Green Worklist.push_back(CopySrc);
106273346f58SDavid Green } else if (NonPredicated.count(MI) && FalseLanesUnknown.contains(MI)) {
10638c50b5fbSDavid Green LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
106473346f58SDavid Green VMOVCopies.clear();
1065d941df36SSam Parker return false;
1066b30adfb5SSam Parker }
1067b30adfb5SSam Parker }
1068d941df36SSam Parker
1069de3e65e6SSam Parker return true;
1070de3e65e6SSam Parker }
1071de3e65e6SSam Parker
Validate(ARMBasicBlockUtils * BBUtils)1072e82a0084SSam Parker void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
10738978c12bSSam Parker if (Revert)
10748978c12bSSam Parker return;
10758978c12bSSam Parker
10764c19b89bSSam Parker // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
10774c19b89bSSam Parker // can only jump back.
10784c19b89bSSam Parker auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End,
10794c19b89bSSam Parker ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
10800447f350SDavid Green MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
10810447f350SDavid Green ? End->getOperand(1).getMBB()
10820447f350SDavid Green : End->getOperand(2).getMBB();
10838978c12bSSam Parker // TODO Maybe there's cases where the target doesn't have to be the header,
10848978c12bSSam Parker // but for now be safe and revert.
10850447f350SDavid Green if (TgtBB != ML.getHeader()) {
10864c19b89bSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
1087e82a0084SSam Parker return false;
10888978c12bSSam Parker }
10898978c12bSSam Parker
10908978c12bSSam Parker // The WLS and LE instructions have 12-bits for the label offset. WLS
10918978c12bSSam Parker // requires a positive offset, while LE uses negative.
1092fd01b2f4SSam Parker if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
1093fd01b2f4SSam Parker !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
10948978c12bSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
1095e82a0084SSam Parker return false;
10968978c12bSSam Parker }
10978978c12bSSam Parker
1098bee2f618SDavid Green if (isWhileLoopStart(*Start)) {
1099bee2f618SDavid Green MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(*Start);
1100bee2f618SDavid Green if (BBUtils->getOffsetOf(Start) > BBUtils->getOffsetOf(TargetBB) ||
1101bee2f618SDavid Green !BBUtils->isBBInRange(Start, TargetBB, 4094)) {
11028978c12bSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
1103e82a0084SSam Parker return false;
11048978c12bSSam Parker }
1105bee2f618SDavid Green }
1106e82a0084SSam Parker return true;
1107e82a0084SSam Parker };
11088978c12bSSam Parker
1109fad70c30SDavid Green StartInsertPt = MachineBasicBlock::iterator(Start);
1110fad70c30SDavid Green StartInsertBB = Start->getParent();
1111fad70c30SDavid Green LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
1112fad70c30SDavid Green << *StartInsertPt);
11137e02bc81SSam Parker
1114dfa2c14bSSam Parker Revert = !ValidateRanges(Start, End, BBUtils, ML);
1115dfa2c14bSSam Parker CannotTailPredicate = !ValidateTailPredicate();
111640425183SSam Parker }
111740425183SSam Parker
AddVCTP(MachineInstr * MI)1118b4fa884aSSam Parker bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
1119b4fa884aSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
1120b4fa884aSSam Parker if (VCTPs.empty()) {
1121b4fa884aSSam Parker VCTPs.push_back(MI);
1122b4fa884aSSam Parker return true;
1123b4fa884aSSam Parker }
1124b4fa884aSSam Parker
1125b4fa884aSSam Parker // If we find another VCTP, check whether it uses the same value as the main VCTP.
1126b4fa884aSSam Parker // If it does, store it in the VCTPs set, else refuse it.
1127b4fa884aSSam Parker MachineInstr *Prev = VCTPs.back();
1128b4fa884aSSam Parker if (!Prev->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
1129e24537d4SMircea Trofin !RDA.hasSameReachingDef(Prev, MI, MI->getOperand(1).getReg().asMCReg())) {
1130b4fa884aSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
1131b4fa884aSSam Parker "definition from the main VCTP");
1132b4fa884aSSam Parker return false;
1133b4fa884aSSam Parker }
1134b4fa884aSSam Parker VCTPs.push_back(MI);
1135b4fa884aSSam Parker return true;
1136b4fa884aSSam Parker }
1137b4fa884aSSam Parker
ValidateMVEStore(MachineInstr * MI,MachineLoop * ML)1138ff0ef6a5SSam Tebbs static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) {
1139ff0ef6a5SSam Tebbs
1140ff0ef6a5SSam Tebbs auto GetFrameIndex = [](MachineMemOperand *Operand) {
1141ff0ef6a5SSam Tebbs const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
1142ff0ef6a5SSam Tebbs if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
1143ff0ef6a5SSam Tebbs if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(PseudoValue)) {
1144ff0ef6a5SSam Tebbs return FS->getFrameIndex();
1145ff0ef6a5SSam Tebbs }
1146ff0ef6a5SSam Tebbs }
1147ff0ef6a5SSam Tebbs return -1;
1148ff0ef6a5SSam Tebbs };
1149ff0ef6a5SSam Tebbs
1150ff0ef6a5SSam Tebbs auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
1151ff0ef6a5SSam Tebbs switch (I->getOpcode()) {
1152ff0ef6a5SSam Tebbs case ARM::MVE_VSTRWU32:
1153ff0ef6a5SSam Tebbs case ARM::MVE_VLDRWU32: {
1154ff0ef6a5SSam Tebbs return I->getOperand(1).getReg() == ARM::SP &&
1155ff0ef6a5SSam Tebbs I->memoperands().size() == 1 &&
1156ff0ef6a5SSam Tebbs GetFrameIndex(I->memoperands().front()) >= 0;
1157ff0ef6a5SSam Tebbs }
1158ff0ef6a5SSam Tebbs default:
1159ff0ef6a5SSam Tebbs return false;
1160ff0ef6a5SSam Tebbs }
1161ff0ef6a5SSam Tebbs };
1162ff0ef6a5SSam Tebbs
1163ff0ef6a5SSam Tebbs // An unpredicated vector register spill is allowed if all of the uses of the
1164ff0ef6a5SSam Tebbs // stack slot are within the loop
1165ff0ef6a5SSam Tebbs if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI))
1166ff0ef6a5SSam Tebbs return false;
1167ff0ef6a5SSam Tebbs
1168ff0ef6a5SSam Tebbs // Search all blocks after the loop for accesses to the same stack slot.
1169ff0ef6a5SSam Tebbs // ReachingDefAnalysis doesn't work for sp as it relies on registers being
1170ff0ef6a5SSam Tebbs // live-out (which sp never is) to know what blocks to look in
1171ff0ef6a5SSam Tebbs if (MI->memoperands().size() == 0)
1172ff0ef6a5SSam Tebbs return false;
1173ff0ef6a5SSam Tebbs int FI = GetFrameIndex(MI->memoperands().front());
1174ff0ef6a5SSam Tebbs
11754fee756cSAmara Emerson auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
1176ff0ef6a5SSam Tebbs if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI))
1177ff0ef6a5SSam Tebbs return false;
1178ff0ef6a5SSam Tebbs
1179ff0ef6a5SSam Tebbs SmallVector<MachineBasicBlock *> Frontier;
1180ff0ef6a5SSam Tebbs ML->getExitBlocks(Frontier);
1181ff0ef6a5SSam Tebbs SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()};
1182ff0ef6a5SSam Tebbs unsigned Idx = 0;
1183ff0ef6a5SSam Tebbs while (Idx < Frontier.size()) {
1184ff0ef6a5SSam Tebbs MachineBasicBlock *BB = Frontier[Idx];
1185ff0ef6a5SSam Tebbs bool LookAtSuccessors = true;
1186ff0ef6a5SSam Tebbs for (auto &I : *BB) {
1187ff0ef6a5SSam Tebbs if (!IsStackOp(&I) || I.memoperands().size() == 0)
1188ff0ef6a5SSam Tebbs continue;
1189ff0ef6a5SSam Tebbs if (GetFrameIndex(I.memoperands().front()) != FI)
1190ff0ef6a5SSam Tebbs continue;
1191ff0ef6a5SSam Tebbs // If this block has a store to the stack slot before any loads then we
1192ff0ef6a5SSam Tebbs // can ignore the block
1193ff0ef6a5SSam Tebbs if (I.getOpcode() == ARM::MVE_VSTRWU32) {
1194ff0ef6a5SSam Tebbs LookAtSuccessors = false;
1195ff0ef6a5SSam Tebbs break;
1196ff0ef6a5SSam Tebbs }
1197ff0ef6a5SSam Tebbs // If the store and the load are using the same stack slot then the
1198ff0ef6a5SSam Tebbs // store isn't valid for tail predication
1199ff0ef6a5SSam Tebbs if (I.getOpcode() == ARM::MVE_VLDRWU32)
1200ff0ef6a5SSam Tebbs return false;
1201ff0ef6a5SSam Tebbs }
1202ff0ef6a5SSam Tebbs
1203ff0ef6a5SSam Tebbs if (LookAtSuccessors) {
1204ff0ef6a5SSam Tebbs for (auto Succ : BB->successors()) {
1205ff0ef6a5SSam Tebbs if (!Visited.contains(Succ) && !is_contained(Frontier, Succ))
1206ff0ef6a5SSam Tebbs Frontier.push_back(Succ);
1207ff0ef6a5SSam Tebbs }
1208ff0ef6a5SSam Tebbs }
1209ff0ef6a5SSam Tebbs Visited.insert(BB);
1210ff0ef6a5SSam Tebbs Idx++;
1211ff0ef6a5SSam Tebbs }
1212ff0ef6a5SSam Tebbs
1213ff0ef6a5SSam Tebbs return true;
1214ff0ef6a5SSam Tebbs }
1215ff0ef6a5SSam Tebbs
ValidateMVEInst(MachineInstr * MI)1216e27632c3SSam Parker bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
1217e73b20c5SSam Parker if (CannotTailPredicate)
1218e73b20c5SSam Parker return false;
1219e73b20c5SSam Parker
1220a0c1dcc3SSam Parker if (!shouldInspect(*MI))
12213ce9ec0cSSam Parker return true;
12223ce9ec0cSSam Parker
12233ce9ec0cSSam Parker if (MI->getOpcode() == ARM::MVE_VPSEL ||
12243ce9ec0cSSam Parker MI->getOpcode() == ARM::MVE_VPNOT) {
12253ce9ec0cSSam Parker // TODO: Allow VPSEL and VPNOT, we currently cannot because:
12263ce9ec0cSSam Parker // 1) It will use the VPR as a predicate operand, but doesn't have to be
12273ce9ec0cSSam Parker // instead a VPT block, which means we can assert while building up
12283ce9ec0cSSam Parker // the VPT block because we don't find another VPT or VPST to being a new
12293ce9ec0cSSam Parker // one.
12303ce9ec0cSSam Parker // 2) VPSEL still requires a VPR operand even after tail predicating,
12313ce9ec0cSSam Parker // which means we can't remove it unless there is another
12323ce9ec0cSSam Parker // instruction, such as vcmp, that can provide the VPR def.
12333ce9ec0cSSam Parker return false;
12343ce9ec0cSSam Parker }
12353ce9ec0cSSam Parker
1236b4fa884aSSam Parker // Record all VCTPs and check that they're equivalent to one another.
1237b4fa884aSSam Parker if (isVCTP(MI) && !AddVCTP(MI))
123840425183SSam Parker return false;
123940425183SSam Parker
1240a0c1dcc3SSam Parker // Inspect uses first so that any instructions that alter the VPR don't
1241a0c1dcc3SSam Parker // alter the predicate upon themselves.
1242a0c1dcc3SSam Parker const MCInstrDesc &MCID = MI->getDesc();
124340425183SSam Parker bool IsUse = false;
124494c799feSSam Parker unsigned LastOpIdx = MI->getNumOperands() - 1;
124594c799feSSam Parker for (auto &Op : enumerate(reverse(MCID.operands()))) {
124694c799feSSam Parker const MachineOperand &MO = MI->getOperand(LastOpIdx - Op.index());
1247b4fa884aSSam Parker if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR)
124840425183SSam Parker continue;
124940425183SSam Parker
125094c799feSSam Parker if (ARM::isVpred(Op.value().OperandType)) {
1251b4fa884aSSam Parker VPTState::addInst(MI);
1252bad6032bSSam Parker IsUse = true;
1253b4fa884aSSam Parker } else if (MI->getOpcode() != ARM::MVE_VPST) {
125440425183SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
125540425183SSam Parker return false;
125640425183SSam Parker }
125740425183SSam Parker }
125840425183SSam Parker
1259e27632c3SSam Parker // If we find an instruction that has been marked as not valid for tail
1260e27632c3SSam Parker // predication, only allow the instruction if it's contained within a valid
1261e27632c3SSam Parker // VPT block.
1262a0c1dcc3SSam Parker bool RequiresExplicitPredication =
1263a0c1dcc3SSam Parker (MCID.TSFlags & ARMII::ValidForTailPredication) == 0;
1264a0c1dcc3SSam Parker if (isDomainMVE(MI) && RequiresExplicitPredication) {
126573346f58SDavid Green if (MI->getOpcode() == ARM::MQPRCopy)
126673346f58SDavid Green return true;
126702cd8a6bSDavid Green if (!IsUse && producesDoubleWidthResult(*MI)) {
126802cd8a6bSDavid Green DoubleWidthResultInstrs.insert(MI);
126902cd8a6bSDavid Green return true;
127002cd8a6bSDavid Green }
127102cd8a6bSDavid Green
127202cd8a6bSDavid Green LLVM_DEBUG(if (!IsUse) dbgs()
127302cd8a6bSDavid Green << "ARM Loops: Can't tail predicate: " << *MI);
1274a63b2a46SSam Parker return IsUse;
1275e27632c3SSam Parker }
1276e27632c3SSam Parker
1277de3e65e6SSam Parker // If the instruction is already explicitly predicated, then the conversion
12783471520bSDavid Green // will be fine, but ensure that all store operations are predicated.
1279ff0ef6a5SSam Tebbs if (MI->mayStore() && !ValidateMVEStore(MI, &ML))
1280b4fa884aSSam Parker return IsUse;
1281b4fa884aSSam Parker
1282b4fa884aSSam Parker // If this instruction defines the VPR, update the predicate for the
1283b4fa884aSSam Parker // proceeding instructions.
1284b4fa884aSSam Parker if (isVectorPredicate(MI)) {
1285b4fa884aSSam Parker // Clear the existing predicate when we're not in VPT Active state,
1286b4fa884aSSam Parker // otherwise we add to it.
1287b4fa884aSSam Parker if (!isVectorPredicated(MI))
1288b4fa884aSSam Parker VPTState::resetPredicate(MI);
1289b4fa884aSSam Parker else
1290b4fa884aSSam Parker VPTState::addPredicate(MI);
1291b4fa884aSSam Parker }
1292b4fa884aSSam Parker
1293b4fa884aSSam Parker // Finally once the predicate has been modified, we can start a new VPT
1294b4fa884aSSam Parker // block if necessary.
1295b4fa884aSSam Parker if (isVPTOpcode(MI->getOpcode()))
1296b4fa884aSSam Parker VPTState::CreateVPTBlock(MI);
1297b4fa884aSSam Parker
1298b4fa884aSSam Parker return true;
12998978c12bSSam Parker }
13008978c12bSSam Parker
runOnMachineFunction(MachineFunction & mf)13018978c12bSSam Parker bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
1302*ad73ce31SZongwei Lan const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
13038978c12bSSam Parker if (!ST.hasLOB())
13048978c12bSSam Parker return false;
13058978c12bSSam Parker
13068978c12bSSam Parker MF = &mf;
13078978c12bSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
13088978c12bSSam Parker
130928166816SSam Parker MLI = &getAnalysis<MachineLoopInfo>();
1310cced971fSSam Parker RDA = &getAnalysis<ReachingDefAnalysis>();
13118978c12bSSam Parker MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
13128978c12bSSam Parker MRI = &MF->getRegInfo();
13138978c12bSSam Parker TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
1314d97cf1f8SSjoerd Meijer TRI = ST.getRegisterInfo();
13158978c12bSSam Parker BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
13168978c12bSSam Parker BBUtils->computeAllBlockSizes();
13178978c12bSSam Parker BBUtils->adjustBBOffsetsAfter(&MF->front());
13188978c12bSSam Parker
13198978c12bSSam Parker bool Changed = false;
132028166816SSam Parker for (auto ML : *MLI) {
132189c1e35fSStefanos Baziotis if (ML->isOutermost())
13228978c12bSSam Parker Changed |= ProcessLoop(ML);
13238978c12bSSam Parker }
13248978c12bSSam Parker Changed |= RevertNonLoops();
13258978c12bSSam Parker return Changed;
13268978c12bSSam Parker }
13278978c12bSSam Parker
ProcessLoop(MachineLoop * ML)1328a6fd919cSSam Parker bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
1329a6fd919cSSam Parker
1330a6fd919cSSam Parker bool Changed = false;
1331a6fd919cSSam Parker
1332a6fd919cSSam Parker // Process inner loops first.
1333c5cf7d91SKazu Hirata for (MachineLoop *L : *ML)
1334c5cf7d91SKazu Hirata Changed |= ProcessLoop(L);
1335a6fd919cSSam Parker
1336543406a6SDavid Green LLVM_DEBUG({
1337543406a6SDavid Green dbgs() << "ARM Loops: Processing loop containing:\n";
133828166816SSam Parker if (auto *Preheader = ML->getLoopPreheader())
1339543406a6SDavid Green dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1340543406a6SDavid Green else if (auto *Preheader = MLI->findLoopPreheader(ML, true, true))
1341543406a6SDavid Green dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
134228166816SSam Parker for (auto *MBB : ML->getBlocks())
1343543406a6SDavid Green dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
1344543406a6SDavid Green });
1345a6fd919cSSam Parker
134698722691SSam Parker // Search the given block for a loop start instruction. If one isn't found,
134798722691SSam Parker // and there's only one predecessor block, search that one too.
134898722691SSam Parker std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
13494379a400SSam Parker [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
1350a6fd919cSSam Parker for (auto &MI : *MBB) {
1351049f9672SSjoerd Meijer if (isLoopStart(MI))
1352a6fd919cSSam Parker return &MI;
1353a6fd919cSSam Parker }
135498722691SSam Parker if (MBB->pred_size() == 1)
135598722691SSam Parker return SearchForStart(*MBB->pred_begin());
1356a6fd919cSSam Parker return nullptr;
1357a6fd919cSSam Parker };
1358a6fd919cSSam Parker
13593ee580d0SSam Parker LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
136028166816SSam Parker // Search the preheader for the start intrinsic.
136198722691SSam Parker // FIXME: I don't see why we shouldn't be supporting multiple predecessors
136298722691SSam Parker // with potentially multiple set.loop.iterations, so we need to enable this.
13633ee580d0SSam Parker if (LoLoop.Preheader)
13643ee580d0SSam Parker LoLoop.Start = SearchForStart(LoLoop.Preheader);
136528166816SSam Parker else
136600d19c67SMichael Benfield return Changed;
1367a6fd919cSSam Parker
1368a6fd919cSSam Parker // Find the low-overhead loop components and decide whether or not to fall
13698978c12bSSam Parker // back to a normal loop. Also look for a vctp instructions and decide
13708978c12bSSam Parker // whether we can convert that predicate using tail predication.
1371a6fd919cSSam Parker for (auto *MBB : reverse(ML->getBlocks())) {
1372a6fd919cSSam Parker for (auto &MI : *MBB) {
137306e12893SSam Parker if (MI.isDebugValue())
137406e12893SSam Parker continue;
137506e12893SSam Parker else if (MI.getOpcode() == ARM::t2LoopDec)
13768978c12bSSam Parker LoLoop.Dec = &MI;
1377a6fd919cSSam Parker else if (MI.getOpcode() == ARM::t2LoopEnd)
13788978c12bSSam Parker LoLoop.End = &MI;
13790447f350SDavid Green else if (MI.getOpcode() == ARM::t2LoopEndDec)
13800447f350SDavid Green LoLoop.End = LoLoop.Dec = &MI;
1381049f9672SSjoerd Meijer else if (isLoopStart(MI))
13828978c12bSSam Parker LoLoop.Start = &MI;
138336c92227SSam Parker else if (MI.getDesc().isCall()) {
1384bcf0eb7aSSam Parker // TODO: Though the call will require LE to execute again, does this
1385bcf0eb7aSSam Parker // mean we should revert? Always executing LE hopefully should be
1386bcf0eb7aSSam Parker // faster than performing a sub,cmp,br or even subs,br.
13878978c12bSSam Parker LoLoop.Revert = true;
138836c92227SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
13898978c12bSSam Parker } else {
139040425183SSam Parker // Record VPR defs and build up their corresponding vpt blocks.
13918978c12bSSam Parker // Check we know how to tail predicate any mve instructions.
13928f6a6763SSam Parker LoLoop.AnalyseMVEInst(&MI);
139336c92227SSam Parker }
1394a6fd919cSSam Parker }
1395a6fd919cSSam Parker }
1396a6fd919cSSam Parker
13978978c12bSSam Parker LLVM_DEBUG(LoLoop.dump());
13984569f63aSSjoerd Meijer if (!LoLoop.FoundAllComponents()) {
13994569f63aSSjoerd Meijer LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
140000d19c67SMichael Benfield return Changed;
14014569f63aSSjoerd Meijer }
1402a6fd919cSSam Parker
1403fad70c30SDavid Green assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
1404fad70c30SDavid Green "Expected t2WhileLoopStart to be removed before regalloc!");
1405fad70c30SDavid Green
14060447f350SDavid Green // Check that the only instruction using LoopDec is LoopEnd. This can only
14070447f350SDavid Green // happen when the Dec and End are separate, not a single t2LoopEndDec.
140856427528SSam Parker // TODO: Check for copy chains that really have no effect.
14090447f350SDavid Green if (LoLoop.Dec != LoLoop.End) {
141056427528SSam Parker SmallPtrSet<MachineInstr *, 2> Uses;
1411e24537d4SMircea Trofin RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
141256427528SSam Parker if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
141356427528SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
141442350cd8SSam Parker LoLoop.Revert = true;
141542350cd8SSam Parker }
14160447f350SDavid Green }
1417e82a0084SSam Parker LoLoop.Validate(BBUtils.get());
14188978c12bSSam Parker Expand(LoLoop);
1419a6fd919cSSam Parker return true;
1420a6fd919cSSam Parker }
1421a6fd919cSSam Parker
1422775b2f59SSam Parker // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
1423775b2f59SSam Parker // beq that branches to the exit branch.
1424566127e3SSam Parker // TODO: We could also try to generate a cbz if the value in LR is also in
1425775b2f59SSam Parker // another low register.
RevertWhile(MachineInstr * MI) const1426775b2f59SSam Parker void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
1427775b2f59SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
1428bee2f618SDavid Green MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(*MI);
1429566127e3SSam Parker unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
1430566127e3SSam Parker ARM::tBcc : ARM::t2Bcc;
1431566127e3SSam Parker
1432fad70c30SDavid Green RevertWhileLoopStartLR(MI, TII, BrOpc);
1433775b2f59SSam Parker }
1434775b2f59SSam Parker
RevertDo(MachineInstr * MI) const1435b2ac9681SDavid Green void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const {
1436b2ac9681SDavid Green LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
1437d9bf6245SDavid Green RevertDoLoopStart(MI, TII);
1438b2ac9681SDavid Green }
1439b2ac9681SDavid Green
RevertLoopDec(MachineInstr * MI) const1440ac30ea2fSSam Parker bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
1441775b2f59SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
1442775b2f59SSam Parker MachineBasicBlock *MBB = MI->getParent();
1443ac30ea2fSSam Parker SmallPtrSet<MachineInstr*, 1> Ignore;
1444bf61421aSSam Parker for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
1445bf61421aSSam Parker if (I->getOpcode() == ARM::t2LoopEnd) {
1446bf61421aSSam Parker Ignore.insert(&*I);
1447bf61421aSSam Parker break;
1448bf61421aSSam Parker }
1449bf61421aSSam Parker }
14504ba6d0deSSam Parker
1451cced971fSSam Parker // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
1452e24537d4SMircea Trofin bool SetFlags =
1453e24537d4SMircea Trofin RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);
14544ba6d0deSSam Parker
1455d9bf6245SDavid Green llvm::RevertLoopDec(MI, TII, SetFlags);
14564ba6d0deSSam Parker return SetFlags;
1457775b2f59SSam Parker }
1458775b2f59SSam Parker
1459775b2f59SSam Parker // Generate a subs, or sub and cmp, and a branch instead of an LE.
RevertLoopEnd(MachineInstr * MI,bool SkipCmp) const14604ba6d0deSSam Parker void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
1461775b2f59SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
1462775b2f59SSam Parker
1463566127e3SSam Parker MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
1464566127e3SSam Parker unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
1465566127e3SSam Parker ARM::tBcc : ARM::t2Bcc;
1466566127e3SSam Parker
1467d9bf6245SDavid Green llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
1468775b2f59SSam Parker }
1469775b2f59SSam Parker
14700447f350SDavid Green // Generate a subs, or sub and cmp, and a branch instead of an LE.
RevertLoopEndDec(MachineInstr * MI) const14710447f350SDavid Green void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const {
14720447f350SDavid Green LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
14730447f350SDavid Green assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
14740447f350SDavid Green MachineBasicBlock *MBB = MI->getParent();
14750447f350SDavid Green
14760447f350SDavid Green MachineInstrBuilder MIB =
14770447f350SDavid Green BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
14780447f350SDavid Green MIB.addDef(ARM::LR);
14790447f350SDavid Green MIB.add(MI->getOperand(1));
14800447f350SDavid Green MIB.addImm(1);
14810447f350SDavid Green MIB.addImm(ARMCC::AL);
14820447f350SDavid Green MIB.addReg(ARM::NoRegister);
14830447f350SDavid Green MIB.addReg(ARM::CPSR);
14840447f350SDavid Green MIB->getOperand(5).setIsDef(true);
14850447f350SDavid Green
14860447f350SDavid Green MachineBasicBlock *DestBB = MI->getOperand(2).getMBB();
14870447f350SDavid Green unsigned BrOpc =
14880447f350SDavid Green BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc;
14890447f350SDavid Green
14900447f350SDavid Green // Create bne
14910447f350SDavid Green MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
14920447f350SDavid Green MIB.add(MI->getOperand(2)); // branch target
14930447f350SDavid Green MIB.addImm(ARMCC::NE); // condition code
14940447f350SDavid Green MIB.addReg(ARM::CPSR);
14950447f350SDavid Green
14960447f350SDavid Green MI->eraseFromParent();
14970447f350SDavid Green }
14980447f350SDavid Green
149901022af5SSjoerd Meijer // Perform dead code elimation on the loop iteration count setup expression.
150001022af5SSjoerd Meijer // If we are tail-predicating, the number of elements to be processed is the
150101022af5SSjoerd Meijer // operand of the VCTP instruction in the vector body, see getCount(), which is
150201022af5SSjoerd Meijer // register $r3 in this example:
150301022af5SSjoerd Meijer //
150401022af5SSjoerd Meijer // $lr = big-itercount-expression
150501022af5SSjoerd Meijer // ..
1506b2ac9681SDavid Green // $lr = t2DoLoopStart renamable $lr
150701022af5SSjoerd Meijer // vector.body:
150801022af5SSjoerd Meijer // ..
150901022af5SSjoerd Meijer // $vpr = MVE_VCTP32 renamable $r3
151001022af5SSjoerd Meijer // renamable $lr = t2LoopDec killed renamable $lr, 1
151101022af5SSjoerd Meijer // t2LoopEnd renamable $lr, %vector.body
151201022af5SSjoerd Meijer // tB %end
151301022af5SSjoerd Meijer //
151401022af5SSjoerd Meijer // What we would like achieve here is to replace the do-loop start pseudo
151501022af5SSjoerd Meijer // instruction t2DoLoopStart with:
151601022af5SSjoerd Meijer //
151701022af5SSjoerd Meijer // $lr = MVE_DLSTP_32 killed renamable $r3
151801022af5SSjoerd Meijer //
151901022af5SSjoerd Meijer // Thus, $r3 which defines the number of elements, is written to $lr,
152001022af5SSjoerd Meijer // and then we want to delete the whole chain that used to define $lr,
152101022af5SSjoerd Meijer // see the comment below how this chain could look like.
152201022af5SSjoerd Meijer //
IterationCountDCE(LowOverheadLoop & LoLoop)152301022af5SSjoerd Meijer void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
152401022af5SSjoerd Meijer if (!LoLoop.IsTailPredicationLegal())
152501022af5SSjoerd Meijer return;
152601022af5SSjoerd Meijer
15275618e9beSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
1528a67eb221SSam Parker
1529e7a6df68SDavid Green MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 1);
15305618e9beSSam Parker if (!Def) {
15315618e9beSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
15325618e9beSSam Parker return;
15335618e9beSSam Parker }
15345618e9beSSam Parker
15355618e9beSSam Parker // Collect and remove the users of iteration count.
15365618e9beSSam Parker SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec,
1537dfa2c14bSSam Parker LoLoop.End };
1538779a8a02SSam Parker if (!TryRemove(Def, *RDA, LoLoop.ToRemove, Killed))
15393d1d0891SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
154042350cd8SSam Parker }
154142350cd8SSam Parker
ExpandLoopStart(LowOverheadLoop & LoLoop)154201022af5SSjoerd Meijer MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
154301022af5SSjoerd Meijer LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
154401022af5SSjoerd Meijer // When using tail-predication, try to delete the dead code that was used to
154501022af5SSjoerd Meijer // calculate the number of loop iterations.
154601022af5SSjoerd Meijer IterationCountDCE(LoLoop);
154701022af5SSjoerd Meijer
1548dfa2c14bSSam Parker MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
15498978c12bSSam Parker MachineInstr *Start = LoLoop.Start;
1550dfa2c14bSSam Parker MachineBasicBlock *MBB = LoLoop.StartInsertBB;
155128166816SSam Parker unsigned Opc = LoLoop.getStartOpcode();
1552c466c5faSFangrui Song MachineOperand &Count = LoLoop.getLoopStartOperand();
15538978c12bSSam Parker
155448230355SDavid Green // A DLS lr, lr we needn't emit
155548230355SDavid Green MachineInstr* NewStart;
155648230355SDavid Green if (Opc == ARM::t2DLS && Count.isReg() && Count.getReg() == ARM::LR) {
155748230355SDavid Green LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
155848230355SDavid Green NewStart = nullptr;
155948230355SDavid Green } else {
1560a6fd919cSSam Parker MachineInstrBuilder MIB =
1561dfa2c14bSSam Parker BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
1562a6fd919cSSam Parker
1563a6fd919cSSam Parker MIB.addDef(ARM::LR);
156428166816SSam Parker MIB.add(Count);
1565bee2f618SDavid Green if (isWhileLoopStart(*Start))
1566bee2f618SDavid Green MIB.addMBB(getWhileLoopStartTargetBB(*Start));
156798722691SSam Parker
156898722691SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
156948230355SDavid Green NewStart = &*MIB;
157048230355SDavid Green }
157148230355SDavid Green
157248230355SDavid Green LoLoop.ToRemove.insert(Start);
157348230355SDavid Green return NewStart;
15748978c12bSSam Parker }
15758978c12bSSam Parker
ConvertVPTBlocks(LowOverheadLoop & LoLoop)157640425183SSam Parker void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
157740425183SSam Parker auto RemovePredicate = [](MachineInstr *MI) {
1578f22b4c71SVictor Campos if (MI->isDebugInstr())
1579f22b4c71SVictor Campos return;
158040425183SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
1581f22b4c71SVictor Campos int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
1582f22b4c71SVictor Campos assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction");
1583bad6032bSSam Parker assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
158440425183SSam Parker "Expected Then predicate!");
1585bad6032bSSam Parker MI->getOperand(PIdx).setImm(ARMVCC::None);
1586bad6032bSSam Parker MI->getOperand(PIdx + 1).setReg(0);
158740425183SSam Parker };
158840425183SSam Parker
158940425183SSam Parker for (auto &Block : LoLoop.getVPTBlocks()) {
1590b4fa884aSSam Parker SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
1591b4fa884aSSam Parker
1592da2e4728SSam Tebbs auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) {
1593da2e4728SSam Tebbs assert(TheVCMP && "Replacing a removed or non-existent VCMP");
1594da2e4728SSam Tebbs // Replace the VCMP with a VPT
1595da2e4728SSam Tebbs MachineInstrBuilder MIB =
1596da2e4728SSam Tebbs BuildMI(*At->getParent(), At, At->getDebugLoc(),
1597da2e4728SSam Tebbs TII->get(VCMPOpcodeToVPT(TheVCMP->getOpcode())));
1598da2e4728SSam Tebbs MIB.addImm(ARMVCC::Then);
1599da2e4728SSam Tebbs // Register one
1600da2e4728SSam Tebbs MIB.add(TheVCMP->getOperand(1));
1601da2e4728SSam Tebbs // Register two
1602da2e4728SSam Tebbs MIB.add(TheVCMP->getOperand(2));
1603da2e4728SSam Tebbs // The comparison code, e.g. ge, eq, lt
1604da2e4728SSam Tebbs MIB.add(TheVCMP->getOperand(3));
1605da2e4728SSam Tebbs LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
1606da2e4728SSam Tebbs LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
1607da2e4728SSam Tebbs LoLoop.ToRemove.insert(TheVCMP);
1608da2e4728SSam Tebbs TheVCMP = nullptr;
1609da2e4728SSam Tebbs };
1610da2e4728SSam Tebbs
1611b4fa884aSSam Parker if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ true)) {
1612da2e4728SSam Tebbs MachineInstr *VPST = Insts.front();
1613b4fa884aSSam Parker if (VPTState::hasUniformPredicate(Block)) {
1614b4fa884aSSam Parker // A vpt block starting with VPST, is only predicated upon vctp and has no
1615b4fa884aSSam Parker // internal vpr defs:
1616b4fa884aSSam Parker // - Remove vpst.
1617b4fa884aSSam Parker // - Unpredicate the remaining instructions.
1618da2e4728SSam Tebbs LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1619b4fa884aSSam Parker for (unsigned i = 1; i < Insts.size(); ++i)
1620b4fa884aSSam Parker RemovePredicate(Insts[i]);
1621b4fa884aSSam Parker } else {
1622835251f7SPierre-vh // The VPT block has a non-uniform predicate but it uses a vpst and its
1623835251f7SPierre-vh // entry is guarded only by a vctp, which means we:
162440425183SSam Parker // - Need to remove the original vpst.
162540425183SSam Parker // - Then need to unpredicate any following instructions, until
162640425183SSam Parker // we come across the divergent vpr def.
162740425183SSam Parker // - Insert a new vpst to predicate the instruction(s) that following
162840425183SSam Parker // the divergent vpr def.
1629b4fa884aSSam Parker MachineInstr *Divergent = VPTState::getDivergent(Block);
1630f22b4c71SVictor Campos MachineBasicBlock *MBB = Divergent->getParent();
163140a3f7e4SSam Tebbs auto DivergentNext = ++MachineBasicBlock::iterator(Divergent);
1632f22b4c71SVictor Campos while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr())
1633f22b4c71SVictor Campos ++DivergentNext;
1634f22b4c71SVictor Campos
163540a3f7e4SSam Tebbs bool DivergentNextIsPredicated =
1636f22b4c71SVictor Campos DivergentNext != MBB->end() &&
163740a3f7e4SSam Tebbs getVPTInstrPredicate(*DivergentNext) != ARMVCC::None;
163840a3f7e4SSam Tebbs
163940a3f7e4SSam Tebbs for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext;
164040a3f7e4SSam Tebbs I != E; ++I)
164140425183SSam Parker RemovePredicate(&*I);
164240425183SSam Parker
1643ef0b9f33SSam Tebbs // Check if the instruction defining vpr is a vcmp so it can be combined
1644ef0b9f33SSam Tebbs // with the VPST This should be the divergent instruction
164540a3f7e4SSam Tebbs MachineInstr *VCMP =
164640a3f7e4SSam Tebbs VCMPOpcodeToVPT(Divergent->getOpcode()) != 0 ? Divergent : nullptr;
1647ef0b9f33SSam Tebbs
164840a3f7e4SSam Tebbs if (DivergentNextIsPredicated) {
164940a3f7e4SSam Tebbs // Insert a VPST at the divergent only if the next instruction
165040a3f7e4SSam Tebbs // would actually use it. A VCMP following a VPST can be
165140a3f7e4SSam Tebbs // merged into a VPT so do that instead if the VCMP exists.
165240a3f7e4SSam Tebbs if (!VCMP) {
165340a3f7e4SSam Tebbs // Create a VPST (with a null mask for now, we'll recompute it
165440a3f7e4SSam Tebbs // later)
165540a3f7e4SSam Tebbs MachineInstrBuilder MIB =
165640a3f7e4SSam Tebbs BuildMI(*Divergent->getParent(), Divergent,
165700ee52aeSSam Parker Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
165824bf8063SPierre-vh MIB.addImm(0);
165940425183SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
166024bf8063SPierre-vh LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
166140a3f7e4SSam Tebbs } else {
166240a3f7e4SSam Tebbs // No RDA checks are necessary here since the VPST would have been
1663da2e4728SSam Tebbs // directly after the VCMP
1664da2e4728SSam Tebbs ReplaceVCMPWithVPT(VCMP, VCMP);
1665da2e4728SSam Tebbs }
166640a3f7e4SSam Tebbs }
166740a3f7e4SSam Tebbs }
166840a3f7e4SSam Tebbs LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
166940a3f7e4SSam Tebbs LoLoop.ToRemove.insert(VPST);
1670b4fa884aSSam Parker } else if (Block.containsVCTP()) {
16711de3e7fdSDavid Green // The vctp will be removed, so either the entire block will be dead or
16721de3e7fdSDavid Green // the block mask of the vp(s)t will need to be recomputed.
16731de3e7fdSDavid Green MachineInstr *VPST = Insts.front();
16741de3e7fdSDavid Green if (Block.size() == 2) {
16751de3e7fdSDavid Green assert(VPST->getOpcode() == ARM::MVE_VPST &&
16761de3e7fdSDavid Green "Found a VPST in an otherwise empty vpt block");
16771de3e7fdSDavid Green LoLoop.ToRemove.insert(VPST);
16781de3e7fdSDavid Green } else
16791de3e7fdSDavid Green LoLoop.BlockMasksToRecompute.insert(VPST);
1680da2e4728SSam Tebbs } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
1681da2e4728SSam Tebbs // If this block starts with a VPST then attempt to merge it with the
1682da2e4728SSam Tebbs // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
1683da2e4728SSam Tebbs // block that no longer exists
1684da2e4728SSam Tebbs MachineInstr *VPST = Insts.front();
1685da2e4728SSam Tebbs auto Next = ++MachineBasicBlock::iterator(VPST);
1686da2e4728SSam Tebbs assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
1687da2e4728SSam Tebbs "The instruction after a VPST must be predicated");
1688f45c052cSMikhail Goncharov (void)Next;
1689da2e4728SSam Tebbs MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
1690da2e4728SSam Tebbs if (VprDef && VCMPOpcodeToVPT(VprDef->getOpcode()) &&
1691da2e4728SSam Tebbs !LoLoop.ToRemove.contains(VprDef)) {
1692da2e4728SSam Tebbs MachineInstr *VCMP = VprDef;
1693da2e4728SSam Tebbs // The VCMP and VPST can only be merged if the VCMP's operands will have
16948ecb015eSSam Tebbs // the same values at the VPST.
16958ecb015eSSam Tebbs // If any of the instructions between the VCMP and VPST are predicated
16968ecb015eSSam Tebbs // then a different code path is expected to have merged the VCMP and
16978ecb015eSSam Tebbs // VPST already.
169826bd534aSKazu Hirata if (std::none_of(++MachineBasicBlock::iterator(VCMP),
1699f45c052cSMikhail Goncharov MachineBasicBlock::iterator(VPST), hasVPRUse) &&
17008ecb015eSSam Tebbs RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) &&
17018ecb015eSSam Tebbs RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) {
1702da2e4728SSam Tebbs ReplaceVCMPWithVPT(VCMP, VPST);
1703da2e4728SSam Tebbs LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1704da2e4728SSam Tebbs LoLoop.ToRemove.insert(VPST);
1705da2e4728SSam Tebbs }
1706da2e4728SSam Tebbs }
1707835251f7SPierre-vh }
1708835251f7SPierre-vh }
1709b4fa884aSSam Parker
1710b4fa884aSSam Parker LoLoop.ToRemove.insert(LoLoop.VCTPs.begin(), LoLoop.VCTPs.end());
17118978c12bSSam Parker }
17128978c12bSSam Parker
Expand(LowOverheadLoop & LoLoop)17138978c12bSSam Parker void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1714a6fd919cSSam Parker
1715a6fd919cSSam Parker // Combine the LoopDec and LoopEnd instructions into LE(TP).
17168978c12bSSam Parker auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
17178978c12bSSam Parker MachineInstr *End = LoLoop.End;
1718a6fd919cSSam Parker MachineBasicBlock *MBB = End->getParent();
17198978c12bSSam Parker unsigned Opc = LoLoop.IsTailPredicationLegal() ?
17208978c12bSSam Parker ARM::MVE_LETP : ARM::t2LEUpdate;
1721a6fd919cSSam Parker MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
17228978c12bSSam Parker TII->get(Opc));
1723a6fd919cSSam Parker MIB.addDef(ARM::LR);
17240447f350SDavid Green unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0;
17250447f350SDavid Green MIB.add(End->getOperand(Off + 0));
17260447f350SDavid Green MIB.add(End->getOperand(Off + 1));
1727a6fd919cSSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
17285618e9beSSam Parker LoLoop.ToRemove.insert(LoLoop.Dec);
17295618e9beSSam Parker LoLoop.ToRemove.insert(End);
173098722691SSam Parker return &*MIB;
1731a6fd919cSSam Parker };
1732a6fd919cSSam Parker
173398722691SSam Parker // TODO: We should be able to automatically remove these branches before we
173498722691SSam Parker // get here - probably by teaching analyzeBranch about the pseudo
173598722691SSam Parker // instructions.
173698722691SSam Parker // If there is an unconditional branch, after I, that just branches to the
173798722691SSam Parker // next block, remove it.
173898722691SSam Parker auto RemoveDeadBranch = [](MachineInstr *I) {
173998722691SSam Parker MachineBasicBlock *BB = I->getParent();
174098722691SSam Parker MachineInstr *Terminator = &BB->instr_back();
174198722691SSam Parker if (Terminator->isUnconditionalBranch() && I != Terminator) {
174298722691SSam Parker MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
174398722691SSam Parker if (BB->isLayoutSuccessor(Succ)) {
174498722691SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
174598722691SSam Parker Terminator->eraseFromParent();
174698722691SSam Parker }
174798722691SSam Parker }
174898722691SSam Parker };
174998722691SSam Parker
175073346f58SDavid Green // And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
175173346f58SDavid Green // Anything other MQPRCopy can be converted to MVE_VORR later on.
175273346f58SDavid Green auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) {
175373346f58SDavid Green for (auto *MI : VMOVCopies) {
175473346f58SDavid Green LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
175573346f58SDavid Green assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
175673346f58SDavid Green MachineBasicBlock *MBB = MI->getParent();
175773346f58SDavid Green Register Dst = MI->getOperand(0).getReg();
175873346f58SDavid Green Register Src = MI->getOperand(1).getReg();
175973346f58SDavid Green auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
176073346f58SDavid Green ARM::D0 + (Dst - ARM::Q0) * 2)
176173346f58SDavid Green .addReg(ARM::D0 + (Src - ARM::Q0) * 2)
176273346f58SDavid Green .add(predOps(ARMCC::AL));
176353801a59SMichael Forster (void)MIB1;
176473346f58SDavid Green LLVM_DEBUG(dbgs() << " into " << *MIB1);
176573346f58SDavid Green auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
176673346f58SDavid Green ARM::D0 + (Dst - ARM::Q0) * 2 + 1)
176773346f58SDavid Green .addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1)
176873346f58SDavid Green .add(predOps(ARMCC::AL));
176973346f58SDavid Green LLVM_DEBUG(dbgs() << " and " << *MIB2);
177053801a59SMichael Forster (void)MIB2;
177173346f58SDavid Green MI->eraseFromParent();
177273346f58SDavid Green }
177373346f58SDavid Green };
177473346f58SDavid Green
17758978c12bSSam Parker if (LoLoop.Revert) {
1776bee2f618SDavid Green if (isWhileLoopStart(*LoLoop.Start))
17778978c12bSSam Parker RevertWhile(LoLoop.Start);
1778775b2f59SSam Parker else
1779b2ac9681SDavid Green RevertDo(LoLoop.Start);
17800447f350SDavid Green if (LoLoop.Dec == LoLoop.End)
17810447f350SDavid Green RevertLoopEndDec(LoLoop.End);
17820447f350SDavid Green else
17830447f350SDavid Green RevertLoopEnd(LoLoop.End, RevertLoopDec(LoLoop.Dec));
1784a6fd919cSSam Parker } else {
178573346f58SDavid Green ExpandVMOVCopies(LoLoop.VMOVCopies);
17868978c12bSSam Parker LoLoop.Start = ExpandLoopStart(LoLoop);
178748230355SDavid Green if (LoLoop.Start)
17888978c12bSSam Parker RemoveDeadBranch(LoLoop.Start);
17898978c12bSSam Parker LoLoop.End = ExpandLoopEnd(LoLoop);
17908978c12bSSam Parker RemoveDeadBranch(LoLoop.End);
1791b30adfb5SSam Parker if (LoLoop.IsTailPredicationLegal())
179240425183SSam Parker ConvertVPTBlocks(LoLoop);
179342350cd8SSam Parker for (auto *I : LoLoop.ToRemove) {
179442350cd8SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
179542350cd8SSam Parker I->eraseFromParent();
1796a6fd919cSSam Parker }
179724bf8063SPierre-vh for (auto *I : LoLoop.BlockMasksToRecompute) {
179824bf8063SPierre-vh LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
179924bf8063SPierre-vh recomputeVPTBlockMask(*I);
180024bf8063SPierre-vh LLVM_DEBUG(dbgs() << " ... done: " << *I);
180124bf8063SPierre-vh }
1802a6fd919cSSam Parker }
1803760b1751SSam Parker
1804fd01b2f4SSam Parker PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
1805760b1751SSam Parker DFS.ProcessLoop();
1806760b1751SSam Parker const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
1807760b1751SSam Parker for (auto *MBB : PostOrder) {
1808760b1751SSam Parker recomputeLiveIns(*MBB);
1809760b1751SSam Parker // FIXME: For some reason, the live-in print order is non-deterministic for
1810760b1751SSam Parker // our tests and I can't out why... So just sort them.
1811760b1751SSam Parker MBB->sortUniqueLiveIns();
1812760b1751SSam Parker }
1813760b1751SSam Parker
1814760b1751SSam Parker for (auto *MBB : reverse(PostOrder))
1815760b1751SSam Parker recomputeLivenessFlags(*MBB);
18161d06e75dSSam Parker
18171d06e75dSSam Parker // We've moved, removed and inserted new instructions, so update RDA.
18181d06e75dSSam Parker RDA->reset();
1819d97cf1f8SSjoerd Meijer }
1820a6fd919cSSam Parker
RevertNonLoops()182136c92227SSam Parker bool ARMLowOverheadLoops::RevertNonLoops() {
18224379a400SSam Parker LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
18234379a400SSam Parker bool Changed = false;
18244379a400SSam Parker
182536c92227SSam Parker for (auto &MBB : *MF) {
18264379a400SSam Parker SmallVector<MachineInstr*, 4> Starts;
18274379a400SSam Parker SmallVector<MachineInstr*, 4> Decs;
18284379a400SSam Parker SmallVector<MachineInstr*, 4> Ends;
18290447f350SDavid Green SmallVector<MachineInstr *, 4> EndDecs;
183073346f58SDavid Green SmallVector<MachineInstr *, 4> MQPRCopies;
18314379a400SSam Parker
18324379a400SSam Parker for (auto &I : MBB) {
1833049f9672SSjoerd Meijer if (isLoopStart(I))
18344379a400SSam Parker Starts.push_back(&I);
18354379a400SSam Parker else if (I.getOpcode() == ARM::t2LoopDec)
18364379a400SSam Parker Decs.push_back(&I);
18374379a400SSam Parker else if (I.getOpcode() == ARM::t2LoopEnd)
18384379a400SSam Parker Ends.push_back(&I);
18390447f350SDavid Green else if (I.getOpcode() == ARM::t2LoopEndDec)
18400447f350SDavid Green EndDecs.push_back(&I);
184173346f58SDavid Green else if (I.getOpcode() == ARM::MQPRCopy)
184273346f58SDavid Green MQPRCopies.push_back(&I);
18434379a400SSam Parker }
18444379a400SSam Parker
184573346f58SDavid Green if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
184673346f58SDavid Green MQPRCopies.empty())
18474379a400SSam Parker continue;
18484379a400SSam Parker
18494379a400SSam Parker Changed = true;
18504379a400SSam Parker
18514379a400SSam Parker for (auto *Start : Starts) {
1852bee2f618SDavid Green if (isWhileLoopStart(*Start))
18534379a400SSam Parker RevertWhile(Start);
18544379a400SSam Parker else
1855b2ac9681SDavid Green RevertDo(Start);
18564379a400SSam Parker }
18574379a400SSam Parker for (auto *Dec : Decs)
18584379a400SSam Parker RevertLoopDec(Dec);
18594379a400SSam Parker
18604379a400SSam Parker for (auto *End : Ends)
18614379a400SSam Parker RevertLoopEnd(End);
18620447f350SDavid Green for (auto *End : EndDecs)
18630447f350SDavid Green RevertLoopEndDec(End);
186473346f58SDavid Green for (auto *MI : MQPRCopies) {
186573346f58SDavid Green LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
186673346f58SDavid Green assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
186773346f58SDavid Green MachineBasicBlock *MBB = MI->getParent();
186873346f58SDavid Green auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR),
186973346f58SDavid Green MI->getOperand(0).getReg())
187073346f58SDavid Green .add(MI->getOperand(1))
187173346f58SDavid Green .add(MI->getOperand(1));
187273346f58SDavid Green addUnpredicatedMveVpredROp(MIB, MI->getOperand(0).getReg());
187373346f58SDavid Green MI->eraseFromParent();
187473346f58SDavid Green }
18754379a400SSam Parker }
18764379a400SSam Parker return Changed;
18774379a400SSam Parker }
18784379a400SSam Parker
createARMLowOverheadLoopsPass()1879a6fd919cSSam Parker FunctionPass *llvm::createARMLowOverheadLoopsPass() {
1880a6fd919cSSam Parker return new ARMLowOverheadLoops();
1881a6fd919cSSam Parker }
1882