1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10 /// instructions into machine operations.
11 /// The expectation is that the loop contains three pseudo instructions:
12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13 ///   form should be in the preheader, whereas the while form should be in the
14 ///   preheaders only predecessor.
15 /// - t2LoopDec - placed within in the loop body.
16 /// - t2LoopEnd - the loop latch terminator.
17 ///
18 /// In addition to this, we also look for the presence of the VCTP instruction,
19 /// which determines whether we can generated the tail-predicated low-overhead
20 /// loop form.
21 ///
22 /// Assumptions and Dependencies:
23 /// Low-overhead loops are constructed and executed using a setup instruction:
24 /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25 /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26 /// but fixed polarity: WLS can only branch forwards and LE can only branch
27 /// backwards. These restrictions mean that this pass is dependent upon block
28 /// layout and block sizes, which is why it's the last pass to run. The same is
29 /// true for ConstantIslands, but this pass does not increase the size of the
30 /// basic blocks, nor does it change the CFG. Instructions are mainly removed
31 /// during the transform and pseudo instructions are replaced by real ones. In
32 /// some cases, when we have to revert to a 'normal' loop, we have to introduce
33 /// multiple instructions for a single pseudo (see RevertWhile and
34 /// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd
35 /// are defined to be as large as this maximum sequence of replacement
36 /// instructions.
37 ///
38 //===----------------------------------------------------------------------===//
39 
40 #include "ARM.h"
41 #include "ARMBaseInstrInfo.h"
42 #include "ARMBaseRegisterInfo.h"
43 #include "ARMBasicBlockInfo.h"
44 #include "ARMSubtarget.h"
45 #include "Thumb2InstrInfo.h"
46 #include "llvm/ADT/SetOperations.h"
47 #include "llvm/ADT/SmallSet.h"
48 #include "llvm/CodeGen/MachineFunctionPass.h"
49 #include "llvm/CodeGen/MachineLoopInfo.h"
50 #include "llvm/CodeGen/MachineLoopUtils.h"
51 #include "llvm/CodeGen/MachineRegisterInfo.h"
52 #include "llvm/CodeGen/Passes.h"
53 #include "llvm/CodeGen/ReachingDefAnalysis.h"
54 #include "llvm/MC/MCInstrDesc.h"
55 
56 using namespace llvm;
57 
58 #define DEBUG_TYPE "arm-low-overhead-loops"
59 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
60 
61 namespace {
62 
63   class PostOrderLoopTraversal {
64     MachineLoop &ML;
65     MachineLoopInfo &MLI;
66     SmallPtrSet<MachineBasicBlock*, 4> Visited;
67     SmallVector<MachineBasicBlock*, 4> Order;
68 
69   public:
70     PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
71       : ML(ML), MLI(MLI) { }
72 
73     const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
74       return Order;
75     }
76 
77     // Visit all the blocks within the loop, as well as exit blocks and any
78     // blocks properly dominating the header.
79     void ProcessLoop() {
80       std::function<void(MachineBasicBlock*)> Search = [this, &Search]
81         (MachineBasicBlock *MBB) -> void {
82         if (Visited.count(MBB))
83           return;
84 
85         Visited.insert(MBB);
86         for (auto *Succ : MBB->successors()) {
87           if (!ML.contains(Succ))
88             continue;
89           Search(Succ);
90         }
91         Order.push_back(MBB);
92       };
93 
94       // Insert exit blocks.
95       SmallVector<MachineBasicBlock*, 2> ExitBlocks;
96       ML.getExitBlocks(ExitBlocks);
97       for (auto *MBB : ExitBlocks)
98         Order.push_back(MBB);
99 
100       // Then add the loop body.
101       Search(ML.getHeader());
102 
103       // Then try the preheader and its predecessors.
104       std::function<void(MachineBasicBlock*)> GetPredecessor =
105         [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
106         Order.push_back(MBB);
107         if (MBB->pred_size() == 1)
108           GetPredecessor(*MBB->pred_begin());
109       };
110 
111       if (auto *Preheader = ML.getLoopPreheader())
112         GetPredecessor(Preheader);
113       else if (auto *Preheader = MLI.findLoopPreheader(&ML, true))
114         GetPredecessor(Preheader);
115     }
116   };
117 
118   struct PredicatedMI {
119     MachineInstr *MI = nullptr;
120     SetVector<MachineInstr*> Predicates;
121 
122   public:
123     PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) :
124     MI(I) {
125       Predicates.insert(Preds.begin(), Preds.end());
126     }
127   };
128 
129   // Represent a VPT block, a list of instructions that begins with a VPST and
130   // has a maximum of four proceeding instructions. All instructions within the
131   // block are predicated upon the vpr and we allow instructions to define the
132   // vpr within in the block too.
133   class VPTBlock {
134     std::unique_ptr<PredicatedMI> VPST;
135     PredicatedMI *Divergent = nullptr;
136     SmallVector<PredicatedMI, 4> Insts;
137 
138   public:
139     VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
140       VPST = std::make_unique<PredicatedMI>(MI, Preds);
141     }
142 
143     void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
144       LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
145       if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) {
146         Divergent = &Insts.back();
147         LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
148       }
149       Insts.emplace_back(MI, Preds);
150       assert(Insts.size() <= 4 && "Too many instructions in VPT block!");
151     }
152 
153     // Have we found an instruction within the block which defines the vpr? If
154     // so, not all the instructions in the block will have the same predicate.
155     bool HasNonUniformPredicate() const {
156       return Divergent != nullptr;
157     }
158 
159     // Is the given instruction part of the predicate set controlling the entry
160     // to the block.
161     bool IsPredicatedOn(MachineInstr *MI) const {
162       return VPST->Predicates.count(MI);
163     }
164 
165     // Is the given instruction the only predicate which controls the entry to
166     // the block.
167     bool IsOnlyPredicatedOn(MachineInstr *MI) const {
168       return IsPredicatedOn(MI) && VPST->Predicates.size() == 1;
169     }
170 
171     unsigned size() const { return Insts.size(); }
172     SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
173     MachineInstr *getVPST() const { return VPST->MI; }
174     PredicatedMI *getDivergent() const { return Divergent; }
175   };
176 
177   struct LowOverheadLoop {
178 
179     MachineLoop *ML = nullptr;
180     MachineFunction *MF = nullptr;
181     MachineInstr *InsertPt = nullptr;
182     MachineInstr *Start = nullptr;
183     MachineInstr *Dec = nullptr;
184     MachineInstr *End = nullptr;
185     MachineInstr *VCTP = nullptr;
186     VPTBlock *CurrentBlock = nullptr;
187     SetVector<MachineInstr*> CurrentPredicate;
188     SmallVector<VPTBlock, 4> VPTBlocks;
189     bool Revert = false;
190     bool CannotTailPredicate = false;
191 
192     LowOverheadLoop(MachineLoop *ML) : ML(ML) {
193       MF = ML->getHeader()->getParent();
194     }
195 
196     // If this is an MVE instruction, check that we know how to use tail
197     // predication with it. Record VPT blocks and return whether the
198     // instruction is valid for tail predication.
199     bool ValidateMVEInst(MachineInstr *MI);
200 
201     void AnalyseMVEInst(MachineInstr *MI) {
202       CannotTailPredicate = !ValidateMVEInst(MI);
203     }
204 
205     bool IsTailPredicationLegal() const {
206       // For now, let's keep things really simple and only support a single
207       // block for tail predication.
208       return !Revert && FoundAllComponents() && VCTP &&
209              !CannotTailPredicate && ML->getNumBlocks() == 1;
210     }
211 
212     bool ValidateTailPredicate(MachineInstr *StartInsertPt,
213                                ReachingDefAnalysis *RDA,
214                                MachineLoopInfo *MLI);
215 
216     // Is it safe to define LR with DLS/WLS?
217     // LR can be defined if it is the operand to start, because it's the same
218     // value, or if it's going to be equivalent to the operand to Start.
219     MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA);
220 
221     // Check the branch targets are within range and we satisfy our
222     // restrictions.
223     void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA,
224                        MachineLoopInfo *MLI);
225 
226     bool FoundAllComponents() const {
227       return Start && Dec && End;
228     }
229 
230     SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
231 
232     // Return the loop iteration count, or the number of elements if we're tail
233     // predicating.
234     MachineOperand &getCount() {
235       return IsTailPredicationLegal() ?
236         VCTP->getOperand(1) : Start->getOperand(0);
237     }
238 
239     unsigned getStartOpcode() const {
240       bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
241       if (!IsTailPredicationLegal())
242         return IsDo ? ARM::t2DLS : ARM::t2WLS;
243 
244       return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo);
245     }
246 
247     void dump() const {
248       if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
249       if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
250       if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
251       if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP;
252       if (!FoundAllComponents())
253         dbgs() << "ARM Loops: Not a low-overhead loop.\n";
254       else if (!(Start && Dec && End))
255         dbgs() << "ARM Loops: Failed to find all loop components.\n";
256     }
257   };
258 
259   class ARMLowOverheadLoops : public MachineFunctionPass {
260     MachineFunction           *MF = nullptr;
261     MachineLoopInfo           *MLI = nullptr;
262     ReachingDefAnalysis       *RDA = nullptr;
263     const ARMBaseInstrInfo    *TII = nullptr;
264     MachineRegisterInfo       *MRI = nullptr;
265     const TargetRegisterInfo  *TRI = nullptr;
266     std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
267 
268   public:
269     static char ID;
270 
271     ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
272 
273     void getAnalysisUsage(AnalysisUsage &AU) const override {
274       AU.setPreservesCFG();
275       AU.addRequired<MachineLoopInfo>();
276       AU.addRequired<ReachingDefAnalysis>();
277       MachineFunctionPass::getAnalysisUsage(AU);
278     }
279 
280     bool runOnMachineFunction(MachineFunction &MF) override;
281 
282     MachineFunctionProperties getRequiredProperties() const override {
283       return MachineFunctionProperties().set(
284           MachineFunctionProperties::Property::NoVRegs).set(
285           MachineFunctionProperties::Property::TracksLiveness);
286     }
287 
288     StringRef getPassName() const override {
289       return ARM_LOW_OVERHEAD_LOOPS_NAME;
290     }
291 
292   private:
293     bool ProcessLoop(MachineLoop *ML);
294 
295     bool RevertNonLoops();
296 
297     void RevertWhile(MachineInstr *MI) const;
298 
299     bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const;
300 
301     void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
302 
303     void RemoveLoopUpdate(LowOverheadLoop &LoLoop);
304 
305     void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
306 
307     MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
308 
309     void Expand(LowOverheadLoop &LoLoop);
310 
311   };
312 }
313 
314 char ARMLowOverheadLoops::ID = 0;
315 
316 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
317                 false, false)
318 
319 MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
320   // We can define LR because LR already contains the same value.
321   if (Start->getOperand(0).getReg() == ARM::LR)
322     return Start;
323 
324   unsigned CountReg = Start->getOperand(0).getReg();
325   auto IsMoveLR = [&CountReg](MachineInstr *MI) {
326     return MI->getOpcode() == ARM::tMOVr &&
327            MI->getOperand(0).getReg() == ARM::LR &&
328            MI->getOperand(1).getReg() == CountReg &&
329            MI->getOperand(2).getImm() == ARMCC::AL;
330    };
331 
332   MachineBasicBlock *MBB = Start->getParent();
333 
334   // Find an insertion point:
335   // - Is there a (mov lr, Count) before Start? If so, and nothing else writes
336   //   to Count before Start, we can insert at that mov.
337   if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR))
338     if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
339       return LRDef;
340 
341   // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
342   //   to Count after Start, we can insert at that mov.
343   if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR))
344     if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
345       return LRDef;
346 
347   // We've found no suitable LR def and Start doesn't use LR directly. Can we
348   // just define LR anyway?
349   if (!RDA->isRegUsedAfter(Start, ARM::LR))
350     return Start;
351 
352   return nullptr;
353 }
354 
355 // Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must
356 // not define a register that is used by any instructions, after and including,
357 // 'To'. These instructions also must not redefine any of Froms operands.
358 template<typename Iterator>
359 static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) {
360   SmallSet<int, 2> Defs;
361   // First check that From would compute the same value if moved.
362   for (auto &MO : From->operands()) {
363     if (!MO.isReg() || MO.isUndef() || !MO.getReg())
364       continue;
365     if (MO.isDef())
366       Defs.insert(MO.getReg());
367     else if (!RDA->hasSameReachingDef(From, To, MO.getReg()))
368       return false;
369   }
370 
371   // Now walk checking that the rest of the instructions will compute the same
372   // value.
373   for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) {
374     for (auto &MO : I->operands())
375       if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg()))
376         return false;
377   }
378   return true;
379 }
380 
381 bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
382     ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) {
383   assert(VCTP && "VCTP instruction expected but is not set");
384   // All predication within the loop should be based on vctp. If the block
385   // isn't predicated on entry, check whether the vctp is within the block
386   // and that all other instructions are then predicated on it.
387   for (auto &Block : VPTBlocks) {
388     if (Block.IsPredicatedOn(VCTP))
389       continue;
390     if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI)) {
391       LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: "
392                  << *Block.getDivergent()->MI);
393       return false;
394     }
395     SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
396     for (auto &PredMI : Insts) {
397       if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI))
398         continue;
399       LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
400                         << " - which is predicated on:\n";
401                         for (auto *MI : PredMI.Predicates)
402                           dbgs() << "   - " << *MI;
403                  );
404       return false;
405     }
406   }
407 
408   // For tail predication, we need to provide the number of elements, instead
409   // of the iteration count, to the loop start instruction. The number of
410   // elements is provided to the vctp instruction, so we need to check that
411   // we can use this register at InsertPt.
412   Register NumElements = VCTP->getOperand(1).getReg();
413 
414   // If the register is defined within loop, then we can't perform TP.
415   // TODO: Check whether this is just a mov of a register that would be
416   // available.
417   if (RDA->getReachingDef(VCTP, NumElements) >= 0) {
418     LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
419     return false;
420   }
421 
422   // The element count register maybe defined after InsertPt, in which case we
423   // need to try to move either InsertPt or the def so that the [w|d]lstp can
424   // use the value.
425   MachineBasicBlock *InsertBB = InsertPt->getParent();
426   if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) {
427     if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) {
428       if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) {
429         ElemDef->removeFromParent();
430         InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef);
431         LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
432                    << *ElemDef);
433       } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) {
434         InsertPt->removeFromParent();
435         InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt);
436         LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
437       } else {
438         LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop "
439                    << "start instruction.\n");
440         return false;
441       }
442     }
443   }
444 
445   // Especially in the case of while loops, InsertBB may not be the
446   // preheader, so we need to check that the register isn't redefined
447   // before entering the loop.
448   auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB,
449                                       Register NumElements) {
450     // NumElements is redefined in this block.
451     if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0)
452       return true;
453 
454     // Don't continue searching up through multiple predecessors.
455     if (MBB->pred_size() > 1)
456       return true;
457 
458     return false;
459   };
460 
461   // First, find the block that looks like the preheader.
462   MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true);
463   if (!MBB) {
464     LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n");
465     return false;
466   }
467 
468   // Then search backwards for a def, until we get to InsertBB.
469   while (MBB != InsertBB) {
470     if (CannotProvideElements(MBB, NumElements)) {
471       LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
472       return false;
473     }
474     MBB = *MBB->pred_begin();
475   }
476 
477   LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n");
478   return true;
479 }
480 
481 void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
482                                     ReachingDefAnalysis *RDA,
483                                     MachineLoopInfo *MLI) {
484   if (Revert)
485     return;
486 
487   if (!End->getOperand(1).isMBB())
488     report_fatal_error("Expected LoopEnd to target basic block");
489 
490   // TODO Maybe there's cases where the target doesn't have to be the header,
491   // but for now be safe and revert.
492   if (End->getOperand(1).getMBB() != ML->getHeader()) {
493     LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
494     Revert = true;
495     return;
496   }
497 
498   // The WLS and LE instructions have 12-bits for the label offset. WLS
499   // requires a positive offset, while LE uses negative.
500   if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
501       !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
502     LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
503     Revert = true;
504     return;
505   }
506 
507   if (Start->getOpcode() == ARM::t2WhileLoopStart &&
508       (BBUtils->getOffsetOf(Start) >
509        BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) ||
510        !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) {
511     LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
512     Revert = true;
513     return;
514   }
515 
516   InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA);
517   if (!InsertPt) {
518     LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
519     Revert = true;
520     return;
521   } else
522     LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
523 
524   if (!IsTailPredicationLegal()) {
525     LLVM_DEBUG(if (!VCTP)
526                  dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
527                dbgs() << "ARM Loops: Tail-predication is not valid.\n");
528     return;
529   }
530 
531   assert(ML->getBlocks().size() == 1 &&
532          "Shouldn't be processing a loop with more than one block");
533   CannotTailPredicate = !ValidateTailPredicate(InsertPt, RDA, MLI);
534   LLVM_DEBUG(if (CannotTailPredicate)
535              dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
536 }
537 
538 bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
539   if (CannotTailPredicate)
540     return false;
541 
542   // Only support a single vctp.
543   if (isVCTP(MI) && VCTP)
544     return false;
545 
546   // Start a new vpt block when we discover a vpt.
547   if (MI->getOpcode() == ARM::MVE_VPST) {
548     VPTBlocks.emplace_back(MI, CurrentPredicate);
549     CurrentBlock = &VPTBlocks.back();
550     return true;
551   } else if (isVCTP(MI))
552     VCTP = MI;
553   else if (MI->getOpcode() == ARM::MVE_VPSEL ||
554            MI->getOpcode() == ARM::MVE_VPNOT)
555     return false;
556 
557   // TODO: Allow VPSEL and VPNOT, we currently cannot because:
558   // 1) It will use the VPR as a predicate operand, but doesn't have to be
559   //    instead a VPT block, which means we can assert while building up
560   //    the VPT block because we don't find another VPST to being a new
561   //    one.
562   // 2) VPSEL still requires a VPR operand even after tail predicating,
563   //    which means we can't remove it unless there is another
564   //    instruction, such as vcmp, that can provide the VPR def.
565 
566   bool IsUse = false;
567   bool IsDef = false;
568   const MCInstrDesc &MCID = MI->getDesc();
569   for (int i = MI->getNumOperands() - 1; i >= 0; --i) {
570     const MachineOperand &MO = MI->getOperand(i);
571     if (!MO.isReg() || MO.getReg() != ARM::VPR)
572       continue;
573 
574     if (MO.isDef()) {
575       CurrentPredicate.insert(MI);
576       IsDef = true;
577     } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) {
578       CurrentBlock->addInst(MI, CurrentPredicate);
579       IsUse = true;
580     } else {
581       LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
582       return false;
583     }
584   }
585 
586   // If we find a vpr def that is not already predicated on the vctp, we've
587   // got disjoint predicates that may not be equivalent when we do the
588   // conversion.
589   if (IsDef && !IsUse && VCTP && !isVCTP(MI)) {
590     LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI);
591     return false;
592   }
593 
594   uint64_t Flags = MCID.TSFlags;
595   if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
596     return true;
597 
598   // If we find an instruction that has been marked as not valid for tail
599   // predication, only allow the instruction if it's contained within a valid
600   // VPT block.
601   if ((Flags & ARMII::ValidForTailPredication) == 0 && !IsUse) {
602     LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI);
603     return false;
604   }
605 
606   return true;
607 }
608 
609 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
610   const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget());
611   if (!ST.hasLOB())
612     return false;
613 
614   MF = &mf;
615   LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
616 
617   MLI = &getAnalysis<MachineLoopInfo>();
618   RDA = &getAnalysis<ReachingDefAnalysis>();
619   MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
620   MRI = &MF->getRegInfo();
621   TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
622   TRI = ST.getRegisterInfo();
623   BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF));
624   BBUtils->computeAllBlockSizes();
625   BBUtils->adjustBBOffsetsAfter(&MF->front());
626 
627   bool Changed = false;
628   for (auto ML : *MLI) {
629     if (!ML->getParentLoop())
630       Changed |= ProcessLoop(ML);
631   }
632   Changed |= RevertNonLoops();
633   return Changed;
634 }
635 
636 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
637 
638   bool Changed = false;
639 
640   // Process inner loops first.
641   for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
642     Changed |= ProcessLoop(*I);
643 
644   LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n";
645              if (auto *Preheader = ML->getLoopPreheader())
646                dbgs() << " - " << Preheader->getName() << "\n";
647              else if (auto *Preheader = MLI->findLoopPreheader(ML))
648                dbgs() << " - " << Preheader->getName() << "\n";
649              for (auto *MBB : ML->getBlocks())
650                dbgs() << " - " << MBB->getName() << "\n";
651             );
652 
653   // Search the given block for a loop start instruction. If one isn't found,
654   // and there's only one predecessor block, search that one too.
655   std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
656     [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
657     for (auto &MI : *MBB) {
658       if (isLoopStart(MI))
659         return &MI;
660     }
661     if (MBB->pred_size() == 1)
662       return SearchForStart(*MBB->pred_begin());
663     return nullptr;
664   };
665 
666   LowOverheadLoop LoLoop(ML);
667   // Search the preheader for the start intrinsic.
668   // FIXME: I don't see why we shouldn't be supporting multiple predecessors
669   // with potentially multiple set.loop.iterations, so we need to enable this.
670   if (auto *Preheader = ML->getLoopPreheader())
671     LoLoop.Start = SearchForStart(Preheader);
672   else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
673     LoLoop.Start = SearchForStart(Preheader);
674   else
675     return false;
676 
677   // Find the low-overhead loop components and decide whether or not to fall
678   // back to a normal loop. Also look for a vctp instructions and decide
679   // whether we can convert that predicate using tail predication.
680   for (auto *MBB : reverse(ML->getBlocks())) {
681     for (auto &MI : *MBB) {
682       if (MI.getOpcode() == ARM::t2LoopDec)
683         LoLoop.Dec = &MI;
684       else if (MI.getOpcode() == ARM::t2LoopEnd)
685         LoLoop.End = &MI;
686       else if (isLoopStart(MI))
687         LoLoop.Start = &MI;
688       else if (MI.getDesc().isCall()) {
689         // TODO: Though the call will require LE to execute again, does this
690         // mean we should revert? Always executing LE hopefully should be
691         // faster than performing a sub,cmp,br or even subs,br.
692         LoLoop.Revert = true;
693         LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
694       } else {
695         // Record VPR defs and build up their corresponding vpt blocks.
696         // Check we know how to tail predicate any mve instructions.
697         LoLoop.AnalyseMVEInst(&MI);
698       }
699 
700       // We need to ensure that LR is not used or defined inbetween LoopDec and
701       // LoopEnd.
702       if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert)
703         continue;
704 
705       // If we find that LR has been written or read between LoopDec and
706       // LoopEnd, expect that the decremented value is being used else where.
707       // Because this value isn't actually going to be produced until the
708       // latch, by LE, we would need to generate a real sub. The value is also
709       // likely to be copied/reloaded for use of LoopEnd - in which in case
710       // we'd need to perform an add because it gets subtracted again by LE!
711       // The other option is to then generate the other form of LE which doesn't
712       // perform the sub.
713       for (auto &MO : MI.operands()) {
714         if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
715             MO.getReg() == ARM::LR) {
716           LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
717           LoLoop.Revert = true;
718           break;
719         }
720       }
721     }
722   }
723 
724   LLVM_DEBUG(LoLoop.dump());
725   if (!LoLoop.FoundAllComponents()) {
726     LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
727     return false;
728   }
729 
730   LoLoop.CheckLegality(BBUtils.get(), RDA, MLI);
731   Expand(LoLoop);
732   return true;
733 }
734 
735 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
736 // beq that branches to the exit branch.
737 // TODO: We could also try to generate a cbz if the value in LR is also in
738 // another low register.
739 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
740   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
741   MachineBasicBlock *MBB = MI->getParent();
742   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
743                                     TII->get(ARM::t2CMPri));
744   MIB.add(MI->getOperand(0));
745   MIB.addImm(0);
746   MIB.addImm(ARMCC::AL);
747   MIB.addReg(ARM::NoRegister);
748 
749   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
750   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
751     ARM::tBcc : ARM::t2Bcc;
752 
753   MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
754   MIB.add(MI->getOperand(1));   // branch target
755   MIB.addImm(ARMCC::EQ);        // condition code
756   MIB.addReg(ARM::CPSR);
757   MI->eraseFromParent();
758 }
759 
760 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
761                                         bool SetFlags) const {
762   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
763   MachineBasicBlock *MBB = MI->getParent();
764 
765   // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
766   if (SetFlags &&
767       (RDA->isRegUsedAfter(MI, ARM::CPSR) ||
768        !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR)))
769       SetFlags = false;
770 
771   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
772                                     TII->get(ARM::t2SUBri));
773   MIB.addDef(ARM::LR);
774   MIB.add(MI->getOperand(1));
775   MIB.add(MI->getOperand(2));
776   MIB.addImm(ARMCC::AL);
777   MIB.addReg(0);
778 
779   if (SetFlags) {
780     MIB.addReg(ARM::CPSR);
781     MIB->getOperand(5).setIsDef(true);
782   } else
783     MIB.addReg(0);
784 
785   MI->eraseFromParent();
786   return SetFlags;
787 }
788 
789 // Generate a subs, or sub and cmp, and a branch instead of an LE.
790 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
791   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
792 
793   MachineBasicBlock *MBB = MI->getParent();
794   // Create cmp
795   if (!SkipCmp) {
796     MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
797                                       TII->get(ARM::t2CMPri));
798     MIB.addReg(ARM::LR);
799     MIB.addImm(0);
800     MIB.addImm(ARMCC::AL);
801     MIB.addReg(ARM::NoRegister);
802   }
803 
804   MachineBasicBlock *DestBB = MI->getOperand(1).getMBB();
805   unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ?
806     ARM::tBcc : ARM::t2Bcc;
807 
808   // Create bne
809   MachineInstrBuilder MIB =
810     BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
811   MIB.add(MI->getOperand(1));   // branch target
812   MIB.addImm(ARMCC::NE);        // condition code
813   MIB.addReg(ARM::CPSR);
814   MI->eraseFromParent();
815 }
816 
817 MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
818   MachineInstr *InsertPt = LoLoop.InsertPt;
819   MachineInstr *Start = LoLoop.Start;
820   MachineBasicBlock *MBB = InsertPt->getParent();
821   bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
822   unsigned Opc = LoLoop.getStartOpcode();
823   MachineOperand &Count = LoLoop.getCount();
824 
825   MachineInstrBuilder MIB =
826     BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
827 
828   MIB.addDef(ARM::LR);
829   MIB.add(Count);
830   if (!IsDo)
831     MIB.add(Start->getOperand(1));
832 
833   // When using tail-predication, try to delete the dead code that was used to
834   // calculate the number of loop iterations.
835   if (LoLoop.IsTailPredicationLegal()) {
836     SmallVector<MachineInstr*, 4> Killed;
837     SmallVector<MachineInstr*, 4> Dead;
838     if (auto *Def = RDA->getReachingMIDef(Start,
839                                           Start->getOperand(0).getReg())) {
840       Killed.push_back(Def);
841 
842       while (!Killed.empty()) {
843         MachineInstr *Def = Killed.back();
844         Killed.pop_back();
845         Dead.push_back(Def);
846         for (auto &MO : Def->operands()) {
847           if (!MO.isReg() || !MO.isKill())
848             continue;
849 
850           MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg());
851           if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1)
852             Killed.push_back(Kill);
853         }
854       }
855       for (auto *MI : Dead)
856         MI->eraseFromParent();
857     }
858   }
859 
860   // If we're inserting at a mov lr, then remove it as it's redundant.
861   if (InsertPt != Start)
862     InsertPt->eraseFromParent();
863   Start->eraseFromParent();
864   LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
865   return &*MIB;
866 }
867 
868 // Goal is to optimise and clean-up these loops:
869 //
870 //   vector.body:
871 //     renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
872 //     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4
873 //     ..
874 //     $lr = MVE_DLSTP_32 renamable $r3
875 //
876 // The SUB is the old update of the loop iteration count expression, which
877 // is no longer needed. This sub is removed when the element count, which is in
878 // r3 in this example, is defined by an instruction in the loop, and it has
879 // no uses.
880 //
881 void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) {
882   Register ElemCount = LoLoop.VCTP->getOperand(1).getReg();
883   MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back();
884 
885   LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n");
886 
887   if (LoLoop.ML->getNumBlocks() != 1) {
888     LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n");
889     return;
890   }
891 
892   LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: ";
893              LoLoop.VCTP->getOperand(1).dump());
894 
895   // Find the definition we are interested in removing, if there is one.
896   MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount);
897   if (!Def) {
898     LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n");
899     return;
900   }
901 
902   // Bail if we define CPSR and it is not dead
903   if (!Def->registerDefIsDead(ARM::CPSR, TRI)) {
904     LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n");
905     return;
906   }
907 
908   // Bail if elemcount is used in exit blocks, i.e. if it is live-in.
909   if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) {
910     LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n");
911     return;
912   }
913 
914   // Bail if there are uses after this Def in the block.
915   SmallVector<MachineInstr*, 4> Uses;
916   RDA->getReachingLocalUses(Def, ElemCount, Uses);
917   if (Uses.size()) {
918     LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n");
919     return;
920   }
921 
922   Uses.clear();
923   RDA->getAllInstWithUseBefore(Def, ElemCount, Uses);
924 
925   // Remove Def if there are no uses, or if the only use is the VCTP
926   // instruction.
927   if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) {
928     LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: ";
929                Def->dump());
930     Def->eraseFromParent();
931     return;
932   }
933 
934   LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n";
935              for (auto U : Uses) U->dump());
936 }
937 
938 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
939   auto RemovePredicate = [](MachineInstr *MI) {
940     LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
941     if (int PIdx = llvm::findFirstVPTPredOperandIdx(*MI)) {
942       assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
943              "Expected Then predicate!");
944       MI->getOperand(PIdx).setImm(ARMVCC::None);
945       MI->getOperand(PIdx+1).setReg(0);
946     } else
947       llvm_unreachable("trying to unpredicate a non-predicated instruction");
948   };
949 
950   // There are a few scenarios which we have to fix up:
951   // 1) A VPT block with is only predicated by the vctp and has no internal vpr
952   //    defs.
953   // 2) A VPT block which is only predicated by the vctp but has an internal
954   //    vpr def.
955   // 3) A VPT block which is predicated upon the vctp as well as another vpr
956   //    def.
957   // 4) A VPT block which is not predicated upon a vctp, but contains it and
958   //    all instructions within the block are predicated upon in.
959 
960   for (auto &Block : LoLoop.getVPTBlocks()) {
961     SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
962     if (Block.HasNonUniformPredicate()) {
963       PredicatedMI *Divergent = Block.getDivergent();
964       if (isVCTP(Divergent->MI)) {
965         // The vctp will be removed, so the size of the vpt block needs to be
966         // modified.
967         uint64_t Size = getARMVPTBlockMask(Block.size() - 1);
968         Block.getVPST()->getOperand(0).setImm(Size);
969         LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n");
970       } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
971         // The VPT block has a non-uniform predicate but it's entry is guarded
972         // only by a vctp, which means we:
973         // - Need to remove the original vpst.
974         // - Then need to unpredicate any following instructions, until
975         //   we come across the divergent vpr def.
976         // - Insert a new vpst to predicate the instruction(s) that following
977         //   the divergent vpr def.
978         // TODO: We could be producing more VPT blocks than necessary and could
979         // fold the newly created one into a proceeding one.
980         for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()),
981              E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
982           RemovePredicate(&*I);
983 
984         unsigned Size = 0;
985         auto E = MachineBasicBlock::reverse_iterator(Divergent->MI);
986         auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI);
987         MachineInstr *InsertAt = nullptr;
988         while (I != E) {
989           InsertAt = &*I;
990           ++Size;
991           ++I;
992         }
993         MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
994                                           InsertAt->getDebugLoc(),
995                                           TII->get(ARM::MVE_VPST));
996         MIB.addImm(getARMVPTBlockMask(Size));
997         LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
998         LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
999         Block.getVPST()->eraseFromParent();
1000       }
1001     } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
1002       // A vpt block which is only predicated upon vctp and has no internal vpr
1003       // defs:
1004       // - Remove vpst.
1005       // - Unpredicate the remaining instructions.
1006       LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
1007       Block.getVPST()->eraseFromParent();
1008       for (auto &PredMI : Insts)
1009         RemovePredicate(PredMI.MI);
1010     }
1011   }
1012 
1013   LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
1014   LoLoop.VCTP->eraseFromParent();
1015 }
1016 
1017 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1018 
1019   // Combine the LoopDec and LoopEnd instructions into LE(TP).
1020   auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
1021     MachineInstr *End = LoLoop.End;
1022     MachineBasicBlock *MBB = End->getParent();
1023     unsigned Opc = LoLoop.IsTailPredicationLegal() ?
1024       ARM::MVE_LETP : ARM::t2LEUpdate;
1025     MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
1026                                       TII->get(Opc));
1027     MIB.addDef(ARM::LR);
1028     MIB.add(End->getOperand(0));
1029     MIB.add(End->getOperand(1));
1030     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
1031 
1032     LoLoop.End->eraseFromParent();
1033     LoLoop.Dec->eraseFromParent();
1034     return &*MIB;
1035   };
1036 
1037   // TODO: We should be able to automatically remove these branches before we
1038   // get here - probably by teaching analyzeBranch about the pseudo
1039   // instructions.
1040   // If there is an unconditional branch, after I, that just branches to the
1041   // next block, remove it.
1042   auto RemoveDeadBranch = [](MachineInstr *I) {
1043     MachineBasicBlock *BB = I->getParent();
1044     MachineInstr *Terminator = &BB->instr_back();
1045     if (Terminator->isUnconditionalBranch() && I != Terminator) {
1046       MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
1047       if (BB->isLayoutSuccessor(Succ)) {
1048         LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
1049         Terminator->eraseFromParent();
1050       }
1051     }
1052   };
1053 
1054   if (LoLoop.Revert) {
1055     if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart)
1056       RevertWhile(LoLoop.Start);
1057     else
1058       LoLoop.Start->eraseFromParent();
1059     bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true);
1060     RevertLoopEnd(LoLoop.End, FlagsAlreadySet);
1061   } else {
1062     LoLoop.Start = ExpandLoopStart(LoLoop);
1063     RemoveDeadBranch(LoLoop.Start);
1064     LoLoop.End = ExpandLoopEnd(LoLoop);
1065     RemoveDeadBranch(LoLoop.End);
1066     if (LoLoop.IsTailPredicationLegal()) {
1067       RemoveLoopUpdate(LoLoop);
1068       ConvertVPTBlocks(LoLoop);
1069     }
1070   }
1071 
1072   PostOrderLoopTraversal DFS(*LoLoop.ML, *MLI);
1073   DFS.ProcessLoop();
1074   const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
1075   for (auto *MBB : PostOrder) {
1076     recomputeLiveIns(*MBB);
1077     // FIXME: For some reason, the live-in print order is non-deterministic for
1078     // our tests and I can't out why... So just sort them.
1079     MBB->sortUniqueLiveIns();
1080   }
1081 
1082   for (auto *MBB : reverse(PostOrder))
1083     recomputeLivenessFlags(*MBB);
1084 }
1085 
1086 bool ARMLowOverheadLoops::RevertNonLoops() {
1087   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
1088   bool Changed = false;
1089 
1090   for (auto &MBB : *MF) {
1091     SmallVector<MachineInstr*, 4> Starts;
1092     SmallVector<MachineInstr*, 4> Decs;
1093     SmallVector<MachineInstr*, 4> Ends;
1094 
1095     for (auto &I : MBB) {
1096       if (isLoopStart(I))
1097         Starts.push_back(&I);
1098       else if (I.getOpcode() == ARM::t2LoopDec)
1099         Decs.push_back(&I);
1100       else if (I.getOpcode() == ARM::t2LoopEnd)
1101         Ends.push_back(&I);
1102     }
1103 
1104     if (Starts.empty() && Decs.empty() && Ends.empty())
1105       continue;
1106 
1107     Changed = true;
1108 
1109     for (auto *Start : Starts) {
1110       if (Start->getOpcode() == ARM::t2WhileLoopStart)
1111         RevertWhile(Start);
1112       else
1113         Start->eraseFromParent();
1114     }
1115     for (auto *Dec : Decs)
1116       RevertLoopDec(Dec);
1117 
1118     for (auto *End : Ends)
1119       RevertLoopEnd(End);
1120   }
1121   return Changed;
1122 }
1123 
1124 FunctionPass *llvm::createARMLowOverheadLoopsPass() {
1125   return new ARMLowOverheadLoops();
1126 }
1127