1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10 /// instructions into machine operations.
11 /// The expectation is that the loop contains three pseudo instructions:
12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop
13 ///   form should be in the preheader, whereas the while form should be in the
14 ///   preheaders only predecessor. TODO: Could DoLoopStart get moved into the
15 ///   pre-preheader?
16 /// - t2LoopDec - placed within in the loop body.
17 /// - t2LoopEnd - the loop latch terminator.
18 ///
19 //===----------------------------------------------------------------------===//
20 
21 #include "ARM.h"
22 #include "ARMBaseInstrInfo.h"
23 #include "ARMBaseRegisterInfo.h"
24 #include "ARMBasicBlockInfo.h"
25 #include "ARMSubtarget.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineLoopInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "arm-low-overhead-loops"
33 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
34 
35 namespace {
36 
37   class ARMLowOverheadLoops : public MachineFunctionPass {
38     const ARMBaseInstrInfo    *TII = nullptr;
39     MachineRegisterInfo       *MRI = nullptr;
40     std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
41 
42   public:
43     static char ID;
44 
45     ARMLowOverheadLoops() : MachineFunctionPass(ID) { }
46 
47     void getAnalysisUsage(AnalysisUsage &AU) const override {
48       AU.setPreservesCFG();
49       AU.addRequired<MachineLoopInfo>();
50       MachineFunctionPass::getAnalysisUsage(AU);
51     }
52 
53     bool runOnMachineFunction(MachineFunction &MF) override;
54 
55     bool ProcessLoop(MachineLoop *ML);
56 
57     void RevertWhile(MachineInstr *MI) const;
58 
59     void RevertLoopDec(MachineInstr *MI) const;
60 
61     void RevertLoopEnd(MachineInstr *MI) const;
62 
63     void Expand(MachineLoop *ML, MachineInstr *Start,
64                 MachineInstr *Dec, MachineInstr *End, bool Revert);
65 
66     MachineFunctionProperties getRequiredProperties() const override {
67       return MachineFunctionProperties().set(
68           MachineFunctionProperties::Property::NoVRegs);
69     }
70 
71     StringRef getPassName() const override {
72       return ARM_LOW_OVERHEAD_LOOPS_NAME;
73     }
74   };
75 }
76 
77 char ARMLowOverheadLoops::ID = 0;
78 
79 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
80                 false, false)
81 
82 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) {
83   if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB())
84     return false;
85 
86   LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n");
87 
88   auto &MLI = getAnalysis<MachineLoopInfo>();
89   MRI = &MF.getRegInfo();
90   TII = static_cast<const ARMBaseInstrInfo*>(
91     MF.getSubtarget().getInstrInfo());
92   BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF));
93   BBUtils->computeAllBlockSizes();
94 
95   bool Changed = false;
96   for (auto ML : MLI) {
97     if (!ML->getParentLoop())
98       Changed |= ProcessLoop(ML);
99   }
100   return Changed;
101 }
102 
103 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
104 
105   bool Changed = false;
106 
107   // Process inner loops first.
108   for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
109     Changed |= ProcessLoop(*I);
110 
111   LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
112 
113   auto IsLoopStart = [](MachineInstr &MI) {
114     return MI.getOpcode() == ARM::t2DoLoopStart ||
115            MI.getOpcode() == ARM::t2WhileLoopStart;
116   };
117 
118   // Search the given block for a loop start instruction. If one isn't found,
119   // and there's only one predecessor block, search that one too.
120   std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
121     [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
122     for (auto &MI : *MBB) {
123       if (IsLoopStart(MI))
124         return &MI;
125     }
126     if (MBB->pred_size() == 1)
127       return SearchForStart(*MBB->pred_begin());
128     return nullptr;
129   };
130 
131   MachineInstr *Start = nullptr;
132   MachineInstr *Dec = nullptr;
133   MachineInstr *End = nullptr;
134   bool Revert = false;
135 
136   // Search the preheader for the start intrinsic, or look through the
137   // predecessors of the header to find exactly one set.iterations intrinsic.
138   // FIXME: I don't see why we shouldn't be supporting multiple predecessors
139   // with potentially multiple set.loop.iterations, so we need to enable this.
140   if (auto *Preheader = ML->getLoopPreheader()) {
141     Start = SearchForStart(Preheader);
142   } else {
143     LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n"
144                << " - Performing manual predecessor search.\n");
145     MachineBasicBlock *Pred = nullptr;
146     for (auto *MBB : ML->getHeader()->predecessors()) {
147       if (!ML->contains(MBB)) {
148         if (Pred) {
149           LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n");
150           Start = nullptr;
151           break;
152         }
153         Pred = MBB;
154         Start = SearchForStart(MBB);
155       }
156     }
157   }
158 
159   // Find the low-overhead loop components and decide whether or not to fall
160   // back to a normal loop.
161   for (auto *MBB : reverse(ML->getBlocks())) {
162     for (auto &MI : *MBB) {
163       if (MI.getOpcode() == ARM::t2LoopDec)
164         Dec = &MI;
165       else if (MI.getOpcode() == ARM::t2LoopEnd)
166         End = &MI;
167       else if (MI.getDesc().isCall())
168         // TODO: Though the call will require LE to execute again, does this
169         // mean we should revert? Always executing LE hopefully should be
170         // faster than performing a sub,cmp,br or even subs,br.
171         Revert = true;
172 
173       if (!Dec)
174         continue;
175 
176       // If we find that we load/store LR between LoopDec and LoopEnd, expect
177       // that the decremented value has been spilled to the stack. Because
178       // this value isn't actually going to be produced until the latch, by LE,
179       // we would need to generate a real sub. The value is also likely to be
180       // reloaded for use of LoopEnd - in which in case we'd need to perform
181       // an add because it gets negated again by LE! The other option is to
182       // then generate the other form of LE which doesn't perform the sub.
183       if (MI.mayLoad() || MI.mayStore())
184         Revert =
185           MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR;
186     }
187 
188     if (Dec && End && Revert)
189       break;
190   }
191 
192   if (!Start && !Dec && !End) {
193     LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
194     return Changed;
195   } if (!(Start && Dec && End)) {
196     report_fatal_error("Failed to find all loop components");
197   }
198 
199   if (!End->getOperand(1).isMBB() ||
200       End->getOperand(1).getMBB() != ML->getHeader())
201     report_fatal_error("Expected LoopEnd to target Loop Header");
202 
203   // The LE instructions has 12-bits for the label offset.
204   if (!BBUtils->isBBInRange(End, ML->getHeader(), 4096)) {
205     LLVM_DEBUG(dbgs() << "ARM Loops: Too large for a low-overhead loop!\n");
206     Revert = true;
207   }
208 
209   LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start
210                     << " - Found Loop Dec: " << *Dec
211                     << " - Found Loop End: " << *End);
212 
213   Expand(ML, Start, Dec, End, Revert);
214   return true;
215 }
216 
217 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
218 // beq that branches to the exit branch.
219 // FIXME: Need to check that we're not trashing the CPSR when generating the
220 // cmp. We could also try to generate a cbz if the value in LR is also in
221 // another low register.
222 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
223   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
224   MachineBasicBlock *MBB = MI->getParent();
225   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
226                                     TII->get(ARM::t2CMPri));
227   MIB.addReg(ARM::LR);
228   MIB.addImm(0);
229   MIB.addImm(ARMCC::AL);
230   MIB.addReg(ARM::CPSR);
231 
232   // TODO: Try to use tBcc instead
233   MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
234   MIB.add(MI->getOperand(1));   // branch target
235   MIB.addImm(ARMCC::EQ);        // condition code
236   MIB.addReg(ARM::CPSR);
237   MI->eraseFromParent();
238 }
239 
240 // TODO: Check flags so that we can possibly generate a tSubs or tSub.
241 void ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
242   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
243   MachineBasicBlock *MBB = MI->getParent();
244   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
245                                     TII->get(ARM::t2SUBri));
246   MIB.addDef(ARM::LR);
247   MIB.add(MI->getOperand(1));
248   MIB.add(MI->getOperand(2));
249   MIB.addImm(ARMCC::AL);
250   MIB.addReg(0);
251   MIB.addReg(0);
252   MI->eraseFromParent();
253 }
254 
255 // Generate a subs, or sub and cmp, and a branch instead of an LE.
256 // FIXME: Need to check that we're not trashing the CPSR when generating
257 // the cmp.
258 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI) const {
259   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
260 
261   // Create cmp
262   MachineBasicBlock *MBB = MI->getParent();
263   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
264                                     TII->get(ARM::t2CMPri));
265   MIB.addReg(ARM::LR);
266   MIB.addImm(0);
267   MIB.addImm(ARMCC::AL);
268   MIB.addReg(ARM::CPSR);
269 
270   // TODO Try to use tBcc instead.
271   // Create bne
272   MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
273   MIB.add(MI->getOperand(1));   // branch target
274   MIB.addImm(ARMCC::NE);        // condition code
275   MIB.addReg(ARM::CPSR);
276   MI->eraseFromParent();
277 }
278 
279 void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
280                                  MachineInstr *Dec, MachineInstr *End,
281                                  bool Revert) {
282 
283   auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) {
284     // The trip count should already been held in LR since the instructions
285     // within the loop can only read and write to LR. So, there should be a
286     // mov to setup the count. WLS/DLS perform this move, so find the original
287     // and delete it - inserting WLS/DLS in its place.
288     MachineBasicBlock *MBB = Start->getParent();
289     MachineInstr *InsertPt = Start;
290     for (auto &I : MRI->def_instructions(ARM::LR)) {
291       if (I.getParent() != MBB)
292         continue;
293 
294       // Always execute.
295       if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL)
296         continue;
297 
298       // Only handle move reg, if the trip count it will need moving into a reg
299       // before the setup instruction anyway.
300       if (!I.getDesc().isMoveReg() ||
301           !I.getOperand(1).isIdenticalTo(Start->getOperand(0)))
302         continue;
303       InsertPt = &I;
304       break;
305     }
306 
307     unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
308       ARM::t2DLS : ARM::t2WLS;
309     MachineInstrBuilder MIB =
310       BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
311 
312     MIB.addDef(ARM::LR);
313     MIB.add(Start->getOperand(0));
314     if (Opc == ARM::t2WLS)
315       MIB.add(Start->getOperand(1));
316 
317     if (InsertPt != Start)
318       InsertPt->eraseFromParent();
319     Start->eraseFromParent();
320     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
321     return &*MIB;
322   };
323 
324   // Combine the LoopDec and LoopEnd instructions into LE(TP).
325   auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec,
326                               MachineInstr *End) {
327     MachineBasicBlock *MBB = End->getParent();
328     MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
329                                       TII->get(ARM::t2LEUpdate));
330     MIB.addDef(ARM::LR);
331     MIB.add(End->getOperand(0));
332     MIB.add(End->getOperand(1));
333     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
334 
335     End->eraseFromParent();
336     Dec->eraseFromParent();
337     return &*MIB;
338   };
339 
340   // TODO: We should be able to automatically remove these branches before we
341   // get here - probably by teaching analyzeBranch about the pseudo
342   // instructions.
343   // If there is an unconditional branch, after I, that just branches to the
344   // next block, remove it.
345   auto RemoveDeadBranch = [](MachineInstr *I) {
346     MachineBasicBlock *BB = I->getParent();
347     MachineInstr *Terminator = &BB->instr_back();
348     if (Terminator->isUnconditionalBranch() && I != Terminator) {
349       MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
350       if (BB->isLayoutSuccessor(Succ)) {
351         LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
352         Terminator->eraseFromParent();
353       }
354     }
355   };
356 
357   if (Revert) {
358     if (Start->getOpcode() == ARM::t2WhileLoopStart)
359       RevertWhile(Start);
360     else
361       Start->eraseFromParent();
362     RevertLoopDec(Dec);
363     RevertLoopEnd(End);
364   } else {
365     Start = ExpandLoopStart(ML, Start);
366     RemoveDeadBranch(Start);
367     End = ExpandLoopEnd(ML, Dec, End);
368     RemoveDeadBranch(End);
369   }
370 }
371 
372 FunctionPass *llvm::createARMLowOverheadLoopsPass() {
373   return new ARMLowOverheadLoops();
374 }
375