1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo 10 /// instructions into machine operations. 11 /// The expectation is that the loop contains three pseudo instructions: 12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop 13 /// form should be in the preheader, whereas the while form should be in the 14 /// preheaders only predecessor. TODO: Could DoLoopStart get moved into the 15 /// pre-preheader? 16 /// - t2LoopDec - placed within in the loop body. 17 /// - t2LoopEnd - the loop latch terminator. 18 /// 19 //===----------------------------------------------------------------------===// 20 21 #include "ARM.h" 22 #include "ARMBaseInstrInfo.h" 23 #include "ARMBaseRegisterInfo.h" 24 #include "ARMBasicBlockInfo.h" 25 #include "ARMSubtarget.h" 26 #include "llvm/CodeGen/MachineFunctionPass.h" 27 #include "llvm/CodeGen/MachineLoopInfo.h" 28 #include "llvm/CodeGen/MachineRegisterInfo.h" 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "arm-low-overhead-loops" 33 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" 34 35 namespace { 36 37 class ARMLowOverheadLoops : public MachineFunctionPass { 38 const ARMBaseInstrInfo *TII = nullptr; 39 MachineRegisterInfo *MRI = nullptr; 40 std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; 41 42 public: 43 static char ID; 44 45 ARMLowOverheadLoops() : MachineFunctionPass(ID) { } 46 47 void getAnalysisUsage(AnalysisUsage &AU) const override { 48 AU.setPreservesCFG(); 49 AU.addRequired<MachineLoopInfo>(); 50 MachineFunctionPass::getAnalysisUsage(AU); 51 } 52 53 bool runOnMachineFunction(MachineFunction &MF) override; 54 55 bool ProcessLoop(MachineLoop *ML); 56 57 void Expand(MachineLoop *ML, MachineInstr *Start, 58 MachineInstr *Dec, MachineInstr *End, bool Revert); 59 60 MachineFunctionProperties getRequiredProperties() const override { 61 return MachineFunctionProperties().set( 62 MachineFunctionProperties::Property::NoVRegs); 63 } 64 65 StringRef getPassName() const override { 66 return ARM_LOW_OVERHEAD_LOOPS_NAME; 67 } 68 }; 69 } 70 71 char ARMLowOverheadLoops::ID = 0; 72 73 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, 74 false, false) 75 76 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &MF) { 77 if (!static_cast<const ARMSubtarget&>(MF.getSubtarget()).hasLOB()) 78 return false; 79 80 LLVM_DEBUG(dbgs() << "ARM Loops on " << MF.getName() << " ------------- \n"); 81 82 auto &MLI = getAnalysis<MachineLoopInfo>(); 83 MRI = &MF.getRegInfo(); 84 TII = static_cast<const ARMBaseInstrInfo*>( 85 MF.getSubtarget().getInstrInfo()); 86 BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(MF)); 87 BBUtils->computeAllBlockSizes(); 88 89 bool Changed = false; 90 for (auto ML : MLI) { 91 if (!ML->getParentLoop()) 92 Changed |= ProcessLoop(ML); 93 } 94 return Changed; 95 } 96 97 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { 98 99 bool Changed = false; 100 101 // Process inner loops first. 102 for (auto I = ML->begin(), E = ML->end(); I != E; ++I) 103 Changed |= ProcessLoop(*I); 104 105 LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); 106 107 auto IsLoopStart = [](MachineInstr &MI) { 108 return MI.getOpcode() == ARM::t2DoLoopStart || 109 MI.getOpcode() == ARM::t2WhileLoopStart; 110 }; 111 112 // Search the given block for a loop start instruction. If one isn't found, 113 // and there's only one predecessor block, search that one too. 114 std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = 115 [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { 116 for (auto &MI : *MBB) { 117 if (IsLoopStart(MI)) 118 return &MI; 119 } 120 if (MBB->pred_size() == 1) 121 return SearchForStart(*MBB->pred_begin()); 122 return nullptr; 123 }; 124 125 MachineInstr *Start = nullptr; 126 MachineInstr *Dec = nullptr; 127 MachineInstr *End = nullptr; 128 bool Revert = false; 129 130 // Search the preheader for the start intrinsic, or look through the 131 // predecessors of the header to find exactly one set.iterations intrinsic. 132 // FIXME: I don't see why we shouldn't be supporting multiple predecessors 133 // with potentially multiple set.loop.iterations, so we need to enable this. 134 if (auto *Preheader = ML->getLoopPreheader()) { 135 Start = SearchForStart(Preheader); 136 } else { 137 LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n" 138 << " - Performing manual predecessor search.\n"); 139 MachineBasicBlock *Pred = nullptr; 140 for (auto *MBB : ML->getHeader()->predecessors()) { 141 if (!ML->contains(MBB)) { 142 if (Pred) { 143 LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n"); 144 Start = nullptr; 145 break; 146 } 147 Pred = MBB; 148 Start = SearchForStart(MBB); 149 } 150 } 151 } 152 153 // Find the low-overhead loop components and decide whether or not to fall 154 // back to a normal loop. 155 for (auto *MBB : reverse(ML->getBlocks())) { 156 for (auto &MI : *MBB) { 157 if (MI.getOpcode() == ARM::t2LoopDec) 158 Dec = &MI; 159 else if (MI.getOpcode() == ARM::t2LoopEnd) 160 End = &MI; 161 else if (MI.getDesc().isCall()) 162 // TODO: Though the call will require LE to execute again, does this 163 // mean we should revert? Always executing LE hopefully should be 164 // faster than performing a sub,cmp,br or even subs,br. 165 Revert = true; 166 167 if (!Dec) 168 continue; 169 170 // If we find that we load/store LR between LoopDec and LoopEnd, expect 171 // that the decremented value has been spilled to the stack. Because 172 // this value isn't actually going to be produced until the latch, by LE, 173 // we would need to generate a real sub. The value is also likely to be 174 // reloaded for use of LoopEnd - in which in case we'd need to perform 175 // an add because it gets negated again by LE! The other option is to 176 // then generate the other form of LE which doesn't perform the sub. 177 if (MI.mayLoad() || MI.mayStore()) 178 Revert = 179 MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == ARM::LR; 180 } 181 182 if (Dec && End && Revert) 183 break; 184 } 185 186 if (!Start && !Dec && !End) { 187 LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); 188 return Changed; 189 } if (!(Start && Dec && End)) { 190 report_fatal_error("Failed to find all loop components"); 191 } 192 193 if (!End->getOperand(1).isMBB() || 194 End->getOperand(1).getMBB() != ML->getHeader()) 195 report_fatal_error("Expected LoopEnd to target Loop Header"); 196 197 // The LE instructions has 12-bits for the label offset. 198 if (!BBUtils->isBBInRange(End, ML->getHeader(), 4096)) { 199 LLVM_DEBUG(dbgs() << "ARM Loops: Too large for a low-overhead loop!\n"); 200 Revert = true; 201 } 202 203 LLVM_DEBUG(dbgs() << "ARM Loops:\n - Found Loop Start: " << *Start 204 << " - Found Loop Dec: " << *Dec 205 << " - Found Loop End: " << *End); 206 207 Expand(ML, Start, Dec, End, Revert); 208 return true; 209 } 210 211 void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, 212 MachineInstr *Dec, MachineInstr *End, 213 bool Revert) { 214 215 auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start) { 216 // The trip count should already been held in LR since the instructions 217 // within the loop can only read and write to LR. So, there should be a 218 // mov to setup the count. WLS/DLS perform this move, so find the original 219 // and delete it - inserting WLS/DLS in its place. 220 MachineBasicBlock *MBB = Start->getParent(); 221 MachineInstr *InsertPt = Start; 222 for (auto &I : MRI->def_instructions(ARM::LR)) { 223 if (I.getParent() != MBB) 224 continue; 225 226 // Always execute. 227 if (!I.getOperand(2).isImm() || I.getOperand(2).getImm() != ARMCC::AL) 228 continue; 229 230 // Only handle move reg, if the trip count it will need moving into a reg 231 // before the setup instruction anyway. 232 if (!I.getDesc().isMoveReg() || 233 !I.getOperand(1).isIdenticalTo(Start->getOperand(0))) 234 continue; 235 InsertPt = &I; 236 break; 237 } 238 239 unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ? 240 ARM::t2DLS : ARM::t2WLS; 241 MachineInstrBuilder MIB = 242 BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); 243 244 MIB.addDef(ARM::LR); 245 MIB.add(Start->getOperand(0)); 246 if (Opc == ARM::t2WLS) 247 MIB.add(Start->getOperand(1)); 248 249 if (InsertPt != Start) 250 InsertPt->eraseFromParent(); 251 Start->eraseFromParent(); 252 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); 253 return &*MIB; 254 }; 255 256 // Combine the LoopDec and LoopEnd instructions into LE(TP). 257 auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec, 258 MachineInstr *End) { 259 MachineBasicBlock *MBB = End->getParent(); 260 MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), 261 TII->get(ARM::t2LEUpdate)); 262 MIB.addDef(ARM::LR); 263 MIB.add(End->getOperand(0)); 264 MIB.add(End->getOperand(1)); 265 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); 266 267 End->eraseFromParent(); 268 Dec->eraseFromParent(); 269 return &*MIB; 270 }; 271 272 // Generate a subs, or sub and cmp, and a branch instead of an LE. 273 // TODO: Check flags so that we can possibly generate a subs. 274 // FIXME: Need to check that we're not trashing the CPSR when generating 275 // the cmp. 276 auto ExpandBranch = [this](MachineInstr *Dec, MachineInstr *End) { 277 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub, cmp, br.\n"); 278 // Create sub 279 MachineBasicBlock *MBB = Dec->getParent(); 280 MachineInstrBuilder MIB = BuildMI(*MBB, Dec, Dec->getDebugLoc(), 281 TII->get(ARM::t2SUBri)); 282 MIB.addDef(ARM::LR); 283 MIB.add(Dec->getOperand(1)); 284 MIB.add(Dec->getOperand(2)); 285 MIB.addImm(ARMCC::AL); 286 MIB.addReg(0); 287 MIB.addReg(0); 288 289 // Create cmp 290 MBB = End->getParent(); 291 MIB = BuildMI(*MBB, End, End->getDebugLoc(), TII->get(ARM::t2CMPri)); 292 MIB.addReg(ARM::LR); 293 MIB.addImm(0); 294 MIB.addImm(ARMCC::AL); 295 MIB.addReg(ARM::CPSR); 296 297 // Create bne 298 MIB = BuildMI(*MBB, End, End->getDebugLoc(), TII->get(ARM::t2Bcc)); 299 MIB.add(End->getOperand(1)); // branch target 300 MIB.addImm(ARMCC::NE); // condition code 301 MIB.addReg(ARM::CPSR); 302 End->eraseFromParent(); 303 Dec->eraseFromParent(); 304 }; 305 306 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a 307 // beq that branches to the exit branch. 308 // FIXME: Need to check that we're not trashing the CPSR when generating the 309 // cmp. We could also try to generate a cbz if the value in LR is also in 310 // another low register. 311 auto ExpandStart = [this](MachineInstr *MI) { 312 MachineBasicBlock *MBB = MI->getParent(); 313 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 314 TII->get(ARM::t2CMPri)); 315 MIB.addReg(ARM::LR); 316 MIB.addImm(0); 317 MIB.addImm(ARMCC::AL); 318 MIB.addReg(ARM::CPSR); 319 320 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc)); 321 MIB.add(MI->getOperand(1)); // branch target 322 MIB.addImm(ARMCC::EQ); // condition code 323 MIB.addReg(ARM::CPSR); 324 }; 325 326 // TODO: We should be able to automatically remove these branches before we 327 // get here - probably by teaching analyzeBranch about the pseudo 328 // instructions. 329 // If there is an unconditional branch, after I, that just branches to the 330 // next block, remove it. 331 auto RemoveDeadBranch = [](MachineInstr *I) { 332 MachineBasicBlock *BB = I->getParent(); 333 MachineInstr *Terminator = &BB->instr_back(); 334 if (Terminator->isUnconditionalBranch() && I != Terminator) { 335 MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB(); 336 if (BB->isLayoutSuccessor(Succ)) { 337 LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); 338 Terminator->eraseFromParent(); 339 } 340 } 341 }; 342 343 if (Revert) { 344 if (Start->getOpcode() == ARM::t2WhileLoopStart) 345 ExpandStart(Start); 346 ExpandBranch(Dec, End); 347 Start->eraseFromParent(); 348 } else { 349 Start = ExpandLoopStart(ML, Start); 350 RemoveDeadBranch(Start); 351 End = ExpandLoopEnd(ML, Dec, End); 352 RemoveDeadBranch(End); 353 } 354 } 355 356 FunctionPass *llvm::createARMLowOverheadLoopsPass() { 357 return new ARMLowOverheadLoops(); 358 } 359