1 //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// Finalize v8.1-m low-overhead loops by converting the associated pseudo 10 /// instructions into machine operations. 11 /// The expectation is that the loop contains three pseudo instructions: 12 /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop 13 /// form should be in the preheader, whereas the while form should be in the 14 /// preheaders only predecessor. 15 /// - t2LoopDec - placed within in the loop body. 16 /// - t2LoopEnd - the loop latch terminator. 17 /// 18 /// In addition to this, we also look for the presence of the VCTP instruction, 19 /// which determines whether we can generated the tail-predicated low-overhead 20 /// loop form. 21 /// 22 /// Assumptions and Dependencies: 23 /// Low-overhead loops are constructed and executed using a setup instruction: 24 /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP. 25 /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range 26 /// but fixed polarity: WLS can only branch forwards and LE can only branch 27 /// backwards. These restrictions mean that this pass is dependent upon block 28 /// layout and block sizes, which is why it's the last pass to run. The same is 29 /// true for ConstantIslands, but this pass does not increase the size of the 30 /// basic blocks, nor does it change the CFG. Instructions are mainly removed 31 /// during the transform and pseudo instructions are replaced by real ones. In 32 /// some cases, when we have to revert to a 'normal' loop, we have to introduce 33 /// multiple instructions for a single pseudo (see RevertWhile and 34 /// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd 35 /// are defined to be as large as this maximum sequence of replacement 36 /// instructions. 37 /// 38 //===----------------------------------------------------------------------===// 39 40 #include "ARM.h" 41 #include "ARMBaseInstrInfo.h" 42 #include "ARMBaseRegisterInfo.h" 43 #include "ARMBasicBlockInfo.h" 44 #include "ARMSubtarget.h" 45 #include "Thumb2InstrInfo.h" 46 #include "llvm/ADT/SetOperations.h" 47 #include "llvm/ADT/SmallSet.h" 48 #include "llvm/CodeGen/MachineFunctionPass.h" 49 #include "llvm/CodeGen/MachineLoopInfo.h" 50 #include "llvm/CodeGen/MachineLoopUtils.h" 51 #include "llvm/CodeGen/MachineRegisterInfo.h" 52 #include "llvm/CodeGen/Passes.h" 53 #include "llvm/CodeGen/ReachingDefAnalysis.h" 54 #include "llvm/MC/MCInstrDesc.h" 55 56 using namespace llvm; 57 58 #define DEBUG_TYPE "arm-low-overhead-loops" 59 #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" 60 61 namespace { 62 63 class PostOrderLoopTraversal { 64 MachineLoop &ML; 65 MachineLoopInfo &MLI; 66 SmallPtrSet<MachineBasicBlock*, 4> Visited; 67 SmallVector<MachineBasicBlock*, 4> Order; 68 69 public: 70 PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI) 71 : ML(ML), MLI(MLI) { } 72 73 const SmallVectorImpl<MachineBasicBlock*> &getOrder() const { 74 return Order; 75 } 76 77 // Visit all the blocks within the loop, as well as exit blocks and any 78 // blocks properly dominating the header. 79 void ProcessLoop() { 80 std::function<void(MachineBasicBlock*)> Search = [this, &Search] 81 (MachineBasicBlock *MBB) -> void { 82 if (Visited.count(MBB)) 83 return; 84 85 Visited.insert(MBB); 86 for (auto *Succ : MBB->successors()) { 87 if (!ML.contains(Succ)) 88 continue; 89 Search(Succ); 90 } 91 Order.push_back(MBB); 92 }; 93 94 // Insert exit blocks. 95 SmallVector<MachineBasicBlock*, 2> ExitBlocks; 96 ML.getExitBlocks(ExitBlocks); 97 for (auto *MBB : ExitBlocks) 98 Order.push_back(MBB); 99 100 // Then add the loop body. 101 Search(ML.getHeader()); 102 103 // Then try the preheader and its predecessors. 104 std::function<void(MachineBasicBlock*)> GetPredecessor = 105 [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void { 106 Order.push_back(MBB); 107 if (MBB->pred_size() == 1) 108 GetPredecessor(*MBB->pred_begin()); 109 }; 110 111 if (auto *Preheader = ML.getLoopPreheader()) 112 GetPredecessor(Preheader); 113 else if (auto *Preheader = MLI.findLoopPreheader(&ML, true)) 114 GetPredecessor(Preheader); 115 } 116 }; 117 118 struct PredicatedMI { 119 MachineInstr *MI = nullptr; 120 SetVector<MachineInstr*> Predicates; 121 122 public: 123 PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) : 124 MI(I) { 125 Predicates.insert(Preds.begin(), Preds.end()); 126 } 127 }; 128 129 // Represent a VPT block, a list of instructions that begins with a VPST and 130 // has a maximum of four proceeding instructions. All instructions within the 131 // block are predicated upon the vpr and we allow instructions to define the 132 // vpr within in the block too. 133 class VPTBlock { 134 std::unique_ptr<PredicatedMI> VPST; 135 PredicatedMI *Divergent = nullptr; 136 SmallVector<PredicatedMI, 4> Insts; 137 138 public: 139 VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) { 140 VPST = std::make_unique<PredicatedMI>(MI, Preds); 141 } 142 143 void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) { 144 LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI); 145 if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) { 146 Divergent = &Insts.back(); 147 LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI); 148 } 149 Insts.emplace_back(MI, Preds); 150 assert(Insts.size() <= 4 && "Too many instructions in VPT block!"); 151 } 152 153 // Have we found an instruction within the block which defines the vpr? If 154 // so, not all the instructions in the block will have the same predicate. 155 bool HasNonUniformPredicate() const { 156 return Divergent != nullptr; 157 } 158 159 // Is the given instruction part of the predicate set controlling the entry 160 // to the block. 161 bool IsPredicatedOn(MachineInstr *MI) const { 162 return VPST->Predicates.count(MI); 163 } 164 165 // Is the given instruction the only predicate which controls the entry to 166 // the block. 167 bool IsOnlyPredicatedOn(MachineInstr *MI) const { 168 return IsPredicatedOn(MI) && VPST->Predicates.size() == 1; 169 } 170 171 unsigned size() const { return Insts.size(); } 172 SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; } 173 MachineInstr *getVPST() const { return VPST->MI; } 174 PredicatedMI *getDivergent() const { return Divergent; } 175 }; 176 177 struct LowOverheadLoop { 178 179 MachineLoop *ML = nullptr; 180 MachineFunction *MF = nullptr; 181 MachineInstr *InsertPt = nullptr; 182 MachineInstr *Start = nullptr; 183 MachineInstr *Dec = nullptr; 184 MachineInstr *End = nullptr; 185 MachineInstr *VCTP = nullptr; 186 VPTBlock *CurrentBlock = nullptr; 187 SetVector<MachineInstr*> CurrentPredicate; 188 SmallVector<VPTBlock, 4> VPTBlocks; 189 bool Revert = false; 190 bool CannotTailPredicate = false; 191 192 LowOverheadLoop(MachineLoop *ML) : ML(ML) { 193 MF = ML->getHeader()->getParent(); 194 } 195 196 // If this is an MVE instruction, check that we know how to use tail 197 // predication with it. Record VPT blocks and return whether the 198 // instruction is valid for tail predication. 199 bool ValidateMVEInst(MachineInstr *MI); 200 201 void AnalyseMVEInst(MachineInstr *MI) { 202 CannotTailPredicate = !ValidateMVEInst(MI); 203 } 204 205 bool IsTailPredicationLegal() const { 206 // For now, let's keep things really simple and only support a single 207 // block for tail predication. 208 return !Revert && FoundAllComponents() && VCTP && 209 !CannotTailPredicate && ML->getNumBlocks() == 1; 210 } 211 212 bool ValidateTailPredicate(MachineInstr *StartInsertPt, 213 ReachingDefAnalysis *RDA, 214 MachineLoopInfo *MLI); 215 216 // Is it safe to define LR with DLS/WLS? 217 // LR can be defined if it is the operand to start, because it's the same 218 // value, or if it's going to be equivalent to the operand to Start. 219 MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA); 220 221 // Check the branch targets are within range and we satisfy our 222 // restrictions. 223 void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA, 224 MachineLoopInfo *MLI); 225 226 bool FoundAllComponents() const { 227 return Start && Dec && End; 228 } 229 230 SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; } 231 232 // Return the loop iteration count, or the number of elements if we're tail 233 // predicating. 234 MachineOperand &getCount() { 235 return IsTailPredicationLegal() ? 236 VCTP->getOperand(1) : Start->getOperand(0); 237 } 238 239 unsigned getStartOpcode() const { 240 bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; 241 if (!IsTailPredicationLegal()) 242 return IsDo ? ARM::t2DLS : ARM::t2WLS; 243 244 return VCTPOpcodeToLSTP(VCTP->getOpcode(), IsDo); 245 } 246 247 void dump() const { 248 if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; 249 if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; 250 if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; 251 if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP; 252 if (!FoundAllComponents()) 253 dbgs() << "ARM Loops: Not a low-overhead loop.\n"; 254 else if (!(Start && Dec && End)) 255 dbgs() << "ARM Loops: Failed to find all loop components.\n"; 256 } 257 }; 258 259 class ARMLowOverheadLoops : public MachineFunctionPass { 260 MachineFunction *MF = nullptr; 261 MachineLoopInfo *MLI = nullptr; 262 ReachingDefAnalysis *RDA = nullptr; 263 const ARMBaseInstrInfo *TII = nullptr; 264 MachineRegisterInfo *MRI = nullptr; 265 const TargetRegisterInfo *TRI = nullptr; 266 std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; 267 268 public: 269 static char ID; 270 271 ARMLowOverheadLoops() : MachineFunctionPass(ID) { } 272 273 void getAnalysisUsage(AnalysisUsage &AU) const override { 274 AU.setPreservesCFG(); 275 AU.addRequired<MachineLoopInfo>(); 276 AU.addRequired<ReachingDefAnalysis>(); 277 MachineFunctionPass::getAnalysisUsage(AU); 278 } 279 280 bool runOnMachineFunction(MachineFunction &MF) override; 281 282 MachineFunctionProperties getRequiredProperties() const override { 283 return MachineFunctionProperties().set( 284 MachineFunctionProperties::Property::NoVRegs).set( 285 MachineFunctionProperties::Property::TracksLiveness); 286 } 287 288 StringRef getPassName() const override { 289 return ARM_LOW_OVERHEAD_LOOPS_NAME; 290 } 291 292 private: 293 bool ProcessLoop(MachineLoop *ML); 294 295 bool RevertNonLoops(); 296 297 void RevertWhile(MachineInstr *MI) const; 298 299 bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const; 300 301 void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; 302 303 void RemoveLoopUpdate(LowOverheadLoop &LoLoop); 304 305 void ConvertVPTBlocks(LowOverheadLoop &LoLoop); 306 307 MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); 308 309 void Expand(LowOverheadLoop &LoLoop); 310 311 }; 312 } 313 314 char ARMLowOverheadLoops::ID = 0; 315 316 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, 317 false, false) 318 319 MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) { 320 // We can define LR because LR already contains the same value. 321 if (Start->getOperand(0).getReg() == ARM::LR) 322 return Start; 323 324 unsigned CountReg = Start->getOperand(0).getReg(); 325 auto IsMoveLR = [&CountReg](MachineInstr *MI) { 326 return MI->getOpcode() == ARM::tMOVr && 327 MI->getOperand(0).getReg() == ARM::LR && 328 MI->getOperand(1).getReg() == CountReg && 329 MI->getOperand(2).getImm() == ARMCC::AL; 330 }; 331 332 MachineBasicBlock *MBB = Start->getParent(); 333 334 // Find an insertion point: 335 // - Is there a (mov lr, Count) before Start? If so, and nothing else writes 336 // to Count before Start, we can insert at that mov. 337 if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR)) 338 if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg)) 339 return LRDef; 340 341 // - Is there a (mov lr, Count) after Start? If so, and nothing else writes 342 // to Count after Start, we can insert at that mov. 343 if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR)) 344 if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg)) 345 return LRDef; 346 347 // We've found no suitable LR def and Start doesn't use LR directly. Can we 348 // just define LR anyway? 349 if (!RDA->isRegUsedAfter(Start, ARM::LR)) 350 return Start; 351 352 return nullptr; 353 } 354 355 // Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must 356 // not define a register that is used by any instructions, after and including, 357 // 'To'. These instructions also must not redefine any of Froms operands. 358 template<typename Iterator> 359 static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) { 360 SmallSet<int, 2> Defs; 361 // First check that From would compute the same value if moved. 362 for (auto &MO : From->operands()) { 363 if (!MO.isReg() || MO.isUndef() || !MO.getReg()) 364 continue; 365 if (MO.isDef()) 366 Defs.insert(MO.getReg()); 367 else if (!RDA->hasSameReachingDef(From, To, MO.getReg())) 368 return false; 369 } 370 371 // Now walk checking that the rest of the instructions will compute the same 372 // value. 373 for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) { 374 for (auto &MO : I->operands()) 375 if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg())) 376 return false; 377 } 378 return true; 379 } 380 381 bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, 382 ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) { 383 assert(VCTP && "VCTP instruction expected but is not set"); 384 // All predication within the loop should be based on vctp. If the block 385 // isn't predicated on entry, check whether the vctp is within the block 386 // and that all other instructions are then predicated on it. 387 for (auto &Block : VPTBlocks) { 388 if (Block.IsPredicatedOn(VCTP)) 389 continue; 390 if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI)) { 391 LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: " 392 << *Block.getDivergent()->MI); 393 return false; 394 } 395 SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts(); 396 for (auto &PredMI : Insts) { 397 if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI)) 398 continue; 399 LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI 400 << " - which is predicated on:\n"; 401 for (auto *MI : PredMI.Predicates) 402 dbgs() << " - " << *MI; 403 ); 404 return false; 405 } 406 } 407 408 // For tail predication, we need to provide the number of elements, instead 409 // of the iteration count, to the loop start instruction. The number of 410 // elements is provided to the vctp instruction, so we need to check that 411 // we can use this register at InsertPt. 412 Register NumElements = VCTP->getOperand(1).getReg(); 413 414 // If the register is defined within loop, then we can't perform TP. 415 // TODO: Check whether this is just a mov of a register that would be 416 // available. 417 if (RDA->getReachingDef(VCTP, NumElements) >= 0) { 418 LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n"); 419 return false; 420 } 421 422 // The element count register maybe defined after InsertPt, in which case we 423 // need to try to move either InsertPt or the def so that the [w|d]lstp can 424 // use the value. 425 MachineBasicBlock *InsertBB = InsertPt->getParent(); 426 if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) { 427 if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) { 428 if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) { 429 ElemDef->removeFromParent(); 430 InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef); 431 LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: " 432 << *ElemDef); 433 } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) { 434 InsertPt->removeFromParent(); 435 InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt); 436 LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); 437 } else { 438 LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop " 439 << "start instruction.\n"); 440 return false; 441 } 442 } 443 } 444 445 // Especially in the case of while loops, InsertBB may not be the 446 // preheader, so we need to check that the register isn't redefined 447 // before entering the loop. 448 auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB, 449 Register NumElements) { 450 // NumElements is redefined in this block. 451 if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0) 452 return true; 453 454 // Don't continue searching up through multiple predecessors. 455 if (MBB->pred_size() > 1) 456 return true; 457 458 return false; 459 }; 460 461 // First, find the block that looks like the preheader. 462 MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true); 463 if (!MBB) { 464 LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n"); 465 return false; 466 } 467 468 // Then search backwards for a def, until we get to InsertBB. 469 while (MBB != InsertBB) { 470 if (CannotProvideElements(MBB, NumElements)) { 471 LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n"); 472 return false; 473 } 474 MBB = *MBB->pred_begin(); 475 } 476 477 LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n"); 478 return true; 479 } 480 481 void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils, 482 ReachingDefAnalysis *RDA, 483 MachineLoopInfo *MLI) { 484 if (Revert) 485 return; 486 487 if (!End->getOperand(1).isMBB()) 488 report_fatal_error("Expected LoopEnd to target basic block"); 489 490 // TODO Maybe there's cases where the target doesn't have to be the header, 491 // but for now be safe and revert. 492 if (End->getOperand(1).getMBB() != ML->getHeader()) { 493 LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); 494 Revert = true; 495 return; 496 } 497 498 // The WLS and LE instructions have 12-bits for the label offset. WLS 499 // requires a positive offset, while LE uses negative. 500 if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || 501 !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { 502 LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); 503 Revert = true; 504 return; 505 } 506 507 if (Start->getOpcode() == ARM::t2WhileLoopStart && 508 (BBUtils->getOffsetOf(Start) > 509 BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || 510 !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { 511 LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); 512 Revert = true; 513 return; 514 } 515 516 InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA); 517 if (!InsertPt) { 518 LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); 519 Revert = true; 520 return; 521 } else 522 LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); 523 524 if (!IsTailPredicationLegal()) { 525 LLVM_DEBUG(if (!VCTP) 526 dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n"; 527 dbgs() << "ARM Loops: Tail-predication is not valid.\n"); 528 return; 529 } 530 531 assert(ML->getBlocks().size() == 1 && 532 "Shouldn't be processing a loop with more than one block"); 533 CannotTailPredicate = !ValidateTailPredicate(InsertPt, RDA, MLI); 534 LLVM_DEBUG(if (CannotTailPredicate) 535 dbgs() << "ARM Loops: Couldn't validate tail predicate.\n"); 536 } 537 538 bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { 539 if (CannotTailPredicate) 540 return false; 541 542 // Only support a single vctp. 543 if (isVCTP(MI) && VCTP) 544 return false; 545 546 // Start a new vpt block when we discover a vpt. 547 if (MI->getOpcode() == ARM::MVE_VPST) { 548 VPTBlocks.emplace_back(MI, CurrentPredicate); 549 CurrentBlock = &VPTBlocks.back(); 550 return true; 551 } else if (isVCTP(MI)) 552 VCTP = MI; 553 else if (MI->getOpcode() == ARM::MVE_VPSEL || 554 MI->getOpcode() == ARM::MVE_VPNOT) 555 return false; 556 557 // TODO: Allow VPSEL and VPNOT, we currently cannot because: 558 // 1) It will use the VPR as a predicate operand, but doesn't have to be 559 // instead a VPT block, which means we can assert while building up 560 // the VPT block because we don't find another VPST to being a new 561 // one. 562 // 2) VPSEL still requires a VPR operand even after tail predicating, 563 // which means we can't remove it unless there is another 564 // instruction, such as vcmp, that can provide the VPR def. 565 566 bool IsUse = false; 567 bool IsDef = false; 568 const MCInstrDesc &MCID = MI->getDesc(); 569 for (int i = MI->getNumOperands() - 1; i >= 0; --i) { 570 const MachineOperand &MO = MI->getOperand(i); 571 if (!MO.isReg() || MO.getReg() != ARM::VPR) 572 continue; 573 574 if (MO.isDef()) { 575 CurrentPredicate.insert(MI); 576 IsDef = true; 577 } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) { 578 CurrentBlock->addInst(MI, CurrentPredicate); 579 IsUse = true; 580 } else { 581 LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI); 582 return false; 583 } 584 } 585 586 // If we find a vpr def that is not already predicated on the vctp, we've 587 // got disjoint predicates that may not be equivalent when we do the 588 // conversion. 589 if (IsDef && !IsUse && VCTP && !isVCTP(MI)) { 590 LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI); 591 return false; 592 } 593 594 uint64_t Flags = MCID.TSFlags; 595 if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) 596 return true; 597 598 // If we find an instruction that has been marked as not valid for tail 599 // predication, only allow the instruction if it's contained within a valid 600 // VPT block. 601 if ((Flags & ARMII::ValidForTailPredication) == 0 && !IsUse) { 602 LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI); 603 return false; 604 } 605 606 return true; 607 } 608 609 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { 610 const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(mf.getSubtarget()); 611 if (!ST.hasLOB()) 612 return false; 613 614 MF = &mf; 615 LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"); 616 617 MLI = &getAnalysis<MachineLoopInfo>(); 618 RDA = &getAnalysis<ReachingDefAnalysis>(); 619 MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); 620 MRI = &MF->getRegInfo(); 621 TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); 622 TRI = ST.getRegisterInfo(); 623 BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF)); 624 BBUtils->computeAllBlockSizes(); 625 BBUtils->adjustBBOffsetsAfter(&MF->front()); 626 627 bool Changed = false; 628 for (auto ML : *MLI) { 629 if (!ML->getParentLoop()) 630 Changed |= ProcessLoop(ML); 631 } 632 Changed |= RevertNonLoops(); 633 return Changed; 634 } 635 636 bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { 637 638 bool Changed = false; 639 640 // Process inner loops first. 641 for (auto I = ML->begin(), E = ML->end(); I != E; ++I) 642 Changed |= ProcessLoop(*I); 643 644 LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n"; 645 if (auto *Preheader = ML->getLoopPreheader()) 646 dbgs() << " - " << Preheader->getName() << "\n"; 647 else if (auto *Preheader = MLI->findLoopPreheader(ML)) 648 dbgs() << " - " << Preheader->getName() << "\n"; 649 for (auto *MBB : ML->getBlocks()) 650 dbgs() << " - " << MBB->getName() << "\n"; 651 ); 652 653 // Search the given block for a loop start instruction. If one isn't found, 654 // and there's only one predecessor block, search that one too. 655 std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = 656 [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { 657 for (auto &MI : *MBB) { 658 if (isLoopStart(MI)) 659 return &MI; 660 } 661 if (MBB->pred_size() == 1) 662 return SearchForStart(*MBB->pred_begin()); 663 return nullptr; 664 }; 665 666 LowOverheadLoop LoLoop(ML); 667 // Search the preheader for the start intrinsic. 668 // FIXME: I don't see why we shouldn't be supporting multiple predecessors 669 // with potentially multiple set.loop.iterations, so we need to enable this. 670 if (auto *Preheader = ML->getLoopPreheader()) 671 LoLoop.Start = SearchForStart(Preheader); 672 else if (auto *Preheader = MLI->findLoopPreheader(ML, true)) 673 LoLoop.Start = SearchForStart(Preheader); 674 else 675 return false; 676 677 // Find the low-overhead loop components and decide whether or not to fall 678 // back to a normal loop. Also look for a vctp instructions and decide 679 // whether we can convert that predicate using tail predication. 680 for (auto *MBB : reverse(ML->getBlocks())) { 681 for (auto &MI : *MBB) { 682 if (MI.getOpcode() == ARM::t2LoopDec) 683 LoLoop.Dec = &MI; 684 else if (MI.getOpcode() == ARM::t2LoopEnd) 685 LoLoop.End = &MI; 686 else if (isLoopStart(MI)) 687 LoLoop.Start = &MI; 688 else if (MI.getDesc().isCall()) { 689 // TODO: Though the call will require LE to execute again, does this 690 // mean we should revert? Always executing LE hopefully should be 691 // faster than performing a sub,cmp,br or even subs,br. 692 LoLoop.Revert = true; 693 LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n"); 694 } else { 695 // Record VPR defs and build up their corresponding vpt blocks. 696 // Check we know how to tail predicate any mve instructions. 697 LoLoop.AnalyseMVEInst(&MI); 698 } 699 700 // We need to ensure that LR is not used or defined inbetween LoopDec and 701 // LoopEnd. 702 if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert) 703 continue; 704 705 // If we find that LR has been written or read between LoopDec and 706 // LoopEnd, expect that the decremented value is being used else where. 707 // Because this value isn't actually going to be produced until the 708 // latch, by LE, we would need to generate a real sub. The value is also 709 // likely to be copied/reloaded for use of LoopEnd - in which in case 710 // we'd need to perform an add because it gets subtracted again by LE! 711 // The other option is to then generate the other form of LE which doesn't 712 // perform the sub. 713 for (auto &MO : MI.operands()) { 714 if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() && 715 MO.getReg() == ARM::LR) { 716 LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI); 717 LoLoop.Revert = true; 718 break; 719 } 720 } 721 } 722 } 723 724 LLVM_DEBUG(LoLoop.dump()); 725 if (!LoLoop.FoundAllComponents()) { 726 LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n"); 727 return false; 728 } 729 730 LoLoop.CheckLegality(BBUtils.get(), RDA, MLI); 731 Expand(LoLoop); 732 return true; 733 } 734 735 // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a 736 // beq that branches to the exit branch. 737 // TODO: We could also try to generate a cbz if the value in LR is also in 738 // another low register. 739 void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { 740 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); 741 MachineBasicBlock *MBB = MI->getParent(); 742 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 743 TII->get(ARM::t2CMPri)); 744 MIB.add(MI->getOperand(0)); 745 MIB.addImm(0); 746 MIB.addImm(ARMCC::AL); 747 MIB.addReg(ARM::NoRegister); 748 749 MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); 750 unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? 751 ARM::tBcc : ARM::t2Bcc; 752 753 MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); 754 MIB.add(MI->getOperand(1)); // branch target 755 MIB.addImm(ARMCC::EQ); // condition code 756 MIB.addReg(ARM::CPSR); 757 MI->eraseFromParent(); 758 } 759 760 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI, 761 bool SetFlags) const { 762 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); 763 MachineBasicBlock *MBB = MI->getParent(); 764 765 // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. 766 if (SetFlags && 767 (RDA->isRegUsedAfter(MI, ARM::CPSR) || 768 !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR))) 769 SetFlags = false; 770 771 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 772 TII->get(ARM::t2SUBri)); 773 MIB.addDef(ARM::LR); 774 MIB.add(MI->getOperand(1)); 775 MIB.add(MI->getOperand(2)); 776 MIB.addImm(ARMCC::AL); 777 MIB.addReg(0); 778 779 if (SetFlags) { 780 MIB.addReg(ARM::CPSR); 781 MIB->getOperand(5).setIsDef(true); 782 } else 783 MIB.addReg(0); 784 785 MI->eraseFromParent(); 786 return SetFlags; 787 } 788 789 // Generate a subs, or sub and cmp, and a branch instead of an LE. 790 void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { 791 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); 792 793 MachineBasicBlock *MBB = MI->getParent(); 794 // Create cmp 795 if (!SkipCmp) { 796 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), 797 TII->get(ARM::t2CMPri)); 798 MIB.addReg(ARM::LR); 799 MIB.addImm(0); 800 MIB.addImm(ARMCC::AL); 801 MIB.addReg(ARM::NoRegister); 802 } 803 804 MachineBasicBlock *DestBB = MI->getOperand(1).getMBB(); 805 unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? 806 ARM::tBcc : ARM::t2Bcc; 807 808 // Create bne 809 MachineInstrBuilder MIB = 810 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); 811 MIB.add(MI->getOperand(1)); // branch target 812 MIB.addImm(ARMCC::NE); // condition code 813 MIB.addReg(ARM::CPSR); 814 MI->eraseFromParent(); 815 } 816 817 MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { 818 MachineInstr *InsertPt = LoLoop.InsertPt; 819 MachineInstr *Start = LoLoop.Start; 820 MachineBasicBlock *MBB = InsertPt->getParent(); 821 bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; 822 unsigned Opc = LoLoop.getStartOpcode(); 823 MachineOperand &Count = LoLoop.getCount(); 824 825 MachineInstrBuilder MIB = 826 BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); 827 828 MIB.addDef(ARM::LR); 829 MIB.add(Count); 830 if (!IsDo) 831 MIB.add(Start->getOperand(1)); 832 833 // When using tail-predication, try to delete the dead code that was used to 834 // calculate the number of loop iterations. 835 if (LoLoop.IsTailPredicationLegal()) { 836 SmallVector<MachineInstr*, 4> Killed; 837 SmallVector<MachineInstr*, 4> Dead; 838 if (auto *Def = RDA->getReachingMIDef(Start, 839 Start->getOperand(0).getReg())) { 840 Killed.push_back(Def); 841 842 while (!Killed.empty()) { 843 MachineInstr *Def = Killed.back(); 844 Killed.pop_back(); 845 Dead.push_back(Def); 846 for (auto &MO : Def->operands()) { 847 if (!MO.isReg() || !MO.isKill()) 848 continue; 849 850 MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg()); 851 if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1) 852 Killed.push_back(Kill); 853 } 854 } 855 for (auto *MI : Dead) 856 MI->eraseFromParent(); 857 } 858 } 859 860 // If we're inserting at a mov lr, then remove it as it's redundant. 861 if (InsertPt != Start) 862 InsertPt->eraseFromParent(); 863 Start->eraseFromParent(); 864 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); 865 return &*MIB; 866 } 867 868 // Goal is to optimise and clean-up these loops: 869 // 870 // vector.body: 871 // renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg 872 // renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4 873 // .. 874 // $lr = MVE_DLSTP_32 renamable $r3 875 // 876 // The SUB is the old update of the loop iteration count expression, which 877 // is no longer needed. This sub is removed when the element count, which is in 878 // r3 in this example, is defined by an instruction in the loop, and it has 879 // no uses. 880 // 881 void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) { 882 Register ElemCount = LoLoop.VCTP->getOperand(1).getReg(); 883 MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back(); 884 885 LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n"); 886 887 if (LoLoop.ML->getNumBlocks() != 1) { 888 LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n"); 889 return; 890 } 891 892 LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: "; 893 LoLoop.VCTP->getOperand(1).dump()); 894 895 // Find the definition we are interested in removing, if there is one. 896 MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount); 897 if (!Def) { 898 LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n"); 899 return; 900 } 901 902 // Bail if we define CPSR and it is not dead 903 if (!Def->registerDefIsDead(ARM::CPSR, TRI)) { 904 LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n"); 905 return; 906 } 907 908 // Bail if elemcount is used in exit blocks, i.e. if it is live-in. 909 if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) { 910 LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n"); 911 return; 912 } 913 914 // Bail if there are uses after this Def in the block. 915 SmallVector<MachineInstr*, 4> Uses; 916 RDA->getReachingLocalUses(Def, ElemCount, Uses); 917 if (Uses.size()) { 918 LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n"); 919 return; 920 } 921 922 Uses.clear(); 923 RDA->getAllInstWithUseBefore(Def, ElemCount, Uses); 924 925 // Remove Def if there are no uses, or if the only use is the VCTP 926 // instruction. 927 if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) { 928 LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: "; 929 Def->dump()); 930 Def->eraseFromParent(); 931 return; 932 } 933 934 LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n"; 935 for (auto U : Uses) U->dump()); 936 } 937 938 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { 939 auto RemovePredicate = [](MachineInstr *MI) { 940 LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); 941 if (int PIdx = llvm::findFirstVPTPredOperandIdx(*MI)) { 942 assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then && 943 "Expected Then predicate!"); 944 MI->getOperand(PIdx).setImm(ARMVCC::None); 945 MI->getOperand(PIdx+1).setReg(0); 946 } else 947 llvm_unreachable("trying to unpredicate a non-predicated instruction"); 948 }; 949 950 // There are a few scenarios which we have to fix up: 951 // 1) A VPT block with is only predicated by the vctp and has no internal vpr 952 // defs. 953 // 2) A VPT block which is only predicated by the vctp but has an internal 954 // vpr def. 955 // 3) A VPT block which is predicated upon the vctp as well as another vpr 956 // def. 957 // 4) A VPT block which is not predicated upon a vctp, but contains it and 958 // all instructions within the block are predicated upon in. 959 960 for (auto &Block : LoLoop.getVPTBlocks()) { 961 SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts(); 962 if (Block.HasNonUniformPredicate()) { 963 PredicatedMI *Divergent = Block.getDivergent(); 964 if (isVCTP(Divergent->MI)) { 965 // The vctp will be removed, so the size of the vpt block needs to be 966 // modified. 967 uint64_t Size = getARMVPTBlockMask(Block.size() - 1); 968 Block.getVPST()->getOperand(0).setImm(Size); 969 LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n"); 970 } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { 971 // The VPT block has a non-uniform predicate but it's entry is guarded 972 // only by a vctp, which means we: 973 // - Need to remove the original vpst. 974 // - Then need to unpredicate any following instructions, until 975 // we come across the divergent vpr def. 976 // - Insert a new vpst to predicate the instruction(s) that following 977 // the divergent vpr def. 978 // TODO: We could be producing more VPT blocks than necessary and could 979 // fold the newly created one into a proceeding one. 980 for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()), 981 E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I) 982 RemovePredicate(&*I); 983 984 unsigned Size = 0; 985 auto E = MachineBasicBlock::reverse_iterator(Divergent->MI); 986 auto I = MachineBasicBlock::reverse_iterator(Insts.back().MI); 987 MachineInstr *InsertAt = nullptr; 988 while (I != E) { 989 InsertAt = &*I; 990 ++Size; 991 ++I; 992 } 993 MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt, 994 InsertAt->getDebugLoc(), 995 TII->get(ARM::MVE_VPST)); 996 MIB.addImm(getARMVPTBlockMask(Size)); 997 LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST()); 998 LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); 999 Block.getVPST()->eraseFromParent(); 1000 } 1001 } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { 1002 // A vpt block which is only predicated upon vctp and has no internal vpr 1003 // defs: 1004 // - Remove vpst. 1005 // - Unpredicate the remaining instructions. 1006 LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST()); 1007 Block.getVPST()->eraseFromParent(); 1008 for (auto &PredMI : Insts) 1009 RemovePredicate(PredMI.MI); 1010 } 1011 } 1012 1013 LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP); 1014 LoLoop.VCTP->eraseFromParent(); 1015 } 1016 1017 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { 1018 1019 // Combine the LoopDec and LoopEnd instructions into LE(TP). 1020 auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) { 1021 MachineInstr *End = LoLoop.End; 1022 MachineBasicBlock *MBB = End->getParent(); 1023 unsigned Opc = LoLoop.IsTailPredicationLegal() ? 1024 ARM::MVE_LETP : ARM::t2LEUpdate; 1025 MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), 1026 TII->get(Opc)); 1027 MIB.addDef(ARM::LR); 1028 MIB.add(End->getOperand(0)); 1029 MIB.add(End->getOperand(1)); 1030 LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); 1031 1032 LoLoop.End->eraseFromParent(); 1033 LoLoop.Dec->eraseFromParent(); 1034 return &*MIB; 1035 }; 1036 1037 // TODO: We should be able to automatically remove these branches before we 1038 // get here - probably by teaching analyzeBranch about the pseudo 1039 // instructions. 1040 // If there is an unconditional branch, after I, that just branches to the 1041 // next block, remove it. 1042 auto RemoveDeadBranch = [](MachineInstr *I) { 1043 MachineBasicBlock *BB = I->getParent(); 1044 MachineInstr *Terminator = &BB->instr_back(); 1045 if (Terminator->isUnconditionalBranch() && I != Terminator) { 1046 MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB(); 1047 if (BB->isLayoutSuccessor(Succ)) { 1048 LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); 1049 Terminator->eraseFromParent(); 1050 } 1051 } 1052 }; 1053 1054 if (LoLoop.Revert) { 1055 if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart) 1056 RevertWhile(LoLoop.Start); 1057 else 1058 LoLoop.Start->eraseFromParent(); 1059 bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true); 1060 RevertLoopEnd(LoLoop.End, FlagsAlreadySet); 1061 } else { 1062 LoLoop.Start = ExpandLoopStart(LoLoop); 1063 RemoveDeadBranch(LoLoop.Start); 1064 LoLoop.End = ExpandLoopEnd(LoLoop); 1065 RemoveDeadBranch(LoLoop.End); 1066 if (LoLoop.IsTailPredicationLegal()) { 1067 RemoveLoopUpdate(LoLoop); 1068 ConvertVPTBlocks(LoLoop); 1069 } 1070 } 1071 1072 PostOrderLoopTraversal DFS(*LoLoop.ML, *MLI); 1073 DFS.ProcessLoop(); 1074 const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder(); 1075 for (auto *MBB : PostOrder) { 1076 recomputeLiveIns(*MBB); 1077 // FIXME: For some reason, the live-in print order is non-deterministic for 1078 // our tests and I can't out why... So just sort them. 1079 MBB->sortUniqueLiveIns(); 1080 } 1081 1082 for (auto *MBB : reverse(PostOrder)) 1083 recomputeLivenessFlags(*MBB); 1084 } 1085 1086 bool ARMLowOverheadLoops::RevertNonLoops() { 1087 LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n"); 1088 bool Changed = false; 1089 1090 for (auto &MBB : *MF) { 1091 SmallVector<MachineInstr*, 4> Starts; 1092 SmallVector<MachineInstr*, 4> Decs; 1093 SmallVector<MachineInstr*, 4> Ends; 1094 1095 for (auto &I : MBB) { 1096 if (isLoopStart(I)) 1097 Starts.push_back(&I); 1098 else if (I.getOpcode() == ARM::t2LoopDec) 1099 Decs.push_back(&I); 1100 else if (I.getOpcode() == ARM::t2LoopEnd) 1101 Ends.push_back(&I); 1102 } 1103 1104 if (Starts.empty() && Decs.empty() && Ends.empty()) 1105 continue; 1106 1107 Changed = true; 1108 1109 for (auto *Start : Starts) { 1110 if (Start->getOpcode() == ARM::t2WhileLoopStart) 1111 RevertWhile(Start); 1112 else 1113 Start->eraseFromParent(); 1114 } 1115 for (auto *Dec : Decs) 1116 RevertLoopDec(Dec); 1117 1118 for (auto *End : Ends) 1119 RevertLoopEnd(End); 1120 } 1121 return Changed; 1122 } 1123 1124 FunctionPass *llvm::createARMLowOverheadLoopsPass() { 1125 return new ARMLowOverheadLoops(); 1126 } 1127