1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "ARM.h" 16 #include "ARMBaseInstrInfo.h" 17 #include "ARMBaseRegisterInfo.h" 18 #include "ARMISelLowering.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMSubtarget.h" 21 #include "MCTargetDesc/ARMAddressingModes.h" 22 #include "Thumb1RegisterInfo.h" 23 #include "llvm/ADT/DenseMap.h" 24 #include "llvm/ADT/STLExtras.h" 25 #include "llvm/ADT/SmallPtrSet.h" 26 #include "llvm/ADT/SmallSet.h" 27 #include "llvm/ADT/SmallVector.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunctionPass.h" 31 #include "llvm/CodeGen/MachineInstr.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/CodeGen/RegisterScavenging.h" 35 #include "llvm/CodeGen/SelectionDAGNodes.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/Support/Debug.h" 40 #include "llvm/Support/ErrorHandling.h" 41 #include "llvm/Target/TargetInstrInfo.h" 42 #include "llvm/Target/TargetMachine.h" 43 #include "llvm/Target/TargetRegisterInfo.h" 44 using namespace llvm; 45 46 #define DEBUG_TYPE "arm-ldst-opt" 47 48 STATISTIC(NumLDMGened , "Number of ldm instructions generated"); 49 STATISTIC(NumSTMGened , "Number of stm instructions generated"); 50 STATISTIC(NumVLDMGened, "Number of vldm instructions generated"); 51 STATISTIC(NumVSTMGened, "Number of vstm instructions generated"); 52 STATISTIC(NumLdStMoved, "Number of load / store instructions moved"); 53 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation"); 54 STATISTIC(NumSTRDFormed,"Number of strd created before allocation"); 55 STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm"); 56 STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); 57 STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); 58 STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); 59 60 /// ARMAllocLoadStoreOpt - Post- register allocation pass the combine 61 /// load / store instructions to form ldm / stm instructions. 62 63 namespace { 64 struct ARMLoadStoreOpt : public MachineFunctionPass { 65 static char ID; 66 ARMLoadStoreOpt() : MachineFunctionPass(ID) {} 67 68 const TargetInstrInfo *TII; 69 const TargetRegisterInfo *TRI; 70 const ARMSubtarget *STI; 71 const TargetLowering *TL; 72 ARMFunctionInfo *AFI; 73 RegScavenger *RS; 74 bool isThumb1, isThumb2; 75 76 bool runOnMachineFunction(MachineFunction &Fn) override; 77 78 const char *getPassName() const override { 79 return "ARM load / store optimization pass"; 80 } 81 82 private: 83 struct MemOpQueueEntry { 84 int Offset; 85 unsigned Reg; 86 bool isKill; 87 unsigned Position; 88 MachineBasicBlock::iterator MBBI; 89 bool Merged; 90 MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, 91 MachineBasicBlock::iterator i) 92 : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} 93 }; 94 typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; 95 typedef MemOpQueue::iterator MemOpQueueIter; 96 97 void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, 98 const MemOpQueue &MemOps, unsigned DefReg, 99 unsigned RangeBegin, unsigned RangeEnd); 100 bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 101 int Offset, unsigned Base, bool BaseKill, int Opcode, 102 ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, 103 DebugLoc dl, 104 ArrayRef<std::pair<unsigned, bool> > Regs, 105 ArrayRef<unsigned> ImpDefs); 106 void MergeOpsUpdate(MachineBasicBlock &MBB, 107 MemOpQueue &MemOps, 108 unsigned memOpsBegin, 109 unsigned memOpsEnd, 110 unsigned insertAfter, 111 int Offset, 112 unsigned Base, 113 bool BaseKill, 114 int Opcode, 115 ARMCC::CondCodes Pred, 116 unsigned PredReg, 117 unsigned Scratch, 118 DebugLoc dl, 119 SmallVectorImpl<MachineBasicBlock::iterator> &Merges); 120 void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, 121 int Opcode, unsigned Size, 122 ARMCC::CondCodes Pred, unsigned PredReg, 123 unsigned Scratch, MemOpQueue &MemOps, 124 SmallVectorImpl<MachineBasicBlock::iterator> &Merges); 125 void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); 126 bool FixInvalidRegPairOp(MachineBasicBlock &MBB, 127 MachineBasicBlock::iterator &MBBI); 128 bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 129 MachineBasicBlock::iterator MBBI, 130 const TargetInstrInfo *TII, 131 bool &Advance, 132 MachineBasicBlock::iterator &I); 133 bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 134 MachineBasicBlock::iterator MBBI, 135 bool &Advance, 136 MachineBasicBlock::iterator &I); 137 bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); 138 bool MergeReturnIntoLDM(MachineBasicBlock &MBB); 139 }; 140 char ARMLoadStoreOpt::ID = 0; 141 } 142 143 static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { 144 switch (Opcode) { 145 default: llvm_unreachable("Unhandled opcode!"); 146 case ARM::LDRi12: 147 ++NumLDMGened; 148 switch (Mode) { 149 default: llvm_unreachable("Unhandled submode!"); 150 case ARM_AM::ia: return ARM::LDMIA; 151 case ARM_AM::da: return ARM::LDMDA; 152 case ARM_AM::db: return ARM::LDMDB; 153 case ARM_AM::ib: return ARM::LDMIB; 154 } 155 case ARM::STRi12: 156 ++NumSTMGened; 157 switch (Mode) { 158 default: llvm_unreachable("Unhandled submode!"); 159 case ARM_AM::ia: return ARM::STMIA; 160 case ARM_AM::da: return ARM::STMDA; 161 case ARM_AM::db: return ARM::STMDB; 162 case ARM_AM::ib: return ARM::STMIB; 163 } 164 case ARM::tLDRi: 165 // tLDMIA is writeback-only - unless the base register is in the input 166 // reglist. 167 ++NumLDMGened; 168 switch (Mode) { 169 default: llvm_unreachable("Unhandled submode!"); 170 case ARM_AM::ia: return ARM::tLDMIA; 171 } 172 case ARM::tSTRi: 173 // There is no non-writeback tSTMIA either. 174 ++NumSTMGened; 175 switch (Mode) { 176 default: llvm_unreachable("Unhandled submode!"); 177 case ARM_AM::ia: return ARM::tSTMIA_UPD; 178 } 179 case ARM::t2LDRi8: 180 case ARM::t2LDRi12: 181 ++NumLDMGened; 182 switch (Mode) { 183 default: llvm_unreachable("Unhandled submode!"); 184 case ARM_AM::ia: return ARM::t2LDMIA; 185 case ARM_AM::db: return ARM::t2LDMDB; 186 } 187 case ARM::t2STRi8: 188 case ARM::t2STRi12: 189 ++NumSTMGened; 190 switch (Mode) { 191 default: llvm_unreachable("Unhandled submode!"); 192 case ARM_AM::ia: return ARM::t2STMIA; 193 case ARM_AM::db: return ARM::t2STMDB; 194 } 195 case ARM::VLDRS: 196 ++NumVLDMGened; 197 switch (Mode) { 198 default: llvm_unreachable("Unhandled submode!"); 199 case ARM_AM::ia: return ARM::VLDMSIA; 200 case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists. 201 } 202 case ARM::VSTRS: 203 ++NumVSTMGened; 204 switch (Mode) { 205 default: llvm_unreachable("Unhandled submode!"); 206 case ARM_AM::ia: return ARM::VSTMSIA; 207 case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists. 208 } 209 case ARM::VLDRD: 210 ++NumVLDMGened; 211 switch (Mode) { 212 default: llvm_unreachable("Unhandled submode!"); 213 case ARM_AM::ia: return ARM::VLDMDIA; 214 case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists. 215 } 216 case ARM::VSTRD: 217 ++NumVSTMGened; 218 switch (Mode) { 219 default: llvm_unreachable("Unhandled submode!"); 220 case ARM_AM::ia: return ARM::VSTMDIA; 221 case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists. 222 } 223 } 224 } 225 226 namespace llvm { 227 namespace ARM_AM { 228 229 AMSubMode getLoadStoreMultipleSubMode(int Opcode) { 230 switch (Opcode) { 231 default: llvm_unreachable("Unhandled opcode!"); 232 case ARM::LDMIA_RET: 233 case ARM::LDMIA: 234 case ARM::LDMIA_UPD: 235 case ARM::STMIA: 236 case ARM::STMIA_UPD: 237 case ARM::tLDMIA: 238 case ARM::tLDMIA_UPD: 239 case ARM::tSTMIA_UPD: 240 case ARM::t2LDMIA_RET: 241 case ARM::t2LDMIA: 242 case ARM::t2LDMIA_UPD: 243 case ARM::t2STMIA: 244 case ARM::t2STMIA_UPD: 245 case ARM::VLDMSIA: 246 case ARM::VLDMSIA_UPD: 247 case ARM::VSTMSIA: 248 case ARM::VSTMSIA_UPD: 249 case ARM::VLDMDIA: 250 case ARM::VLDMDIA_UPD: 251 case ARM::VSTMDIA: 252 case ARM::VSTMDIA_UPD: 253 return ARM_AM::ia; 254 255 case ARM::LDMDA: 256 case ARM::LDMDA_UPD: 257 case ARM::STMDA: 258 case ARM::STMDA_UPD: 259 return ARM_AM::da; 260 261 case ARM::LDMDB: 262 case ARM::LDMDB_UPD: 263 case ARM::STMDB: 264 case ARM::STMDB_UPD: 265 case ARM::t2LDMDB: 266 case ARM::t2LDMDB_UPD: 267 case ARM::t2STMDB: 268 case ARM::t2STMDB_UPD: 269 case ARM::VLDMSDB_UPD: 270 case ARM::VSTMSDB_UPD: 271 case ARM::VLDMDDB_UPD: 272 case ARM::VSTMDDB_UPD: 273 return ARM_AM::db; 274 275 case ARM::LDMIB: 276 case ARM::LDMIB_UPD: 277 case ARM::STMIB: 278 case ARM::STMIB_UPD: 279 return ARM_AM::ib; 280 } 281 } 282 283 } // end namespace ARM_AM 284 } // end namespace llvm 285 286 static bool isT1i32Load(unsigned Opc) { 287 return Opc == ARM::tLDRi; 288 } 289 290 static bool isT2i32Load(unsigned Opc) { 291 return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; 292 } 293 294 static bool isi32Load(unsigned Opc) { 295 return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ; 296 } 297 298 static bool isT1i32Store(unsigned Opc) { 299 return Opc == ARM::tSTRi; 300 } 301 302 static bool isT2i32Store(unsigned Opc) { 303 return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; 304 } 305 306 static bool isi32Store(unsigned Opc) { 307 return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc); 308 } 309 310 /// MergeOps - Create and insert a LDM or STM with Base as base register and 311 /// registers in Regs as the register operands that would be loaded / stored. 312 /// It returns true if the transformation is done. 313 bool 314 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, 315 MachineBasicBlock::iterator MBBI, 316 int Offset, unsigned Base, bool BaseKill, 317 int Opcode, ARMCC::CondCodes Pred, 318 unsigned PredReg, unsigned Scratch, DebugLoc dl, 319 ArrayRef<std::pair<unsigned, bool> > Regs, 320 ArrayRef<unsigned> ImpDefs) { 321 // Only a single register to load / store. Don't bother. 322 unsigned NumRegs = Regs.size(); 323 if (NumRegs <= 1) 324 return false; 325 326 ARM_AM::AMSubMode Mode = ARM_AM::ia; 327 // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA. 328 bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); 329 bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1; 330 331 if (Offset == 4 && haveIBAndDA) { 332 Mode = ARM_AM::ib; 333 } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) { 334 Mode = ARM_AM::da; 335 } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) { 336 // VLDM/VSTM do not support DB mode without also updating the base reg. 337 Mode = ARM_AM::db; 338 } else if (Offset != 0) { 339 // Check if this is a supported opcode before inserting instructions to 340 // calculate a new base register. 341 if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false; 342 343 // If starting offset isn't zero, insert a MI to materialize a new base. 344 // But only do so if it is cost effective, i.e. merging more than two 345 // loads / stores. 346 if (NumRegs <= 2) 347 return false; 348 349 unsigned NewBase; 350 if (isi32Load(Opcode)) { 351 // If it is a load, then just use one of the destination register to 352 // use as the new base. 353 NewBase = Regs[NumRegs-1].first; 354 } else { 355 // Use the scratch register to use as a new base. 356 NewBase = Scratch; 357 if (NewBase == 0) 358 return false; 359 } 360 361 int BaseOpc = 362 isThumb2 ? ARM::t2ADDri : 363 (isThumb1 && Offset < 8) ? ARM::tADDi3 : 364 isThumb1 ? ARM::tADDi8 : ARM::ADDri; 365 366 if (Offset < 0) { 367 Offset = - Offset; 368 BaseOpc = 369 isThumb2 ? ARM::t2SUBri : 370 (isThumb1 && Offset < 8) ? ARM::tSUBi3 : 371 isThumb1 ? ARM::tSUBi8 : ARM::SUBri; 372 } 373 374 if (!TL->isLegalAddImmediate(Offset)) 375 // FIXME: Try add with register operand? 376 return false; // Probably not worth it then. 377 378 if (isThumb1) { 379 // Thumb1: depending on immediate size, use either 380 // ADD NewBase, Base, #imm3 381 // or 382 // MOV NewBase, Base 383 // ADD NewBase, #imm8. 384 if (Base != NewBase && Offset >= 8) { 385 // Need to insert a MOV to the new base first. 386 BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase) 387 .addReg(Base, getKillRegState(BaseKill)) 388 .addImm(Pred).addReg(PredReg); 389 // Set up BaseKill and Base correctly to insert the ADDS/SUBS below. 390 Base = NewBase; 391 BaseKill = false; 392 } 393 AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)) 394 .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) 395 .addImm(Pred).addReg(PredReg); 396 } else { 397 BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) 398 .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) 399 .addImm(Pred).addReg(PredReg).addReg(0); 400 } 401 Base = NewBase; 402 BaseKill = true; // New base is always killed straight away. 403 } 404 405 bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS || 406 Opcode == ARM::VLDRD); 407 408 // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with 409 // base register writeback. 410 Opcode = getLoadStoreMultipleOpcode(Opcode, Mode); 411 if (!Opcode) return false; 412 413 bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback. 414 415 // Exception: If the base register is in the input reglist, Thumb1 LDM is 416 // non-writeback. Check for this. 417 if (Opcode == ARM::tLDMIA && isThumb1) 418 for (unsigned I = 0; I < NumRegs; ++I) 419 if (Base == Regs[I].first) { 420 Writeback = false; 421 break; 422 } 423 424 // If the merged instruction has writeback and the base register is not killed 425 // it's not safe to do the merge on Thumb1. This is because resetting the base 426 // register writeback by inserting a SUBS sets the condition flags. 427 // FIXME: Try something clever here to see if resetting the base register can 428 // be avoided, e.g. by updating a later ADD/SUB of the base register with the 429 // writeback. 430 if (isThumb1 && Writeback && !BaseKill) return false; 431 432 MachineInstrBuilder MIB; 433 434 if (Writeback) { 435 if (Opcode == ARM::tLDMIA) 436 // Update tLDMIA with writeback if necessary. 437 Opcode = ARM::tLDMIA_UPD; 438 439 MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); 440 441 // Thumb1: we might need to set base writeback when building the MI. 442 MIB.addReg(Base, getDefRegState(true)) 443 .addReg(Base, getKillRegState(BaseKill)); 444 } else { 445 // No writeback, simply build the MachineInstr. 446 MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); 447 MIB.addReg(Base, getKillRegState(BaseKill)); 448 } 449 450 MIB.addImm(Pred).addReg(PredReg); 451 452 for (unsigned i = 0; i != NumRegs; ++i) 453 MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) 454 | getKillRegState(Regs[i].second)); 455 456 // Add implicit defs for super-registers. 457 for (unsigned i = 0, e = ImpDefs.size(); i != e; ++i) 458 MIB.addReg(ImpDefs[i], RegState::ImplicitDefine); 459 460 return true; 461 } 462 463 /// \brief Find all instructions using a given imp-def within a range. 464 /// 465 /// We are trying to combine a range of instructions, one of which (located at 466 /// position RangeBegin) implicitly defines a register. The final LDM/STM will 467 /// be placed at RangeEnd, and so any uses of this definition between RangeStart 468 /// and RangeEnd must be modified to use an undefined value. 469 /// 470 /// The live range continues until we find a second definition or one of the 471 /// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so 472 /// we must consider all uses and decide which are relevant in a second pass. 473 void ARMLoadStoreOpt::findUsesOfImpDef( 474 SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps, 475 unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) { 476 std::map<unsigned, MachineOperand *> Uses; 477 unsigned LastLivePos = RangeEnd; 478 479 // First we find all uses of this register with Position between RangeBegin 480 // and RangeEnd, any or all of these could be uses of a definition at 481 // RangeBegin. We also record the latest position a definition at RangeBegin 482 // would be considered live. 483 for (unsigned i = 0; i < MemOps.size(); ++i) { 484 MachineInstr &MI = *MemOps[i].MBBI; 485 unsigned MIPosition = MemOps[i].Position; 486 if (MIPosition <= RangeBegin || MIPosition > RangeEnd) 487 continue; 488 489 // If this instruction defines the register, then any later use will be of 490 // that definition rather than ours. 491 if (MI.definesRegister(DefReg)) 492 LastLivePos = std::min(LastLivePos, MIPosition); 493 494 MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg); 495 if (!UseOp) 496 continue; 497 498 // If this instruction kills the register then (assuming liveness is 499 // correct when we start) we don't need to think about anything after here. 500 if (UseOp->isKill()) 501 LastLivePos = std::min(LastLivePos, MIPosition); 502 503 Uses[MIPosition] = UseOp; 504 } 505 506 // Now we traverse the list of all uses, and append the ones that actually use 507 // our definition to the requested list. 508 for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(), 509 E = Uses.end(); 510 I != E; ++I) { 511 // List is sorted by position so once we've found one out of range there 512 // will be no more to consider. 513 if (I->first > LastLivePos) 514 break; 515 UsesOfImpDefs.push_back(I->second); 516 } 517 } 518 519 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on 520 // success. 521 void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, 522 MemOpQueue &memOps, 523 unsigned memOpsBegin, unsigned memOpsEnd, 524 unsigned insertAfter, int Offset, 525 unsigned Base, bool BaseKill, 526 int Opcode, 527 ARMCC::CondCodes Pred, unsigned PredReg, 528 unsigned Scratch, 529 DebugLoc dl, 530 SmallVectorImpl<MachineBasicBlock::iterator> &Merges) { 531 // First calculate which of the registers should be killed by the merged 532 // instruction. 533 const unsigned insertPos = memOps[insertAfter].Position; 534 SmallSet<unsigned, 4> KilledRegs; 535 DenseMap<unsigned, unsigned> Killer; 536 for (unsigned i = 0, e = memOps.size(); i != e; ++i) { 537 if (i == memOpsBegin) { 538 i = memOpsEnd; 539 if (i == e) 540 break; 541 } 542 if (memOps[i].Position < insertPos && memOps[i].isKill) { 543 unsigned Reg = memOps[i].Reg; 544 KilledRegs.insert(Reg); 545 Killer[Reg] = i; 546 } 547 } 548 549 SmallVector<std::pair<unsigned, bool>, 8> Regs; 550 SmallVector<unsigned, 8> ImpDefs; 551 SmallVector<MachineOperand *, 8> UsesOfImpDefs; 552 for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { 553 unsigned Reg = memOps[i].Reg; 554 // If we are inserting the merged operation after an operation that 555 // uses the same register, make sure to transfer any kill flag. 556 bool isKill = memOps[i].isKill || KilledRegs.count(Reg); 557 Regs.push_back(std::make_pair(Reg, isKill)); 558 559 // Collect any implicit defs of super-registers. They must be preserved. 560 for (MIOperands MO(memOps[i].MBBI); MO.isValid(); ++MO) { 561 if (!MO->isReg() || !MO->isDef() || !MO->isImplicit() || MO->isDead()) 562 continue; 563 unsigned DefReg = MO->getReg(); 564 if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end()) 565 ImpDefs.push_back(DefReg); 566 567 // There may be other uses of the definition between this instruction and 568 // the eventual LDM/STM position. These should be marked undef if the 569 // merge takes place. 570 findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position, 571 insertPos); 572 } 573 } 574 575 // Try to do the merge. 576 MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI; 577 ++Loc; 578 if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode, 579 Pred, PredReg, Scratch, dl, Regs, ImpDefs)) 580 return; 581 582 // Merge succeeded, update records. 583 Merges.push_back(std::prev(Loc)); 584 585 // In gathering loads together, we may have moved the imp-def of a register 586 // past one of its uses. This is OK, since we know better than the rest of 587 // LLVM what's OK with ARM loads and stores; but we still have to adjust the 588 // affected uses. 589 for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(), 590 E = UsesOfImpDefs.end(); 591 I != E; ++I) 592 (*I)->setIsUndef(); 593 594 for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { 595 // Remove kill flags from any memops that come before insertPos. 596 if (Regs[i-memOpsBegin].second) { 597 unsigned Reg = Regs[i-memOpsBegin].first; 598 if (KilledRegs.count(Reg)) { 599 unsigned j = Killer[Reg]; 600 int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true); 601 assert(Idx >= 0 && "Cannot find killing operand"); 602 memOps[j].MBBI->getOperand(Idx).setIsKill(false); 603 memOps[j].isKill = false; 604 } 605 memOps[i].isKill = true; 606 } 607 MBB.erase(memOps[i].MBBI); 608 // Update this memop to refer to the merged instruction. 609 // We may need to move kill flags again. 610 memOps[i].Merged = true; 611 memOps[i].MBBI = Merges.back(); 612 memOps[i].Position = insertPos; 613 } 614 } 615 616 /// MergeLDR_STR - Merge a number of load / store instructions into one or more 617 /// load / store multiple instructions. 618 void 619 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, 620 unsigned Base, int Opcode, unsigned Size, 621 ARMCC::CondCodes Pred, unsigned PredReg, 622 unsigned Scratch, MemOpQueue &MemOps, 623 SmallVectorImpl<MachineBasicBlock::iterator> &Merges) { 624 bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); 625 int Offset = MemOps[SIndex].Offset; 626 int SOffset = Offset; 627 unsigned insertAfter = SIndex; 628 MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; 629 DebugLoc dl = Loc->getDebugLoc(); 630 const MachineOperand &PMO = Loc->getOperand(0); 631 unsigned PReg = PMO.getReg(); 632 unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg); 633 unsigned Count = 1; 634 unsigned Limit = ~0U; 635 bool BaseKill = false; 636 // vldm / vstm limit are 32 for S variants, 16 for D variants. 637 638 switch (Opcode) { 639 default: break; 640 case ARM::VSTRS: 641 Limit = 32; 642 break; 643 case ARM::VSTRD: 644 Limit = 16; 645 break; 646 case ARM::VLDRD: 647 Limit = 16; 648 break; 649 case ARM::VLDRS: 650 Limit = 32; 651 break; 652 } 653 654 for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { 655 int NewOffset = MemOps[i].Offset; 656 const MachineOperand &MO = MemOps[i].MBBI->getOperand(0); 657 unsigned Reg = MO.getReg(); 658 unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); 659 // Register numbers must be in ascending order. For VFP / NEON load and 660 // store multiples, the registers must also be consecutive and within the 661 // limit on the number of registers per instruction. 662 if (Reg != ARM::SP && 663 NewOffset == Offset + (int)Size && 664 ((isNotVFP && RegNum > PRegNum) || 665 ((Count < Limit) && RegNum == PRegNum+1)) && 666 // On Swift we don't want vldm/vstm to start with a odd register num 667 // because Q register unaligned vldm/vstm need more uops. 668 (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) { 669 Offset += Size; 670 PRegNum = RegNum; 671 ++Count; 672 } else { 673 // Can't merge this in. Try merge the earlier ones first. 674 // We need to compute BaseKill here because the MemOps may have been 675 // reordered. 676 BaseKill = Loc->killsRegister(Base); 677 678 MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, Base, 679 BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); 680 MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, 681 MemOps, Merges); 682 return; 683 } 684 685 if (MemOps[i].Position > MemOps[insertAfter].Position) { 686 insertAfter = i; 687 Loc = MemOps[i].MBBI; 688 } 689 } 690 691 BaseKill = Loc->killsRegister(Base); 692 MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset, 693 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); 694 } 695 696 static bool definesCPSR(MachineInstr *MI) { 697 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 698 const MachineOperand &MO = MI->getOperand(i); 699 if (!MO.isReg()) 700 continue; 701 if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead()) 702 // If the instruction has live CPSR def, then it's not safe to fold it 703 // into load / store. 704 return true; 705 } 706 707 return false; 708 } 709 710 static bool isMatchingDecrement(MachineInstr *MI, unsigned Base, 711 unsigned Bytes, unsigned Limit, 712 ARMCC::CondCodes Pred, unsigned PredReg) { 713 unsigned MyPredReg = 0; 714 if (!MI) 715 return false; 716 717 bool CheckCPSRDef = false; 718 switch (MI->getOpcode()) { 719 default: return false; 720 case ARM::tSUBi8: 721 case ARM::t2SUBri: 722 case ARM::SUBri: 723 CheckCPSRDef = true; 724 // fallthrough 725 case ARM::tSUBspi: 726 break; 727 } 728 729 // Make sure the offset fits in 8 bits. 730 if (Bytes == 0 || (Limit && Bytes >= Limit)) 731 return false; 732 733 unsigned Scale = (MI->getOpcode() == ARM::tSUBspi || 734 MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME 735 if (!(MI->getOperand(0).getReg() == Base && 736 MI->getOperand(1).getReg() == Base && 737 (MI->getOperand(2).getImm() * Scale) == Bytes && 738 getInstrPredicate(MI, MyPredReg) == Pred && 739 MyPredReg == PredReg)) 740 return false; 741 742 return CheckCPSRDef ? !definesCPSR(MI) : true; 743 } 744 745 static bool isMatchingIncrement(MachineInstr *MI, unsigned Base, 746 unsigned Bytes, unsigned Limit, 747 ARMCC::CondCodes Pred, unsigned PredReg) { 748 unsigned MyPredReg = 0; 749 if (!MI) 750 return false; 751 752 bool CheckCPSRDef = false; 753 switch (MI->getOpcode()) { 754 default: return false; 755 case ARM::tADDi8: 756 case ARM::t2ADDri: 757 case ARM::ADDri: 758 CheckCPSRDef = true; 759 // fallthrough 760 case ARM::tADDspi: 761 break; 762 } 763 764 if (Bytes == 0 || (Limit && Bytes >= Limit)) 765 // Make sure the offset fits in 8 bits. 766 return false; 767 768 unsigned Scale = (MI->getOpcode() == ARM::tADDspi || 769 MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME 770 if (!(MI->getOperand(0).getReg() == Base && 771 MI->getOperand(1).getReg() == Base && 772 (MI->getOperand(2).getImm() * Scale) == Bytes && 773 getInstrPredicate(MI, MyPredReg) == Pred && 774 MyPredReg == PredReg)) 775 return false; 776 777 return CheckCPSRDef ? !definesCPSR(MI) : true; 778 } 779 780 static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { 781 switch (MI->getOpcode()) { 782 default: return 0; 783 case ARM::LDRi12: 784 case ARM::STRi12: 785 case ARM::tLDRi: 786 case ARM::tSTRi: 787 case ARM::t2LDRi8: 788 case ARM::t2LDRi12: 789 case ARM::t2STRi8: 790 case ARM::t2STRi12: 791 case ARM::VLDRS: 792 case ARM::VSTRS: 793 return 4; 794 case ARM::VLDRD: 795 case ARM::VSTRD: 796 return 8; 797 case ARM::LDMIA: 798 case ARM::LDMDA: 799 case ARM::LDMDB: 800 case ARM::LDMIB: 801 case ARM::STMIA: 802 case ARM::STMDA: 803 case ARM::STMDB: 804 case ARM::STMIB: 805 case ARM::tLDMIA: 806 case ARM::tLDMIA_UPD: 807 case ARM::tSTMIA_UPD: 808 case ARM::t2LDMIA: 809 case ARM::t2LDMDB: 810 case ARM::t2STMIA: 811 case ARM::t2STMDB: 812 case ARM::VLDMSIA: 813 case ARM::VSTMSIA: 814 return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; 815 case ARM::VLDMDIA: 816 case ARM::VSTMDIA: 817 return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; 818 } 819 } 820 821 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, 822 ARM_AM::AMSubMode Mode) { 823 switch (Opc) { 824 default: llvm_unreachable("Unhandled opcode!"); 825 case ARM::LDMIA: 826 case ARM::LDMDA: 827 case ARM::LDMDB: 828 case ARM::LDMIB: 829 switch (Mode) { 830 default: llvm_unreachable("Unhandled submode!"); 831 case ARM_AM::ia: return ARM::LDMIA_UPD; 832 case ARM_AM::ib: return ARM::LDMIB_UPD; 833 case ARM_AM::da: return ARM::LDMDA_UPD; 834 case ARM_AM::db: return ARM::LDMDB_UPD; 835 } 836 case ARM::STMIA: 837 case ARM::STMDA: 838 case ARM::STMDB: 839 case ARM::STMIB: 840 switch (Mode) { 841 default: llvm_unreachable("Unhandled submode!"); 842 case ARM_AM::ia: return ARM::STMIA_UPD; 843 case ARM_AM::ib: return ARM::STMIB_UPD; 844 case ARM_AM::da: return ARM::STMDA_UPD; 845 case ARM_AM::db: return ARM::STMDB_UPD; 846 } 847 case ARM::t2LDMIA: 848 case ARM::t2LDMDB: 849 switch (Mode) { 850 default: llvm_unreachable("Unhandled submode!"); 851 case ARM_AM::ia: return ARM::t2LDMIA_UPD; 852 case ARM_AM::db: return ARM::t2LDMDB_UPD; 853 } 854 case ARM::t2STMIA: 855 case ARM::t2STMDB: 856 switch (Mode) { 857 default: llvm_unreachable("Unhandled submode!"); 858 case ARM_AM::ia: return ARM::t2STMIA_UPD; 859 case ARM_AM::db: return ARM::t2STMDB_UPD; 860 } 861 case ARM::VLDMSIA: 862 switch (Mode) { 863 default: llvm_unreachable("Unhandled submode!"); 864 case ARM_AM::ia: return ARM::VLDMSIA_UPD; 865 case ARM_AM::db: return ARM::VLDMSDB_UPD; 866 } 867 case ARM::VLDMDIA: 868 switch (Mode) { 869 default: llvm_unreachable("Unhandled submode!"); 870 case ARM_AM::ia: return ARM::VLDMDIA_UPD; 871 case ARM_AM::db: return ARM::VLDMDDB_UPD; 872 } 873 case ARM::VSTMSIA: 874 switch (Mode) { 875 default: llvm_unreachable("Unhandled submode!"); 876 case ARM_AM::ia: return ARM::VSTMSIA_UPD; 877 case ARM_AM::db: return ARM::VSTMSDB_UPD; 878 } 879 case ARM::VSTMDIA: 880 switch (Mode) { 881 default: llvm_unreachable("Unhandled submode!"); 882 case ARM_AM::ia: return ARM::VSTMDIA_UPD; 883 case ARM_AM::db: return ARM::VSTMDDB_UPD; 884 } 885 } 886 } 887 888 /// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base 889 /// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: 890 /// 891 /// stmia rn, <ra, rb, rc> 892 /// rn := rn + 4 * 3; 893 /// => 894 /// stmia rn!, <ra, rb, rc> 895 /// 896 /// rn := rn - 4 * 3; 897 /// ldmia rn, <ra, rb, rc> 898 /// => 899 /// ldmdb rn!, <ra, rb, rc> 900 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 901 MachineBasicBlock::iterator MBBI, 902 bool &Advance, 903 MachineBasicBlock::iterator &I) { 904 // Thumb1 is already using updating loads/stores. 905 if (isThumb1) return false; 906 907 MachineInstr *MI = MBBI; 908 unsigned Base = MI->getOperand(0).getReg(); 909 bool BaseKill = MI->getOperand(0).isKill(); 910 unsigned Bytes = getLSMultipleTransferSize(MI); 911 unsigned PredReg = 0; 912 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); 913 int Opcode = MI->getOpcode(); 914 DebugLoc dl = MI->getDebugLoc(); 915 916 // Can't use an updating ld/st if the base register is also a dest 917 // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. 918 for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) 919 if (MI->getOperand(i).getReg() == Base) 920 return false; 921 922 bool DoMerge = false; 923 ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode); 924 925 // Try merging with the previous instruction. 926 MachineBasicBlock::iterator BeginMBBI = MBB.begin(); 927 if (MBBI != BeginMBBI) { 928 MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); 929 while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) 930 --PrevMBBI; 931 if (Mode == ARM_AM::ia && 932 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 933 Mode = ARM_AM::db; 934 DoMerge = true; 935 } else if (Mode == ARM_AM::ib && 936 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 937 Mode = ARM_AM::da; 938 DoMerge = true; 939 } 940 if (DoMerge) 941 MBB.erase(PrevMBBI); 942 } 943 944 // Try merging with the next instruction. 945 MachineBasicBlock::iterator EndMBBI = MBB.end(); 946 if (!DoMerge && MBBI != EndMBBI) { 947 MachineBasicBlock::iterator NextMBBI = std::next(MBBI); 948 while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) 949 ++NextMBBI; 950 if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && 951 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 952 DoMerge = true; 953 } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && 954 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 955 DoMerge = true; 956 } 957 if (DoMerge) { 958 if (NextMBBI == I) { 959 Advance = true; 960 ++I; 961 } 962 MBB.erase(NextMBBI); 963 } 964 } 965 966 if (!DoMerge) 967 return false; 968 969 unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); 970 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) 971 .addReg(Base, getDefRegState(true)) // WB base register 972 .addReg(Base, getKillRegState(BaseKill)) 973 .addImm(Pred).addReg(PredReg); 974 975 // Transfer the rest of operands. 976 for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum) 977 MIB.addOperand(MI->getOperand(OpNum)); 978 979 // Transfer memoperands. 980 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 981 982 MBB.erase(MBBI); 983 return true; 984 } 985 986 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc, 987 ARM_AM::AddrOpc Mode) { 988 switch (Opc) { 989 case ARM::LDRi12: 990 return ARM::LDR_PRE_IMM; 991 case ARM::STRi12: 992 return ARM::STR_PRE_IMM; 993 case ARM::VLDRS: 994 return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD; 995 case ARM::VLDRD: 996 return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD; 997 case ARM::VSTRS: 998 return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD; 999 case ARM::VSTRD: 1000 return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD; 1001 case ARM::t2LDRi8: 1002 case ARM::t2LDRi12: 1003 return ARM::t2LDR_PRE; 1004 case ARM::t2STRi8: 1005 case ARM::t2STRi12: 1006 return ARM::t2STR_PRE; 1007 default: llvm_unreachable("Unhandled opcode!"); 1008 } 1009 } 1010 1011 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, 1012 ARM_AM::AddrOpc Mode) { 1013 switch (Opc) { 1014 case ARM::LDRi12: 1015 return ARM::LDR_POST_IMM; 1016 case ARM::STRi12: 1017 return ARM::STR_POST_IMM; 1018 case ARM::VLDRS: 1019 return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD; 1020 case ARM::VLDRD: 1021 return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD; 1022 case ARM::VSTRS: 1023 return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD; 1024 case ARM::VSTRD: 1025 return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD; 1026 case ARM::t2LDRi8: 1027 case ARM::t2LDRi12: 1028 return ARM::t2LDR_POST; 1029 case ARM::t2STRi8: 1030 case ARM::t2STRi12: 1031 return ARM::t2STR_POST; 1032 default: llvm_unreachable("Unhandled opcode!"); 1033 } 1034 } 1035 1036 /// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base 1037 /// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: 1038 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 1039 MachineBasicBlock::iterator MBBI, 1040 const TargetInstrInfo *TII, 1041 bool &Advance, 1042 MachineBasicBlock::iterator &I) { 1043 // Thumb1 doesn't have updating LDR/STR. 1044 // FIXME: Use LDM/STM with single register instead. 1045 if (isThumb1) return false; 1046 1047 MachineInstr *MI = MBBI; 1048 unsigned Base = MI->getOperand(1).getReg(); 1049 bool BaseKill = MI->getOperand(1).isKill(); 1050 unsigned Bytes = getLSMultipleTransferSize(MI); 1051 int Opcode = MI->getOpcode(); 1052 DebugLoc dl = MI->getDebugLoc(); 1053 bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || 1054 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS); 1055 bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12); 1056 if (isi32Load(Opcode) || isi32Store(Opcode)) 1057 if (MI->getOperand(2).getImm() != 0) 1058 return false; 1059 if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) 1060 return false; 1061 1062 bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; 1063 // Can't do the merge if the destination register is the same as the would-be 1064 // writeback register. 1065 if (MI->getOperand(0).getReg() == Base) 1066 return false; 1067 1068 unsigned PredReg = 0; 1069 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); 1070 bool DoMerge = false; 1071 ARM_AM::AddrOpc AddSub = ARM_AM::add; 1072 unsigned NewOpc = 0; 1073 // AM2 - 12 bits, thumb2 - 8 bits. 1074 unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); 1075 1076 // Try merging with the previous instruction. 1077 MachineBasicBlock::iterator BeginMBBI = MBB.begin(); 1078 if (MBBI != BeginMBBI) { 1079 MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); 1080 while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) 1081 --PrevMBBI; 1082 if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { 1083 DoMerge = true; 1084 AddSub = ARM_AM::sub; 1085 } else if (!isAM5 && 1086 isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { 1087 DoMerge = true; 1088 } 1089 if (DoMerge) { 1090 NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub); 1091 MBB.erase(PrevMBBI); 1092 } 1093 } 1094 1095 // Try merging with the next instruction. 1096 MachineBasicBlock::iterator EndMBBI = MBB.end(); 1097 if (!DoMerge && MBBI != EndMBBI) { 1098 MachineBasicBlock::iterator NextMBBI = std::next(MBBI); 1099 while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) 1100 ++NextMBBI; 1101 if (!isAM5 && 1102 isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { 1103 DoMerge = true; 1104 AddSub = ARM_AM::sub; 1105 } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { 1106 DoMerge = true; 1107 } 1108 if (DoMerge) { 1109 NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub); 1110 if (NextMBBI == I) { 1111 Advance = true; 1112 ++I; 1113 } 1114 MBB.erase(NextMBBI); 1115 } 1116 } 1117 1118 if (!DoMerge) 1119 return false; 1120 1121 if (isAM5) { 1122 // VLDM[SD]_UPD, VSTM[SD]_UPD 1123 // (There are no base-updating versions of VLDR/VSTR instructions, but the 1124 // updating load/store-multiple instructions can be used with only one 1125 // register.) 1126 MachineOperand &MO = MI->getOperand(0); 1127 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) 1128 .addReg(Base, getDefRegState(true)) // WB base register 1129 .addReg(Base, getKillRegState(isLd ? BaseKill : false)) 1130 .addImm(Pred).addReg(PredReg) 1131 .addReg(MO.getReg(), (isLd ? getDefRegState(true) : 1132 getKillRegState(MO.isKill()))); 1133 } else if (isLd) { 1134 if (isAM2) { 1135 // LDR_PRE, LDR_POST 1136 if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { 1137 int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 1138 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 1139 .addReg(Base, RegState::Define) 1140 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 1141 } else { 1142 int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); 1143 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 1144 .addReg(Base, RegState::Define) 1145 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 1146 } 1147 } else { 1148 int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 1149 // t2LDR_PRE, t2LDR_POST 1150 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 1151 .addReg(Base, RegState::Define) 1152 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 1153 } 1154 } else { 1155 MachineOperand &MO = MI->getOperand(0); 1156 // FIXME: post-indexed stores use am2offset_imm, which still encodes 1157 // the vestigal zero-reg offset register. When that's fixed, this clause 1158 // can be removed entirely. 1159 if (isAM2 && NewOpc == ARM::STR_POST_IMM) { 1160 int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); 1161 // STR_PRE, STR_POST 1162 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 1163 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 1164 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 1165 } else { 1166 int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 1167 // t2STR_PRE, t2STR_POST 1168 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 1169 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 1170 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 1171 } 1172 } 1173 MBB.erase(MBBI); 1174 1175 return true; 1176 } 1177 1178 /// isMemoryOp - Returns true if instruction is a memory operation that this 1179 /// pass is capable of operating on. 1180 static bool isMemoryOp(const MachineInstr *MI) { 1181 // When no memory operands are present, conservatively assume unaligned, 1182 // volatile, unfoldable. 1183 if (!MI->hasOneMemOperand()) 1184 return false; 1185 1186 const MachineMemOperand *MMO = *MI->memoperands_begin(); 1187 1188 // Don't touch volatile memory accesses - we may be changing their order. 1189 if (MMO->isVolatile()) 1190 return false; 1191 1192 // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is 1193 // not. 1194 if (MMO->getAlignment() < 4) 1195 return false; 1196 1197 // str <undef> could probably be eliminated entirely, but for now we just want 1198 // to avoid making a mess of it. 1199 // FIXME: Use str <undef> as a wildcard to enable better stm folding. 1200 if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() && 1201 MI->getOperand(0).isUndef()) 1202 return false; 1203 1204 // Likewise don't mess with references to undefined addresses. 1205 if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() && 1206 MI->getOperand(1).isUndef()) 1207 return false; 1208 1209 int Opcode = MI->getOpcode(); 1210 switch (Opcode) { 1211 default: break; 1212 case ARM::VLDRS: 1213 case ARM::VSTRS: 1214 return MI->getOperand(1).isReg(); 1215 case ARM::VLDRD: 1216 case ARM::VSTRD: 1217 return MI->getOperand(1).isReg(); 1218 case ARM::LDRi12: 1219 case ARM::STRi12: 1220 case ARM::tLDRi: 1221 case ARM::tSTRi: 1222 case ARM::t2LDRi8: 1223 case ARM::t2LDRi12: 1224 case ARM::t2STRi8: 1225 case ARM::t2STRi12: 1226 return MI->getOperand(1).isReg(); 1227 } 1228 return false; 1229 } 1230 1231 /// AdvanceRS - Advance register scavenger to just before the earliest memory 1232 /// op that is being merged. 1233 void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { 1234 MachineBasicBlock::iterator Loc = MemOps[0].MBBI; 1235 unsigned Position = MemOps[0].Position; 1236 for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { 1237 if (MemOps[i].Position < Position) { 1238 Position = MemOps[i].Position; 1239 Loc = MemOps[i].MBBI; 1240 } 1241 } 1242 1243 if (Loc != MBB.begin()) 1244 RS->forward(std::prev(Loc)); 1245 } 1246 1247 static int getMemoryOpOffset(const MachineInstr *MI) { 1248 int Opcode = MI->getOpcode(); 1249 bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; 1250 unsigned NumOperands = MI->getDesc().getNumOperands(); 1251 unsigned OffField = MI->getOperand(NumOperands-3).getImm(); 1252 1253 if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || 1254 Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || 1255 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 || 1256 Opcode == ARM::LDRi12 || Opcode == ARM::STRi12) 1257 return OffField; 1258 1259 // Thumb1 immediate offsets are scaled by 4 1260 if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi) 1261 return OffField * 4; 1262 1263 int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField) 1264 : ARM_AM::getAM5Offset(OffField) * 4; 1265 if (isAM3) { 1266 if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) 1267 Offset = -Offset; 1268 } else { 1269 if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) 1270 Offset = -Offset; 1271 } 1272 return Offset; 1273 } 1274 1275 static void InsertLDR_STR(MachineBasicBlock &MBB, 1276 MachineBasicBlock::iterator &MBBI, 1277 int Offset, bool isDef, 1278 DebugLoc dl, unsigned NewOpc, 1279 unsigned Reg, bool RegDeadKill, bool RegUndef, 1280 unsigned BaseReg, bool BaseKill, bool BaseUndef, 1281 bool OffKill, bool OffUndef, 1282 ARMCC::CondCodes Pred, unsigned PredReg, 1283 const TargetInstrInfo *TII, bool isT2) { 1284 if (isDef) { 1285 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 1286 TII->get(NewOpc)) 1287 .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) 1288 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 1289 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1290 } else { 1291 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 1292 TII->get(NewOpc)) 1293 .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) 1294 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 1295 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1296 } 1297 } 1298 1299 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, 1300 MachineBasicBlock::iterator &MBBI) { 1301 MachineInstr *MI = &*MBBI; 1302 unsigned Opcode = MI->getOpcode(); 1303 if (Opcode == ARM::LDRD || Opcode == ARM::STRD || 1304 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) { 1305 const MachineOperand &BaseOp = MI->getOperand(2); 1306 unsigned BaseReg = BaseOp.getReg(); 1307 unsigned EvenReg = MI->getOperand(0).getReg(); 1308 unsigned OddReg = MI->getOperand(1).getReg(); 1309 unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); 1310 unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); 1311 // ARM errata 602117: LDRD with base in list may result in incorrect base 1312 // register when interrupted or faulted. 1313 bool Errata602117 = EvenReg == BaseReg && STI->isCortexM3(); 1314 if (!Errata602117 && 1315 ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)) 1316 return false; 1317 1318 MachineBasicBlock::iterator NewBBI = MBBI; 1319 bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; 1320 bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; 1321 bool EvenDeadKill = isLd ? 1322 MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); 1323 bool EvenUndef = MI->getOperand(0).isUndef(); 1324 bool OddDeadKill = isLd ? 1325 MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); 1326 bool OddUndef = MI->getOperand(1).isUndef(); 1327 bool BaseKill = BaseOp.isKill(); 1328 bool BaseUndef = BaseOp.isUndef(); 1329 bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); 1330 bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); 1331 int OffImm = getMemoryOpOffset(MI); 1332 unsigned PredReg = 0; 1333 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); 1334 1335 if (OddRegNum > EvenRegNum && OffImm == 0) { 1336 // Ascending register numbers and no offset. It's safe to change it to a 1337 // ldm or stm. 1338 unsigned NewOpc = (isLd) 1339 ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA) 1340 : (isT2 ? ARM::t2STMIA : ARM::STMIA); 1341 if (isLd) { 1342 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 1343 .addReg(BaseReg, getKillRegState(BaseKill)) 1344 .addImm(Pred).addReg(PredReg) 1345 .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) 1346 .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); 1347 ++NumLDRD2LDM; 1348 } else { 1349 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 1350 .addReg(BaseReg, getKillRegState(BaseKill)) 1351 .addImm(Pred).addReg(PredReg) 1352 .addReg(EvenReg, 1353 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) 1354 .addReg(OddReg, 1355 getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); 1356 ++NumSTRD2STM; 1357 } 1358 NewBBI = std::prev(MBBI); 1359 } else { 1360 // Split into two instructions. 1361 unsigned NewOpc = (isLd) 1362 ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) 1363 : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); 1364 // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset, 1365 // so adjust and use t2LDRi12 here for that. 1366 unsigned NewOpc2 = (isLd) 1367 ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) 1368 : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); 1369 DebugLoc dl = MBBI->getDebugLoc(); 1370 // If this is a load and base register is killed, it may have been 1371 // re-defed by the load, make sure the first load does not clobber it. 1372 if (isLd && 1373 (BaseKill || OffKill) && 1374 (TRI->regsOverlap(EvenReg, BaseReg))) { 1375 assert(!TRI->regsOverlap(OddReg, BaseReg)); 1376 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, 1377 OddReg, OddDeadKill, false, 1378 BaseReg, false, BaseUndef, false, OffUndef, 1379 Pred, PredReg, TII, isT2); 1380 NewBBI = std::prev(MBBI); 1381 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 1382 EvenReg, EvenDeadKill, false, 1383 BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, 1384 Pred, PredReg, TII, isT2); 1385 } else { 1386 if (OddReg == EvenReg && EvenDeadKill) { 1387 // If the two source operands are the same, the kill marker is 1388 // probably on the first one. e.g. 1389 // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0 1390 EvenDeadKill = false; 1391 OddDeadKill = true; 1392 } 1393 // Never kill the base register in the first instruction. 1394 if (EvenReg == BaseReg) 1395 EvenDeadKill = false; 1396 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 1397 EvenReg, EvenDeadKill, EvenUndef, 1398 BaseReg, false, BaseUndef, false, OffUndef, 1399 Pred, PredReg, TII, isT2); 1400 NewBBI = std::prev(MBBI); 1401 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, 1402 OddReg, OddDeadKill, OddUndef, 1403 BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, 1404 Pred, PredReg, TII, isT2); 1405 } 1406 if (isLd) 1407 ++NumLDRD2LDR; 1408 else 1409 ++NumSTRD2STR; 1410 } 1411 1412 MBB.erase(MI); 1413 MBBI = NewBBI; 1414 return true; 1415 } 1416 return false; 1417 } 1418 1419 /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR 1420 /// ops of the same base and incrementing offset into LDM / STM ops. 1421 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { 1422 unsigned NumMerges = 0; 1423 unsigned NumMemOps = 0; 1424 MemOpQueue MemOps; 1425 unsigned CurrBase = 0; 1426 int CurrOpc = -1; 1427 unsigned CurrSize = 0; 1428 ARMCC::CondCodes CurrPred = ARMCC::AL; 1429 unsigned CurrPredReg = 0; 1430 unsigned Position = 0; 1431 SmallVector<MachineBasicBlock::iterator,4> Merges; 1432 1433 RS->enterBasicBlock(&MBB); 1434 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1435 while (MBBI != E) { 1436 if (FixInvalidRegPairOp(MBB, MBBI)) 1437 continue; 1438 1439 bool Advance = false; 1440 bool TryMerge = false; 1441 bool Clobber = false; 1442 1443 bool isMemOp = isMemoryOp(MBBI); 1444 if (isMemOp) { 1445 int Opcode = MBBI->getOpcode(); 1446 unsigned Size = getLSMultipleTransferSize(MBBI); 1447 const MachineOperand &MO = MBBI->getOperand(0); 1448 unsigned Reg = MO.getReg(); 1449 bool isKill = MO.isDef() ? false : MO.isKill(); 1450 unsigned Base = MBBI->getOperand(1).getReg(); 1451 unsigned PredReg = 0; 1452 ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg); 1453 int Offset = getMemoryOpOffset(MBBI); 1454 // Watch out for: 1455 // r4 := ldr [r5] 1456 // r5 := ldr [r5, #4] 1457 // r6 := ldr [r5, #8] 1458 // 1459 // The second ldr has effectively broken the chain even though it 1460 // looks like the later ldr(s) use the same base register. Try to 1461 // merge the ldr's so far, including this one. But don't try to 1462 // combine the following ldr(s). 1463 Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); 1464 1465 // Watch out for: 1466 // r4 := ldr [r0, #8] 1467 // r4 := ldr [r0, #4] 1468 // 1469 // The optimization may reorder the second ldr in front of the first 1470 // ldr, which violates write after write(WAW) dependence. The same as 1471 // str. Try to merge inst(s) already in MemOps. 1472 bool Overlap = false; 1473 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) { 1474 if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) { 1475 Overlap = true; 1476 break; 1477 } 1478 } 1479 1480 if (CurrBase == 0 && !Clobber) { 1481 // Start of a new chain. 1482 CurrBase = Base; 1483 CurrOpc = Opcode; 1484 CurrSize = Size; 1485 CurrPred = Pred; 1486 CurrPredReg = PredReg; 1487 MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); 1488 ++NumMemOps; 1489 Advance = true; 1490 } else if (!Overlap) { 1491 if (Clobber) { 1492 TryMerge = true; 1493 Advance = true; 1494 } 1495 1496 if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { 1497 // No need to match PredReg. 1498 // Continue adding to the queue. 1499 if (Offset > MemOps.back().Offset) { 1500 MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, 1501 Position, MBBI)); 1502 ++NumMemOps; 1503 Advance = true; 1504 } else { 1505 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); 1506 I != E; ++I) { 1507 if (Offset < I->Offset) { 1508 MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill, 1509 Position, MBBI)); 1510 ++NumMemOps; 1511 Advance = true; 1512 break; 1513 } else if (Offset == I->Offset) { 1514 // Collision! This can't be merged! 1515 break; 1516 } 1517 } 1518 } 1519 } 1520 } 1521 } 1522 1523 if (MBBI->isDebugValue()) { 1524 ++MBBI; 1525 if (MBBI == E) 1526 // Reach the end of the block, try merging the memory instructions. 1527 TryMerge = true; 1528 } else if (Advance) { 1529 ++Position; 1530 ++MBBI; 1531 if (MBBI == E) 1532 // Reach the end of the block, try merging the memory instructions. 1533 TryMerge = true; 1534 } else { 1535 TryMerge = true; 1536 } 1537 1538 if (TryMerge) { 1539 if (NumMemOps > 1) { 1540 // Try to find a free register to use as a new base in case it's needed. 1541 // First advance to the instruction just before the start of the chain. 1542 AdvanceRS(MBB, MemOps); 1543 1544 // Find a scratch register. 1545 unsigned Scratch = 1546 RS->FindUnusedReg(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); 1547 1548 // Process the load / store instructions. 1549 RS->forward(std::prev(MBBI)); 1550 1551 // Merge ops. 1552 Merges.clear(); 1553 MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, 1554 CurrPred, CurrPredReg, Scratch, MemOps, Merges); 1555 1556 // Try folding preceding/trailing base inc/dec into the generated 1557 // LDM/STM ops. 1558 for (unsigned i = 0, e = Merges.size(); i < e; ++i) 1559 if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) 1560 ++NumMerges; 1561 NumMerges += Merges.size(); 1562 1563 // Try folding preceding/trailing base inc/dec into those load/store 1564 // that were not merged to form LDM/STM ops. 1565 for (unsigned i = 0; i != NumMemOps; ++i) 1566 if (!MemOps[i].Merged) 1567 if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) 1568 ++NumMerges; 1569 1570 // RS may be pointing to an instruction that's deleted. 1571 RS->skipTo(std::prev(MBBI)); 1572 } else if (NumMemOps == 1) { 1573 // Try folding preceding/trailing base inc/dec into the single 1574 // load/store. 1575 if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { 1576 ++NumMerges; 1577 RS->forward(std::prev(MBBI)); 1578 } 1579 } 1580 1581 CurrBase = 0; 1582 CurrOpc = -1; 1583 CurrSize = 0; 1584 CurrPred = ARMCC::AL; 1585 CurrPredReg = 0; 1586 if (NumMemOps) { 1587 MemOps.clear(); 1588 NumMemOps = 0; 1589 } 1590 1591 // If iterator hasn't been advanced and this is not a memory op, skip it. 1592 // It can't start a new chain anyway. 1593 if (!Advance && !isMemOp && MBBI != E) { 1594 ++Position; 1595 ++MBBI; 1596 } 1597 } 1598 } 1599 return NumMerges > 0; 1600 } 1601 1602 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops 1603 /// ("bx lr" and "mov pc, lr") into the preceding stack restore so it 1604 /// directly restore the value of LR into pc. 1605 /// ldmfd sp!, {..., lr} 1606 /// bx lr 1607 /// or 1608 /// ldmfd sp!, {..., lr} 1609 /// mov pc, lr 1610 /// => 1611 /// ldmfd sp!, {..., pc} 1612 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { 1613 // Thumb1 LDM doesn't allow high registers. 1614 if (isThumb1) return false; 1615 if (MBB.empty()) return false; 1616 1617 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); 1618 if (MBBI != MBB.begin() && 1619 (MBBI->getOpcode() == ARM::BX_RET || 1620 MBBI->getOpcode() == ARM::tBX_RET || 1621 MBBI->getOpcode() == ARM::MOVPCLR)) { 1622 MachineInstr *PrevMI = std::prev(MBBI); 1623 unsigned Opcode = PrevMI->getOpcode(); 1624 if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD || 1625 Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD || 1626 Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) { 1627 MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); 1628 if (MO.getReg() != ARM::LR) 1629 return false; 1630 unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET); 1631 assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) || 1632 Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!"); 1633 PrevMI->setDesc(TII->get(NewOpc)); 1634 MO.setReg(ARM::PC); 1635 PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI); 1636 MBB.erase(MBBI); 1637 return true; 1638 } 1639 } 1640 return false; 1641 } 1642 1643 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1644 const TargetMachine &TM = Fn.getTarget(); 1645 TL = TM.getSubtargetImpl()->getTargetLowering(); 1646 AFI = Fn.getInfo<ARMFunctionInfo>(); 1647 TII = TM.getSubtargetImpl()->getInstrInfo(); 1648 TRI = TM.getSubtargetImpl()->getRegisterInfo(); 1649 STI = &TM.getSubtarget<ARMSubtarget>(); 1650 RS = new RegScavenger(); 1651 isThumb2 = AFI->isThumb2Function(); 1652 isThumb1 = AFI->isThumbFunction() && !isThumb2; 1653 1654 bool Modified = false; 1655 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1656 ++MFI) { 1657 MachineBasicBlock &MBB = *MFI; 1658 Modified |= LoadStoreMultipleOpti(MBB); 1659 if (TM.getSubtarget<ARMSubtarget>().hasV5TOps()) 1660 Modified |= MergeReturnIntoLDM(MBB); 1661 } 1662 1663 delete RS; 1664 return Modified; 1665 } 1666 1667 1668 /// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move 1669 /// load / stores from consecutive locations close to make it more 1670 /// likely they will be combined later. 1671 1672 namespace { 1673 struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ 1674 static char ID; 1675 ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {} 1676 1677 const DataLayout *TD; 1678 const TargetInstrInfo *TII; 1679 const TargetRegisterInfo *TRI; 1680 const ARMSubtarget *STI; 1681 MachineRegisterInfo *MRI; 1682 MachineFunction *MF; 1683 1684 bool runOnMachineFunction(MachineFunction &Fn) override; 1685 1686 const char *getPassName() const override { 1687 return "ARM pre- register allocation load / store optimization pass"; 1688 } 1689 1690 private: 1691 bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, 1692 unsigned &NewOpc, unsigned &EvenReg, 1693 unsigned &OddReg, unsigned &BaseReg, 1694 int &Offset, 1695 unsigned &PredReg, ARMCC::CondCodes &Pred, 1696 bool &isT2); 1697 bool RescheduleOps(MachineBasicBlock *MBB, 1698 SmallVectorImpl<MachineInstr *> &Ops, 1699 unsigned Base, bool isLd, 1700 DenseMap<MachineInstr*, unsigned> &MI2LocMap); 1701 bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); 1702 }; 1703 char ARMPreAllocLoadStoreOpt::ID = 0; 1704 } 1705 1706 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1707 TD = Fn.getSubtarget().getDataLayout(); 1708 TII = Fn.getSubtarget().getInstrInfo(); 1709 TRI = Fn.getSubtarget().getRegisterInfo(); 1710 STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); 1711 MRI = &Fn.getRegInfo(); 1712 MF = &Fn; 1713 1714 bool Modified = false; 1715 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1716 ++MFI) 1717 Modified |= RescheduleLoadStoreInstrs(MFI); 1718 1719 return Modified; 1720 } 1721 1722 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, 1723 MachineBasicBlock::iterator I, 1724 MachineBasicBlock::iterator E, 1725 SmallPtrSetImpl<MachineInstr*> &MemOps, 1726 SmallSet<unsigned, 4> &MemRegs, 1727 const TargetRegisterInfo *TRI) { 1728 // Are there stores / loads / calls between them? 1729 // FIXME: This is overly conservative. We should make use of alias information 1730 // some day. 1731 SmallSet<unsigned, 4> AddedRegPressure; 1732 while (++I != E) { 1733 if (I->isDebugValue() || MemOps.count(&*I)) 1734 continue; 1735 if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects()) 1736 return false; 1737 if (isLd && I->mayStore()) 1738 return false; 1739 if (!isLd) { 1740 if (I->mayLoad()) 1741 return false; 1742 // It's not safe to move the first 'str' down. 1743 // str r1, [r0] 1744 // strh r5, [r0] 1745 // str r4, [r0, #+4] 1746 if (I->mayStore()) 1747 return false; 1748 } 1749 for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) { 1750 MachineOperand &MO = I->getOperand(j); 1751 if (!MO.isReg()) 1752 continue; 1753 unsigned Reg = MO.getReg(); 1754 if (MO.isDef() && TRI->regsOverlap(Reg, Base)) 1755 return false; 1756 if (Reg != Base && !MemRegs.count(Reg)) 1757 AddedRegPressure.insert(Reg); 1758 } 1759 } 1760 1761 // Estimate register pressure increase due to the transformation. 1762 if (MemRegs.size() <= 4) 1763 // Ok if we are moving small number of instructions. 1764 return true; 1765 return AddedRegPressure.size() <= MemRegs.size() * 2; 1766 } 1767 1768 1769 /// Copy Op0 and Op1 operands into a new array assigned to MI. 1770 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, 1771 MachineInstr *Op1) { 1772 assert(MI->memoperands_empty() && "expected a new machineinstr"); 1773 size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) 1774 + (Op1->memoperands_end() - Op1->memoperands_begin()); 1775 1776 MachineFunction *MF = MI->getParent()->getParent(); 1777 MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); 1778 MachineSDNode::mmo_iterator MemEnd = 1779 std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); 1780 MemEnd = 1781 std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); 1782 MI->setMemRefs(MemBegin, MemEnd); 1783 } 1784 1785 bool 1786 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, 1787 DebugLoc &dl, 1788 unsigned &NewOpc, unsigned &EvenReg, 1789 unsigned &OddReg, unsigned &BaseReg, 1790 int &Offset, unsigned &PredReg, 1791 ARMCC::CondCodes &Pred, 1792 bool &isT2) { 1793 // Make sure we're allowed to generate LDRD/STRD. 1794 if (!STI->hasV5TEOps()) 1795 return false; 1796 1797 // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD 1798 unsigned Scale = 1; 1799 unsigned Opcode = Op0->getOpcode(); 1800 if (Opcode == ARM::LDRi12) { 1801 NewOpc = ARM::LDRD; 1802 } else if (Opcode == ARM::STRi12) { 1803 NewOpc = ARM::STRD; 1804 } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { 1805 NewOpc = ARM::t2LDRDi8; 1806 Scale = 4; 1807 isT2 = true; 1808 } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) { 1809 NewOpc = ARM::t2STRDi8; 1810 Scale = 4; 1811 isT2 = true; 1812 } else { 1813 return false; 1814 } 1815 1816 // Make sure the base address satisfies i64 ld / st alignment requirement. 1817 // At the moment, we ignore the memoryoperand's value. 1818 // If we want to use AliasAnalysis, we should check it accordingly. 1819 if (!Op0->hasOneMemOperand() || 1820 (*Op0->memoperands_begin())->isVolatile()) 1821 return false; 1822 1823 unsigned Align = (*Op0->memoperands_begin())->getAlignment(); 1824 const Function *Func = MF->getFunction(); 1825 unsigned ReqAlign = STI->hasV6Ops() 1826 ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext())) 1827 : 8; // Pre-v6 need 8-byte align 1828 if (Align < ReqAlign) 1829 return false; 1830 1831 // Then make sure the immediate offset fits. 1832 int OffImm = getMemoryOpOffset(Op0); 1833 if (isT2) { 1834 int Limit = (1 << 8) * Scale; 1835 if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1))) 1836 return false; 1837 Offset = OffImm; 1838 } else { 1839 ARM_AM::AddrOpc AddSub = ARM_AM::add; 1840 if (OffImm < 0) { 1841 AddSub = ARM_AM::sub; 1842 OffImm = - OffImm; 1843 } 1844 int Limit = (1 << 8) * Scale; 1845 if (OffImm >= Limit || (OffImm & (Scale-1))) 1846 return false; 1847 Offset = ARM_AM::getAM3Opc(AddSub, OffImm); 1848 } 1849 EvenReg = Op0->getOperand(0).getReg(); 1850 OddReg = Op1->getOperand(0).getReg(); 1851 if (EvenReg == OddReg) 1852 return false; 1853 BaseReg = Op0->getOperand(1).getReg(); 1854 Pred = getInstrPredicate(Op0, PredReg); 1855 dl = Op0->getDebugLoc(); 1856 return true; 1857 } 1858 1859 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, 1860 SmallVectorImpl<MachineInstr *> &Ops, 1861 unsigned Base, bool isLd, 1862 DenseMap<MachineInstr*, unsigned> &MI2LocMap) { 1863 bool RetVal = false; 1864 1865 // Sort by offset (in reverse order). 1866 std::sort(Ops.begin(), Ops.end(), 1867 [](const MachineInstr *LHS, const MachineInstr *RHS) { 1868 int LOffset = getMemoryOpOffset(LHS); 1869 int ROffset = getMemoryOpOffset(RHS); 1870 assert(LHS == RHS || LOffset != ROffset); 1871 return LOffset > ROffset; 1872 }); 1873 1874 // The loads / stores of the same base are in order. Scan them from first to 1875 // last and check for the following: 1876 // 1. Any def of base. 1877 // 2. Any gaps. 1878 while (Ops.size() > 1) { 1879 unsigned FirstLoc = ~0U; 1880 unsigned LastLoc = 0; 1881 MachineInstr *FirstOp = nullptr; 1882 MachineInstr *LastOp = nullptr; 1883 int LastOffset = 0; 1884 unsigned LastOpcode = 0; 1885 unsigned LastBytes = 0; 1886 unsigned NumMove = 0; 1887 for (int i = Ops.size() - 1; i >= 0; --i) { 1888 MachineInstr *Op = Ops[i]; 1889 unsigned Loc = MI2LocMap[Op]; 1890 if (Loc <= FirstLoc) { 1891 FirstLoc = Loc; 1892 FirstOp = Op; 1893 } 1894 if (Loc >= LastLoc) { 1895 LastLoc = Loc; 1896 LastOp = Op; 1897 } 1898 1899 unsigned LSMOpcode 1900 = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia); 1901 if (LastOpcode && LSMOpcode != LastOpcode) 1902 break; 1903 1904 int Offset = getMemoryOpOffset(Op); 1905 unsigned Bytes = getLSMultipleTransferSize(Op); 1906 if (LastBytes) { 1907 if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes)) 1908 break; 1909 } 1910 LastOffset = Offset; 1911 LastBytes = Bytes; 1912 LastOpcode = LSMOpcode; 1913 if (++NumMove == 8) // FIXME: Tune this limit. 1914 break; 1915 } 1916 1917 if (NumMove <= 1) 1918 Ops.pop_back(); 1919 else { 1920 SmallPtrSet<MachineInstr*, 4> MemOps; 1921 SmallSet<unsigned, 4> MemRegs; 1922 for (int i = NumMove-1; i >= 0; --i) { 1923 MemOps.insert(Ops[i]); 1924 MemRegs.insert(Ops[i]->getOperand(0).getReg()); 1925 } 1926 1927 // Be conservative, if the instructions are too far apart, don't 1928 // move them. We want to limit the increase of register pressure. 1929 bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this. 1930 if (DoMove) 1931 DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp, 1932 MemOps, MemRegs, TRI); 1933 if (!DoMove) { 1934 for (unsigned i = 0; i != NumMove; ++i) 1935 Ops.pop_back(); 1936 } else { 1937 // This is the new location for the loads / stores. 1938 MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; 1939 while (InsertPos != MBB->end() 1940 && (MemOps.count(InsertPos) || InsertPos->isDebugValue())) 1941 ++InsertPos; 1942 1943 // If we are moving a pair of loads / stores, see if it makes sense 1944 // to try to allocate a pair of registers that can form register pairs. 1945 MachineInstr *Op0 = Ops.back(); 1946 MachineInstr *Op1 = Ops[Ops.size()-2]; 1947 unsigned EvenReg = 0, OddReg = 0; 1948 unsigned BaseReg = 0, PredReg = 0; 1949 ARMCC::CondCodes Pred = ARMCC::AL; 1950 bool isT2 = false; 1951 unsigned NewOpc = 0; 1952 int Offset = 0; 1953 DebugLoc dl; 1954 if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, 1955 EvenReg, OddReg, BaseReg, 1956 Offset, PredReg, Pred, isT2)) { 1957 Ops.pop_back(); 1958 Ops.pop_back(); 1959 1960 const MCInstrDesc &MCID = TII->get(NewOpc); 1961 const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF); 1962 MRI->constrainRegClass(EvenReg, TRC); 1963 MRI->constrainRegClass(OddReg, TRC); 1964 1965 // Form the pair instruction. 1966 if (isLd) { 1967 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID) 1968 .addReg(EvenReg, RegState::Define) 1969 .addReg(OddReg, RegState::Define) 1970 .addReg(BaseReg); 1971 // FIXME: We're converting from LDRi12 to an insn that still 1972 // uses addrmode2, so we need an explicit offset reg. It should 1973 // always by reg0 since we're transforming LDRi12s. 1974 if (!isT2) 1975 MIB.addReg(0); 1976 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1977 concatenateMemOperands(MIB, Op0, Op1); 1978 DEBUG(dbgs() << "Formed " << *MIB << "\n"); 1979 ++NumLDRDFormed; 1980 } else { 1981 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID) 1982 .addReg(EvenReg) 1983 .addReg(OddReg) 1984 .addReg(BaseReg); 1985 // FIXME: We're converting from LDRi12 to an insn that still 1986 // uses addrmode2, so we need an explicit offset reg. It should 1987 // always by reg0 since we're transforming STRi12s. 1988 if (!isT2) 1989 MIB.addReg(0); 1990 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1991 concatenateMemOperands(MIB, Op0, Op1); 1992 DEBUG(dbgs() << "Formed " << *MIB << "\n"); 1993 ++NumSTRDFormed; 1994 } 1995 MBB->erase(Op0); 1996 MBB->erase(Op1); 1997 1998 // Add register allocation hints to form register pairs. 1999 MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg); 2000 MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg); 2001 } else { 2002 for (unsigned i = 0; i != NumMove; ++i) { 2003 MachineInstr *Op = Ops.back(); 2004 Ops.pop_back(); 2005 MBB->splice(InsertPos, MBB, Op); 2006 } 2007 } 2008 2009 NumLdStMoved += NumMove; 2010 RetVal = true; 2011 } 2012 } 2013 } 2014 2015 return RetVal; 2016 } 2017 2018 bool 2019 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { 2020 bool RetVal = false; 2021 2022 DenseMap<MachineInstr*, unsigned> MI2LocMap; 2023 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap; 2024 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap; 2025 SmallVector<unsigned, 4> LdBases; 2026 SmallVector<unsigned, 4> StBases; 2027 2028 unsigned Loc = 0; 2029 MachineBasicBlock::iterator MBBI = MBB->begin(); 2030 MachineBasicBlock::iterator E = MBB->end(); 2031 while (MBBI != E) { 2032 for (; MBBI != E; ++MBBI) { 2033 MachineInstr *MI = MBBI; 2034 if (MI->isCall() || MI->isTerminator()) { 2035 // Stop at barriers. 2036 ++MBBI; 2037 break; 2038 } 2039 2040 if (!MI->isDebugValue()) 2041 MI2LocMap[MI] = ++Loc; 2042 2043 if (!isMemoryOp(MI)) 2044 continue; 2045 unsigned PredReg = 0; 2046 if (getInstrPredicate(MI, PredReg) != ARMCC::AL) 2047 continue; 2048 2049 int Opc = MI->getOpcode(); 2050 bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; 2051 unsigned Base = MI->getOperand(1).getReg(); 2052 int Offset = getMemoryOpOffset(MI); 2053 2054 bool StopHere = false; 2055 if (isLd) { 2056 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 2057 Base2LdsMap.find(Base); 2058 if (BI != Base2LdsMap.end()) { 2059 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 2060 if (Offset == getMemoryOpOffset(BI->second[i])) { 2061 StopHere = true; 2062 break; 2063 } 2064 } 2065 if (!StopHere) 2066 BI->second.push_back(MI); 2067 } else { 2068 Base2LdsMap[Base].push_back(MI); 2069 LdBases.push_back(Base); 2070 } 2071 } else { 2072 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 2073 Base2StsMap.find(Base); 2074 if (BI != Base2StsMap.end()) { 2075 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 2076 if (Offset == getMemoryOpOffset(BI->second[i])) { 2077 StopHere = true; 2078 break; 2079 } 2080 } 2081 if (!StopHere) 2082 BI->second.push_back(MI); 2083 } else { 2084 Base2StsMap[Base].push_back(MI); 2085 StBases.push_back(Base); 2086 } 2087 } 2088 2089 if (StopHere) { 2090 // Found a duplicate (a base+offset combination that's seen earlier). 2091 // Backtrack. 2092 --Loc; 2093 break; 2094 } 2095 } 2096 2097 // Re-schedule loads. 2098 for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { 2099 unsigned Base = LdBases[i]; 2100 SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base]; 2101 if (Lds.size() > 1) 2102 RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); 2103 } 2104 2105 // Re-schedule stores. 2106 for (unsigned i = 0, e = StBases.size(); i != e; ++i) { 2107 unsigned Base = StBases[i]; 2108 SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base]; 2109 if (Sts.size() > 1) 2110 RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); 2111 } 2112 2113 if (MBBI != E) { 2114 Base2LdsMap.clear(); 2115 Base2StsMap.clear(); 2116 LdBases.clear(); 2117 StBases.clear(); 2118 } 2119 } 2120 2121 return RetVal; 2122 } 2123 2124 2125 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store 2126 /// optimization pass. 2127 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { 2128 if (PreAlloc) 2129 return new ARMPreAllocLoadStoreOpt(); 2130 return new ARMLoadStoreOpt(); 2131 } 2132