1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "ARM.h" 16 #include "ARMBaseInstrInfo.h" 17 #include "ARMBaseRegisterInfo.h" 18 #include "ARMISelLowering.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMSubtarget.h" 21 #include "MCTargetDesc/ARMAddressingModes.h" 22 #include "Thumb1RegisterInfo.h" 23 #include "llvm/ADT/DenseMap.h" 24 #include "llvm/ADT/STLExtras.h" 25 #include "llvm/ADT/SmallPtrSet.h" 26 #include "llvm/ADT/SmallSet.h" 27 #include "llvm/ADT/SmallVector.h" 28 #include "llvm/ADT/Statistic.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunctionPass.h" 31 #include "llvm/CodeGen/MachineInstr.h" 32 #include "llvm/CodeGen/MachineInstrBuilder.h" 33 #include "llvm/CodeGen/MachineRegisterInfo.h" 34 #include "llvm/CodeGen/RegisterScavenging.h" 35 #include "llvm/CodeGen/SelectionDAGNodes.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/Support/Debug.h" 40 #include "llvm/Support/ErrorHandling.h" 41 #include "llvm/Target/TargetInstrInfo.h" 42 #include "llvm/Target/TargetMachine.h" 43 #include "llvm/Target/TargetRegisterInfo.h" 44 using namespace llvm; 45 46 #define DEBUG_TYPE "arm-ldst-opt" 47 48 STATISTIC(NumLDMGened , "Number of ldm instructions generated"); 49 STATISTIC(NumSTMGened , "Number of stm instructions generated"); 50 STATISTIC(NumVLDMGened, "Number of vldm instructions generated"); 51 STATISTIC(NumVSTMGened, "Number of vstm instructions generated"); 52 STATISTIC(NumLdStMoved, "Number of load / store instructions moved"); 53 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation"); 54 STATISTIC(NumSTRDFormed,"Number of strd created before allocation"); 55 STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm"); 56 STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); 57 STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); 58 STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); 59 60 /// ARMAllocLoadStoreOpt - Post- register allocation pass the combine 61 /// load / store instructions to form ldm / stm instructions. 62 63 namespace { 64 struct ARMLoadStoreOpt : public MachineFunctionPass { 65 static char ID; 66 ARMLoadStoreOpt() : MachineFunctionPass(ID) {} 67 68 const TargetInstrInfo *TII; 69 const TargetRegisterInfo *TRI; 70 const ARMSubtarget *STI; 71 const TargetLowering *TL; 72 ARMFunctionInfo *AFI; 73 RegScavenger *RS; 74 bool isThumb1, isThumb2; 75 76 bool runOnMachineFunction(MachineFunction &Fn) override; 77 78 const char *getPassName() const override { 79 return "ARM load / store optimization pass"; 80 } 81 82 private: 83 struct MemOpQueueEntry { 84 int Offset; 85 unsigned Reg; 86 bool isKill; 87 unsigned Position; 88 MachineBasicBlock::iterator MBBI; 89 bool Merged; 90 MemOpQueueEntry(int o, unsigned r, bool k, unsigned p, 91 MachineBasicBlock::iterator i) 92 : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} 93 }; 94 typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; 95 typedef MemOpQueue::iterator MemOpQueueIter; 96 97 void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, 98 const MemOpQueue &MemOps, unsigned DefReg, 99 unsigned RangeBegin, unsigned RangeEnd); 100 bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 101 int Offset, unsigned Base, bool BaseKill, int Opcode, 102 ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, 103 DebugLoc dl, 104 ArrayRef<std::pair<unsigned, bool> > Regs, 105 ArrayRef<unsigned> ImpDefs); 106 void MergeOpsUpdate(MachineBasicBlock &MBB, 107 MemOpQueue &MemOps, 108 unsigned memOpsBegin, 109 unsigned memOpsEnd, 110 unsigned insertAfter, 111 int Offset, 112 unsigned Base, 113 bool BaseKill, 114 int Opcode, 115 ARMCC::CondCodes Pred, 116 unsigned PredReg, 117 unsigned Scratch, 118 DebugLoc dl, 119 SmallVectorImpl<MachineBasicBlock::iterator> &Merges); 120 void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, 121 int Opcode, unsigned Size, 122 ARMCC::CondCodes Pred, unsigned PredReg, 123 unsigned Scratch, MemOpQueue &MemOps, 124 SmallVectorImpl<MachineBasicBlock::iterator> &Merges); 125 void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); 126 bool FixInvalidRegPairOp(MachineBasicBlock &MBB, 127 MachineBasicBlock::iterator &MBBI); 128 bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 129 MachineBasicBlock::iterator MBBI, 130 const TargetInstrInfo *TII, 131 bool &Advance, 132 MachineBasicBlock::iterator &I); 133 bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 134 MachineBasicBlock::iterator MBBI, 135 bool &Advance, 136 MachineBasicBlock::iterator &I); 137 bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); 138 bool MergeReturnIntoLDM(MachineBasicBlock &MBB); 139 }; 140 char ARMLoadStoreOpt::ID = 0; 141 } 142 143 static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { 144 switch (Opcode) { 145 default: llvm_unreachable("Unhandled opcode!"); 146 case ARM::LDRi12: 147 ++NumLDMGened; 148 switch (Mode) { 149 default: llvm_unreachable("Unhandled submode!"); 150 case ARM_AM::ia: return ARM::LDMIA; 151 case ARM_AM::da: return ARM::LDMDA; 152 case ARM_AM::db: return ARM::LDMDB; 153 case ARM_AM::ib: return ARM::LDMIB; 154 } 155 case ARM::STRi12: 156 ++NumSTMGened; 157 switch (Mode) { 158 default: llvm_unreachable("Unhandled submode!"); 159 case ARM_AM::ia: return ARM::STMIA; 160 case ARM_AM::da: return ARM::STMDA; 161 case ARM_AM::db: return ARM::STMDB; 162 case ARM_AM::ib: return ARM::STMIB; 163 } 164 case ARM::tLDRi: 165 // tLDMIA is writeback-only - unless the base register is in the input 166 // reglist. 167 ++NumLDMGened; 168 switch (Mode) { 169 default: llvm_unreachable("Unhandled submode!"); 170 case ARM_AM::ia: return ARM::tLDMIA; 171 } 172 case ARM::tSTRi: 173 // There is no non-writeback tSTMIA either. 174 ++NumSTMGened; 175 switch (Mode) { 176 default: llvm_unreachable("Unhandled submode!"); 177 case ARM_AM::ia: return ARM::tSTMIA_UPD; 178 } 179 case ARM::t2LDRi8: 180 case ARM::t2LDRi12: 181 ++NumLDMGened; 182 switch (Mode) { 183 default: llvm_unreachable("Unhandled submode!"); 184 case ARM_AM::ia: return ARM::t2LDMIA; 185 case ARM_AM::db: return ARM::t2LDMDB; 186 } 187 case ARM::t2STRi8: 188 case ARM::t2STRi12: 189 ++NumSTMGened; 190 switch (Mode) { 191 default: llvm_unreachable("Unhandled submode!"); 192 case ARM_AM::ia: return ARM::t2STMIA; 193 case ARM_AM::db: return ARM::t2STMDB; 194 } 195 case ARM::VLDRS: 196 ++NumVLDMGened; 197 switch (Mode) { 198 default: llvm_unreachable("Unhandled submode!"); 199 case ARM_AM::ia: return ARM::VLDMSIA; 200 case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists. 201 } 202 case ARM::VSTRS: 203 ++NumVSTMGened; 204 switch (Mode) { 205 default: llvm_unreachable("Unhandled submode!"); 206 case ARM_AM::ia: return ARM::VSTMSIA; 207 case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists. 208 } 209 case ARM::VLDRD: 210 ++NumVLDMGened; 211 switch (Mode) { 212 default: llvm_unreachable("Unhandled submode!"); 213 case ARM_AM::ia: return ARM::VLDMDIA; 214 case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists. 215 } 216 case ARM::VSTRD: 217 ++NumVSTMGened; 218 switch (Mode) { 219 default: llvm_unreachable("Unhandled submode!"); 220 case ARM_AM::ia: return ARM::VSTMDIA; 221 case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists. 222 } 223 } 224 } 225 226 namespace llvm { 227 namespace ARM_AM { 228 229 AMSubMode getLoadStoreMultipleSubMode(int Opcode) { 230 switch (Opcode) { 231 default: llvm_unreachable("Unhandled opcode!"); 232 case ARM::LDMIA_RET: 233 case ARM::LDMIA: 234 case ARM::LDMIA_UPD: 235 case ARM::STMIA: 236 case ARM::STMIA_UPD: 237 case ARM::tLDMIA: 238 case ARM::tLDMIA_UPD: 239 case ARM::tSTMIA_UPD: 240 case ARM::t2LDMIA_RET: 241 case ARM::t2LDMIA: 242 case ARM::t2LDMIA_UPD: 243 case ARM::t2STMIA: 244 case ARM::t2STMIA_UPD: 245 case ARM::VLDMSIA: 246 case ARM::VLDMSIA_UPD: 247 case ARM::VSTMSIA: 248 case ARM::VSTMSIA_UPD: 249 case ARM::VLDMDIA: 250 case ARM::VLDMDIA_UPD: 251 case ARM::VSTMDIA: 252 case ARM::VSTMDIA_UPD: 253 return ARM_AM::ia; 254 255 case ARM::LDMDA: 256 case ARM::LDMDA_UPD: 257 case ARM::STMDA: 258 case ARM::STMDA_UPD: 259 return ARM_AM::da; 260 261 case ARM::LDMDB: 262 case ARM::LDMDB_UPD: 263 case ARM::STMDB: 264 case ARM::STMDB_UPD: 265 case ARM::t2LDMDB: 266 case ARM::t2LDMDB_UPD: 267 case ARM::t2STMDB: 268 case ARM::t2STMDB_UPD: 269 case ARM::VLDMSDB_UPD: 270 case ARM::VSTMSDB_UPD: 271 case ARM::VLDMDDB_UPD: 272 case ARM::VSTMDDB_UPD: 273 return ARM_AM::db; 274 275 case ARM::LDMIB: 276 case ARM::LDMIB_UPD: 277 case ARM::STMIB: 278 case ARM::STMIB_UPD: 279 return ARM_AM::ib; 280 } 281 } 282 283 } // end namespace ARM_AM 284 } // end namespace llvm 285 286 static bool isT1i32Load(unsigned Opc) { 287 return Opc == ARM::tLDRi; 288 } 289 290 static bool isT2i32Load(unsigned Opc) { 291 return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; 292 } 293 294 static bool isi32Load(unsigned Opc) { 295 return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ; 296 } 297 298 static bool isT1i32Store(unsigned Opc) { 299 return Opc == ARM::tSTRi; 300 } 301 302 static bool isT2i32Store(unsigned Opc) { 303 return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; 304 } 305 306 static bool isi32Store(unsigned Opc) { 307 return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc); 308 } 309 310 /// MergeOps - Create and insert a LDM or STM with Base as base register and 311 /// registers in Regs as the register operands that would be loaded / stored. 312 /// It returns true if the transformation is done. 313 bool 314 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, 315 MachineBasicBlock::iterator MBBI, 316 int Offset, unsigned Base, bool BaseKill, 317 int Opcode, ARMCC::CondCodes Pred, 318 unsigned PredReg, unsigned Scratch, DebugLoc dl, 319 ArrayRef<std::pair<unsigned, bool> > Regs, 320 ArrayRef<unsigned> ImpDefs) { 321 // Only a single register to load / store. Don't bother. 322 unsigned NumRegs = Regs.size(); 323 if (NumRegs <= 1) 324 return false; 325 326 ARM_AM::AMSubMode Mode = ARM_AM::ia; 327 // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA. 328 bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); 329 bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1; 330 331 if (Offset == 4 && haveIBAndDA) { 332 Mode = ARM_AM::ib; 333 } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) { 334 Mode = ARM_AM::da; 335 } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) { 336 // VLDM/VSTM do not support DB mode without also updating the base reg. 337 Mode = ARM_AM::db; 338 } else if (Offset != 0) { 339 // Check if this is a supported opcode before inserting instructions to 340 // calculate a new base register. 341 if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false; 342 343 // If starting offset isn't zero, insert a MI to materialize a new base. 344 // But only do so if it is cost effective, i.e. merging more than two 345 // loads / stores. 346 if (NumRegs <= 2) 347 return false; 348 349 unsigned NewBase; 350 if (isi32Load(Opcode)) { 351 // If it is a load, then just use one of the destination register to 352 // use as the new base. 353 NewBase = Regs[NumRegs-1].first; 354 } else { 355 // Use the scratch register to use as a new base. 356 NewBase = Scratch; 357 if (NewBase == 0) 358 return false; 359 } 360 361 int BaseOpc = 362 isThumb2 ? ARM::t2ADDri : 363 isThumb1 ? ARM::tADDi8 : ARM::ADDri; 364 365 if (Offset < 0) { 366 BaseOpc = 367 isThumb2 ? ARM::t2SUBri : 368 isThumb1 ? ARM::tSUBi8 : ARM::SUBri; 369 Offset = - Offset; 370 } 371 372 if (!TL->isLegalAddImmediate(Offset)) 373 // FIXME: Try add with register operand? 374 return false; // Probably not worth it then. 375 376 if (isThumb1) { 377 if (Base != NewBase) { 378 // Need to insert a MOV to the new base first. 379 // FIXME: If the immediate fits in 3 bits, use ADD instead. 380 BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase) 381 .addReg(Base, getKillRegState(BaseKill)) 382 .addImm(Pred).addReg(PredReg); 383 } 384 AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)) 385 .addReg(NewBase, getKillRegState(true)).addImm(Offset) 386 .addImm(Pred).addReg(PredReg); 387 } else { 388 BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) 389 .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) 390 .addImm(Pred).addReg(PredReg).addReg(0); 391 } 392 393 Base = NewBase; 394 BaseKill = true; // New base is always killed straight away. 395 } 396 397 bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS || 398 Opcode == ARM::VLDRD); 399 400 // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with 401 // base register writeback. 402 Opcode = getLoadStoreMultipleOpcode(Opcode, Mode); 403 if (!Opcode) return false; 404 405 bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback. 406 407 // Exception: If the base register is in the input reglist, Thumb1 LDM is 408 // non-writeback. Check for this. 409 if (Opcode == ARM::tLDMIA && isThumb1) 410 for (unsigned I = 0; I < NumRegs; ++I) 411 if (Base == Regs[I].first) { 412 Writeback = false; 413 break; 414 } 415 416 // If the merged instruction has writeback and the base register is not killed 417 // it's not safe to do the merge on Thumb1. This is because resetting the base 418 // register writeback by inserting a SUBS sets the condition flags. 419 // FIXME: Try something clever here to see if resetting the base register can 420 // be avoided, e.g. by updating a later ADD/SUB of the base register with the 421 // writeback. 422 if (isThumb1 && Writeback && !BaseKill) return false; 423 424 MachineInstrBuilder MIB; 425 426 if (Writeback) { 427 if (Opcode == ARM::tLDMIA) 428 // Update tLDMIA with writeback if necessary. 429 Opcode = ARM::tLDMIA_UPD; 430 431 MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); 432 433 // Thumb1: we might need to set base writeback when building the MI. 434 MIB.addReg(Base, getDefRegState(true)) 435 .addReg(Base, getKillRegState(BaseKill)); 436 } else { 437 // No writeback, simply build the MachineInstr. 438 MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); 439 MIB.addReg(Base, getKillRegState(BaseKill)); 440 } 441 442 MIB.addImm(Pred).addReg(PredReg); 443 444 for (unsigned i = 0; i != NumRegs; ++i) 445 MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) 446 | getKillRegState(Regs[i].second)); 447 448 // Add implicit defs for super-registers. 449 for (unsigned i = 0, e = ImpDefs.size(); i != e; ++i) 450 MIB.addReg(ImpDefs[i], RegState::ImplicitDefine); 451 452 return true; 453 } 454 455 /// \brief Find all instructions using a given imp-def within a range. 456 /// 457 /// We are trying to combine a range of instructions, one of which (located at 458 /// position RangeBegin) implicitly defines a register. The final LDM/STM will 459 /// be placed at RangeEnd, and so any uses of this definition between RangeStart 460 /// and RangeEnd must be modified to use an undefined value. 461 /// 462 /// The live range continues until we find a second definition or one of the 463 /// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so 464 /// we must consider all uses and decide which are relevant in a second pass. 465 void ARMLoadStoreOpt::findUsesOfImpDef( 466 SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps, 467 unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) { 468 std::map<unsigned, MachineOperand *> Uses; 469 unsigned LastLivePos = RangeEnd; 470 471 // First we find all uses of this register with Position between RangeBegin 472 // and RangeEnd, any or all of these could be uses of a definition at 473 // RangeBegin. We also record the latest position a definition at RangeBegin 474 // would be considered live. 475 for (unsigned i = 0; i < MemOps.size(); ++i) { 476 MachineInstr &MI = *MemOps[i].MBBI; 477 unsigned MIPosition = MemOps[i].Position; 478 if (MIPosition <= RangeBegin || MIPosition > RangeEnd) 479 continue; 480 481 // If this instruction defines the register, then any later use will be of 482 // that definition rather than ours. 483 if (MI.definesRegister(DefReg)) 484 LastLivePos = std::min(LastLivePos, MIPosition); 485 486 MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg); 487 if (!UseOp) 488 continue; 489 490 // If this instruction kills the register then (assuming liveness is 491 // correct when we start) we don't need to think about anything after here. 492 if (UseOp->isKill()) 493 LastLivePos = std::min(LastLivePos, MIPosition); 494 495 Uses[MIPosition] = UseOp; 496 } 497 498 // Now we traverse the list of all uses, and append the ones that actually use 499 // our definition to the requested list. 500 for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(), 501 E = Uses.end(); 502 I != E; ++I) { 503 // List is sorted by position so once we've found one out of range there 504 // will be no more to consider. 505 if (I->first > LastLivePos) 506 break; 507 UsesOfImpDefs.push_back(I->second); 508 } 509 } 510 511 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on 512 // success. 513 void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, 514 MemOpQueue &memOps, 515 unsigned memOpsBegin, unsigned memOpsEnd, 516 unsigned insertAfter, int Offset, 517 unsigned Base, bool BaseKill, 518 int Opcode, 519 ARMCC::CondCodes Pred, unsigned PredReg, 520 unsigned Scratch, 521 DebugLoc dl, 522 SmallVectorImpl<MachineBasicBlock::iterator> &Merges) { 523 // First calculate which of the registers should be killed by the merged 524 // instruction. 525 const unsigned insertPos = memOps[insertAfter].Position; 526 SmallSet<unsigned, 4> KilledRegs; 527 DenseMap<unsigned, unsigned> Killer; 528 for (unsigned i = 0, e = memOps.size(); i != e; ++i) { 529 if (i == memOpsBegin) { 530 i = memOpsEnd; 531 if (i == e) 532 break; 533 } 534 if (memOps[i].Position < insertPos && memOps[i].isKill) { 535 unsigned Reg = memOps[i].Reg; 536 KilledRegs.insert(Reg); 537 Killer[Reg] = i; 538 } 539 } 540 541 SmallVector<std::pair<unsigned, bool>, 8> Regs; 542 SmallVector<unsigned, 8> ImpDefs; 543 SmallVector<MachineOperand *, 8> UsesOfImpDefs; 544 for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { 545 unsigned Reg = memOps[i].Reg; 546 // If we are inserting the merged operation after an operation that 547 // uses the same register, make sure to transfer any kill flag. 548 bool isKill = memOps[i].isKill || KilledRegs.count(Reg); 549 Regs.push_back(std::make_pair(Reg, isKill)); 550 551 // Collect any implicit defs of super-registers. They must be preserved. 552 for (MIOperands MO(memOps[i].MBBI); MO.isValid(); ++MO) { 553 if (!MO->isReg() || !MO->isDef() || !MO->isImplicit() || MO->isDead()) 554 continue; 555 unsigned DefReg = MO->getReg(); 556 if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end()) 557 ImpDefs.push_back(DefReg); 558 559 // There may be other uses of the definition between this instruction and 560 // the eventual LDM/STM position. These should be marked undef if the 561 // merge takes place. 562 findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position, 563 insertPos); 564 } 565 } 566 567 // Try to do the merge. 568 MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI; 569 ++Loc; 570 if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode, 571 Pred, PredReg, Scratch, dl, Regs, ImpDefs)) 572 return; 573 574 // Merge succeeded, update records. 575 Merges.push_back(std::prev(Loc)); 576 577 // In gathering loads together, we may have moved the imp-def of a register 578 // past one of its uses. This is OK, since we know better than the rest of 579 // LLVM what's OK with ARM loads and stores; but we still have to adjust the 580 // affected uses. 581 for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(), 582 E = UsesOfImpDefs.end(); 583 I != E; ++I) 584 (*I)->setIsUndef(); 585 586 for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { 587 // Remove kill flags from any memops that come before insertPos. 588 if (Regs[i-memOpsBegin].second) { 589 unsigned Reg = Regs[i-memOpsBegin].first; 590 if (KilledRegs.count(Reg)) { 591 unsigned j = Killer[Reg]; 592 int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true); 593 assert(Idx >= 0 && "Cannot find killing operand"); 594 memOps[j].MBBI->getOperand(Idx).setIsKill(false); 595 memOps[j].isKill = false; 596 } 597 memOps[i].isKill = true; 598 } 599 MBB.erase(memOps[i].MBBI); 600 // Update this memop to refer to the merged instruction. 601 // We may need to move kill flags again. 602 memOps[i].Merged = true; 603 memOps[i].MBBI = Merges.back(); 604 memOps[i].Position = insertPos; 605 } 606 } 607 608 /// MergeLDR_STR - Merge a number of load / store instructions into one or more 609 /// load / store multiple instructions. 610 void 611 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, 612 unsigned Base, int Opcode, unsigned Size, 613 ARMCC::CondCodes Pred, unsigned PredReg, 614 unsigned Scratch, MemOpQueue &MemOps, 615 SmallVectorImpl<MachineBasicBlock::iterator> &Merges) { 616 bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); 617 int Offset = MemOps[SIndex].Offset; 618 int SOffset = Offset; 619 unsigned insertAfter = SIndex; 620 MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; 621 DebugLoc dl = Loc->getDebugLoc(); 622 const MachineOperand &PMO = Loc->getOperand(0); 623 unsigned PReg = PMO.getReg(); 624 unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg); 625 unsigned Count = 1; 626 unsigned Limit = ~0U; 627 bool BaseKill = false; 628 // vldm / vstm limit are 32 for S variants, 16 for D variants. 629 630 switch (Opcode) { 631 default: break; 632 case ARM::VSTRS: 633 Limit = 32; 634 break; 635 case ARM::VSTRD: 636 Limit = 16; 637 break; 638 case ARM::VLDRD: 639 Limit = 16; 640 break; 641 case ARM::VLDRS: 642 Limit = 32; 643 break; 644 } 645 646 for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { 647 int NewOffset = MemOps[i].Offset; 648 const MachineOperand &MO = MemOps[i].MBBI->getOperand(0); 649 unsigned Reg = MO.getReg(); 650 unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); 651 // Register numbers must be in ascending order. For VFP / NEON load and 652 // store multiples, the registers must also be consecutive and within the 653 // limit on the number of registers per instruction. 654 if (Reg != ARM::SP && 655 NewOffset == Offset + (int)Size && 656 ((isNotVFP && RegNum > PRegNum) || 657 ((Count < Limit) && RegNum == PRegNum+1)) && 658 // On Swift we don't want vldm/vstm to start with a odd register num 659 // because Q register unaligned vldm/vstm need more uops. 660 (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) { 661 Offset += Size; 662 PRegNum = RegNum; 663 ++Count; 664 } else { 665 // Can't merge this in. Try merge the earlier ones first. 666 // We need to compute BaseKill here because the MemOps may have been 667 // reordered. 668 BaseKill = Loc->killsRegister(Base); 669 670 MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, Base, 671 BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); 672 MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, 673 MemOps, Merges); 674 return; 675 } 676 677 if (MemOps[i].Position > MemOps[insertAfter].Position) { 678 insertAfter = i; 679 Loc = MemOps[i].MBBI; 680 } 681 } 682 683 BaseKill = Loc->killsRegister(Base); 684 MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset, 685 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); 686 } 687 688 static bool definesCPSR(MachineInstr *MI) { 689 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { 690 const MachineOperand &MO = MI->getOperand(i); 691 if (!MO.isReg()) 692 continue; 693 if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead()) 694 // If the instruction has live CPSR def, then it's not safe to fold it 695 // into load / store. 696 return true; 697 } 698 699 return false; 700 } 701 702 static bool isMatchingDecrement(MachineInstr *MI, unsigned Base, 703 unsigned Bytes, unsigned Limit, 704 ARMCC::CondCodes Pred, unsigned PredReg) { 705 unsigned MyPredReg = 0; 706 if (!MI) 707 return false; 708 709 bool CheckCPSRDef = false; 710 switch (MI->getOpcode()) { 711 default: return false; 712 case ARM::tSUBi8: 713 case ARM::t2SUBri: 714 case ARM::SUBri: 715 CheckCPSRDef = true; 716 // fallthrough 717 case ARM::tSUBspi: 718 break; 719 } 720 721 // Make sure the offset fits in 8 bits. 722 if (Bytes == 0 || (Limit && Bytes >= Limit)) 723 return false; 724 725 unsigned Scale = (MI->getOpcode() == ARM::tSUBspi || 726 MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME 727 if (!(MI->getOperand(0).getReg() == Base && 728 MI->getOperand(1).getReg() == Base && 729 (MI->getOperand(2).getImm() * Scale) == Bytes && 730 getInstrPredicate(MI, MyPredReg) == Pred && 731 MyPredReg == PredReg)) 732 return false; 733 734 return CheckCPSRDef ? !definesCPSR(MI) : true; 735 } 736 737 static bool isMatchingIncrement(MachineInstr *MI, unsigned Base, 738 unsigned Bytes, unsigned Limit, 739 ARMCC::CondCodes Pred, unsigned PredReg) { 740 unsigned MyPredReg = 0; 741 if (!MI) 742 return false; 743 744 bool CheckCPSRDef = false; 745 switch (MI->getOpcode()) { 746 default: return false; 747 case ARM::tADDi8: 748 case ARM::t2ADDri: 749 case ARM::ADDri: 750 CheckCPSRDef = true; 751 // fallthrough 752 case ARM::tADDspi: 753 break; 754 } 755 756 if (Bytes == 0 || (Limit && Bytes >= Limit)) 757 // Make sure the offset fits in 8 bits. 758 return false; 759 760 unsigned Scale = (MI->getOpcode() == ARM::tADDspi || 761 MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME 762 if (!(MI->getOperand(0).getReg() == Base && 763 MI->getOperand(1).getReg() == Base && 764 (MI->getOperand(2).getImm() * Scale) == Bytes && 765 getInstrPredicate(MI, MyPredReg) == Pred && 766 MyPredReg == PredReg)) 767 return false; 768 769 return CheckCPSRDef ? !definesCPSR(MI) : true; 770 } 771 772 static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { 773 switch (MI->getOpcode()) { 774 default: return 0; 775 case ARM::LDRi12: 776 case ARM::STRi12: 777 case ARM::tLDRi: 778 case ARM::tSTRi: 779 case ARM::t2LDRi8: 780 case ARM::t2LDRi12: 781 case ARM::t2STRi8: 782 case ARM::t2STRi12: 783 case ARM::VLDRS: 784 case ARM::VSTRS: 785 return 4; 786 case ARM::VLDRD: 787 case ARM::VSTRD: 788 return 8; 789 case ARM::LDMIA: 790 case ARM::LDMDA: 791 case ARM::LDMDB: 792 case ARM::LDMIB: 793 case ARM::STMIA: 794 case ARM::STMDA: 795 case ARM::STMDB: 796 case ARM::STMIB: 797 case ARM::tLDMIA: 798 case ARM::tLDMIA_UPD: 799 case ARM::tSTMIA_UPD: 800 case ARM::t2LDMIA: 801 case ARM::t2LDMDB: 802 case ARM::t2STMIA: 803 case ARM::t2STMDB: 804 case ARM::VLDMSIA: 805 case ARM::VSTMSIA: 806 return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4; 807 case ARM::VLDMDIA: 808 case ARM::VSTMDIA: 809 return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8; 810 } 811 } 812 813 static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, 814 ARM_AM::AMSubMode Mode) { 815 switch (Opc) { 816 default: llvm_unreachable("Unhandled opcode!"); 817 case ARM::LDMIA: 818 case ARM::LDMDA: 819 case ARM::LDMDB: 820 case ARM::LDMIB: 821 switch (Mode) { 822 default: llvm_unreachable("Unhandled submode!"); 823 case ARM_AM::ia: return ARM::LDMIA_UPD; 824 case ARM_AM::ib: return ARM::LDMIB_UPD; 825 case ARM_AM::da: return ARM::LDMDA_UPD; 826 case ARM_AM::db: return ARM::LDMDB_UPD; 827 } 828 case ARM::STMIA: 829 case ARM::STMDA: 830 case ARM::STMDB: 831 case ARM::STMIB: 832 switch (Mode) { 833 default: llvm_unreachable("Unhandled submode!"); 834 case ARM_AM::ia: return ARM::STMIA_UPD; 835 case ARM_AM::ib: return ARM::STMIB_UPD; 836 case ARM_AM::da: return ARM::STMDA_UPD; 837 case ARM_AM::db: return ARM::STMDB_UPD; 838 } 839 case ARM::t2LDMIA: 840 case ARM::t2LDMDB: 841 switch (Mode) { 842 default: llvm_unreachable("Unhandled submode!"); 843 case ARM_AM::ia: return ARM::t2LDMIA_UPD; 844 case ARM_AM::db: return ARM::t2LDMDB_UPD; 845 } 846 case ARM::t2STMIA: 847 case ARM::t2STMDB: 848 switch (Mode) { 849 default: llvm_unreachable("Unhandled submode!"); 850 case ARM_AM::ia: return ARM::t2STMIA_UPD; 851 case ARM_AM::db: return ARM::t2STMDB_UPD; 852 } 853 case ARM::VLDMSIA: 854 switch (Mode) { 855 default: llvm_unreachable("Unhandled submode!"); 856 case ARM_AM::ia: return ARM::VLDMSIA_UPD; 857 case ARM_AM::db: return ARM::VLDMSDB_UPD; 858 } 859 case ARM::VLDMDIA: 860 switch (Mode) { 861 default: llvm_unreachable("Unhandled submode!"); 862 case ARM_AM::ia: return ARM::VLDMDIA_UPD; 863 case ARM_AM::db: return ARM::VLDMDDB_UPD; 864 } 865 case ARM::VSTMSIA: 866 switch (Mode) { 867 default: llvm_unreachable("Unhandled submode!"); 868 case ARM_AM::ia: return ARM::VSTMSIA_UPD; 869 case ARM_AM::db: return ARM::VSTMSDB_UPD; 870 } 871 case ARM::VSTMDIA: 872 switch (Mode) { 873 default: llvm_unreachable("Unhandled submode!"); 874 case ARM_AM::ia: return ARM::VSTMDIA_UPD; 875 case ARM_AM::db: return ARM::VSTMDDB_UPD; 876 } 877 } 878 } 879 880 /// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base 881 /// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: 882 /// 883 /// stmia rn, <ra, rb, rc> 884 /// rn := rn + 4 * 3; 885 /// => 886 /// stmia rn!, <ra, rb, rc> 887 /// 888 /// rn := rn - 4 * 3; 889 /// ldmia rn, <ra, rb, rc> 890 /// => 891 /// ldmdb rn!, <ra, rb, rc> 892 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 893 MachineBasicBlock::iterator MBBI, 894 bool &Advance, 895 MachineBasicBlock::iterator &I) { 896 // Thumb1 is already using updating loads/stores. 897 if (isThumb1) return false; 898 899 MachineInstr *MI = MBBI; 900 unsigned Base = MI->getOperand(0).getReg(); 901 bool BaseKill = MI->getOperand(0).isKill(); 902 unsigned Bytes = getLSMultipleTransferSize(MI); 903 unsigned PredReg = 0; 904 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); 905 int Opcode = MI->getOpcode(); 906 DebugLoc dl = MI->getDebugLoc(); 907 908 // Can't use an updating ld/st if the base register is also a dest 909 // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. 910 for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i) 911 if (MI->getOperand(i).getReg() == Base) 912 return false; 913 914 bool DoMerge = false; 915 ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode); 916 917 // Try merging with the previous instruction. 918 MachineBasicBlock::iterator BeginMBBI = MBB.begin(); 919 if (MBBI != BeginMBBI) { 920 MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); 921 while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) 922 --PrevMBBI; 923 if (Mode == ARM_AM::ia && 924 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 925 Mode = ARM_AM::db; 926 DoMerge = true; 927 } else if (Mode == ARM_AM::ib && 928 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 929 Mode = ARM_AM::da; 930 DoMerge = true; 931 } 932 if (DoMerge) 933 MBB.erase(PrevMBBI); 934 } 935 936 // Try merging with the next instruction. 937 MachineBasicBlock::iterator EndMBBI = MBB.end(); 938 if (!DoMerge && MBBI != EndMBBI) { 939 MachineBasicBlock::iterator NextMBBI = std::next(MBBI); 940 while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) 941 ++NextMBBI; 942 if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && 943 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 944 DoMerge = true; 945 } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && 946 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 947 DoMerge = true; 948 } 949 if (DoMerge) { 950 if (NextMBBI == I) { 951 Advance = true; 952 ++I; 953 } 954 MBB.erase(NextMBBI); 955 } 956 } 957 958 if (!DoMerge) 959 return false; 960 961 unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); 962 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) 963 .addReg(Base, getDefRegState(true)) // WB base register 964 .addReg(Base, getKillRegState(BaseKill)) 965 .addImm(Pred).addReg(PredReg); 966 967 // Transfer the rest of operands. 968 for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum) 969 MIB.addOperand(MI->getOperand(OpNum)); 970 971 // Transfer memoperands. 972 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); 973 974 MBB.erase(MBBI); 975 return true; 976 } 977 978 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc, 979 ARM_AM::AddrOpc Mode) { 980 switch (Opc) { 981 case ARM::LDRi12: 982 return ARM::LDR_PRE_IMM; 983 case ARM::STRi12: 984 return ARM::STR_PRE_IMM; 985 case ARM::VLDRS: 986 return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD; 987 case ARM::VLDRD: 988 return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD; 989 case ARM::VSTRS: 990 return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD; 991 case ARM::VSTRD: 992 return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD; 993 case ARM::t2LDRi8: 994 case ARM::t2LDRi12: 995 return ARM::t2LDR_PRE; 996 case ARM::t2STRi8: 997 case ARM::t2STRi12: 998 return ARM::t2STR_PRE; 999 default: llvm_unreachable("Unhandled opcode!"); 1000 } 1001 } 1002 1003 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, 1004 ARM_AM::AddrOpc Mode) { 1005 switch (Opc) { 1006 case ARM::LDRi12: 1007 return ARM::LDR_POST_IMM; 1008 case ARM::STRi12: 1009 return ARM::STR_POST_IMM; 1010 case ARM::VLDRS: 1011 return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD; 1012 case ARM::VLDRD: 1013 return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD; 1014 case ARM::VSTRS: 1015 return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD; 1016 case ARM::VSTRD: 1017 return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD; 1018 case ARM::t2LDRi8: 1019 case ARM::t2LDRi12: 1020 return ARM::t2LDR_POST; 1021 case ARM::t2STRi8: 1022 case ARM::t2STRi12: 1023 return ARM::t2STR_POST; 1024 default: llvm_unreachable("Unhandled opcode!"); 1025 } 1026 } 1027 1028 /// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base 1029 /// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: 1030 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 1031 MachineBasicBlock::iterator MBBI, 1032 const TargetInstrInfo *TII, 1033 bool &Advance, 1034 MachineBasicBlock::iterator &I) { 1035 // Thumb1 doesn't have updating LDR/STR. 1036 // FIXME: Use LDM/STM with single register instead. 1037 if (isThumb1) return false; 1038 1039 MachineInstr *MI = MBBI; 1040 unsigned Base = MI->getOperand(1).getReg(); 1041 bool BaseKill = MI->getOperand(1).isKill(); 1042 unsigned Bytes = getLSMultipleTransferSize(MI); 1043 int Opcode = MI->getOpcode(); 1044 DebugLoc dl = MI->getDebugLoc(); 1045 bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || 1046 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS); 1047 bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12); 1048 if (isi32Load(Opcode) || isi32Store(Opcode)) 1049 if (MI->getOperand(2).getImm() != 0) 1050 return false; 1051 if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) 1052 return false; 1053 1054 bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; 1055 // Can't do the merge if the destination register is the same as the would-be 1056 // writeback register. 1057 if (MI->getOperand(0).getReg() == Base) 1058 return false; 1059 1060 unsigned PredReg = 0; 1061 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); 1062 bool DoMerge = false; 1063 ARM_AM::AddrOpc AddSub = ARM_AM::add; 1064 unsigned NewOpc = 0; 1065 // AM2 - 12 bits, thumb2 - 8 bits. 1066 unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); 1067 1068 // Try merging with the previous instruction. 1069 MachineBasicBlock::iterator BeginMBBI = MBB.begin(); 1070 if (MBBI != BeginMBBI) { 1071 MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); 1072 while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) 1073 --PrevMBBI; 1074 if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { 1075 DoMerge = true; 1076 AddSub = ARM_AM::sub; 1077 } else if (!isAM5 && 1078 isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { 1079 DoMerge = true; 1080 } 1081 if (DoMerge) { 1082 NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub); 1083 MBB.erase(PrevMBBI); 1084 } 1085 } 1086 1087 // Try merging with the next instruction. 1088 MachineBasicBlock::iterator EndMBBI = MBB.end(); 1089 if (!DoMerge && MBBI != EndMBBI) { 1090 MachineBasicBlock::iterator NextMBBI = std::next(MBBI); 1091 while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) 1092 ++NextMBBI; 1093 if (!isAM5 && 1094 isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { 1095 DoMerge = true; 1096 AddSub = ARM_AM::sub; 1097 } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { 1098 DoMerge = true; 1099 } 1100 if (DoMerge) { 1101 NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub); 1102 if (NextMBBI == I) { 1103 Advance = true; 1104 ++I; 1105 } 1106 MBB.erase(NextMBBI); 1107 } 1108 } 1109 1110 if (!DoMerge) 1111 return false; 1112 1113 if (isAM5) { 1114 // VLDM[SD]_UPD, VSTM[SD]_UPD 1115 // (There are no base-updating versions of VLDR/VSTR instructions, but the 1116 // updating load/store-multiple instructions can be used with only one 1117 // register.) 1118 MachineOperand &MO = MI->getOperand(0); 1119 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) 1120 .addReg(Base, getDefRegState(true)) // WB base register 1121 .addReg(Base, getKillRegState(isLd ? BaseKill : false)) 1122 .addImm(Pred).addReg(PredReg) 1123 .addReg(MO.getReg(), (isLd ? getDefRegState(true) : 1124 getKillRegState(MO.isKill()))); 1125 } else if (isLd) { 1126 if (isAM2) { 1127 // LDR_PRE, LDR_POST 1128 if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { 1129 int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 1130 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 1131 .addReg(Base, RegState::Define) 1132 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 1133 } else { 1134 int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); 1135 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 1136 .addReg(Base, RegState::Define) 1137 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 1138 } 1139 } else { 1140 int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 1141 // t2LDR_PRE, t2LDR_POST 1142 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 1143 .addReg(Base, RegState::Define) 1144 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 1145 } 1146 } else { 1147 MachineOperand &MO = MI->getOperand(0); 1148 // FIXME: post-indexed stores use am2offset_imm, which still encodes 1149 // the vestigal zero-reg offset register. When that's fixed, this clause 1150 // can be removed entirely. 1151 if (isAM2 && NewOpc == ARM::STR_POST_IMM) { 1152 int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); 1153 // STR_PRE, STR_POST 1154 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 1155 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 1156 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 1157 } else { 1158 int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 1159 // t2STR_PRE, t2STR_POST 1160 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 1161 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 1162 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 1163 } 1164 } 1165 MBB.erase(MBBI); 1166 1167 return true; 1168 } 1169 1170 /// isMemoryOp - Returns true if instruction is a memory operation that this 1171 /// pass is capable of operating on. 1172 static bool isMemoryOp(const MachineInstr *MI) { 1173 // When no memory operands are present, conservatively assume unaligned, 1174 // volatile, unfoldable. 1175 if (!MI->hasOneMemOperand()) 1176 return false; 1177 1178 const MachineMemOperand *MMO = *MI->memoperands_begin(); 1179 1180 // Don't touch volatile memory accesses - we may be changing their order. 1181 if (MMO->isVolatile()) 1182 return false; 1183 1184 // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is 1185 // not. 1186 if (MMO->getAlignment() < 4) 1187 return false; 1188 1189 // str <undef> could probably be eliminated entirely, but for now we just want 1190 // to avoid making a mess of it. 1191 // FIXME: Use str <undef> as a wildcard to enable better stm folding. 1192 if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() && 1193 MI->getOperand(0).isUndef()) 1194 return false; 1195 1196 // Likewise don't mess with references to undefined addresses. 1197 if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() && 1198 MI->getOperand(1).isUndef()) 1199 return false; 1200 1201 int Opcode = MI->getOpcode(); 1202 switch (Opcode) { 1203 default: break; 1204 case ARM::VLDRS: 1205 case ARM::VSTRS: 1206 return MI->getOperand(1).isReg(); 1207 case ARM::VLDRD: 1208 case ARM::VSTRD: 1209 return MI->getOperand(1).isReg(); 1210 case ARM::LDRi12: 1211 case ARM::STRi12: 1212 case ARM::tLDRi: 1213 case ARM::tSTRi: 1214 case ARM::t2LDRi8: 1215 case ARM::t2LDRi12: 1216 case ARM::t2STRi8: 1217 case ARM::t2STRi12: 1218 return MI->getOperand(1).isReg(); 1219 } 1220 return false; 1221 } 1222 1223 /// AdvanceRS - Advance register scavenger to just before the earliest memory 1224 /// op that is being merged. 1225 void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { 1226 MachineBasicBlock::iterator Loc = MemOps[0].MBBI; 1227 unsigned Position = MemOps[0].Position; 1228 for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { 1229 if (MemOps[i].Position < Position) { 1230 Position = MemOps[i].Position; 1231 Loc = MemOps[i].MBBI; 1232 } 1233 } 1234 1235 if (Loc != MBB.begin()) 1236 RS->forward(std::prev(Loc)); 1237 } 1238 1239 static int getMemoryOpOffset(const MachineInstr *MI) { 1240 int Opcode = MI->getOpcode(); 1241 bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; 1242 unsigned NumOperands = MI->getDesc().getNumOperands(); 1243 unsigned OffField = MI->getOperand(NumOperands-3).getImm(); 1244 1245 if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || 1246 Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || 1247 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 || 1248 Opcode == ARM::LDRi12 || Opcode == ARM::STRi12) 1249 return OffField; 1250 1251 // Thumb1 immediate offsets are scaled by 4 1252 if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi) 1253 return OffField * 4; 1254 1255 int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField) 1256 : ARM_AM::getAM5Offset(OffField) * 4; 1257 if (isAM3) { 1258 if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) 1259 Offset = -Offset; 1260 } else { 1261 if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) 1262 Offset = -Offset; 1263 } 1264 return Offset; 1265 } 1266 1267 static void InsertLDR_STR(MachineBasicBlock &MBB, 1268 MachineBasicBlock::iterator &MBBI, 1269 int Offset, bool isDef, 1270 DebugLoc dl, unsigned NewOpc, 1271 unsigned Reg, bool RegDeadKill, bool RegUndef, 1272 unsigned BaseReg, bool BaseKill, bool BaseUndef, 1273 bool OffKill, bool OffUndef, 1274 ARMCC::CondCodes Pred, unsigned PredReg, 1275 const TargetInstrInfo *TII, bool isT2) { 1276 if (isDef) { 1277 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 1278 TII->get(NewOpc)) 1279 .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) 1280 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 1281 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1282 } else { 1283 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 1284 TII->get(NewOpc)) 1285 .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) 1286 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 1287 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1288 } 1289 } 1290 1291 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, 1292 MachineBasicBlock::iterator &MBBI) { 1293 MachineInstr *MI = &*MBBI; 1294 unsigned Opcode = MI->getOpcode(); 1295 if (Opcode == ARM::LDRD || Opcode == ARM::STRD || 1296 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) { 1297 const MachineOperand &BaseOp = MI->getOperand(2); 1298 unsigned BaseReg = BaseOp.getReg(); 1299 unsigned EvenReg = MI->getOperand(0).getReg(); 1300 unsigned OddReg = MI->getOperand(1).getReg(); 1301 unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); 1302 unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); 1303 // ARM errata 602117: LDRD with base in list may result in incorrect base 1304 // register when interrupted or faulted. 1305 bool Errata602117 = EvenReg == BaseReg && STI->isCortexM3(); 1306 if (!Errata602117 && 1307 ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)) 1308 return false; 1309 1310 MachineBasicBlock::iterator NewBBI = MBBI; 1311 bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; 1312 bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; 1313 bool EvenDeadKill = isLd ? 1314 MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); 1315 bool EvenUndef = MI->getOperand(0).isUndef(); 1316 bool OddDeadKill = isLd ? 1317 MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); 1318 bool OddUndef = MI->getOperand(1).isUndef(); 1319 bool BaseKill = BaseOp.isKill(); 1320 bool BaseUndef = BaseOp.isUndef(); 1321 bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); 1322 bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); 1323 int OffImm = getMemoryOpOffset(MI); 1324 unsigned PredReg = 0; 1325 ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); 1326 1327 if (OddRegNum > EvenRegNum && OffImm == 0) { 1328 // Ascending register numbers and no offset. It's safe to change it to a 1329 // ldm or stm. 1330 unsigned NewOpc = (isLd) 1331 ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA) 1332 : (isT2 ? ARM::t2STMIA : ARM::STMIA); 1333 if (isLd) { 1334 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 1335 .addReg(BaseReg, getKillRegState(BaseKill)) 1336 .addImm(Pred).addReg(PredReg) 1337 .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) 1338 .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); 1339 ++NumLDRD2LDM; 1340 } else { 1341 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 1342 .addReg(BaseReg, getKillRegState(BaseKill)) 1343 .addImm(Pred).addReg(PredReg) 1344 .addReg(EvenReg, 1345 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) 1346 .addReg(OddReg, 1347 getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); 1348 ++NumSTRD2STM; 1349 } 1350 NewBBI = std::prev(MBBI); 1351 } else { 1352 // Split into two instructions. 1353 unsigned NewOpc = (isLd) 1354 ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) 1355 : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); 1356 // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset, 1357 // so adjust and use t2LDRi12 here for that. 1358 unsigned NewOpc2 = (isLd) 1359 ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12) 1360 : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12); 1361 DebugLoc dl = MBBI->getDebugLoc(); 1362 // If this is a load and base register is killed, it may have been 1363 // re-defed by the load, make sure the first load does not clobber it. 1364 if (isLd && 1365 (BaseKill || OffKill) && 1366 (TRI->regsOverlap(EvenReg, BaseReg))) { 1367 assert(!TRI->regsOverlap(OddReg, BaseReg)); 1368 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, 1369 OddReg, OddDeadKill, false, 1370 BaseReg, false, BaseUndef, false, OffUndef, 1371 Pred, PredReg, TII, isT2); 1372 NewBBI = std::prev(MBBI); 1373 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 1374 EvenReg, EvenDeadKill, false, 1375 BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, 1376 Pred, PredReg, TII, isT2); 1377 } else { 1378 if (OddReg == EvenReg && EvenDeadKill) { 1379 // If the two source operands are the same, the kill marker is 1380 // probably on the first one. e.g. 1381 // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0 1382 EvenDeadKill = false; 1383 OddDeadKill = true; 1384 } 1385 // Never kill the base register in the first instruction. 1386 if (EvenReg == BaseReg) 1387 EvenDeadKill = false; 1388 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 1389 EvenReg, EvenDeadKill, EvenUndef, 1390 BaseReg, false, BaseUndef, false, OffUndef, 1391 Pred, PredReg, TII, isT2); 1392 NewBBI = std::prev(MBBI); 1393 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2, 1394 OddReg, OddDeadKill, OddUndef, 1395 BaseReg, BaseKill, BaseUndef, OffKill, OffUndef, 1396 Pred, PredReg, TII, isT2); 1397 } 1398 if (isLd) 1399 ++NumLDRD2LDR; 1400 else 1401 ++NumSTRD2STR; 1402 } 1403 1404 MBB.erase(MI); 1405 MBBI = NewBBI; 1406 return true; 1407 } 1408 return false; 1409 } 1410 1411 /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR 1412 /// ops of the same base and incrementing offset into LDM / STM ops. 1413 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { 1414 unsigned NumMerges = 0; 1415 unsigned NumMemOps = 0; 1416 MemOpQueue MemOps; 1417 unsigned CurrBase = 0; 1418 int CurrOpc = -1; 1419 unsigned CurrSize = 0; 1420 ARMCC::CondCodes CurrPred = ARMCC::AL; 1421 unsigned CurrPredReg = 0; 1422 unsigned Position = 0; 1423 SmallVector<MachineBasicBlock::iterator,4> Merges; 1424 1425 RS->enterBasicBlock(&MBB); 1426 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1427 while (MBBI != E) { 1428 if (FixInvalidRegPairOp(MBB, MBBI)) 1429 continue; 1430 1431 bool Advance = false; 1432 bool TryMerge = false; 1433 bool Clobber = false; 1434 1435 bool isMemOp = isMemoryOp(MBBI); 1436 if (isMemOp) { 1437 int Opcode = MBBI->getOpcode(); 1438 unsigned Size = getLSMultipleTransferSize(MBBI); 1439 const MachineOperand &MO = MBBI->getOperand(0); 1440 unsigned Reg = MO.getReg(); 1441 bool isKill = MO.isDef() ? false : MO.isKill(); 1442 unsigned Base = MBBI->getOperand(1).getReg(); 1443 unsigned PredReg = 0; 1444 ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg); 1445 int Offset = getMemoryOpOffset(MBBI); 1446 // Watch out for: 1447 // r4 := ldr [r5] 1448 // r5 := ldr [r5, #4] 1449 // r6 := ldr [r5, #8] 1450 // 1451 // The second ldr has effectively broken the chain even though it 1452 // looks like the later ldr(s) use the same base register. Try to 1453 // merge the ldr's so far, including this one. But don't try to 1454 // combine the following ldr(s). 1455 Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); 1456 1457 // Watch out for: 1458 // r4 := ldr [r0, #8] 1459 // r4 := ldr [r0, #4] 1460 // 1461 // The optimization may reorder the second ldr in front of the first 1462 // ldr, which violates write after write(WAW) dependence. The same as 1463 // str. Try to merge inst(s) already in MemOps. 1464 bool Overlap = false; 1465 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) { 1466 if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) { 1467 Overlap = true; 1468 break; 1469 } 1470 } 1471 1472 if (CurrBase == 0 && !Clobber) { 1473 // Start of a new chain. 1474 CurrBase = Base; 1475 CurrOpc = Opcode; 1476 CurrSize = Size; 1477 CurrPred = Pred; 1478 CurrPredReg = PredReg; 1479 MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); 1480 ++NumMemOps; 1481 Advance = true; 1482 } else if (!Overlap) { 1483 if (Clobber) { 1484 TryMerge = true; 1485 Advance = true; 1486 } 1487 1488 if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { 1489 // No need to match PredReg. 1490 // Continue adding to the queue. 1491 if (Offset > MemOps.back().Offset) { 1492 MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, 1493 Position, MBBI)); 1494 ++NumMemOps; 1495 Advance = true; 1496 } else { 1497 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); 1498 I != E; ++I) { 1499 if (Offset < I->Offset) { 1500 MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill, 1501 Position, MBBI)); 1502 ++NumMemOps; 1503 Advance = true; 1504 break; 1505 } else if (Offset == I->Offset) { 1506 // Collision! This can't be merged! 1507 break; 1508 } 1509 } 1510 } 1511 } 1512 } 1513 } 1514 1515 if (MBBI->isDebugValue()) { 1516 ++MBBI; 1517 if (MBBI == E) 1518 // Reach the end of the block, try merging the memory instructions. 1519 TryMerge = true; 1520 } else if (Advance) { 1521 ++Position; 1522 ++MBBI; 1523 if (MBBI == E) 1524 // Reach the end of the block, try merging the memory instructions. 1525 TryMerge = true; 1526 } else { 1527 TryMerge = true; 1528 } 1529 1530 if (TryMerge) { 1531 if (NumMemOps > 1) { 1532 // Try to find a free register to use as a new base in case it's needed. 1533 // First advance to the instruction just before the start of the chain. 1534 AdvanceRS(MBB, MemOps); 1535 1536 // Find a scratch register. 1537 unsigned Scratch = 1538 RS->FindUnusedReg(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); 1539 1540 // Process the load / store instructions. 1541 RS->forward(std::prev(MBBI)); 1542 1543 // Merge ops. 1544 Merges.clear(); 1545 MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, 1546 CurrPred, CurrPredReg, Scratch, MemOps, Merges); 1547 1548 // Try folding preceding/trailing base inc/dec into the generated 1549 // LDM/STM ops. 1550 for (unsigned i = 0, e = Merges.size(); i < e; ++i) 1551 if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) 1552 ++NumMerges; 1553 NumMerges += Merges.size(); 1554 1555 // Try folding preceding/trailing base inc/dec into those load/store 1556 // that were not merged to form LDM/STM ops. 1557 for (unsigned i = 0; i != NumMemOps; ++i) 1558 if (!MemOps[i].Merged) 1559 if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) 1560 ++NumMerges; 1561 1562 // RS may be pointing to an instruction that's deleted. 1563 RS->skipTo(std::prev(MBBI)); 1564 } else if (NumMemOps == 1) { 1565 // Try folding preceding/trailing base inc/dec into the single 1566 // load/store. 1567 if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { 1568 ++NumMerges; 1569 RS->forward(std::prev(MBBI)); 1570 } 1571 } 1572 1573 CurrBase = 0; 1574 CurrOpc = -1; 1575 CurrSize = 0; 1576 CurrPred = ARMCC::AL; 1577 CurrPredReg = 0; 1578 if (NumMemOps) { 1579 MemOps.clear(); 1580 NumMemOps = 0; 1581 } 1582 1583 // If iterator hasn't been advanced and this is not a memory op, skip it. 1584 // It can't start a new chain anyway. 1585 if (!Advance && !isMemOp && MBBI != E) { 1586 ++Position; 1587 ++MBBI; 1588 } 1589 } 1590 } 1591 return NumMerges > 0; 1592 } 1593 1594 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops 1595 /// ("bx lr" and "mov pc, lr") into the preceding stack restore so it 1596 /// directly restore the value of LR into pc. 1597 /// ldmfd sp!, {..., lr} 1598 /// bx lr 1599 /// or 1600 /// ldmfd sp!, {..., lr} 1601 /// mov pc, lr 1602 /// => 1603 /// ldmfd sp!, {..., pc} 1604 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { 1605 // Thumb1 LDM doesn't allow high registers. 1606 if (isThumb1) return false; 1607 if (MBB.empty()) return false; 1608 1609 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); 1610 if (MBBI != MBB.begin() && 1611 (MBBI->getOpcode() == ARM::BX_RET || 1612 MBBI->getOpcode() == ARM::tBX_RET || 1613 MBBI->getOpcode() == ARM::MOVPCLR)) { 1614 MachineInstr *PrevMI = std::prev(MBBI); 1615 unsigned Opcode = PrevMI->getOpcode(); 1616 if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD || 1617 Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD || 1618 Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) { 1619 MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); 1620 if (MO.getReg() != ARM::LR) 1621 return false; 1622 unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET); 1623 assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) || 1624 Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!"); 1625 PrevMI->setDesc(TII->get(NewOpc)); 1626 MO.setReg(ARM::PC); 1627 PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI); 1628 MBB.erase(MBBI); 1629 return true; 1630 } 1631 } 1632 return false; 1633 } 1634 1635 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1636 const TargetMachine &TM = Fn.getTarget(); 1637 TL = TM.getSubtargetImpl()->getTargetLowering(); 1638 AFI = Fn.getInfo<ARMFunctionInfo>(); 1639 TII = TM.getSubtargetImpl()->getInstrInfo(); 1640 TRI = TM.getSubtargetImpl()->getRegisterInfo(); 1641 STI = &TM.getSubtarget<ARMSubtarget>(); 1642 RS = new RegScavenger(); 1643 isThumb2 = AFI->isThumb2Function(); 1644 isThumb1 = AFI->isThumbFunction() && !isThumb2; 1645 1646 bool Modified = false; 1647 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1648 ++MFI) { 1649 MachineBasicBlock &MBB = *MFI; 1650 Modified |= LoadStoreMultipleOpti(MBB); 1651 if (TM.getSubtarget<ARMSubtarget>().hasV5TOps()) 1652 Modified |= MergeReturnIntoLDM(MBB); 1653 } 1654 1655 delete RS; 1656 return Modified; 1657 } 1658 1659 1660 /// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move 1661 /// load / stores from consecutive locations close to make it more 1662 /// likely they will be combined later. 1663 1664 namespace { 1665 struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ 1666 static char ID; 1667 ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {} 1668 1669 const DataLayout *TD; 1670 const TargetInstrInfo *TII; 1671 const TargetRegisterInfo *TRI; 1672 const ARMSubtarget *STI; 1673 MachineRegisterInfo *MRI; 1674 MachineFunction *MF; 1675 1676 bool runOnMachineFunction(MachineFunction &Fn) override; 1677 1678 const char *getPassName() const override { 1679 return "ARM pre- register allocation load / store optimization pass"; 1680 } 1681 1682 private: 1683 bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, 1684 unsigned &NewOpc, unsigned &EvenReg, 1685 unsigned &OddReg, unsigned &BaseReg, 1686 int &Offset, 1687 unsigned &PredReg, ARMCC::CondCodes &Pred, 1688 bool &isT2); 1689 bool RescheduleOps(MachineBasicBlock *MBB, 1690 SmallVectorImpl<MachineInstr *> &Ops, 1691 unsigned Base, bool isLd, 1692 DenseMap<MachineInstr*, unsigned> &MI2LocMap); 1693 bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); 1694 }; 1695 char ARMPreAllocLoadStoreOpt::ID = 0; 1696 } 1697 1698 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1699 TD = Fn.getSubtarget().getDataLayout(); 1700 TII = Fn.getSubtarget().getInstrInfo(); 1701 TRI = Fn.getSubtarget().getRegisterInfo(); 1702 STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); 1703 MRI = &Fn.getRegInfo(); 1704 MF = &Fn; 1705 1706 bool Modified = false; 1707 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1708 ++MFI) 1709 Modified |= RescheduleLoadStoreInstrs(MFI); 1710 1711 return Modified; 1712 } 1713 1714 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, 1715 MachineBasicBlock::iterator I, 1716 MachineBasicBlock::iterator E, 1717 SmallPtrSet<MachineInstr*, 4> &MemOps, 1718 SmallSet<unsigned, 4> &MemRegs, 1719 const TargetRegisterInfo *TRI) { 1720 // Are there stores / loads / calls between them? 1721 // FIXME: This is overly conservative. We should make use of alias information 1722 // some day. 1723 SmallSet<unsigned, 4> AddedRegPressure; 1724 while (++I != E) { 1725 if (I->isDebugValue() || MemOps.count(&*I)) 1726 continue; 1727 if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects()) 1728 return false; 1729 if (isLd && I->mayStore()) 1730 return false; 1731 if (!isLd) { 1732 if (I->mayLoad()) 1733 return false; 1734 // It's not safe to move the first 'str' down. 1735 // str r1, [r0] 1736 // strh r5, [r0] 1737 // str r4, [r0, #+4] 1738 if (I->mayStore()) 1739 return false; 1740 } 1741 for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) { 1742 MachineOperand &MO = I->getOperand(j); 1743 if (!MO.isReg()) 1744 continue; 1745 unsigned Reg = MO.getReg(); 1746 if (MO.isDef() && TRI->regsOverlap(Reg, Base)) 1747 return false; 1748 if (Reg != Base && !MemRegs.count(Reg)) 1749 AddedRegPressure.insert(Reg); 1750 } 1751 } 1752 1753 // Estimate register pressure increase due to the transformation. 1754 if (MemRegs.size() <= 4) 1755 // Ok if we are moving small number of instructions. 1756 return true; 1757 return AddedRegPressure.size() <= MemRegs.size() * 2; 1758 } 1759 1760 1761 /// Copy Op0 and Op1 operands into a new array assigned to MI. 1762 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, 1763 MachineInstr *Op1) { 1764 assert(MI->memoperands_empty() && "expected a new machineinstr"); 1765 size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) 1766 + (Op1->memoperands_end() - Op1->memoperands_begin()); 1767 1768 MachineFunction *MF = MI->getParent()->getParent(); 1769 MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); 1770 MachineSDNode::mmo_iterator MemEnd = 1771 std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); 1772 MemEnd = 1773 std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); 1774 MI->setMemRefs(MemBegin, MemEnd); 1775 } 1776 1777 bool 1778 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, 1779 DebugLoc &dl, 1780 unsigned &NewOpc, unsigned &EvenReg, 1781 unsigned &OddReg, unsigned &BaseReg, 1782 int &Offset, unsigned &PredReg, 1783 ARMCC::CondCodes &Pred, 1784 bool &isT2) { 1785 // Make sure we're allowed to generate LDRD/STRD. 1786 if (!STI->hasV5TEOps()) 1787 return false; 1788 1789 // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD 1790 unsigned Scale = 1; 1791 unsigned Opcode = Op0->getOpcode(); 1792 if (Opcode == ARM::LDRi12) { 1793 NewOpc = ARM::LDRD; 1794 } else if (Opcode == ARM::STRi12) { 1795 NewOpc = ARM::STRD; 1796 } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { 1797 NewOpc = ARM::t2LDRDi8; 1798 Scale = 4; 1799 isT2 = true; 1800 } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) { 1801 NewOpc = ARM::t2STRDi8; 1802 Scale = 4; 1803 isT2 = true; 1804 } else { 1805 return false; 1806 } 1807 1808 // Make sure the base address satisfies i64 ld / st alignment requirement. 1809 // At the moment, we ignore the memoryoperand's value. 1810 // If we want to use AliasAnalysis, we should check it accordingly. 1811 if (!Op0->hasOneMemOperand() || 1812 (*Op0->memoperands_begin())->isVolatile()) 1813 return false; 1814 1815 unsigned Align = (*Op0->memoperands_begin())->getAlignment(); 1816 const Function *Func = MF->getFunction(); 1817 unsigned ReqAlign = STI->hasV6Ops() 1818 ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext())) 1819 : 8; // Pre-v6 need 8-byte align 1820 if (Align < ReqAlign) 1821 return false; 1822 1823 // Then make sure the immediate offset fits. 1824 int OffImm = getMemoryOpOffset(Op0); 1825 if (isT2) { 1826 int Limit = (1 << 8) * Scale; 1827 if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1))) 1828 return false; 1829 Offset = OffImm; 1830 } else { 1831 ARM_AM::AddrOpc AddSub = ARM_AM::add; 1832 if (OffImm < 0) { 1833 AddSub = ARM_AM::sub; 1834 OffImm = - OffImm; 1835 } 1836 int Limit = (1 << 8) * Scale; 1837 if (OffImm >= Limit || (OffImm & (Scale-1))) 1838 return false; 1839 Offset = ARM_AM::getAM3Opc(AddSub, OffImm); 1840 } 1841 EvenReg = Op0->getOperand(0).getReg(); 1842 OddReg = Op1->getOperand(0).getReg(); 1843 if (EvenReg == OddReg) 1844 return false; 1845 BaseReg = Op0->getOperand(1).getReg(); 1846 Pred = getInstrPredicate(Op0, PredReg); 1847 dl = Op0->getDebugLoc(); 1848 return true; 1849 } 1850 1851 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, 1852 SmallVectorImpl<MachineInstr *> &Ops, 1853 unsigned Base, bool isLd, 1854 DenseMap<MachineInstr*, unsigned> &MI2LocMap) { 1855 bool RetVal = false; 1856 1857 // Sort by offset (in reverse order). 1858 std::sort(Ops.begin(), Ops.end(), 1859 [](const MachineInstr *LHS, const MachineInstr *RHS) { 1860 int LOffset = getMemoryOpOffset(LHS); 1861 int ROffset = getMemoryOpOffset(RHS); 1862 assert(LHS == RHS || LOffset != ROffset); 1863 return LOffset > ROffset; 1864 }); 1865 1866 // The loads / stores of the same base are in order. Scan them from first to 1867 // last and check for the following: 1868 // 1. Any def of base. 1869 // 2. Any gaps. 1870 while (Ops.size() > 1) { 1871 unsigned FirstLoc = ~0U; 1872 unsigned LastLoc = 0; 1873 MachineInstr *FirstOp = nullptr; 1874 MachineInstr *LastOp = nullptr; 1875 int LastOffset = 0; 1876 unsigned LastOpcode = 0; 1877 unsigned LastBytes = 0; 1878 unsigned NumMove = 0; 1879 for (int i = Ops.size() - 1; i >= 0; --i) { 1880 MachineInstr *Op = Ops[i]; 1881 unsigned Loc = MI2LocMap[Op]; 1882 if (Loc <= FirstLoc) { 1883 FirstLoc = Loc; 1884 FirstOp = Op; 1885 } 1886 if (Loc >= LastLoc) { 1887 LastLoc = Loc; 1888 LastOp = Op; 1889 } 1890 1891 unsigned LSMOpcode 1892 = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia); 1893 if (LastOpcode && LSMOpcode != LastOpcode) 1894 break; 1895 1896 int Offset = getMemoryOpOffset(Op); 1897 unsigned Bytes = getLSMultipleTransferSize(Op); 1898 if (LastBytes) { 1899 if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes)) 1900 break; 1901 } 1902 LastOffset = Offset; 1903 LastBytes = Bytes; 1904 LastOpcode = LSMOpcode; 1905 if (++NumMove == 8) // FIXME: Tune this limit. 1906 break; 1907 } 1908 1909 if (NumMove <= 1) 1910 Ops.pop_back(); 1911 else { 1912 SmallPtrSet<MachineInstr*, 4> MemOps; 1913 SmallSet<unsigned, 4> MemRegs; 1914 for (int i = NumMove-1; i >= 0; --i) { 1915 MemOps.insert(Ops[i]); 1916 MemRegs.insert(Ops[i]->getOperand(0).getReg()); 1917 } 1918 1919 // Be conservative, if the instructions are too far apart, don't 1920 // move them. We want to limit the increase of register pressure. 1921 bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this. 1922 if (DoMove) 1923 DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp, 1924 MemOps, MemRegs, TRI); 1925 if (!DoMove) { 1926 for (unsigned i = 0; i != NumMove; ++i) 1927 Ops.pop_back(); 1928 } else { 1929 // This is the new location for the loads / stores. 1930 MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; 1931 while (InsertPos != MBB->end() 1932 && (MemOps.count(InsertPos) || InsertPos->isDebugValue())) 1933 ++InsertPos; 1934 1935 // If we are moving a pair of loads / stores, see if it makes sense 1936 // to try to allocate a pair of registers that can form register pairs. 1937 MachineInstr *Op0 = Ops.back(); 1938 MachineInstr *Op1 = Ops[Ops.size()-2]; 1939 unsigned EvenReg = 0, OddReg = 0; 1940 unsigned BaseReg = 0, PredReg = 0; 1941 ARMCC::CondCodes Pred = ARMCC::AL; 1942 bool isT2 = false; 1943 unsigned NewOpc = 0; 1944 int Offset = 0; 1945 DebugLoc dl; 1946 if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, 1947 EvenReg, OddReg, BaseReg, 1948 Offset, PredReg, Pred, isT2)) { 1949 Ops.pop_back(); 1950 Ops.pop_back(); 1951 1952 const MCInstrDesc &MCID = TII->get(NewOpc); 1953 const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF); 1954 MRI->constrainRegClass(EvenReg, TRC); 1955 MRI->constrainRegClass(OddReg, TRC); 1956 1957 // Form the pair instruction. 1958 if (isLd) { 1959 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID) 1960 .addReg(EvenReg, RegState::Define) 1961 .addReg(OddReg, RegState::Define) 1962 .addReg(BaseReg); 1963 // FIXME: We're converting from LDRi12 to an insn that still 1964 // uses addrmode2, so we need an explicit offset reg. It should 1965 // always by reg0 since we're transforming LDRi12s. 1966 if (!isT2) 1967 MIB.addReg(0); 1968 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1969 concatenateMemOperands(MIB, Op0, Op1); 1970 DEBUG(dbgs() << "Formed " << *MIB << "\n"); 1971 ++NumLDRDFormed; 1972 } else { 1973 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID) 1974 .addReg(EvenReg) 1975 .addReg(OddReg) 1976 .addReg(BaseReg); 1977 // FIXME: We're converting from LDRi12 to an insn that still 1978 // uses addrmode2, so we need an explicit offset reg. It should 1979 // always by reg0 since we're transforming STRi12s. 1980 if (!isT2) 1981 MIB.addReg(0); 1982 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1983 concatenateMemOperands(MIB, Op0, Op1); 1984 DEBUG(dbgs() << "Formed " << *MIB << "\n"); 1985 ++NumSTRDFormed; 1986 } 1987 MBB->erase(Op0); 1988 MBB->erase(Op1); 1989 1990 // Add register allocation hints to form register pairs. 1991 MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg); 1992 MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg); 1993 } else { 1994 for (unsigned i = 0; i != NumMove; ++i) { 1995 MachineInstr *Op = Ops.back(); 1996 Ops.pop_back(); 1997 MBB->splice(InsertPos, MBB, Op); 1998 } 1999 } 2000 2001 NumLdStMoved += NumMove; 2002 RetVal = true; 2003 } 2004 } 2005 } 2006 2007 return RetVal; 2008 } 2009 2010 bool 2011 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { 2012 bool RetVal = false; 2013 2014 DenseMap<MachineInstr*, unsigned> MI2LocMap; 2015 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap; 2016 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap; 2017 SmallVector<unsigned, 4> LdBases; 2018 SmallVector<unsigned, 4> StBases; 2019 2020 unsigned Loc = 0; 2021 MachineBasicBlock::iterator MBBI = MBB->begin(); 2022 MachineBasicBlock::iterator E = MBB->end(); 2023 while (MBBI != E) { 2024 for (; MBBI != E; ++MBBI) { 2025 MachineInstr *MI = MBBI; 2026 if (MI->isCall() || MI->isTerminator()) { 2027 // Stop at barriers. 2028 ++MBBI; 2029 break; 2030 } 2031 2032 if (!MI->isDebugValue()) 2033 MI2LocMap[MI] = ++Loc; 2034 2035 if (!isMemoryOp(MI)) 2036 continue; 2037 unsigned PredReg = 0; 2038 if (getInstrPredicate(MI, PredReg) != ARMCC::AL) 2039 continue; 2040 2041 int Opc = MI->getOpcode(); 2042 bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; 2043 unsigned Base = MI->getOperand(1).getReg(); 2044 int Offset = getMemoryOpOffset(MI); 2045 2046 bool StopHere = false; 2047 if (isLd) { 2048 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 2049 Base2LdsMap.find(Base); 2050 if (BI != Base2LdsMap.end()) { 2051 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 2052 if (Offset == getMemoryOpOffset(BI->second[i])) { 2053 StopHere = true; 2054 break; 2055 } 2056 } 2057 if (!StopHere) 2058 BI->second.push_back(MI); 2059 } else { 2060 Base2LdsMap[Base].push_back(MI); 2061 LdBases.push_back(Base); 2062 } 2063 } else { 2064 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 2065 Base2StsMap.find(Base); 2066 if (BI != Base2StsMap.end()) { 2067 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 2068 if (Offset == getMemoryOpOffset(BI->second[i])) { 2069 StopHere = true; 2070 break; 2071 } 2072 } 2073 if (!StopHere) 2074 BI->second.push_back(MI); 2075 } else { 2076 Base2StsMap[Base].push_back(MI); 2077 StBases.push_back(Base); 2078 } 2079 } 2080 2081 if (StopHere) { 2082 // Found a duplicate (a base+offset combination that's seen earlier). 2083 // Backtrack. 2084 --Loc; 2085 break; 2086 } 2087 } 2088 2089 // Re-schedule loads. 2090 for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { 2091 unsigned Base = LdBases[i]; 2092 SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base]; 2093 if (Lds.size() > 1) 2094 RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); 2095 } 2096 2097 // Re-schedule stores. 2098 for (unsigned i = 0, e = StBases.size(); i != e; ++i) { 2099 unsigned Base = StBases[i]; 2100 SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base]; 2101 if (Sts.size() > 1) 2102 RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); 2103 } 2104 2105 if (MBBI != E) { 2106 Base2LdsMap.clear(); 2107 Base2StsMap.clear(); 2108 LdBases.clear(); 2109 StBases.clear(); 2110 } 2111 } 2112 2113 return RetVal; 2114 } 2115 2116 2117 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store 2118 /// optimization pass. 2119 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { 2120 if (PreAlloc) 2121 return new ARMPreAllocLoadStoreOpt(); 2122 return new ARMLoadStoreOpt(); 2123 } 2124