1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "arm-ldst-opt" 16 #include "ARM.h" 17 #include "ARMAddressingModes.h" 18 #include "ARMBaseInstrInfo.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMRegisterInfo.h" 21 #include "llvm/DerivedTypes.h" 22 #include "llvm/Function.h" 23 #include "llvm/CodeGen/MachineBasicBlock.h" 24 #include "llvm/CodeGen/MachineFunctionPass.h" 25 #include "llvm/CodeGen/MachineInstr.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/CodeGen/MachineRegisterInfo.h" 28 #include "llvm/CodeGen/RegisterScavenging.h" 29 #include "llvm/Target/TargetData.h" 30 #include "llvm/Target/TargetInstrInfo.h" 31 #include "llvm/Target/TargetMachine.h" 32 #include "llvm/Target/TargetRegisterInfo.h" 33 #include "llvm/Support/ErrorHandling.h" 34 #include "llvm/ADT/DenseMap.h" 35 #include "llvm/ADT/STLExtras.h" 36 #include "llvm/ADT/SmallPtrSet.h" 37 #include "llvm/ADT/SmallSet.h" 38 #include "llvm/ADT/SmallVector.h" 39 #include "llvm/ADT/Statistic.h" 40 using namespace llvm; 41 42 STATISTIC(NumLDMGened , "Number of ldm instructions generated"); 43 STATISTIC(NumSTMGened , "Number of stm instructions generated"); 44 STATISTIC(NumVLDMGened, "Number of vldm instructions generated"); 45 STATISTIC(NumVSTMGened, "Number of vstm instructions generated"); 46 STATISTIC(NumLdStMoved, "Number of load / store instructions moved"); 47 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation"); 48 STATISTIC(NumSTRDFormed,"Number of strd created before allocation"); 49 STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm"); 50 STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); 51 STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); 52 STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); 53 54 /// ARMAllocLoadStoreOpt - Post- register allocation pass the combine 55 /// load / store instructions to form ldm / stm instructions. 56 57 namespace { 58 struct ARMLoadStoreOpt : public MachineFunctionPass { 59 static char ID; 60 ARMLoadStoreOpt() : MachineFunctionPass(&ID) {} 61 62 const TargetInstrInfo *TII; 63 const TargetRegisterInfo *TRI; 64 ARMFunctionInfo *AFI; 65 RegScavenger *RS; 66 bool isThumb2; 67 68 virtual bool runOnMachineFunction(MachineFunction &Fn); 69 70 virtual const char *getPassName() const { 71 return "ARM load / store optimization pass"; 72 } 73 74 private: 75 struct MemOpQueueEntry { 76 int Offset; 77 unsigned Position; 78 MachineBasicBlock::iterator MBBI; 79 bool Merged; 80 MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i) 81 : Offset(o), Position(p), MBBI(i), Merged(false) {} 82 }; 83 typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; 84 typedef MemOpQueue::iterator MemOpQueueIter; 85 86 bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 87 int Offset, unsigned Base, bool BaseKill, int Opcode, 88 ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, 89 DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs); 90 void MergeOpsUpdate(MachineBasicBlock &MBB, 91 MemOpQueue &MemOps, 92 unsigned memOpsBegin, 93 unsigned memOpsEnd, 94 unsigned insertAfter, 95 int Offset, 96 unsigned Base, 97 bool BaseKill, 98 int Opcode, 99 ARMCC::CondCodes Pred, 100 unsigned PredReg, 101 unsigned Scratch, 102 DebugLoc dl, 103 SmallVector<MachineBasicBlock::iterator, 4> &Merges); 104 void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, 105 int Opcode, unsigned Size, 106 ARMCC::CondCodes Pred, unsigned PredReg, 107 unsigned Scratch, MemOpQueue &MemOps, 108 SmallVector<MachineBasicBlock::iterator, 4> &Merges); 109 110 void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); 111 bool FixInvalidRegPairOp(MachineBasicBlock &MBB, 112 MachineBasicBlock::iterator &MBBI); 113 bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 114 MachineBasicBlock::iterator MBBI, 115 const TargetInstrInfo *TII, 116 bool &Advance, 117 MachineBasicBlock::iterator &I); 118 bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 119 MachineBasicBlock::iterator MBBI, 120 bool &Advance, 121 MachineBasicBlock::iterator &I); 122 bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); 123 bool MergeReturnIntoLDM(MachineBasicBlock &MBB); 124 }; 125 char ARMLoadStoreOpt::ID = 0; 126 } 127 128 static int getLoadStoreMultipleOpcode(int Opcode) { 129 switch (Opcode) { 130 case ARM::LDR: 131 NumLDMGened++; 132 return ARM::LDM; 133 case ARM::STR: 134 NumSTMGened++; 135 return ARM::STM; 136 case ARM::t2LDRi8: 137 case ARM::t2LDRi12: 138 NumLDMGened++; 139 return ARM::t2LDM; 140 case ARM::t2STRi8: 141 case ARM::t2STRi12: 142 NumSTMGened++; 143 return ARM::t2STM; 144 case ARM::VLDRS: 145 NumVLDMGened++; 146 return ARM::VLDMS; 147 case ARM::VSTRS: 148 NumVSTMGened++; 149 return ARM::VSTMS; 150 case ARM::VLDRD: 151 NumVLDMGened++; 152 return ARM::VLDMD; 153 case ARM::VSTRD: 154 NumVSTMGened++; 155 return ARM::VSTMD; 156 default: llvm_unreachable("Unhandled opcode!"); 157 } 158 return 0; 159 } 160 161 static bool isT2i32Load(unsigned Opc) { 162 return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; 163 } 164 165 static bool isi32Load(unsigned Opc) { 166 return Opc == ARM::LDR || isT2i32Load(Opc); 167 } 168 169 static bool isT2i32Store(unsigned Opc) { 170 return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; 171 } 172 173 static bool isi32Store(unsigned Opc) { 174 return Opc == ARM::STR || isT2i32Store(Opc); 175 } 176 177 /// MergeOps - Create and insert a LDM or STM with Base as base register and 178 /// registers in Regs as the register operands that would be loaded / stored. 179 /// It returns true if the transformation is done. 180 bool 181 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, 182 MachineBasicBlock::iterator MBBI, 183 int Offset, unsigned Base, bool BaseKill, 184 int Opcode, ARMCC::CondCodes Pred, 185 unsigned PredReg, unsigned Scratch, DebugLoc dl, 186 SmallVector<std::pair<unsigned, bool>, 8> &Regs) { 187 // Only a single register to load / store. Don't bother. 188 unsigned NumRegs = Regs.size(); 189 if (NumRegs <= 1) 190 return false; 191 192 ARM_AM::AMSubMode Mode = ARM_AM::ia; 193 bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); 194 if (isAM4 && Offset == 4) { 195 if (isThumb2) 196 // Thumb2 does not support ldmib / stmib. 197 return false; 198 Mode = ARM_AM::ib; 199 } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) { 200 if (isThumb2) 201 // Thumb2 does not support ldmda / stmda. 202 return false; 203 Mode = ARM_AM::da; 204 } else if (isAM4 && Offset == -4 * (int)NumRegs) { 205 Mode = ARM_AM::db; 206 } else if (Offset != 0) { 207 // If starting offset isn't zero, insert a MI to materialize a new base. 208 // But only do so if it is cost effective, i.e. merging more than two 209 // loads / stores. 210 if (NumRegs <= 2) 211 return false; 212 213 unsigned NewBase; 214 if (isi32Load(Opcode)) 215 // If it is a load, then just use one of the destination register to 216 // use as the new base. 217 NewBase = Regs[NumRegs-1].first; 218 else { 219 // Use the scratch register to use as a new base. 220 NewBase = Scratch; 221 if (NewBase == 0) 222 return false; 223 } 224 int BaseOpc = !isThumb2 225 ? ARM::ADDri 226 : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri); 227 if (Offset < 0) { 228 BaseOpc = !isThumb2 229 ? ARM::SUBri 230 : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri); 231 Offset = - Offset; 232 } 233 int ImmedOffset = isThumb2 234 ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset); 235 if (ImmedOffset == -1) 236 // FIXME: Try t2ADDri12 or t2SUBri12? 237 return false; // Probably not worth it then. 238 239 BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) 240 .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) 241 .addImm(Pred).addReg(PredReg).addReg(0); 242 Base = NewBase; 243 BaseKill = true; // New base is always killed right its use. 244 } 245 246 bool isDPR = Opcode == ARM::VLDRD || Opcode == ARM::VSTRD; 247 bool isDef = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; 248 Opcode = getLoadStoreMultipleOpcode(Opcode); 249 MachineInstrBuilder MIB = (isAM4) 250 ? BuildMI(MBB, MBBI, dl, TII->get(Opcode)) 251 .addReg(Base, getKillRegState(BaseKill)) 252 .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg) 253 : BuildMI(MBB, MBBI, dl, TII->get(Opcode)) 254 .addReg(Base, getKillRegState(BaseKill)) 255 .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs)) 256 .addImm(Pred).addReg(PredReg); 257 MIB.addReg(0); // Add optional writeback (0 for now). 258 for (unsigned i = 0; i != NumRegs; ++i) 259 MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) 260 | getKillRegState(Regs[i].second)); 261 262 return true; 263 } 264 265 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on 266 // success. 267 void ARMLoadStoreOpt:: 268 MergeOpsUpdate(MachineBasicBlock &MBB, 269 MemOpQueue &memOps, 270 unsigned memOpsBegin, 271 unsigned memOpsEnd, 272 unsigned insertAfter, 273 int Offset, 274 unsigned Base, 275 bool BaseKill, 276 int Opcode, 277 ARMCC::CondCodes Pred, 278 unsigned PredReg, 279 unsigned Scratch, 280 DebugLoc dl, 281 SmallVector<MachineBasicBlock::iterator, 4> &Merges) { 282 // First calculate which of the registers should be killed by the merged 283 // instruction. 284 SmallVector<std::pair<unsigned, bool>, 8> Regs; 285 const unsigned insertPos = memOps[insertAfter].Position; 286 for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { 287 const MachineOperand &MO = memOps[i].MBBI->getOperand(0); 288 unsigned Reg = MO.getReg(); 289 bool isKill = MO.isKill(); 290 291 // If we are inserting the merged operation after an unmerged operation that 292 // uses the same register, make sure to transfer any kill flag. 293 for (unsigned j = memOpsEnd, e = memOps.size(); !isKill && j != e; ++j) 294 if (memOps[j].Position<insertPos) { 295 const MachineOperand &MOJ = memOps[j].MBBI->getOperand(0); 296 if (MOJ.getReg() == Reg && MOJ.isKill()) 297 isKill = true; 298 } 299 300 Regs.push_back(std::make_pair(Reg, isKill)); 301 } 302 303 // Try to do the merge. 304 MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI; 305 Loc++; 306 if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode, 307 Pred, PredReg, Scratch, dl, Regs)) 308 return; 309 310 // Merge succeeded, update records. 311 Merges.push_back(prior(Loc)); 312 for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) { 313 // Remove kill flags from any unmerged memops that come before insertPos. 314 if (Regs[i-memOpsBegin].second) 315 for (unsigned j = memOpsEnd, e = memOps.size(); j != e; ++j) 316 if (memOps[j].Position<insertPos) { 317 MachineOperand &MOJ = memOps[j].MBBI->getOperand(0); 318 if (MOJ.getReg() == Regs[i-memOpsBegin].first && MOJ.isKill()) 319 MOJ.setIsKill(false); 320 } 321 MBB.erase(memOps[i].MBBI); 322 memOps[i].Merged = true; 323 } 324 } 325 326 /// MergeLDR_STR - Merge a number of load / store instructions into one or more 327 /// load / store multiple instructions. 328 void 329 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, 330 unsigned Base, int Opcode, unsigned Size, 331 ARMCC::CondCodes Pred, unsigned PredReg, 332 unsigned Scratch, MemOpQueue &MemOps, 333 SmallVector<MachineBasicBlock::iterator, 4> &Merges) { 334 bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); 335 int Offset = MemOps[SIndex].Offset; 336 int SOffset = Offset; 337 unsigned insertAfter = SIndex; 338 MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; 339 DebugLoc dl = Loc->getDebugLoc(); 340 const MachineOperand &PMO = Loc->getOperand(0); 341 unsigned PReg = PMO.getReg(); 342 unsigned PRegNum = PMO.isUndef() ? UINT_MAX 343 : ARMRegisterInfo::getRegisterNumbering(PReg); 344 345 for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { 346 int NewOffset = MemOps[i].Offset; 347 const MachineOperand &MO = MemOps[i].MBBI->getOperand(0); 348 unsigned Reg = MO.getReg(); 349 unsigned RegNum = MO.isUndef() ? UINT_MAX 350 : ARMRegisterInfo::getRegisterNumbering(Reg); 351 // AM4 - register numbers in ascending order. 352 // AM5 - consecutive register numbers in ascending order. 353 if (NewOffset == Offset + (int)Size && 354 ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) { 355 Offset += Size; 356 PRegNum = RegNum; 357 } else { 358 // Can't merge this in. Try merge the earlier ones first. 359 MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset, 360 Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges); 361 MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, 362 MemOps, Merges); 363 return; 364 } 365 366 if (MemOps[i].Position > MemOps[insertAfter].Position) 367 insertAfter = i; 368 } 369 370 bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1; 371 MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset, 372 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); 373 return; 374 } 375 376 static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base, 377 unsigned Bytes, unsigned Limit, 378 ARMCC::CondCodes Pred, unsigned PredReg){ 379 unsigned MyPredReg = 0; 380 if (!MI) 381 return false; 382 if (MI->getOpcode() != ARM::t2SUBri && 383 MI->getOpcode() != ARM::t2SUBrSPi && 384 MI->getOpcode() != ARM::t2SUBrSPi12 && 385 MI->getOpcode() != ARM::tSUBspi && 386 MI->getOpcode() != ARM::SUBri) 387 return false; 388 389 // Make sure the offset fits in 8 bits. 390 if (Bytes <= 0 || (Limit && Bytes >= Limit)) 391 return false; 392 393 unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME 394 return (MI->getOperand(0).getReg() == Base && 395 MI->getOperand(1).getReg() == Base && 396 (MI->getOperand(2).getImm()*Scale) == Bytes && 397 llvm::getInstrPredicate(MI, MyPredReg) == Pred && 398 MyPredReg == PredReg); 399 } 400 401 static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base, 402 unsigned Bytes, unsigned Limit, 403 ARMCC::CondCodes Pred, unsigned PredReg){ 404 unsigned MyPredReg = 0; 405 if (!MI) 406 return false; 407 if (MI->getOpcode() != ARM::t2ADDri && 408 MI->getOpcode() != ARM::t2ADDrSPi && 409 MI->getOpcode() != ARM::t2ADDrSPi12 && 410 MI->getOpcode() != ARM::tADDspi && 411 MI->getOpcode() != ARM::ADDri) 412 return false; 413 414 if (Bytes <= 0 || (Limit && Bytes >= Limit)) 415 // Make sure the offset fits in 8 bits. 416 return false; 417 418 unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME 419 return (MI->getOperand(0).getReg() == Base && 420 MI->getOperand(1).getReg() == Base && 421 (MI->getOperand(2).getImm()*Scale) == Bytes && 422 llvm::getInstrPredicate(MI, MyPredReg) == Pred && 423 MyPredReg == PredReg); 424 } 425 426 static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { 427 switch (MI->getOpcode()) { 428 default: return 0; 429 case ARM::LDR: 430 case ARM::STR: 431 case ARM::t2LDRi8: 432 case ARM::t2LDRi12: 433 case ARM::t2STRi8: 434 case ARM::t2STRi12: 435 case ARM::VLDRS: 436 case ARM::VSTRS: 437 return 4; 438 case ARM::VLDRD: 439 case ARM::VSTRD: 440 return 8; 441 case ARM::LDM: 442 case ARM::STM: 443 case ARM::t2LDM: 444 case ARM::t2STM: 445 return (MI->getNumOperands() - 5) * 4; 446 case ARM::VLDMS: 447 case ARM::VSTMS: 448 case ARM::VLDMD: 449 case ARM::VSTMD: 450 return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4; 451 } 452 } 453 454 /// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base 455 /// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: 456 /// 457 /// stmia rn, <ra, rb, rc> 458 /// rn := rn + 4 * 3; 459 /// => 460 /// stmia rn!, <ra, rb, rc> 461 /// 462 /// rn := rn - 4 * 3; 463 /// ldmia rn, <ra, rb, rc> 464 /// => 465 /// ldmdb rn!, <ra, rb, rc> 466 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 467 MachineBasicBlock::iterator MBBI, 468 bool &Advance, 469 MachineBasicBlock::iterator &I) { 470 MachineInstr *MI = MBBI; 471 unsigned Base = MI->getOperand(0).getReg(); 472 unsigned Bytes = getLSMultipleTransferSize(MI); 473 unsigned PredReg = 0; 474 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 475 int Opcode = MI->getOpcode(); 476 bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM || 477 Opcode == ARM::STM || Opcode == ARM::t2STM; 478 479 if (isAM4) { 480 if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm())) 481 return false; 482 483 // Can't use the updating AM4 sub-mode if the base register is also a dest 484 // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. 485 for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) { 486 if (MI->getOperand(i).getReg() == Base) 487 return false; 488 } 489 490 ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm()); 491 if (MBBI != MBB.begin()) { 492 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 493 if (Mode == ARM_AM::ia && 494 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 495 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true)); 496 MI->getOperand(4).setReg(Base); 497 MI->getOperand(4).setIsDef(); 498 MBB.erase(PrevMBBI); 499 return true; 500 } else if (Mode == ARM_AM::ib && 501 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 502 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true)); 503 MI->getOperand(4).setReg(Base); // WB to base 504 MI->getOperand(4).setIsDef(); 505 MBB.erase(PrevMBBI); 506 return true; 507 } 508 } 509 510 if (MBBI != MBB.end()) { 511 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); 512 if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && 513 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 514 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); 515 MI->getOperand(4).setReg(Base); // WB to base 516 MI->getOperand(4).setIsDef(); 517 if (NextMBBI == I) { 518 Advance = true; 519 ++I; 520 } 521 MBB.erase(NextMBBI); 522 return true; 523 } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && 524 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 525 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); 526 MI->getOperand(4).setReg(Base); // WB to base 527 MI->getOperand(4).setIsDef(); 528 if (NextMBBI == I) { 529 Advance = true; 530 ++I; 531 } 532 MBB.erase(NextMBBI); 533 return true; 534 } 535 } 536 } else { 537 // VLDM{D|S}, VSTM{D|S} addressing mode 5 ops. 538 if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm())) 539 return false; 540 541 ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm()); 542 unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm()); 543 if (MBBI != MBB.begin()) { 544 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 545 if (Mode == ARM_AM::ia && 546 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 547 MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset)); 548 MI->getOperand(4).setReg(Base); // WB to base 549 MI->getOperand(4).setIsDef(); 550 MBB.erase(PrevMBBI); 551 return true; 552 } 553 } 554 555 if (MBBI != MBB.end()) { 556 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); 557 if (Mode == ARM_AM::ia && 558 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 559 MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset)); 560 MI->getOperand(4).setReg(Base); // WB to base 561 MI->getOperand(4).setIsDef(); 562 if (NextMBBI == I) { 563 Advance = true; 564 ++I; 565 } 566 MBB.erase(NextMBBI); 567 } 568 return true; 569 } 570 } 571 572 return false; 573 } 574 575 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) { 576 switch (Opc) { 577 case ARM::LDR: return ARM::LDR_PRE; 578 case ARM::STR: return ARM::STR_PRE; 579 case ARM::VLDRS: return ARM::VLDMS; 580 case ARM::VLDRD: return ARM::VLDMD; 581 case ARM::VSTRS: return ARM::VSTMS; 582 case ARM::VSTRD: return ARM::VSTMD; 583 case ARM::t2LDRi8: 584 case ARM::t2LDRi12: 585 return ARM::t2LDR_PRE; 586 case ARM::t2STRi8: 587 case ARM::t2STRi12: 588 return ARM::t2STR_PRE; 589 default: llvm_unreachable("Unhandled opcode!"); 590 } 591 return 0; 592 } 593 594 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) { 595 switch (Opc) { 596 case ARM::LDR: return ARM::LDR_POST; 597 case ARM::STR: return ARM::STR_POST; 598 case ARM::VLDRS: return ARM::VLDMS; 599 case ARM::VLDRD: return ARM::VLDMD; 600 case ARM::VSTRS: return ARM::VSTMS; 601 case ARM::VSTRD: return ARM::VSTMD; 602 case ARM::t2LDRi8: 603 case ARM::t2LDRi12: 604 return ARM::t2LDR_POST; 605 case ARM::t2STRi8: 606 case ARM::t2STRi12: 607 return ARM::t2STR_POST; 608 default: llvm_unreachable("Unhandled opcode!"); 609 } 610 return 0; 611 } 612 613 /// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base 614 /// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: 615 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 616 MachineBasicBlock::iterator MBBI, 617 const TargetInstrInfo *TII, 618 bool &Advance, 619 MachineBasicBlock::iterator &I) { 620 MachineInstr *MI = MBBI; 621 unsigned Base = MI->getOperand(1).getReg(); 622 bool BaseKill = MI->getOperand(1).isKill(); 623 unsigned Bytes = getLSMultipleTransferSize(MI); 624 int Opcode = MI->getOpcode(); 625 DebugLoc dl = MI->getDebugLoc(); 626 bool isAM5 = Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || 627 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS; 628 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; 629 if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) 630 return false; 631 else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) 632 return false; 633 else if (isT2i32Load(Opcode) || isT2i32Store(Opcode)) 634 if (MI->getOperand(2).getImm() != 0) 635 return false; 636 637 bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; 638 // Can't do the merge if the destination register is the same as the would-be 639 // writeback register. 640 if (isLd && MI->getOperand(0).getReg() == Base) 641 return false; 642 643 unsigned PredReg = 0; 644 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 645 bool DoMerge = false; 646 ARM_AM::AddrOpc AddSub = ARM_AM::add; 647 unsigned NewOpc = 0; 648 // AM2 - 12 bits, thumb2 - 8 bits. 649 unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); 650 if (MBBI != MBB.begin()) { 651 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 652 if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { 653 DoMerge = true; 654 AddSub = ARM_AM::sub; 655 NewOpc = getPreIndexedLoadStoreOpcode(Opcode); 656 } else if (!isAM5 && 657 isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { 658 DoMerge = true; 659 NewOpc = getPreIndexedLoadStoreOpcode(Opcode); 660 } 661 if (DoMerge) 662 MBB.erase(PrevMBBI); 663 } 664 665 if (!DoMerge && MBBI != MBB.end()) { 666 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); 667 if (!isAM5 && 668 isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { 669 DoMerge = true; 670 AddSub = ARM_AM::sub; 671 NewOpc = getPostIndexedLoadStoreOpcode(Opcode); 672 } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { 673 DoMerge = true; 674 NewOpc = getPostIndexedLoadStoreOpcode(Opcode); 675 } 676 if (DoMerge) { 677 if (NextMBBI == I) { 678 Advance = true; 679 ++I; 680 } 681 MBB.erase(NextMBBI); 682 } 683 } 684 685 if (!DoMerge) 686 return false; 687 688 bool isDPR = NewOpc == ARM::VLDMD || NewOpc == ARM::VSTMD; 689 unsigned Offset = 0; 690 if (isAM5) 691 Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) 692 ? ARM_AM::db 693 : ARM_AM::ia, true, (isDPR ? 2 : 1)); 694 else if (isAM2) 695 Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); 696 else 697 Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 698 if (isLd) { 699 if (isAM5) 700 // VLDMS, VLDMD 701 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) 702 .addReg(Base, getKillRegState(BaseKill)) 703 .addImm(Offset).addImm(Pred).addReg(PredReg) 704 .addReg(Base, getDefRegState(true)) // WB base register 705 .addReg(MI->getOperand(0).getReg(), RegState::Define); 706 else if (isAM2) 707 // LDR_PRE, LDR_POST, 708 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 709 .addReg(Base, RegState::Define) 710 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 711 else 712 // t2LDR_PRE, t2LDR_POST 713 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 714 .addReg(Base, RegState::Define) 715 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 716 } else { 717 MachineOperand &MO = MI->getOperand(0); 718 if (isAM5) 719 // VSTMS, VSTMD 720 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset) 721 .addImm(Pred).addReg(PredReg) 722 .addReg(Base, getDefRegState(true)) // WB base register 723 .addReg(MO.getReg(), getKillRegState(MO.isKill())); 724 else if (isAM2) 725 // STR_PRE, STR_POST 726 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 727 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 728 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 729 else 730 // t2STR_PRE, t2STR_POST 731 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 732 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 733 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 734 } 735 MBB.erase(MBBI); 736 737 return true; 738 } 739 740 /// isMemoryOp - Returns true if instruction is a memory operations (that this 741 /// pass is capable of operating on). 742 static bool isMemoryOp(const MachineInstr *MI) { 743 if (MI->hasOneMemOperand()) { 744 const MachineMemOperand *MMO = *MI->memoperands_begin(); 745 746 // Don't touch volatile memory accesses - we may be changing their order. 747 if (MMO->isVolatile()) 748 return false; 749 750 // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is not. 751 if (MMO->getAlignment() < 4) 752 return false; 753 } 754 755 int Opcode = MI->getOpcode(); 756 switch (Opcode) { 757 default: break; 758 case ARM::LDR: 759 case ARM::STR: 760 return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0; 761 case ARM::VLDRS: 762 case ARM::VSTRS: 763 return MI->getOperand(1).isReg(); 764 case ARM::VLDRD: 765 case ARM::VSTRD: 766 return MI->getOperand(1).isReg(); 767 case ARM::t2LDRi8: 768 case ARM::t2LDRi12: 769 case ARM::t2STRi8: 770 case ARM::t2STRi12: 771 return MI->getOperand(1).isReg(); 772 } 773 return false; 774 } 775 776 /// AdvanceRS - Advance register scavenger to just before the earliest memory 777 /// op that is being merged. 778 void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { 779 MachineBasicBlock::iterator Loc = MemOps[0].MBBI; 780 unsigned Position = MemOps[0].Position; 781 for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { 782 if (MemOps[i].Position < Position) { 783 Position = MemOps[i].Position; 784 Loc = MemOps[i].MBBI; 785 } 786 } 787 788 if (Loc != MBB.begin()) 789 RS->forward(prior(Loc)); 790 } 791 792 static int getMemoryOpOffset(const MachineInstr *MI) { 793 int Opcode = MI->getOpcode(); 794 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; 795 bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; 796 unsigned NumOperands = MI->getDesc().getNumOperands(); 797 unsigned OffField = MI->getOperand(NumOperands-3).getImm(); 798 799 if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || 800 Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || 801 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) 802 return OffField; 803 804 int Offset = isAM2 805 ? ARM_AM::getAM2Offset(OffField) 806 : (isAM3 ? ARM_AM::getAM3Offset(OffField) 807 : ARM_AM::getAM5Offset(OffField) * 4); 808 if (isAM2) { 809 if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub) 810 Offset = -Offset; 811 } else if (isAM3) { 812 if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) 813 Offset = -Offset; 814 } else { 815 if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) 816 Offset = -Offset; 817 } 818 return Offset; 819 } 820 821 static void InsertLDR_STR(MachineBasicBlock &MBB, 822 MachineBasicBlock::iterator &MBBI, 823 int OffImm, bool isDef, 824 DebugLoc dl, unsigned NewOpc, 825 unsigned Reg, bool RegDeadKill, bool RegUndef, 826 unsigned BaseReg, bool BaseKill, bool BaseUndef, 827 unsigned OffReg, bool OffKill, bool OffUndef, 828 ARMCC::CondCodes Pred, unsigned PredReg, 829 const TargetInstrInfo *TII, bool isT2) { 830 int Offset = OffImm; 831 if (!isT2) { 832 if (OffImm < 0) 833 Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift); 834 else 835 Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift); 836 } 837 if (isDef) { 838 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 839 TII->get(NewOpc)) 840 .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) 841 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 842 if (!isT2) 843 MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); 844 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 845 } else { 846 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 847 TII->get(NewOpc)) 848 .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) 849 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 850 if (!isT2) 851 MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); 852 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 853 } 854 } 855 856 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, 857 MachineBasicBlock::iterator &MBBI) { 858 MachineInstr *MI = &*MBBI; 859 unsigned Opcode = MI->getOpcode(); 860 if (Opcode == ARM::LDRD || Opcode == ARM::STRD || 861 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) { 862 unsigned EvenReg = MI->getOperand(0).getReg(); 863 unsigned OddReg = MI->getOperand(1).getReg(); 864 unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); 865 unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); 866 if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum) 867 return false; 868 869 bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; 870 bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; 871 bool EvenDeadKill = isLd ? 872 MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); 873 bool EvenUndef = MI->getOperand(0).isUndef(); 874 bool OddDeadKill = isLd ? 875 MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); 876 bool OddUndef = MI->getOperand(1).isUndef(); 877 const MachineOperand &BaseOp = MI->getOperand(2); 878 unsigned BaseReg = BaseOp.getReg(); 879 bool BaseKill = BaseOp.isKill(); 880 bool BaseUndef = BaseOp.isUndef(); 881 unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg(); 882 bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); 883 bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); 884 int OffImm = getMemoryOpOffset(MI); 885 unsigned PredReg = 0; 886 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 887 888 if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) { 889 // Ascending register numbers and no offset. It's safe to change it to a 890 // ldm or stm. 891 unsigned NewOpc = (isLd) 892 ? (isT2 ? ARM::t2LDM : ARM::LDM) 893 : (isT2 ? ARM::t2STM : ARM::STM); 894 if (isLd) { 895 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 896 .addReg(BaseReg, getKillRegState(BaseKill)) 897 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) 898 .addImm(Pred).addReg(PredReg) 899 .addReg(0) 900 .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) 901 .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); 902 ++NumLDRD2LDM; 903 } else { 904 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 905 .addReg(BaseReg, getKillRegState(BaseKill)) 906 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) 907 .addImm(Pred).addReg(PredReg) 908 .addReg(0) 909 .addReg(EvenReg, 910 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) 911 .addReg(OddReg, 912 getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); 913 ++NumSTRD2STM; 914 } 915 } else { 916 // Split into two instructions. 917 assert((!isT2 || !OffReg) && 918 "Thumb2 ldrd / strd does not encode offset register!"); 919 unsigned NewOpc = (isLd) 920 ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR) 921 : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR); 922 DebugLoc dl = MBBI->getDebugLoc(); 923 // If this is a load and base register is killed, it may have been 924 // re-defed by the load, make sure the first load does not clobber it. 925 if (isLd && 926 (BaseKill || OffKill) && 927 (TRI->regsOverlap(EvenReg, BaseReg) || 928 (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) { 929 assert(!TRI->regsOverlap(OddReg, BaseReg) && 930 (!OffReg || !TRI->regsOverlap(OddReg, OffReg))); 931 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, 932 OddReg, OddDeadKill, false, 933 BaseReg, false, BaseUndef, OffReg, false, OffUndef, 934 Pred, PredReg, TII, isT2); 935 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 936 EvenReg, EvenDeadKill, false, 937 BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, 938 Pred, PredReg, TII, isT2); 939 } else { 940 if (OddReg == EvenReg && EvenDeadKill) { 941 // If the two source operands are the same, the kill marker is probably 942 // on the first one. e.g. 943 // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0 944 EvenDeadKill = false; 945 OddDeadKill = true; 946 } 947 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 948 EvenReg, EvenDeadKill, EvenUndef, 949 BaseReg, false, BaseUndef, OffReg, false, OffUndef, 950 Pred, PredReg, TII, isT2); 951 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, 952 OddReg, OddDeadKill, OddUndef, 953 BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, 954 Pred, PredReg, TII, isT2); 955 } 956 if (isLd) 957 ++NumLDRD2LDR; 958 else 959 ++NumSTRD2STR; 960 } 961 962 MBBI = prior(MBBI); 963 MBB.erase(MI); 964 } 965 return false; 966 } 967 968 /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR 969 /// ops of the same base and incrementing offset into LDM / STM ops. 970 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { 971 unsigned NumMerges = 0; 972 unsigned NumMemOps = 0; 973 MemOpQueue MemOps; 974 unsigned CurrBase = 0; 975 int CurrOpc = -1; 976 unsigned CurrSize = 0; 977 ARMCC::CondCodes CurrPred = ARMCC::AL; 978 unsigned CurrPredReg = 0; 979 unsigned Position = 0; 980 SmallVector<MachineBasicBlock::iterator,4> Merges; 981 982 RS->enterBasicBlock(&MBB); 983 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 984 while (MBBI != E) { 985 if (FixInvalidRegPairOp(MBB, MBBI)) 986 continue; 987 988 bool Advance = false; 989 bool TryMerge = false; 990 bool Clobber = false; 991 992 bool isMemOp = isMemoryOp(MBBI); 993 if (isMemOp) { 994 int Opcode = MBBI->getOpcode(); 995 unsigned Size = getLSMultipleTransferSize(MBBI); 996 unsigned Base = MBBI->getOperand(1).getReg(); 997 unsigned PredReg = 0; 998 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg); 999 int Offset = getMemoryOpOffset(MBBI); 1000 // Watch out for: 1001 // r4 := ldr [r5] 1002 // r5 := ldr [r5, #4] 1003 // r6 := ldr [r5, #8] 1004 // 1005 // The second ldr has effectively broken the chain even though it 1006 // looks like the later ldr(s) use the same base register. Try to 1007 // merge the ldr's so far, including this one. But don't try to 1008 // combine the following ldr(s). 1009 Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); 1010 if (CurrBase == 0 && !Clobber) { 1011 // Start of a new chain. 1012 CurrBase = Base; 1013 CurrOpc = Opcode; 1014 CurrSize = Size; 1015 CurrPred = Pred; 1016 CurrPredReg = PredReg; 1017 MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); 1018 NumMemOps++; 1019 Advance = true; 1020 } else { 1021 if (Clobber) { 1022 TryMerge = true; 1023 Advance = true; 1024 } 1025 1026 if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { 1027 // No need to match PredReg. 1028 // Continue adding to the queue. 1029 if (Offset > MemOps.back().Offset) { 1030 MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); 1031 NumMemOps++; 1032 Advance = true; 1033 } else { 1034 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); 1035 I != E; ++I) { 1036 if (Offset < I->Offset) { 1037 MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI)); 1038 NumMemOps++; 1039 Advance = true; 1040 break; 1041 } else if (Offset == I->Offset) { 1042 // Collision! This can't be merged! 1043 break; 1044 } 1045 } 1046 } 1047 } 1048 } 1049 } 1050 1051 if (Advance) { 1052 ++Position; 1053 ++MBBI; 1054 if (MBBI == E) 1055 // Reach the end of the block, try merging the memory instructions. 1056 TryMerge = true; 1057 } else 1058 TryMerge = true; 1059 1060 if (TryMerge) { 1061 if (NumMemOps > 1) { 1062 // Try to find a free register to use as a new base in case it's needed. 1063 // First advance to the instruction just before the start of the chain. 1064 AdvanceRS(MBB, MemOps); 1065 // Find a scratch register. 1066 unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass); 1067 // Process the load / store instructions. 1068 RS->forward(prior(MBBI)); 1069 1070 // Merge ops. 1071 Merges.clear(); 1072 MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, 1073 CurrPred, CurrPredReg, Scratch, MemOps, Merges); 1074 1075 // Try folding preceeding/trailing base inc/dec into the generated 1076 // LDM/STM ops. 1077 for (unsigned i = 0, e = Merges.size(); i < e; ++i) 1078 if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) 1079 ++NumMerges; 1080 NumMerges += Merges.size(); 1081 1082 // Try folding preceeding/trailing base inc/dec into those load/store 1083 // that were not merged to form LDM/STM ops. 1084 for (unsigned i = 0; i != NumMemOps; ++i) 1085 if (!MemOps[i].Merged) 1086 if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) 1087 ++NumMerges; 1088 1089 // RS may be pointing to an instruction that's deleted. 1090 RS->skipTo(prior(MBBI)); 1091 } else if (NumMemOps == 1) { 1092 // Try folding preceeding/trailing base inc/dec into the single 1093 // load/store. 1094 if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { 1095 ++NumMerges; 1096 RS->forward(prior(MBBI)); 1097 } 1098 } 1099 1100 CurrBase = 0; 1101 CurrOpc = -1; 1102 CurrSize = 0; 1103 CurrPred = ARMCC::AL; 1104 CurrPredReg = 0; 1105 if (NumMemOps) { 1106 MemOps.clear(); 1107 NumMemOps = 0; 1108 } 1109 1110 // If iterator hasn't been advanced and this is not a memory op, skip it. 1111 // It can't start a new chain anyway. 1112 if (!Advance && !isMemOp && MBBI != E) { 1113 ++Position; 1114 ++MBBI; 1115 } 1116 } 1117 } 1118 return NumMerges > 0; 1119 } 1120 1121 namespace { 1122 struct OffsetCompare { 1123 bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const { 1124 int LOffset = getMemoryOpOffset(LHS); 1125 int ROffset = getMemoryOpOffset(RHS); 1126 assert(LHS == RHS || LOffset != ROffset); 1127 return LOffset > ROffset; 1128 } 1129 }; 1130 } 1131 1132 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return op 1133 /// (bx lr) into the preceeding stack restore so it directly restore the value 1134 /// of LR into pc. 1135 /// ldmfd sp!, {r7, lr} 1136 /// bx lr 1137 /// => 1138 /// ldmfd sp!, {r7, pc} 1139 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { 1140 if (MBB.empty()) return false; 1141 1142 MachineBasicBlock::iterator MBBI = prior(MBB.end()); 1143 if (MBBI != MBB.begin() && 1144 (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) { 1145 MachineInstr *PrevMI = prior(MBBI); 1146 if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) { 1147 MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); 1148 if (MO.getReg() != ARM::LR) 1149 return false; 1150 unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET; 1151 PrevMI->setDesc(TII->get(NewOpc)); 1152 MO.setReg(ARM::PC); 1153 MBB.erase(MBBI); 1154 return true; 1155 } 1156 } 1157 return false; 1158 } 1159 1160 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1161 const TargetMachine &TM = Fn.getTarget(); 1162 AFI = Fn.getInfo<ARMFunctionInfo>(); 1163 TII = TM.getInstrInfo(); 1164 TRI = TM.getRegisterInfo(); 1165 RS = new RegScavenger(); 1166 isThumb2 = AFI->isThumb2Function(); 1167 1168 bool Modified = false; 1169 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1170 ++MFI) { 1171 MachineBasicBlock &MBB = *MFI; 1172 Modified |= LoadStoreMultipleOpti(MBB); 1173 Modified |= MergeReturnIntoLDM(MBB); 1174 } 1175 1176 delete RS; 1177 return Modified; 1178 } 1179 1180 1181 /// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move 1182 /// load / stores from consecutive locations close to make it more 1183 /// likely they will be combined later. 1184 1185 namespace { 1186 struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ 1187 static char ID; 1188 ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {} 1189 1190 const TargetData *TD; 1191 const TargetInstrInfo *TII; 1192 const TargetRegisterInfo *TRI; 1193 const ARMSubtarget *STI; 1194 MachineRegisterInfo *MRI; 1195 MachineFunction *MF; 1196 1197 virtual bool runOnMachineFunction(MachineFunction &Fn); 1198 1199 virtual const char *getPassName() const { 1200 return "ARM pre- register allocation load / store optimization pass"; 1201 } 1202 1203 private: 1204 bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, 1205 unsigned &NewOpc, unsigned &EvenReg, 1206 unsigned &OddReg, unsigned &BaseReg, 1207 unsigned &OffReg, int &Offset, 1208 unsigned &PredReg, ARMCC::CondCodes &Pred, 1209 bool &isT2); 1210 bool RescheduleOps(MachineBasicBlock *MBB, 1211 SmallVector<MachineInstr*, 4> &Ops, 1212 unsigned Base, bool isLd, 1213 DenseMap<MachineInstr*, unsigned> &MI2LocMap); 1214 bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); 1215 }; 1216 char ARMPreAllocLoadStoreOpt::ID = 0; 1217 } 1218 1219 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1220 TD = Fn.getTarget().getTargetData(); 1221 TII = Fn.getTarget().getInstrInfo(); 1222 TRI = Fn.getTarget().getRegisterInfo(); 1223 STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); 1224 MRI = &Fn.getRegInfo(); 1225 MF = &Fn; 1226 1227 bool Modified = false; 1228 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1229 ++MFI) 1230 Modified |= RescheduleLoadStoreInstrs(MFI); 1231 1232 return Modified; 1233 } 1234 1235 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, 1236 MachineBasicBlock::iterator I, 1237 MachineBasicBlock::iterator E, 1238 SmallPtrSet<MachineInstr*, 4> &MemOps, 1239 SmallSet<unsigned, 4> &MemRegs, 1240 const TargetRegisterInfo *TRI) { 1241 // Are there stores / loads / calls between them? 1242 // FIXME: This is overly conservative. We should make use of alias information 1243 // some day. 1244 SmallSet<unsigned, 4> AddedRegPressure; 1245 while (++I != E) { 1246 if (MemOps.count(&*I)) 1247 continue; 1248 const TargetInstrDesc &TID = I->getDesc(); 1249 if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects()) 1250 return false; 1251 if (isLd && TID.mayStore()) 1252 return false; 1253 if (!isLd) { 1254 if (TID.mayLoad()) 1255 return false; 1256 // It's not safe to move the first 'str' down. 1257 // str r1, [r0] 1258 // strh r5, [r0] 1259 // str r4, [r0, #+4] 1260 if (TID.mayStore()) 1261 return false; 1262 } 1263 for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) { 1264 MachineOperand &MO = I->getOperand(j); 1265 if (!MO.isReg()) 1266 continue; 1267 unsigned Reg = MO.getReg(); 1268 if (MO.isDef() && TRI->regsOverlap(Reg, Base)) 1269 return false; 1270 if (Reg != Base && !MemRegs.count(Reg)) 1271 AddedRegPressure.insert(Reg); 1272 } 1273 } 1274 1275 // Estimate register pressure increase due to the transformation. 1276 if (MemRegs.size() <= 4) 1277 // Ok if we are moving small number of instructions. 1278 return true; 1279 return AddedRegPressure.size() <= MemRegs.size() * 2; 1280 } 1281 1282 bool 1283 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, 1284 DebugLoc &dl, 1285 unsigned &NewOpc, unsigned &EvenReg, 1286 unsigned &OddReg, unsigned &BaseReg, 1287 unsigned &OffReg, int &Offset, 1288 unsigned &PredReg, 1289 ARMCC::CondCodes &Pred, 1290 bool &isT2) { 1291 // Make sure we're allowed to generate LDRD/STRD. 1292 if (!STI->hasV5TEOps()) 1293 return false; 1294 1295 // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD 1296 unsigned Scale = 1; 1297 unsigned Opcode = Op0->getOpcode(); 1298 if (Opcode == ARM::LDR) 1299 NewOpc = ARM::LDRD; 1300 else if (Opcode == ARM::STR) 1301 NewOpc = ARM::STRD; 1302 else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { 1303 NewOpc = ARM::t2LDRDi8; 1304 Scale = 4; 1305 isT2 = true; 1306 } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) { 1307 NewOpc = ARM::t2STRDi8; 1308 Scale = 4; 1309 isT2 = true; 1310 } else 1311 return false; 1312 1313 // Make sure the offset registers match. 1314 if (!isT2 && 1315 (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg())) 1316 return false; 1317 1318 // Must sure the base address satisfies i64 ld / st alignment requirement. 1319 if (!Op0->hasOneMemOperand() || 1320 !(*Op0->memoperands_begin())->getValue() || 1321 (*Op0->memoperands_begin())->isVolatile()) 1322 return false; 1323 1324 unsigned Align = (*Op0->memoperands_begin())->getAlignment(); 1325 Function *Func = MF->getFunction(); 1326 unsigned ReqAlign = STI->hasV6Ops() 1327 ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext())) 1328 : 8; // Pre-v6 need 8-byte align 1329 if (Align < ReqAlign) 1330 return false; 1331 1332 // Then make sure the immediate offset fits. 1333 int OffImm = getMemoryOpOffset(Op0); 1334 if (isT2) { 1335 if (OffImm < 0) { 1336 if (OffImm < -255) 1337 // Can't fall back to t2LDRi8 / t2STRi8. 1338 return false; 1339 } else { 1340 int Limit = (1 << 8) * Scale; 1341 if (OffImm >= Limit || (OffImm & (Scale-1))) 1342 return false; 1343 } 1344 Offset = OffImm; 1345 } else { 1346 ARM_AM::AddrOpc AddSub = ARM_AM::add; 1347 if (OffImm < 0) { 1348 AddSub = ARM_AM::sub; 1349 OffImm = - OffImm; 1350 } 1351 int Limit = (1 << 8) * Scale; 1352 if (OffImm >= Limit || (OffImm & (Scale-1))) 1353 return false; 1354 Offset = ARM_AM::getAM3Opc(AddSub, OffImm); 1355 } 1356 EvenReg = Op0->getOperand(0).getReg(); 1357 OddReg = Op1->getOperand(0).getReg(); 1358 if (EvenReg == OddReg) 1359 return false; 1360 BaseReg = Op0->getOperand(1).getReg(); 1361 if (!isT2) 1362 OffReg = Op0->getOperand(2).getReg(); 1363 Pred = llvm::getInstrPredicate(Op0, PredReg); 1364 dl = Op0->getDebugLoc(); 1365 return true; 1366 } 1367 1368 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, 1369 SmallVector<MachineInstr*, 4> &Ops, 1370 unsigned Base, bool isLd, 1371 DenseMap<MachineInstr*, unsigned> &MI2LocMap) { 1372 bool RetVal = false; 1373 1374 // Sort by offset (in reverse order). 1375 std::sort(Ops.begin(), Ops.end(), OffsetCompare()); 1376 1377 // The loads / stores of the same base are in order. Scan them from first to 1378 // last and check for the followins: 1379 // 1. Any def of base. 1380 // 2. Any gaps. 1381 while (Ops.size() > 1) { 1382 unsigned FirstLoc = ~0U; 1383 unsigned LastLoc = 0; 1384 MachineInstr *FirstOp = 0; 1385 MachineInstr *LastOp = 0; 1386 int LastOffset = 0; 1387 unsigned LastOpcode = 0; 1388 unsigned LastBytes = 0; 1389 unsigned NumMove = 0; 1390 for (int i = Ops.size() - 1; i >= 0; --i) { 1391 MachineInstr *Op = Ops[i]; 1392 unsigned Loc = MI2LocMap[Op]; 1393 if (Loc <= FirstLoc) { 1394 FirstLoc = Loc; 1395 FirstOp = Op; 1396 } 1397 if (Loc >= LastLoc) { 1398 LastLoc = Loc; 1399 LastOp = Op; 1400 } 1401 1402 unsigned Opcode = Op->getOpcode(); 1403 if (LastOpcode && Opcode != LastOpcode) 1404 break; 1405 1406 int Offset = getMemoryOpOffset(Op); 1407 unsigned Bytes = getLSMultipleTransferSize(Op); 1408 if (LastBytes) { 1409 if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes)) 1410 break; 1411 } 1412 LastOffset = Offset; 1413 LastBytes = Bytes; 1414 LastOpcode = Opcode; 1415 if (++NumMove == 8) // FIXME: Tune this limit. 1416 break; 1417 } 1418 1419 if (NumMove <= 1) 1420 Ops.pop_back(); 1421 else { 1422 SmallPtrSet<MachineInstr*, 4> MemOps; 1423 SmallSet<unsigned, 4> MemRegs; 1424 for (int i = NumMove-1; i >= 0; --i) { 1425 MemOps.insert(Ops[i]); 1426 MemRegs.insert(Ops[i]->getOperand(0).getReg()); 1427 } 1428 1429 // Be conservative, if the instructions are too far apart, don't 1430 // move them. We want to limit the increase of register pressure. 1431 bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this. 1432 if (DoMove) 1433 DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp, 1434 MemOps, MemRegs, TRI); 1435 if (!DoMove) { 1436 for (unsigned i = 0; i != NumMove; ++i) 1437 Ops.pop_back(); 1438 } else { 1439 // This is the new location for the loads / stores. 1440 MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; 1441 while (InsertPos != MBB->end() && MemOps.count(InsertPos)) 1442 ++InsertPos; 1443 1444 // If we are moving a pair of loads / stores, see if it makes sense 1445 // to try to allocate a pair of registers that can form register pairs. 1446 MachineInstr *Op0 = Ops.back(); 1447 MachineInstr *Op1 = Ops[Ops.size()-2]; 1448 unsigned EvenReg = 0, OddReg = 0; 1449 unsigned BaseReg = 0, OffReg = 0, PredReg = 0; 1450 ARMCC::CondCodes Pred = ARMCC::AL; 1451 bool isT2 = false; 1452 unsigned NewOpc = 0; 1453 int Offset = 0; 1454 DebugLoc dl; 1455 if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, 1456 EvenReg, OddReg, BaseReg, OffReg, 1457 Offset, PredReg, Pred, isT2)) { 1458 Ops.pop_back(); 1459 Ops.pop_back(); 1460 1461 // Form the pair instruction. 1462 if (isLd) { 1463 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, 1464 dl, TII->get(NewOpc)) 1465 .addReg(EvenReg, RegState::Define) 1466 .addReg(OddReg, RegState::Define) 1467 .addReg(BaseReg); 1468 if (!isT2) 1469 MIB.addReg(OffReg); 1470 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1471 ++NumLDRDFormed; 1472 } else { 1473 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, 1474 dl, TII->get(NewOpc)) 1475 .addReg(EvenReg) 1476 .addReg(OddReg) 1477 .addReg(BaseReg); 1478 if (!isT2) 1479 MIB.addReg(OffReg); 1480 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1481 ++NumSTRDFormed; 1482 } 1483 MBB->erase(Op0); 1484 MBB->erase(Op1); 1485 1486 // Add register allocation hints to form register pairs. 1487 MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg); 1488 MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg); 1489 } else { 1490 for (unsigned i = 0; i != NumMove; ++i) { 1491 MachineInstr *Op = Ops.back(); 1492 Ops.pop_back(); 1493 MBB->splice(InsertPos, MBB, Op); 1494 } 1495 } 1496 1497 NumLdStMoved += NumMove; 1498 RetVal = true; 1499 } 1500 } 1501 } 1502 1503 return RetVal; 1504 } 1505 1506 bool 1507 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { 1508 bool RetVal = false; 1509 1510 DenseMap<MachineInstr*, unsigned> MI2LocMap; 1511 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap; 1512 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap; 1513 SmallVector<unsigned, 4> LdBases; 1514 SmallVector<unsigned, 4> StBases; 1515 1516 unsigned Loc = 0; 1517 MachineBasicBlock::iterator MBBI = MBB->begin(); 1518 MachineBasicBlock::iterator E = MBB->end(); 1519 while (MBBI != E) { 1520 for (; MBBI != E; ++MBBI) { 1521 MachineInstr *MI = MBBI; 1522 const TargetInstrDesc &TID = MI->getDesc(); 1523 if (TID.isCall() || TID.isTerminator()) { 1524 // Stop at barriers. 1525 ++MBBI; 1526 break; 1527 } 1528 1529 MI2LocMap[MI] = Loc++; 1530 if (!isMemoryOp(MI)) 1531 continue; 1532 unsigned PredReg = 0; 1533 if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL) 1534 continue; 1535 1536 int Opc = MI->getOpcode(); 1537 bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; 1538 unsigned Base = MI->getOperand(1).getReg(); 1539 int Offset = getMemoryOpOffset(MI); 1540 1541 bool StopHere = false; 1542 if (isLd) { 1543 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 1544 Base2LdsMap.find(Base); 1545 if (BI != Base2LdsMap.end()) { 1546 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 1547 if (Offset == getMemoryOpOffset(BI->second[i])) { 1548 StopHere = true; 1549 break; 1550 } 1551 } 1552 if (!StopHere) 1553 BI->second.push_back(MI); 1554 } else { 1555 SmallVector<MachineInstr*, 4> MIs; 1556 MIs.push_back(MI); 1557 Base2LdsMap[Base] = MIs; 1558 LdBases.push_back(Base); 1559 } 1560 } else { 1561 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 1562 Base2StsMap.find(Base); 1563 if (BI != Base2StsMap.end()) { 1564 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 1565 if (Offset == getMemoryOpOffset(BI->second[i])) { 1566 StopHere = true; 1567 break; 1568 } 1569 } 1570 if (!StopHere) 1571 BI->second.push_back(MI); 1572 } else { 1573 SmallVector<MachineInstr*, 4> MIs; 1574 MIs.push_back(MI); 1575 Base2StsMap[Base] = MIs; 1576 StBases.push_back(Base); 1577 } 1578 } 1579 1580 if (StopHere) { 1581 // Found a duplicate (a base+offset combination that's seen earlier). 1582 // Backtrack. 1583 --Loc; 1584 break; 1585 } 1586 } 1587 1588 // Re-schedule loads. 1589 for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { 1590 unsigned Base = LdBases[i]; 1591 SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base]; 1592 if (Lds.size() > 1) 1593 RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); 1594 } 1595 1596 // Re-schedule stores. 1597 for (unsigned i = 0, e = StBases.size(); i != e; ++i) { 1598 unsigned Base = StBases[i]; 1599 SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base]; 1600 if (Sts.size() > 1) 1601 RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); 1602 } 1603 1604 if (MBBI != E) { 1605 Base2LdsMap.clear(); 1606 Base2StsMap.clear(); 1607 LdBases.clear(); 1608 StBases.clear(); 1609 } 1610 } 1611 1612 return RetVal; 1613 } 1614 1615 1616 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store 1617 /// optimization pass. 1618 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { 1619 if (PreAlloc) 1620 return new ARMPreAllocLoadStoreOpt(); 1621 return new ARMLoadStoreOpt(); 1622 } 1623