1 //===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #define DEBUG_TYPE "arm-ldst-opt" 16 #include "ARM.h" 17 #include "ARMAddressingModes.h" 18 #include "ARMBaseInstrInfo.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMRegisterInfo.h" 21 #include "llvm/DerivedTypes.h" 22 #include "llvm/Function.h" 23 #include "llvm/CodeGen/MachineBasicBlock.h" 24 #include "llvm/CodeGen/MachineFunctionPass.h" 25 #include "llvm/CodeGen/MachineInstr.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/CodeGen/MachineRegisterInfo.h" 28 #include "llvm/CodeGen/RegisterScavenging.h" 29 #include "llvm/Target/TargetData.h" 30 #include "llvm/Target/TargetInstrInfo.h" 31 #include "llvm/Target/TargetMachine.h" 32 #include "llvm/Target/TargetRegisterInfo.h" 33 #include "llvm/Support/ErrorHandling.h" 34 #include "llvm/ADT/DenseMap.h" 35 #include "llvm/ADT/STLExtras.h" 36 #include "llvm/ADT/SmallPtrSet.h" 37 #include "llvm/ADT/SmallSet.h" 38 #include "llvm/ADT/SmallVector.h" 39 #include "llvm/ADT/Statistic.h" 40 using namespace llvm; 41 42 STATISTIC(NumLDMGened , "Number of ldm instructions generated"); 43 STATISTIC(NumSTMGened , "Number of stm instructions generated"); 44 STATISTIC(NumVLDMGened, "Number of vldm instructions generated"); 45 STATISTIC(NumVSTMGened, "Number of vstm instructions generated"); 46 STATISTIC(NumLdStMoved, "Number of load / store instructions moved"); 47 STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation"); 48 STATISTIC(NumSTRDFormed,"Number of strd created before allocation"); 49 STATISTIC(NumLDRD2LDM, "Number of ldrd instructions turned back into ldm"); 50 STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); 51 STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); 52 STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); 53 54 /// ARMAllocLoadStoreOpt - Post- register allocation pass the combine 55 /// load / store instructions to form ldm / stm instructions. 56 57 namespace { 58 struct ARMLoadStoreOpt : public MachineFunctionPass { 59 static char ID; 60 ARMLoadStoreOpt() : MachineFunctionPass(&ID) {} 61 62 const TargetInstrInfo *TII; 63 const TargetRegisterInfo *TRI; 64 ARMFunctionInfo *AFI; 65 RegScavenger *RS; 66 bool isThumb2; 67 68 virtual bool runOnMachineFunction(MachineFunction &Fn); 69 70 virtual const char *getPassName() const { 71 return "ARM load / store optimization pass"; 72 } 73 74 private: 75 struct MemOpQueueEntry { 76 int Offset; 77 unsigned Position; 78 MachineBasicBlock::iterator MBBI; 79 bool Merged; 80 MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i) 81 : Offset(o), Position(p), MBBI(i), Merged(false) {}; 82 }; 83 typedef SmallVector<MemOpQueueEntry,8> MemOpQueue; 84 typedef MemOpQueue::iterator MemOpQueueIter; 85 86 bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, 87 int Offset, unsigned Base, bool BaseKill, int Opcode, 88 ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, 89 DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs); 90 void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base, 91 int Opcode, unsigned Size, 92 ARMCC::CondCodes Pred, unsigned PredReg, 93 unsigned Scratch, MemOpQueue &MemOps, 94 SmallVector<MachineBasicBlock::iterator, 4> &Merges); 95 96 void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps); 97 bool FixInvalidRegPairOp(MachineBasicBlock &MBB, 98 MachineBasicBlock::iterator &MBBI); 99 bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 100 MachineBasicBlock::iterator MBBI, 101 const TargetInstrInfo *TII, 102 bool &Advance, 103 MachineBasicBlock::iterator &I); 104 bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 105 MachineBasicBlock::iterator MBBI, 106 bool &Advance, 107 MachineBasicBlock::iterator &I); 108 bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); 109 bool MergeReturnIntoLDM(MachineBasicBlock &MBB); 110 }; 111 char ARMLoadStoreOpt::ID = 0; 112 } 113 114 static int getLoadStoreMultipleOpcode(int Opcode) { 115 switch (Opcode) { 116 case ARM::LDR: 117 NumLDMGened++; 118 return ARM::LDM; 119 case ARM::STR: 120 NumSTMGened++; 121 return ARM::STM; 122 case ARM::t2LDRi8: 123 case ARM::t2LDRi12: 124 NumLDMGened++; 125 return ARM::t2LDM; 126 case ARM::t2STRi8: 127 case ARM::t2STRi12: 128 NumSTMGened++; 129 return ARM::t2STM; 130 case ARM::VLDRS: 131 NumVLDMGened++; 132 return ARM::VLDMS; 133 case ARM::VSTRS: 134 NumVSTMGened++; 135 return ARM::VSTMS; 136 case ARM::VLDRD: 137 NumVLDMGened++; 138 return ARM::VLDMD; 139 case ARM::VSTRD: 140 NumVSTMGened++; 141 return ARM::VSTMD; 142 default: llvm_unreachable("Unhandled opcode!"); 143 } 144 return 0; 145 } 146 147 static bool isT2i32Load(unsigned Opc) { 148 return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8; 149 } 150 151 static bool isi32Load(unsigned Opc) { 152 return Opc == ARM::LDR || isT2i32Load(Opc); 153 } 154 155 static bool isT2i32Store(unsigned Opc) { 156 return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8; 157 } 158 159 static bool isi32Store(unsigned Opc) { 160 return Opc == ARM::STR || isT2i32Store(Opc); 161 } 162 163 /// MergeOps - Create and insert a LDM or STM with Base as base register and 164 /// registers in Regs as the register operands that would be loaded / stored. 165 /// It returns true if the transformation is done. 166 bool 167 ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, 168 MachineBasicBlock::iterator MBBI, 169 int Offset, unsigned Base, bool BaseKill, 170 int Opcode, ARMCC::CondCodes Pred, 171 unsigned PredReg, unsigned Scratch, DebugLoc dl, 172 SmallVector<std::pair<unsigned, bool>, 8> &Regs) { 173 // Only a single register to load / store. Don't bother. 174 unsigned NumRegs = Regs.size(); 175 if (NumRegs <= 1) 176 return false; 177 178 ARM_AM::AMSubMode Mode = ARM_AM::ia; 179 bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); 180 if (isAM4 && Offset == 4) { 181 if (isThumb2) 182 // Thumb2 does not support ldmib / stmib. 183 return false; 184 Mode = ARM_AM::ib; 185 } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) { 186 if (isThumb2) 187 // Thumb2 does not support ldmda / stmda. 188 return false; 189 Mode = ARM_AM::da; 190 } else if (isAM4 && Offset == -4 * (int)NumRegs) { 191 Mode = ARM_AM::db; 192 } else if (Offset != 0) { 193 // If starting offset isn't zero, insert a MI to materialize a new base. 194 // But only do so if it is cost effective, i.e. merging more than two 195 // loads / stores. 196 if (NumRegs <= 2) 197 return false; 198 199 unsigned NewBase; 200 if (isi32Load(Opcode)) 201 // If it is a load, then just use one of the destination register to 202 // use as the new base. 203 NewBase = Regs[NumRegs-1].first; 204 else { 205 // Use the scratch register to use as a new base. 206 NewBase = Scratch; 207 if (NewBase == 0) 208 return false; 209 } 210 int BaseOpc = !isThumb2 211 ? ARM::ADDri 212 : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri); 213 if (Offset < 0) { 214 BaseOpc = !isThumb2 215 ? ARM::SUBri 216 : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri); 217 Offset = - Offset; 218 } 219 int ImmedOffset = isThumb2 220 ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset); 221 if (ImmedOffset == -1) 222 // FIXME: Try t2ADDri12 or t2SUBri12? 223 return false; // Probably not worth it then. 224 225 BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase) 226 .addReg(Base, getKillRegState(BaseKill)).addImm(Offset) 227 .addImm(Pred).addReg(PredReg).addReg(0); 228 Base = NewBase; 229 BaseKill = true; // New base is always killed right its use. 230 } 231 232 bool isDPR = Opcode == ARM::VLDRD || Opcode == ARM::VSTRD; 233 bool isDef = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; 234 Opcode = getLoadStoreMultipleOpcode(Opcode); 235 MachineInstrBuilder MIB = (isAM4) 236 ? BuildMI(MBB, MBBI, dl, TII->get(Opcode)) 237 .addReg(Base, getKillRegState(BaseKill)) 238 .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg) 239 : BuildMI(MBB, MBBI, dl, TII->get(Opcode)) 240 .addReg(Base, getKillRegState(BaseKill)) 241 .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs)) 242 .addImm(Pred).addReg(PredReg); 243 MIB.addReg(0); // Add optional writeback (0 for now). 244 for (unsigned i = 0; i != NumRegs; ++i) 245 MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef) 246 | getKillRegState(Regs[i].second)); 247 248 return true; 249 } 250 251 /// MergeLDR_STR - Merge a number of load / store instructions into one or more 252 /// load / store multiple instructions. 253 void 254 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, 255 unsigned Base, int Opcode, unsigned Size, 256 ARMCC::CondCodes Pred, unsigned PredReg, 257 unsigned Scratch, MemOpQueue &MemOps, 258 SmallVector<MachineBasicBlock::iterator, 4> &Merges) { 259 bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode); 260 int Offset = MemOps[SIndex].Offset; 261 int SOffset = Offset; 262 unsigned Pos = MemOps[SIndex].Position; 263 MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI; 264 DebugLoc dl = Loc->getDebugLoc(); 265 unsigned PReg = Loc->getOperand(0).getReg(); 266 unsigned PRegNum = ARMRegisterInfo::getRegisterNumbering(PReg); 267 bool isKill = Loc->getOperand(0).isKill(); 268 269 SmallVector<std::pair<unsigned,bool>, 8> Regs; 270 Regs.push_back(std::make_pair(PReg, isKill)); 271 for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) { 272 int NewOffset = MemOps[i].Offset; 273 unsigned Reg = MemOps[i].MBBI->getOperand(0).getReg(); 274 unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg); 275 isKill = MemOps[i].MBBI->getOperand(0).isKill(); 276 // AM4 - register numbers in ascending order. 277 // AM5 - consecutive register numbers in ascending order. 278 if (NewOffset == Offset + (int)Size && 279 ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) { 280 Offset += Size; 281 Regs.push_back(std::make_pair(Reg, isKill)); 282 PRegNum = RegNum; 283 } else { 284 // Can't merge this in. Try merge the earlier ones first. 285 if (MergeOps(MBB, ++Loc, SOffset, Base, false, Opcode, Pred, PredReg, 286 Scratch, dl, Regs)) { 287 Merges.push_back(prior(Loc)); 288 for (unsigned j = SIndex; j < i; ++j) { 289 MBB.erase(MemOps[j].MBBI); 290 MemOps[j].Merged = true; 291 } 292 } 293 MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch, 294 MemOps, Merges); 295 return; 296 } 297 298 if (MemOps[i].Position > Pos) { 299 Pos = MemOps[i].Position; 300 Loc = MemOps[i].MBBI; 301 } 302 } 303 304 bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1; 305 if (MergeOps(MBB, ++Loc, SOffset, Base, BaseKill, Opcode, Pred, PredReg, 306 Scratch, dl, Regs)) { 307 Merges.push_back(prior(Loc)); 308 for (unsigned i = SIndex, e = MemOps.size(); i != e; ++i) { 309 MBB.erase(MemOps[i].MBBI); 310 MemOps[i].Merged = true; 311 } 312 } 313 314 return; 315 } 316 317 static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base, 318 unsigned Bytes, unsigned Limit, 319 ARMCC::CondCodes Pred, unsigned PredReg){ 320 unsigned MyPredReg = 0; 321 if (!MI) 322 return false; 323 if (MI->getOpcode() != ARM::t2SUBri && 324 MI->getOpcode() != ARM::t2SUBrSPi && 325 MI->getOpcode() != ARM::t2SUBrSPi12 && 326 MI->getOpcode() != ARM::tSUBspi && 327 MI->getOpcode() != ARM::SUBri) 328 return false; 329 330 // Make sure the offset fits in 8 bits. 331 if (Bytes <= 0 || (Limit && Bytes >= Limit)) 332 return false; 333 334 unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME 335 return (MI->getOperand(0).getReg() == Base && 336 MI->getOperand(1).getReg() == Base && 337 (MI->getOperand(2).getImm()*Scale) == Bytes && 338 llvm::getInstrPredicate(MI, MyPredReg) == Pred && 339 MyPredReg == PredReg); 340 } 341 342 static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base, 343 unsigned Bytes, unsigned Limit, 344 ARMCC::CondCodes Pred, unsigned PredReg){ 345 unsigned MyPredReg = 0; 346 if (!MI) 347 return false; 348 if (MI->getOpcode() != ARM::t2ADDri && 349 MI->getOpcode() != ARM::t2ADDrSPi && 350 MI->getOpcode() != ARM::t2ADDrSPi12 && 351 MI->getOpcode() != ARM::tADDspi && 352 MI->getOpcode() != ARM::ADDri) 353 return false; 354 355 if (Bytes <= 0 || (Limit && Bytes >= Limit)) 356 // Make sure the offset fits in 8 bits. 357 return false; 358 359 unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME 360 return (MI->getOperand(0).getReg() == Base && 361 MI->getOperand(1).getReg() == Base && 362 (MI->getOperand(2).getImm()*Scale) == Bytes && 363 llvm::getInstrPredicate(MI, MyPredReg) == Pred && 364 MyPredReg == PredReg); 365 } 366 367 static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) { 368 switch (MI->getOpcode()) { 369 default: return 0; 370 case ARM::LDR: 371 case ARM::STR: 372 case ARM::t2LDRi8: 373 case ARM::t2LDRi12: 374 case ARM::t2STRi8: 375 case ARM::t2STRi12: 376 case ARM::VLDRS: 377 case ARM::VSTRS: 378 return 4; 379 case ARM::VLDRD: 380 case ARM::VSTRD: 381 return 8; 382 case ARM::LDM: 383 case ARM::STM: 384 case ARM::t2LDM: 385 case ARM::t2STM: 386 return (MI->getNumOperands() - 5) * 4; 387 case ARM::VLDMS: 388 case ARM::VSTMS: 389 case ARM::VLDMD: 390 case ARM::VSTMD: 391 return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4; 392 } 393 } 394 395 /// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base 396 /// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: 397 /// 398 /// stmia rn, <ra, rb, rc> 399 /// rn := rn + 4 * 3; 400 /// => 401 /// stmia rn!, <ra, rb, rc> 402 /// 403 /// rn := rn - 4 * 3; 404 /// ldmia rn, <ra, rb, rc> 405 /// => 406 /// ldmdb rn!, <ra, rb, rc> 407 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB, 408 MachineBasicBlock::iterator MBBI, 409 bool &Advance, 410 MachineBasicBlock::iterator &I) { 411 MachineInstr *MI = MBBI; 412 unsigned Base = MI->getOperand(0).getReg(); 413 unsigned Bytes = getLSMultipleTransferSize(MI); 414 unsigned PredReg = 0; 415 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 416 int Opcode = MI->getOpcode(); 417 bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM || 418 Opcode == ARM::STM || Opcode == ARM::t2STM; 419 420 if (isAM4) { 421 if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm())) 422 return false; 423 424 // Can't use the updating AM4 sub-mode if the base register is also a dest 425 // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined. 426 for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) { 427 if (MI->getOperand(i).getReg() == Base) 428 return false; 429 } 430 431 ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm()); 432 if (MBBI != MBB.begin()) { 433 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 434 if (Mode == ARM_AM::ia && 435 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 436 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true)); 437 MI->getOperand(4).setReg(Base); 438 MI->getOperand(4).setIsDef(); 439 MBB.erase(PrevMBBI); 440 return true; 441 } else if (Mode == ARM_AM::ib && 442 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 443 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true)); 444 MI->getOperand(4).setReg(Base); // WB to base 445 MI->getOperand(4).setIsDef(); 446 MBB.erase(PrevMBBI); 447 return true; 448 } 449 } 450 451 if (MBBI != MBB.end()) { 452 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); 453 if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && 454 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 455 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); 456 MI->getOperand(4).setReg(Base); // WB to base 457 MI->getOperand(4).setIsDef(); 458 if (NextMBBI == I) { 459 Advance = true; 460 ++I; 461 } 462 MBB.erase(NextMBBI); 463 return true; 464 } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && 465 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 466 MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true)); 467 MI->getOperand(4).setReg(Base); // WB to base 468 MI->getOperand(4).setIsDef(); 469 if (NextMBBI == I) { 470 Advance = true; 471 ++I; 472 } 473 MBB.erase(NextMBBI); 474 return true; 475 } 476 } 477 } else { 478 // VLDM{D|S}, VSTM{D|S} addressing mode 5 ops. 479 if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm())) 480 return false; 481 482 ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm()); 483 unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm()); 484 if (MBBI != MBB.begin()) { 485 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 486 if (Mode == ARM_AM::ia && 487 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { 488 MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset)); 489 MI->getOperand(4).setReg(Base); // WB to base 490 MI->getOperand(4).setIsDef(); 491 MBB.erase(PrevMBBI); 492 return true; 493 } 494 } 495 496 if (MBBI != MBB.end()) { 497 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); 498 if (Mode == ARM_AM::ia && 499 isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { 500 MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset)); 501 MI->getOperand(4).setReg(Base); // WB to base 502 MI->getOperand(4).setIsDef(); 503 if (NextMBBI == I) { 504 Advance = true; 505 ++I; 506 } 507 MBB.erase(NextMBBI); 508 } 509 return true; 510 } 511 } 512 513 return false; 514 } 515 516 static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) { 517 switch (Opc) { 518 case ARM::LDR: return ARM::LDR_PRE; 519 case ARM::STR: return ARM::STR_PRE; 520 case ARM::VLDRS: return ARM::VLDMS; 521 case ARM::VLDRD: return ARM::VLDMD; 522 case ARM::VSTRS: return ARM::VSTMS; 523 case ARM::VSTRD: return ARM::VSTMD; 524 case ARM::t2LDRi8: 525 case ARM::t2LDRi12: 526 return ARM::t2LDR_PRE; 527 case ARM::t2STRi8: 528 case ARM::t2STRi12: 529 return ARM::t2STR_PRE; 530 default: llvm_unreachable("Unhandled opcode!"); 531 } 532 return 0; 533 } 534 535 static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) { 536 switch (Opc) { 537 case ARM::LDR: return ARM::LDR_POST; 538 case ARM::STR: return ARM::STR_POST; 539 case ARM::VLDRS: return ARM::VLDMS; 540 case ARM::VLDRD: return ARM::VLDMD; 541 case ARM::VSTRS: return ARM::VSTMS; 542 case ARM::VSTRD: return ARM::VSTMD; 543 case ARM::t2LDRi8: 544 case ARM::t2LDRi12: 545 return ARM::t2LDR_POST; 546 case ARM::t2STRi8: 547 case ARM::t2STRi12: 548 return ARM::t2STR_POST; 549 default: llvm_unreachable("Unhandled opcode!"); 550 } 551 return 0; 552 } 553 554 /// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base 555 /// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible: 556 bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, 557 MachineBasicBlock::iterator MBBI, 558 const TargetInstrInfo *TII, 559 bool &Advance, 560 MachineBasicBlock::iterator &I) { 561 MachineInstr *MI = MBBI; 562 unsigned Base = MI->getOperand(1).getReg(); 563 bool BaseKill = MI->getOperand(1).isKill(); 564 unsigned Bytes = getLSMultipleTransferSize(MI); 565 int Opcode = MI->getOpcode(); 566 DebugLoc dl = MI->getDebugLoc(); 567 bool isAM5 = Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || 568 Opcode == ARM::VSTRD || Opcode == ARM::VSTRS; 569 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; 570 if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0) 571 return false; 572 else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) 573 return false; 574 else if (isT2i32Load(Opcode) || isT2i32Store(Opcode)) 575 if (MI->getOperand(2).getImm() != 0) 576 return false; 577 578 bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; 579 // Can't do the merge if the destination register is the same as the would-be 580 // writeback register. 581 if (isLd && MI->getOperand(0).getReg() == Base) 582 return false; 583 584 unsigned PredReg = 0; 585 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 586 bool DoMerge = false; 587 ARM_AM::AddrOpc AddSub = ARM_AM::add; 588 unsigned NewOpc = 0; 589 // AM2 - 12 bits, thumb2 - 8 bits. 590 unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); 591 if (MBBI != MBB.begin()) { 592 MachineBasicBlock::iterator PrevMBBI = prior(MBBI); 593 if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { 594 DoMerge = true; 595 AddSub = ARM_AM::sub; 596 NewOpc = getPreIndexedLoadStoreOpcode(Opcode); 597 } else if (!isAM5 && 598 isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { 599 DoMerge = true; 600 NewOpc = getPreIndexedLoadStoreOpcode(Opcode); 601 } 602 if (DoMerge) 603 MBB.erase(PrevMBBI); 604 } 605 606 if (!DoMerge && MBBI != MBB.end()) { 607 MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI); 608 if (!isAM5 && 609 isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { 610 DoMerge = true; 611 AddSub = ARM_AM::sub; 612 NewOpc = getPostIndexedLoadStoreOpcode(Opcode); 613 } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { 614 DoMerge = true; 615 NewOpc = getPostIndexedLoadStoreOpcode(Opcode); 616 } 617 if (DoMerge) { 618 if (NextMBBI == I) { 619 Advance = true; 620 ++I; 621 } 622 MBB.erase(NextMBBI); 623 } 624 } 625 626 if (!DoMerge) 627 return false; 628 629 bool isDPR = NewOpc == ARM::VLDMD || NewOpc == ARM::VSTMD; 630 unsigned Offset = 0; 631 if (isAM5) 632 Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub) 633 ? ARM_AM::db 634 : ARM_AM::ia, true, (isDPR ? 2 : 1)); 635 else if (isAM2) 636 Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); 637 else 638 Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; 639 if (isLd) { 640 if (isAM5) 641 // VLDMS, VLDMD 642 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)) 643 .addReg(Base, getKillRegState(BaseKill)) 644 .addImm(Offset).addImm(Pred).addReg(PredReg) 645 .addReg(Base, getDefRegState(true)) // WB base register 646 .addReg(MI->getOperand(0).getReg(), RegState::Define); 647 else if (isAM2) 648 // LDR_PRE, LDR_POST, 649 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 650 .addReg(Base, RegState::Define) 651 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 652 else 653 // t2LDR_PRE, t2LDR_POST 654 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg()) 655 .addReg(Base, RegState::Define) 656 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 657 } else { 658 MachineOperand &MO = MI->getOperand(0); 659 if (isAM5) 660 // VSTMS, VSTMD 661 BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset) 662 .addImm(Pred).addReg(PredReg) 663 .addReg(Base, getDefRegState(true)) // WB base register 664 .addReg(MO.getReg(), getKillRegState(MO.isKill())); 665 else if (isAM2) 666 // STR_PRE, STR_POST 667 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 668 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 669 .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); 670 else 671 // t2STR_PRE, t2STR_POST 672 BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base) 673 .addReg(MO.getReg(), getKillRegState(MO.isKill())) 674 .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); 675 } 676 MBB.erase(MBBI); 677 678 return true; 679 } 680 681 /// isMemoryOp - Returns true if instruction is a memory operations (that this 682 /// pass is capable of operating on). 683 static bool isMemoryOp(const MachineInstr *MI) { 684 int Opcode = MI->getOpcode(); 685 switch (Opcode) { 686 default: break; 687 case ARM::LDR: 688 case ARM::STR: 689 return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0; 690 case ARM::VLDRS: 691 case ARM::VSTRS: 692 return MI->getOperand(1).isReg(); 693 case ARM::VLDRD: 694 case ARM::VSTRD: 695 return MI->getOperand(1).isReg(); 696 case ARM::t2LDRi8: 697 case ARM::t2LDRi12: 698 case ARM::t2STRi8: 699 case ARM::t2STRi12: 700 return MI->getOperand(1).isReg(); 701 } 702 return false; 703 } 704 705 /// AdvanceRS - Advance register scavenger to just before the earliest memory 706 /// op that is being merged. 707 void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { 708 MachineBasicBlock::iterator Loc = MemOps[0].MBBI; 709 unsigned Position = MemOps[0].Position; 710 for (unsigned i = 1, e = MemOps.size(); i != e; ++i) { 711 if (MemOps[i].Position < Position) { 712 Position = MemOps[i].Position; 713 Loc = MemOps[i].MBBI; 714 } 715 } 716 717 if (Loc != MBB.begin()) 718 RS->forward(prior(Loc)); 719 } 720 721 static int getMemoryOpOffset(const MachineInstr *MI) { 722 int Opcode = MI->getOpcode(); 723 bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR; 724 bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; 725 unsigned NumOperands = MI->getDesc().getNumOperands(); 726 unsigned OffField = MI->getOperand(NumOperands-3).getImm(); 727 728 if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || 729 Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || 730 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) 731 return OffField; 732 733 int Offset = isAM2 734 ? ARM_AM::getAM2Offset(OffField) 735 : (isAM3 ? ARM_AM::getAM3Offset(OffField) 736 : ARM_AM::getAM5Offset(OffField) * 4); 737 if (isAM2) { 738 if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub) 739 Offset = -Offset; 740 } else if (isAM3) { 741 if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) 742 Offset = -Offset; 743 } else { 744 if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) 745 Offset = -Offset; 746 } 747 return Offset; 748 } 749 750 static void InsertLDR_STR(MachineBasicBlock &MBB, 751 MachineBasicBlock::iterator &MBBI, 752 int OffImm, bool isDef, 753 DebugLoc dl, unsigned NewOpc, 754 unsigned Reg, bool RegDeadKill, bool RegUndef, 755 unsigned BaseReg, bool BaseKill, bool BaseUndef, 756 unsigned OffReg, bool OffKill, bool OffUndef, 757 ARMCC::CondCodes Pred, unsigned PredReg, 758 const TargetInstrInfo *TII, bool isT2) { 759 int Offset = OffImm; 760 if (!isT2) { 761 if (OffImm < 0) 762 Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift); 763 else 764 Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift); 765 } 766 if (isDef) { 767 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 768 TII->get(NewOpc)) 769 .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill)) 770 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 771 if (!isT2) 772 MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); 773 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 774 } else { 775 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(), 776 TII->get(NewOpc)) 777 .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef)) 778 .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef)); 779 if (!isT2) 780 MIB.addReg(OffReg, getKillRegState(OffKill)|getUndefRegState(OffUndef)); 781 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 782 } 783 } 784 785 bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, 786 MachineBasicBlock::iterator &MBBI) { 787 MachineInstr *MI = &*MBBI; 788 unsigned Opcode = MI->getOpcode(); 789 if (Opcode == ARM::LDRD || Opcode == ARM::STRD || 790 Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) { 791 unsigned EvenReg = MI->getOperand(0).getReg(); 792 unsigned OddReg = MI->getOperand(1).getReg(); 793 unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false); 794 unsigned OddRegNum = TRI->getDwarfRegNum(OddReg, false); 795 if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum) 796 return false; 797 798 bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8; 799 bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8; 800 bool EvenDeadKill = isLd ? 801 MI->getOperand(0).isDead() : MI->getOperand(0).isKill(); 802 bool EvenUndef = MI->getOperand(0).isUndef(); 803 bool OddDeadKill = isLd ? 804 MI->getOperand(1).isDead() : MI->getOperand(1).isKill(); 805 bool OddUndef = MI->getOperand(1).isUndef(); 806 const MachineOperand &BaseOp = MI->getOperand(2); 807 unsigned BaseReg = BaseOp.getReg(); 808 bool BaseKill = BaseOp.isKill(); 809 bool BaseUndef = BaseOp.isUndef(); 810 unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg(); 811 bool OffKill = isT2 ? false : MI->getOperand(3).isKill(); 812 bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef(); 813 int OffImm = getMemoryOpOffset(MI); 814 unsigned PredReg = 0; 815 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg); 816 817 if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) { 818 // Ascending register numbers and no offset. It's safe to change it to a 819 // ldm or stm. 820 unsigned NewOpc = (isLd) 821 ? (isT2 ? ARM::t2LDM : ARM::LDM) 822 : (isT2 ? ARM::t2STM : ARM::STM); 823 if (isLd) { 824 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 825 .addReg(BaseReg, getKillRegState(BaseKill)) 826 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) 827 .addImm(Pred).addReg(PredReg) 828 .addReg(0) 829 .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill)) 830 .addReg(OddReg, getDefRegState(isLd) | getDeadRegState(OddDeadKill)); 831 ++NumLDRD2LDM; 832 } else { 833 BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc)) 834 .addReg(BaseReg, getKillRegState(BaseKill)) 835 .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia)) 836 .addImm(Pred).addReg(PredReg) 837 .addReg(0) 838 .addReg(EvenReg, 839 getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef)) 840 .addReg(OddReg, 841 getKillRegState(OddDeadKill) | getUndefRegState(OddUndef)); 842 ++NumSTRD2STM; 843 } 844 } else { 845 // Split into two instructions. 846 assert((!isT2 || !OffReg) && 847 "Thumb2 ldrd / strd does not encode offset register!"); 848 unsigned NewOpc = (isLd) 849 ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR) 850 : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR); 851 DebugLoc dl = MBBI->getDebugLoc(); 852 // If this is a load and base register is killed, it may have been 853 // re-defed by the load, make sure the first load does not clobber it. 854 if (isLd && 855 (BaseKill || OffKill) && 856 (TRI->regsOverlap(EvenReg, BaseReg) || 857 (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) { 858 assert(!TRI->regsOverlap(OddReg, BaseReg) && 859 (!OffReg || !TRI->regsOverlap(OddReg, OffReg))); 860 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, 861 OddReg, OddDeadKill, false, 862 BaseReg, false, BaseUndef, OffReg, false, OffUndef, 863 Pred, PredReg, TII, isT2); 864 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 865 EvenReg, EvenDeadKill, false, 866 BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, 867 Pred, PredReg, TII, isT2); 868 } else { 869 if (OddReg == EvenReg && EvenDeadKill) { 870 // If the two source operands are the same, the kill marker is probably 871 // on the first one. e.g. 872 // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0 873 EvenDeadKill = false; 874 OddDeadKill = true; 875 } 876 InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc, 877 EvenReg, EvenDeadKill, EvenUndef, 878 BaseReg, false, BaseUndef, OffReg, false, OffUndef, 879 Pred, PredReg, TII, isT2); 880 InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc, 881 OddReg, OddDeadKill, OddUndef, 882 BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef, 883 Pred, PredReg, TII, isT2); 884 } 885 if (isLd) 886 ++NumLDRD2LDR; 887 else 888 ++NumSTRD2STR; 889 } 890 891 MBBI = prior(MBBI); 892 MBB.erase(MI); 893 } 894 return false; 895 } 896 897 /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR 898 /// ops of the same base and incrementing offset into LDM / STM ops. 899 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { 900 unsigned NumMerges = 0; 901 unsigned NumMemOps = 0; 902 MemOpQueue MemOps; 903 unsigned CurrBase = 0; 904 int CurrOpc = -1; 905 unsigned CurrSize = 0; 906 ARMCC::CondCodes CurrPred = ARMCC::AL; 907 unsigned CurrPredReg = 0; 908 unsigned Position = 0; 909 SmallVector<MachineBasicBlock::iterator,4> Merges; 910 911 RS->enterBasicBlock(&MBB); 912 MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 913 while (MBBI != E) { 914 if (FixInvalidRegPairOp(MBB, MBBI)) 915 continue; 916 917 bool Advance = false; 918 bool TryMerge = false; 919 bool Clobber = false; 920 921 bool isMemOp = isMemoryOp(MBBI); 922 if (isMemOp) { 923 int Opcode = MBBI->getOpcode(); 924 unsigned Size = getLSMultipleTransferSize(MBBI); 925 unsigned Base = MBBI->getOperand(1).getReg(); 926 unsigned PredReg = 0; 927 ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg); 928 int Offset = getMemoryOpOffset(MBBI); 929 // Watch out for: 930 // r4 := ldr [r5] 931 // r5 := ldr [r5, #4] 932 // r6 := ldr [r5, #8] 933 // 934 // The second ldr has effectively broken the chain even though it 935 // looks like the later ldr(s) use the same base register. Try to 936 // merge the ldr's so far, including this one. But don't try to 937 // combine the following ldr(s). 938 Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); 939 if (CurrBase == 0 && !Clobber) { 940 // Start of a new chain. 941 CurrBase = Base; 942 CurrOpc = Opcode; 943 CurrSize = Size; 944 CurrPred = Pred; 945 CurrPredReg = PredReg; 946 MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); 947 NumMemOps++; 948 Advance = true; 949 } else { 950 if (Clobber) { 951 TryMerge = true; 952 Advance = true; 953 } 954 955 if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) { 956 // No need to match PredReg. 957 // Continue adding to the queue. 958 if (Offset > MemOps.back().Offset) { 959 MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI)); 960 NumMemOps++; 961 Advance = true; 962 } else { 963 for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); 964 I != E; ++I) { 965 if (Offset < I->Offset) { 966 MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI)); 967 NumMemOps++; 968 Advance = true; 969 break; 970 } else if (Offset == I->Offset) { 971 // Collision! This can't be merged! 972 break; 973 } 974 } 975 } 976 } 977 } 978 } 979 980 if (Advance) { 981 ++Position; 982 ++MBBI; 983 if (MBBI == E) 984 // Reach the end of the block, try merging the memory instructions. 985 TryMerge = true; 986 } else 987 TryMerge = true; 988 989 if (TryMerge) { 990 if (NumMemOps > 1) { 991 // Try to find a free register to use as a new base in case it's needed. 992 // First advance to the instruction just before the start of the chain. 993 AdvanceRS(MBB, MemOps); 994 // Find a scratch register. 995 unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass); 996 // Process the load / store instructions. 997 RS->forward(prior(MBBI)); 998 999 // Merge ops. 1000 Merges.clear(); 1001 MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize, 1002 CurrPred, CurrPredReg, Scratch, MemOps, Merges); 1003 1004 // Try folding preceeding/trailing base inc/dec into the generated 1005 // LDM/STM ops. 1006 for (unsigned i = 0, e = Merges.size(); i < e; ++i) 1007 if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI)) 1008 ++NumMerges; 1009 NumMerges += Merges.size(); 1010 1011 // Try folding preceeding/trailing base inc/dec into those load/store 1012 // that were not merged to form LDM/STM ops. 1013 for (unsigned i = 0; i != NumMemOps; ++i) 1014 if (!MemOps[i].Merged) 1015 if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI)) 1016 ++NumMerges; 1017 1018 // RS may be pointing to an instruction that's deleted. 1019 RS->skipTo(prior(MBBI)); 1020 } else if (NumMemOps == 1) { 1021 // Try folding preceeding/trailing base inc/dec into the single 1022 // load/store. 1023 if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) { 1024 ++NumMerges; 1025 RS->forward(prior(MBBI)); 1026 } 1027 } 1028 1029 CurrBase = 0; 1030 CurrOpc = -1; 1031 CurrSize = 0; 1032 CurrPred = ARMCC::AL; 1033 CurrPredReg = 0; 1034 if (NumMemOps) { 1035 MemOps.clear(); 1036 NumMemOps = 0; 1037 } 1038 1039 // If iterator hasn't been advanced and this is not a memory op, skip it. 1040 // It can't start a new chain anyway. 1041 if (!Advance && !isMemOp && MBBI != E) { 1042 ++Position; 1043 ++MBBI; 1044 } 1045 } 1046 } 1047 return NumMerges > 0; 1048 } 1049 1050 namespace { 1051 struct OffsetCompare { 1052 bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const { 1053 int LOffset = getMemoryOpOffset(LHS); 1054 int ROffset = getMemoryOpOffset(RHS); 1055 assert(LHS == RHS || LOffset != ROffset); 1056 return LOffset > ROffset; 1057 } 1058 }; 1059 } 1060 1061 /// MergeReturnIntoLDM - If this is a exit BB, try merging the return op 1062 /// (bx lr) into the preceeding stack restore so it directly restore the value 1063 /// of LR into pc. 1064 /// ldmfd sp!, {r7, lr} 1065 /// bx lr 1066 /// => 1067 /// ldmfd sp!, {r7, pc} 1068 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { 1069 if (MBB.empty()) return false; 1070 1071 MachineBasicBlock::iterator MBBI = prior(MBB.end()); 1072 if (MBBI != MBB.begin() && 1073 (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) { 1074 MachineInstr *PrevMI = prior(MBBI); 1075 if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) { 1076 MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1); 1077 if (MO.getReg() != ARM::LR) 1078 return false; 1079 unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET; 1080 PrevMI->setDesc(TII->get(NewOpc)); 1081 MO.setReg(ARM::PC); 1082 MBB.erase(MBBI); 1083 return true; 1084 } 1085 } 1086 return false; 1087 } 1088 1089 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1090 const TargetMachine &TM = Fn.getTarget(); 1091 AFI = Fn.getInfo<ARMFunctionInfo>(); 1092 TII = TM.getInstrInfo(); 1093 TRI = TM.getRegisterInfo(); 1094 RS = new RegScavenger(); 1095 isThumb2 = AFI->isThumb2Function(); 1096 1097 bool Modified = false; 1098 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1099 ++MFI) { 1100 MachineBasicBlock &MBB = *MFI; 1101 Modified |= LoadStoreMultipleOpti(MBB); 1102 Modified |= MergeReturnIntoLDM(MBB); 1103 } 1104 1105 delete RS; 1106 return Modified; 1107 } 1108 1109 1110 /// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move 1111 /// load / stores from consecutive locations close to make it more 1112 /// likely they will be combined later. 1113 1114 namespace { 1115 struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ 1116 static char ID; 1117 ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {} 1118 1119 const TargetData *TD; 1120 const TargetInstrInfo *TII; 1121 const TargetRegisterInfo *TRI; 1122 const ARMSubtarget *STI; 1123 MachineRegisterInfo *MRI; 1124 MachineFunction *MF; 1125 1126 virtual bool runOnMachineFunction(MachineFunction &Fn); 1127 1128 virtual const char *getPassName() const { 1129 return "ARM pre- register allocation load / store optimization pass"; 1130 } 1131 1132 private: 1133 bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, 1134 unsigned &NewOpc, unsigned &EvenReg, 1135 unsigned &OddReg, unsigned &BaseReg, 1136 unsigned &OffReg, int &Offset, 1137 unsigned &PredReg, ARMCC::CondCodes &Pred, 1138 bool &isT2); 1139 bool RescheduleOps(MachineBasicBlock *MBB, 1140 SmallVector<MachineInstr*, 4> &Ops, 1141 unsigned Base, bool isLd, 1142 DenseMap<MachineInstr*, unsigned> &MI2LocMap); 1143 bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); 1144 }; 1145 char ARMPreAllocLoadStoreOpt::ID = 0; 1146 } 1147 1148 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1149 TD = Fn.getTarget().getTargetData(); 1150 TII = Fn.getTarget().getInstrInfo(); 1151 TRI = Fn.getTarget().getRegisterInfo(); 1152 STI = &Fn.getTarget().getSubtarget<ARMSubtarget>(); 1153 MRI = &Fn.getRegInfo(); 1154 MF = &Fn; 1155 1156 bool Modified = false; 1157 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; 1158 ++MFI) 1159 Modified |= RescheduleLoadStoreInstrs(MFI); 1160 1161 return Modified; 1162 } 1163 1164 static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, 1165 MachineBasicBlock::iterator I, 1166 MachineBasicBlock::iterator E, 1167 SmallPtrSet<MachineInstr*, 4> &MemOps, 1168 SmallSet<unsigned, 4> &MemRegs, 1169 const TargetRegisterInfo *TRI) { 1170 // Are there stores / loads / calls between them? 1171 // FIXME: This is overly conservative. We should make use of alias information 1172 // some day. 1173 SmallSet<unsigned, 4> AddedRegPressure; 1174 while (++I != E) { 1175 if (MemOps.count(&*I)) 1176 continue; 1177 const TargetInstrDesc &TID = I->getDesc(); 1178 if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects()) 1179 return false; 1180 if (isLd && TID.mayStore()) 1181 return false; 1182 if (!isLd) { 1183 if (TID.mayLoad()) 1184 return false; 1185 // It's not safe to move the first 'str' down. 1186 // str r1, [r0] 1187 // strh r5, [r0] 1188 // str r4, [r0, #+4] 1189 if (TID.mayStore()) 1190 return false; 1191 } 1192 for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) { 1193 MachineOperand &MO = I->getOperand(j); 1194 if (!MO.isReg()) 1195 continue; 1196 unsigned Reg = MO.getReg(); 1197 if (MO.isDef() && TRI->regsOverlap(Reg, Base)) 1198 return false; 1199 if (Reg != Base && !MemRegs.count(Reg)) 1200 AddedRegPressure.insert(Reg); 1201 } 1202 } 1203 1204 // Estimate register pressure increase due to the transformation. 1205 if (MemRegs.size() <= 4) 1206 // Ok if we are moving small number of instructions. 1207 return true; 1208 return AddedRegPressure.size() <= MemRegs.size() * 2; 1209 } 1210 1211 bool 1212 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, 1213 DebugLoc &dl, 1214 unsigned &NewOpc, unsigned &EvenReg, 1215 unsigned &OddReg, unsigned &BaseReg, 1216 unsigned &OffReg, int &Offset, 1217 unsigned &PredReg, 1218 ARMCC::CondCodes &Pred, 1219 bool &isT2) { 1220 // Make sure we're allowed to generate LDRD/STRD. 1221 if (!STI->hasV5TEOps()) 1222 return false; 1223 1224 // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD 1225 unsigned Scale = 1; 1226 unsigned Opcode = Op0->getOpcode(); 1227 if (Opcode == ARM::LDR) 1228 NewOpc = ARM::LDRD; 1229 else if (Opcode == ARM::STR) 1230 NewOpc = ARM::STRD; 1231 else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) { 1232 NewOpc = ARM::t2LDRDi8; 1233 Scale = 4; 1234 isT2 = true; 1235 } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) { 1236 NewOpc = ARM::t2STRDi8; 1237 Scale = 4; 1238 isT2 = true; 1239 } else 1240 return false; 1241 1242 // Make sure the offset registers match. 1243 if (!isT2 && 1244 (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg())) 1245 return false; 1246 1247 // Must sure the base address satisfies i64 ld / st alignment requirement. 1248 if (!Op0->hasOneMemOperand() || 1249 !(*Op0->memoperands_begin())->getValue() || 1250 (*Op0->memoperands_begin())->isVolatile()) 1251 return false; 1252 1253 unsigned Align = (*Op0->memoperands_begin())->getAlignment(); 1254 Function *Func = MF->getFunction(); 1255 unsigned ReqAlign = STI->hasV6Ops() 1256 ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext())) 1257 : 8; // Pre-v6 need 8-byte align 1258 if (Align < ReqAlign) 1259 return false; 1260 1261 // Then make sure the immediate offset fits. 1262 int OffImm = getMemoryOpOffset(Op0); 1263 if (isT2) { 1264 if (OffImm < 0) { 1265 if (OffImm < -255) 1266 // Can't fall back to t2LDRi8 / t2STRi8. 1267 return false; 1268 } else { 1269 int Limit = (1 << 8) * Scale; 1270 if (OffImm >= Limit || (OffImm & (Scale-1))) 1271 return false; 1272 } 1273 Offset = OffImm; 1274 } else { 1275 ARM_AM::AddrOpc AddSub = ARM_AM::add; 1276 if (OffImm < 0) { 1277 AddSub = ARM_AM::sub; 1278 OffImm = - OffImm; 1279 } 1280 int Limit = (1 << 8) * Scale; 1281 if (OffImm >= Limit || (OffImm & (Scale-1))) 1282 return false; 1283 Offset = ARM_AM::getAM3Opc(AddSub, OffImm); 1284 } 1285 EvenReg = Op0->getOperand(0).getReg(); 1286 OddReg = Op1->getOperand(0).getReg(); 1287 if (EvenReg == OddReg) 1288 return false; 1289 BaseReg = Op0->getOperand(1).getReg(); 1290 if (!isT2) 1291 OffReg = Op0->getOperand(2).getReg(); 1292 Pred = llvm::getInstrPredicate(Op0, PredReg); 1293 dl = Op0->getDebugLoc(); 1294 return true; 1295 } 1296 1297 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, 1298 SmallVector<MachineInstr*, 4> &Ops, 1299 unsigned Base, bool isLd, 1300 DenseMap<MachineInstr*, unsigned> &MI2LocMap) { 1301 bool RetVal = false; 1302 1303 // Sort by offset (in reverse order). 1304 std::sort(Ops.begin(), Ops.end(), OffsetCompare()); 1305 1306 // The loads / stores of the same base are in order. Scan them from first to 1307 // last and check for the followins: 1308 // 1. Any def of base. 1309 // 2. Any gaps. 1310 while (Ops.size() > 1) { 1311 unsigned FirstLoc = ~0U; 1312 unsigned LastLoc = 0; 1313 MachineInstr *FirstOp = 0; 1314 MachineInstr *LastOp = 0; 1315 int LastOffset = 0; 1316 unsigned LastOpcode = 0; 1317 unsigned LastBytes = 0; 1318 unsigned NumMove = 0; 1319 for (int i = Ops.size() - 1; i >= 0; --i) { 1320 MachineInstr *Op = Ops[i]; 1321 unsigned Loc = MI2LocMap[Op]; 1322 if (Loc <= FirstLoc) { 1323 FirstLoc = Loc; 1324 FirstOp = Op; 1325 } 1326 if (Loc >= LastLoc) { 1327 LastLoc = Loc; 1328 LastOp = Op; 1329 } 1330 1331 unsigned Opcode = Op->getOpcode(); 1332 if (LastOpcode && Opcode != LastOpcode) 1333 break; 1334 1335 int Offset = getMemoryOpOffset(Op); 1336 unsigned Bytes = getLSMultipleTransferSize(Op); 1337 if (LastBytes) { 1338 if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes)) 1339 break; 1340 } 1341 LastOffset = Offset; 1342 LastBytes = Bytes; 1343 LastOpcode = Opcode; 1344 if (++NumMove == 8) // FIXME: Tune this limit. 1345 break; 1346 } 1347 1348 if (NumMove <= 1) 1349 Ops.pop_back(); 1350 else { 1351 SmallPtrSet<MachineInstr*, 4> MemOps; 1352 SmallSet<unsigned, 4> MemRegs; 1353 for (int i = NumMove-1; i >= 0; --i) { 1354 MemOps.insert(Ops[i]); 1355 MemRegs.insert(Ops[i]->getOperand(0).getReg()); 1356 } 1357 1358 // Be conservative, if the instructions are too far apart, don't 1359 // move them. We want to limit the increase of register pressure. 1360 bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this. 1361 if (DoMove) 1362 DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp, 1363 MemOps, MemRegs, TRI); 1364 if (!DoMove) { 1365 for (unsigned i = 0; i != NumMove; ++i) 1366 Ops.pop_back(); 1367 } else { 1368 // This is the new location for the loads / stores. 1369 MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp; 1370 while (InsertPos != MBB->end() && MemOps.count(InsertPos)) 1371 ++InsertPos; 1372 1373 // If we are moving a pair of loads / stores, see if it makes sense 1374 // to try to allocate a pair of registers that can form register pairs. 1375 MachineInstr *Op0 = Ops.back(); 1376 MachineInstr *Op1 = Ops[Ops.size()-2]; 1377 unsigned EvenReg = 0, OddReg = 0; 1378 unsigned BaseReg = 0, OffReg = 0, PredReg = 0; 1379 ARMCC::CondCodes Pred = ARMCC::AL; 1380 bool isT2 = false; 1381 unsigned NewOpc = 0; 1382 int Offset = 0; 1383 DebugLoc dl; 1384 if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc, 1385 EvenReg, OddReg, BaseReg, OffReg, 1386 Offset, PredReg, Pred, isT2)) { 1387 Ops.pop_back(); 1388 Ops.pop_back(); 1389 1390 // Form the pair instruction. 1391 if (isLd) { 1392 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, 1393 dl, TII->get(NewOpc)) 1394 .addReg(EvenReg, RegState::Define) 1395 .addReg(OddReg, RegState::Define) 1396 .addReg(BaseReg); 1397 if (!isT2) 1398 MIB.addReg(OffReg); 1399 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1400 ++NumLDRDFormed; 1401 } else { 1402 MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, 1403 dl, TII->get(NewOpc)) 1404 .addReg(EvenReg) 1405 .addReg(OddReg) 1406 .addReg(BaseReg); 1407 if (!isT2) 1408 MIB.addReg(OffReg); 1409 MIB.addImm(Offset).addImm(Pred).addReg(PredReg); 1410 ++NumSTRDFormed; 1411 } 1412 MBB->erase(Op0); 1413 MBB->erase(Op1); 1414 1415 // Add register allocation hints to form register pairs. 1416 MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg); 1417 MRI->setRegAllocationHint(OddReg, ARMRI::RegPairOdd, EvenReg); 1418 } else { 1419 for (unsigned i = 0; i != NumMove; ++i) { 1420 MachineInstr *Op = Ops.back(); 1421 Ops.pop_back(); 1422 MBB->splice(InsertPos, MBB, Op); 1423 } 1424 } 1425 1426 NumLdStMoved += NumMove; 1427 RetVal = true; 1428 } 1429 } 1430 } 1431 1432 return RetVal; 1433 } 1434 1435 bool 1436 ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { 1437 bool RetVal = false; 1438 1439 DenseMap<MachineInstr*, unsigned> MI2LocMap; 1440 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap; 1441 DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap; 1442 SmallVector<unsigned, 4> LdBases; 1443 SmallVector<unsigned, 4> StBases; 1444 1445 unsigned Loc = 0; 1446 MachineBasicBlock::iterator MBBI = MBB->begin(); 1447 MachineBasicBlock::iterator E = MBB->end(); 1448 while (MBBI != E) { 1449 for (; MBBI != E; ++MBBI) { 1450 MachineInstr *MI = MBBI; 1451 const TargetInstrDesc &TID = MI->getDesc(); 1452 if (TID.isCall() || TID.isTerminator()) { 1453 // Stop at barriers. 1454 ++MBBI; 1455 break; 1456 } 1457 1458 MI2LocMap[MI] = Loc++; 1459 if (!isMemoryOp(MI)) 1460 continue; 1461 unsigned PredReg = 0; 1462 if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL) 1463 continue; 1464 1465 int Opc = MI->getOpcode(); 1466 bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD; 1467 unsigned Base = MI->getOperand(1).getReg(); 1468 int Offset = getMemoryOpOffset(MI); 1469 1470 bool StopHere = false; 1471 if (isLd) { 1472 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 1473 Base2LdsMap.find(Base); 1474 if (BI != Base2LdsMap.end()) { 1475 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 1476 if (Offset == getMemoryOpOffset(BI->second[i])) { 1477 StopHere = true; 1478 break; 1479 } 1480 } 1481 if (!StopHere) 1482 BI->second.push_back(MI); 1483 } else { 1484 SmallVector<MachineInstr*, 4> MIs; 1485 MIs.push_back(MI); 1486 Base2LdsMap[Base] = MIs; 1487 LdBases.push_back(Base); 1488 } 1489 } else { 1490 DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI = 1491 Base2StsMap.find(Base); 1492 if (BI != Base2StsMap.end()) { 1493 for (unsigned i = 0, e = BI->second.size(); i != e; ++i) { 1494 if (Offset == getMemoryOpOffset(BI->second[i])) { 1495 StopHere = true; 1496 break; 1497 } 1498 } 1499 if (!StopHere) 1500 BI->second.push_back(MI); 1501 } else { 1502 SmallVector<MachineInstr*, 4> MIs; 1503 MIs.push_back(MI); 1504 Base2StsMap[Base] = MIs; 1505 StBases.push_back(Base); 1506 } 1507 } 1508 1509 if (StopHere) { 1510 // Found a duplicate (a base+offset combination that's seen earlier). 1511 // Backtrack. 1512 --Loc; 1513 break; 1514 } 1515 } 1516 1517 // Re-schedule loads. 1518 for (unsigned i = 0, e = LdBases.size(); i != e; ++i) { 1519 unsigned Base = LdBases[i]; 1520 SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base]; 1521 if (Lds.size() > 1) 1522 RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap); 1523 } 1524 1525 // Re-schedule stores. 1526 for (unsigned i = 0, e = StBases.size(); i != e; ++i) { 1527 unsigned Base = StBases[i]; 1528 SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base]; 1529 if (Sts.size() > 1) 1530 RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap); 1531 } 1532 1533 if (MBBI != E) { 1534 Base2LdsMap.clear(); 1535 Base2StsMap.clear(); 1536 LdBases.clear(); 1537 StBases.clear(); 1538 } 1539 } 1540 1541 return RetVal; 1542 } 1543 1544 1545 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store 1546 /// optimization pass. 1547 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { 1548 if (PreAlloc) 1549 return new ARMPreAllocLoadStoreOpt(); 1550 return new ARMLoadStoreOpt(); 1551 } 1552