1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AArch64InstrInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "MCTargetDesc/AArch64AddressingModes.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/Support/CommandLine.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include "llvm/Target/TargetRegisterInfo.h" 32 using namespace llvm; 33 34 #define DEBUG_TYPE "aarch64-ldst-opt" 35 36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine 37 /// load / store instructions to form ldp / stp instructions. 38 39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 40 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 41 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 42 STATISTIC(NumUnscaledPairCreated, 43 "Number of load/store from unscaled generated"); 44 STATISTIC(NumSmallTypeMerged, "Number of small type loads merged"); 45 46 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", 47 cl::init(20), cl::Hidden); 48 49 namespace llvm { 50 void initializeAArch64LoadStoreOptPass(PassRegistry &); 51 } 52 53 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 54 55 namespace { 56 57 typedef struct LdStPairFlags { 58 // If a matching instruction is found, MergeForward is set to true if the 59 // merge is to remove the first instruction and replace the second with 60 // a pair-wise insn, and false if the reverse is true. 61 bool MergeForward; 62 63 // SExtIdx gives the index of the result of the load pair that must be 64 // extended. The value of SExtIdx assumes that the paired load produces the 65 // value in this order: (I, returned iterator), i.e., -1 means no value has 66 // to be extended, 0 means I, and 1 means the returned iterator. 67 int SExtIdx; 68 69 LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} 70 71 void setMergeForward(bool V = true) { MergeForward = V; } 72 bool getMergeForward() const { return MergeForward; } 73 74 void setSExtIdx(int V) { SExtIdx = V; } 75 int getSExtIdx() const { return SExtIdx; } 76 77 } LdStPairFlags; 78 79 struct AArch64LoadStoreOpt : public MachineFunctionPass { 80 static char ID; 81 AArch64LoadStoreOpt() : MachineFunctionPass(ID), IsStrictAlign(false) { 82 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 83 } 84 85 const AArch64InstrInfo *TII; 86 const TargetRegisterInfo *TRI; 87 bool IsStrictAlign; 88 89 // Scan the instructions looking for a load/store that can be combined 90 // with the current instruction into a load/store pair. 91 // Return the matching instruction if one is found, else MBB->end(). 92 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 93 LdStPairFlags &Flags, 94 unsigned Limit); 95 // Merge the two instructions indicated into a single pair-wise instruction. 96 // If MergeForward is true, erase the first instruction and fold its 97 // operation into the second. If false, the reverse. Return the instruction 98 // following the first instruction (which may change during processing). 99 MachineBasicBlock::iterator 100 mergePairedInsns(MachineBasicBlock::iterator I, 101 MachineBasicBlock::iterator Paired, 102 const LdStPairFlags &Flags); 103 104 // Scan the instruction list to find a base register update that can 105 // be combined with the current instruction (a load or store) using 106 // pre or post indexed addressing with writeback. Scan forwards. 107 MachineBasicBlock::iterator 108 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, 109 int UnscaledOffset); 110 111 // Scan the instruction list to find a base register update that can 112 // be combined with the current instruction (a load or store) using 113 // pre or post indexed addressing with writeback. Scan backwards. 114 MachineBasicBlock::iterator 115 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 116 117 // Find an instruction that updates the base register of the ld/st 118 // instruction. 119 bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, 120 unsigned BaseReg, int Offset); 121 122 // Merge a pre- or post-index base register update into a ld/st instruction. 123 MachineBasicBlock::iterator 124 mergeUpdateInsn(MachineBasicBlock::iterator I, 125 MachineBasicBlock::iterator Update, bool IsPreIdx); 126 127 // Find and merge foldable ldr/str instructions. 128 bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); 129 130 bool optimizeBlock(MachineBasicBlock &MBB); 131 132 bool runOnMachineFunction(MachineFunction &Fn) override; 133 134 const char *getPassName() const override { 135 return AARCH64_LOAD_STORE_OPT_NAME; 136 } 137 }; 138 char AArch64LoadStoreOpt::ID = 0; 139 } // namespace 140 141 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 142 AARCH64_LOAD_STORE_OPT_NAME, false, false) 143 144 static bool isUnscaledLdSt(unsigned Opc) { 145 switch (Opc) { 146 default: 147 return false; 148 case AArch64::STURSi: 149 case AArch64::STURDi: 150 case AArch64::STURQi: 151 case AArch64::STURWi: 152 case AArch64::STURXi: 153 case AArch64::LDURSi: 154 case AArch64::LDURDi: 155 case AArch64::LDURQi: 156 case AArch64::LDURWi: 157 case AArch64::LDURXi: 158 case AArch64::LDURSWi: 159 case AArch64::LDURHHi: 160 return true; 161 } 162 } 163 164 static bool isUnscaledLdSt(MachineInstr *MI) { 165 return isUnscaledLdSt(MI->getOpcode()); 166 } 167 168 static bool isSmallTypeLdMerge(unsigned Opc) { 169 switch (Opc) { 170 default: 171 return false; 172 case AArch64::LDRHHui: 173 case AArch64::LDURHHi: 174 return true; 175 // FIXME: Add other instructions (e.g, LDRBBui, LDURSHWi, LDRSHWui, etc.). 176 } 177 } 178 static bool isSmallTypeLdMerge(MachineInstr *MI) { 179 return isSmallTypeLdMerge(MI->getOpcode()); 180 } 181 182 // Scaling factor for unscaled load or store. 183 static int getMemScale(MachineInstr *MI) { 184 switch (MI->getOpcode()) { 185 default: 186 llvm_unreachable("Opcode has unknown scale!"); 187 case AArch64::LDRBBui: 188 case AArch64::STRBBui: 189 return 1; 190 case AArch64::LDRHHui: 191 case AArch64::LDURHHi: 192 case AArch64::STRHHui: 193 return 2; 194 case AArch64::LDRSui: 195 case AArch64::LDURSi: 196 case AArch64::LDRSWui: 197 case AArch64::LDURSWi: 198 case AArch64::LDRWui: 199 case AArch64::LDURWi: 200 case AArch64::STRSui: 201 case AArch64::STURSi: 202 case AArch64::STRWui: 203 case AArch64::STURWi: 204 case AArch64::LDPSi: 205 case AArch64::LDPSWi: 206 case AArch64::LDPWi: 207 case AArch64::STPSi: 208 case AArch64::STPWi: 209 return 4; 210 case AArch64::LDRDui: 211 case AArch64::LDURDi: 212 case AArch64::LDRXui: 213 case AArch64::LDURXi: 214 case AArch64::STRDui: 215 case AArch64::STURDi: 216 case AArch64::STRXui: 217 case AArch64::STURXi: 218 case AArch64::LDPDi: 219 case AArch64::LDPXi: 220 case AArch64::STPDi: 221 case AArch64::STPXi: 222 return 8; 223 case AArch64::LDRQui: 224 case AArch64::LDURQi: 225 case AArch64::STRQui: 226 case AArch64::STURQi: 227 case AArch64::LDPQi: 228 case AArch64::STPQi: 229 return 16; 230 } 231 } 232 233 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 234 bool *IsValidLdStrOpc = nullptr) { 235 if (IsValidLdStrOpc) 236 *IsValidLdStrOpc = true; 237 switch (Opc) { 238 default: 239 if (IsValidLdStrOpc) 240 *IsValidLdStrOpc = false; 241 return UINT_MAX; 242 case AArch64::STRDui: 243 case AArch64::STURDi: 244 case AArch64::STRQui: 245 case AArch64::STURQi: 246 case AArch64::STRWui: 247 case AArch64::STURWi: 248 case AArch64::STRXui: 249 case AArch64::STURXi: 250 case AArch64::LDRDui: 251 case AArch64::LDURDi: 252 case AArch64::LDRQui: 253 case AArch64::LDURQi: 254 case AArch64::LDRWui: 255 case AArch64::LDURWi: 256 case AArch64::LDRXui: 257 case AArch64::LDURXi: 258 case AArch64::STRSui: 259 case AArch64::STURSi: 260 case AArch64::LDRSui: 261 case AArch64::LDURSi: 262 case AArch64::LDRHHui: 263 case AArch64::LDURHHi: 264 return Opc; 265 case AArch64::LDRSWui: 266 return AArch64::LDRWui; 267 case AArch64::LDURSWi: 268 return AArch64::LDURWi; 269 } 270 } 271 272 static unsigned getMatchingPairOpcode(unsigned Opc) { 273 switch (Opc) { 274 default: 275 llvm_unreachable("Opcode has no pairwise equivalent!"); 276 case AArch64::STRSui: 277 case AArch64::STURSi: 278 return AArch64::STPSi; 279 case AArch64::STRDui: 280 case AArch64::STURDi: 281 return AArch64::STPDi; 282 case AArch64::STRQui: 283 case AArch64::STURQi: 284 return AArch64::STPQi; 285 case AArch64::STRWui: 286 case AArch64::STURWi: 287 return AArch64::STPWi; 288 case AArch64::STRXui: 289 case AArch64::STURXi: 290 return AArch64::STPXi; 291 case AArch64::LDRSui: 292 case AArch64::LDURSi: 293 return AArch64::LDPSi; 294 case AArch64::LDRDui: 295 case AArch64::LDURDi: 296 return AArch64::LDPDi; 297 case AArch64::LDRQui: 298 case AArch64::LDURQi: 299 return AArch64::LDPQi; 300 case AArch64::LDRWui: 301 case AArch64::LDURWi: 302 return AArch64::LDPWi; 303 case AArch64::LDRXui: 304 case AArch64::LDURXi: 305 return AArch64::LDPXi; 306 case AArch64::LDRSWui: 307 case AArch64::LDURSWi: 308 return AArch64::LDPSWi; 309 case AArch64::LDRHHui: 310 return AArch64::LDRWui; 311 case AArch64::LDURHHi: 312 return AArch64::LDURWi; 313 } 314 } 315 316 static unsigned getPreIndexedOpcode(unsigned Opc) { 317 switch (Opc) { 318 default: 319 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 320 case AArch64::STRSui: 321 return AArch64::STRSpre; 322 case AArch64::STRDui: 323 return AArch64::STRDpre; 324 case AArch64::STRQui: 325 return AArch64::STRQpre; 326 case AArch64::STRBBui: 327 return AArch64::STRBBpre; 328 case AArch64::STRHHui: 329 return AArch64::STRHHpre; 330 case AArch64::STRWui: 331 return AArch64::STRWpre; 332 case AArch64::STRXui: 333 return AArch64::STRXpre; 334 case AArch64::LDRSui: 335 return AArch64::LDRSpre; 336 case AArch64::LDRDui: 337 return AArch64::LDRDpre; 338 case AArch64::LDRQui: 339 return AArch64::LDRQpre; 340 case AArch64::LDRBBui: 341 return AArch64::LDRBBpre; 342 case AArch64::LDRHHui: 343 return AArch64::LDRHHpre; 344 case AArch64::LDRWui: 345 return AArch64::LDRWpre; 346 case AArch64::LDRXui: 347 return AArch64::LDRXpre; 348 case AArch64::LDRSWui: 349 return AArch64::LDRSWpre; 350 case AArch64::LDPSi: 351 return AArch64::LDPSpre; 352 case AArch64::LDPSWi: 353 return AArch64::LDPSWpre; 354 case AArch64::LDPDi: 355 return AArch64::LDPDpre; 356 case AArch64::LDPQi: 357 return AArch64::LDPQpre; 358 case AArch64::LDPWi: 359 return AArch64::LDPWpre; 360 case AArch64::LDPXi: 361 return AArch64::LDPXpre; 362 case AArch64::STPSi: 363 return AArch64::STPSpre; 364 case AArch64::STPDi: 365 return AArch64::STPDpre; 366 case AArch64::STPQi: 367 return AArch64::STPQpre; 368 case AArch64::STPWi: 369 return AArch64::STPWpre; 370 case AArch64::STPXi: 371 return AArch64::STPXpre; 372 } 373 } 374 375 static unsigned getPostIndexedOpcode(unsigned Opc) { 376 switch (Opc) { 377 default: 378 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 379 case AArch64::STRSui: 380 return AArch64::STRSpost; 381 case AArch64::STRDui: 382 return AArch64::STRDpost; 383 case AArch64::STRQui: 384 return AArch64::STRQpost; 385 case AArch64::STRBBui: 386 return AArch64::STRBBpost; 387 case AArch64::STRHHui: 388 return AArch64::STRHHpost; 389 case AArch64::STRWui: 390 return AArch64::STRWpost; 391 case AArch64::STRXui: 392 return AArch64::STRXpost; 393 case AArch64::LDRSui: 394 return AArch64::LDRSpost; 395 case AArch64::LDRDui: 396 return AArch64::LDRDpost; 397 case AArch64::LDRQui: 398 return AArch64::LDRQpost; 399 case AArch64::LDRBBui: 400 return AArch64::LDRBBpost; 401 case AArch64::LDRHHui: 402 return AArch64::LDRHHpost; 403 case AArch64::LDRWui: 404 return AArch64::LDRWpost; 405 case AArch64::LDRXui: 406 return AArch64::LDRXpost; 407 case AArch64::LDRSWui: 408 return AArch64::LDRSWpost; 409 case AArch64::LDPSi: 410 return AArch64::LDPSpost; 411 case AArch64::LDPSWi: 412 return AArch64::LDPSWpost; 413 case AArch64::LDPDi: 414 return AArch64::LDPDpost; 415 case AArch64::LDPQi: 416 return AArch64::LDPQpost; 417 case AArch64::LDPWi: 418 return AArch64::LDPWpost; 419 case AArch64::LDPXi: 420 return AArch64::LDPXpost; 421 case AArch64::STPSi: 422 return AArch64::STPSpost; 423 case AArch64::STPDi: 424 return AArch64::STPDpost; 425 case AArch64::STPQi: 426 return AArch64::STPQpost; 427 case AArch64::STPWi: 428 return AArch64::STPWpost; 429 case AArch64::STPXi: 430 return AArch64::STPXpost; 431 } 432 } 433 434 static bool isPairedLdSt(const MachineInstr *MI) { 435 switch (MI->getOpcode()) { 436 default: 437 return false; 438 case AArch64::LDPSi: 439 case AArch64::LDPSWi: 440 case AArch64::LDPDi: 441 case AArch64::LDPQi: 442 case AArch64::LDPWi: 443 case AArch64::LDPXi: 444 case AArch64::STPSi: 445 case AArch64::STPDi: 446 case AArch64::STPQi: 447 case AArch64::STPWi: 448 case AArch64::STPXi: 449 return true; 450 } 451 } 452 453 static const MachineOperand &getLdStRegOp(const MachineInstr *MI, 454 unsigned PairedRegOp = 0) { 455 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 456 unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; 457 return MI->getOperand(Idx); 458 } 459 460 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { 461 unsigned Idx = isPairedLdSt(MI) ? 2 : 1; 462 return MI->getOperand(Idx); 463 } 464 465 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { 466 unsigned Idx = isPairedLdSt(MI) ? 3 : 2; 467 return MI->getOperand(Idx); 468 } 469 470 // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. 471 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, 472 MachineInstr *Op1) { 473 assert(MI->memoperands_empty() && "expected a new machineinstr"); 474 size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + 475 (Op1->memoperands_end() - Op1->memoperands_begin()); 476 477 MachineFunction *MF = MI->getParent()->getParent(); 478 MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); 479 MachineSDNode::mmo_iterator MemEnd = 480 std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); 481 MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); 482 MI->setMemRefs(MemBegin, MemEnd); 483 } 484 485 MachineBasicBlock::iterator 486 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 487 MachineBasicBlock::iterator Paired, 488 const LdStPairFlags &Flags) { 489 MachineBasicBlock::iterator NextI = I; 490 ++NextI; 491 // If NextI is the second of the two instructions to be merged, we need 492 // to skip one further. Either way we merge will invalidate the iterator, 493 // and we don't need to scan the new instruction, as it's a pairwise 494 // instruction, which we're not considering for further action anyway. 495 if (NextI == Paired) 496 ++NextI; 497 498 int SExtIdx = Flags.getSExtIdx(); 499 unsigned Opc = 500 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 501 bool IsUnscaled = isUnscaledLdSt(Opc); 502 int OffsetStride = IsUnscaled ? getMemScale(I) : 1; 503 504 bool MergeForward = Flags.getMergeForward(); 505 unsigned NewOpc = getMatchingPairOpcode(Opc); 506 // Insert our new paired instruction after whichever of the paired 507 // instructions MergeForward indicates. 508 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 509 // Also based on MergeForward is from where we copy the base register operand 510 // so we get the flags compatible with the input code. 511 const MachineOperand &BaseRegOp = 512 MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); 513 514 // Which register is Rt and which is Rt2 depends on the offset order. 515 MachineInstr *RtMI, *Rt2MI; 516 if (getLdStOffsetOp(I).getImm() == 517 getLdStOffsetOp(Paired).getImm() + OffsetStride) { 518 RtMI = Paired; 519 Rt2MI = I; 520 // Here we swapped the assumption made for SExtIdx. 521 // I.e., we turn ldp I, Paired into ldp Paired, I. 522 // Update the index accordingly. 523 if (SExtIdx != -1) 524 SExtIdx = (SExtIdx + 1) % 2; 525 } else { 526 RtMI = I; 527 Rt2MI = Paired; 528 } 529 530 int OffsetImm = getLdStOffsetOp(RtMI).getImm(); 531 532 if (isSmallTypeLdMerge(Opc)) { 533 // Change the scaled offset from small to large type. 534 if (!IsUnscaled) 535 OffsetImm /= 2; 536 MachineInstr *RtNewDest = MergeForward ? I : Paired; 537 // Construct the new load instruction. 538 // FIXME: currently we support only halfword unsigned load. We need to 539 // handle byte type, signed, and store instructions as well. 540 MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; 541 NewMemMI = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 542 .addOperand(getLdStRegOp(RtNewDest)) 543 .addOperand(BaseRegOp) 544 .addImm(OffsetImm); 545 546 // Copy MachineMemOperands from the original loads. 547 concatenateMemOperands(NewMemMI, I, Paired); 548 549 DEBUG( 550 dbgs() 551 << "Creating the new load and extract. Replacing instructions:\n "); 552 DEBUG(I->print(dbgs())); 553 DEBUG(dbgs() << " "); 554 DEBUG(Paired->print(dbgs())); 555 DEBUG(dbgs() << " with instructions:\n "); 556 DEBUG((NewMemMI)->print(dbgs())); 557 558 MachineInstr *ExtDestMI = MergeForward ? Paired : I; 559 if (ExtDestMI == Rt2MI) { 560 // Create the bitfield extract for high half. 561 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 562 TII->get(AArch64::UBFMWri)) 563 .addOperand(getLdStRegOp(Rt2MI)) 564 .addReg(getLdStRegOp(RtNewDest).getReg()) 565 .addImm(16) 566 .addImm(31); 567 // Create the bitfield extract for low half. 568 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 569 TII->get(AArch64::ANDWri)) 570 .addOperand(getLdStRegOp(RtMI)) 571 .addReg(getLdStRegOp(RtNewDest).getReg()) 572 .addImm(15); 573 } else { 574 // Create the bitfield extract for low half. 575 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 576 TII->get(AArch64::ANDWri)) 577 .addOperand(getLdStRegOp(RtMI)) 578 .addReg(getLdStRegOp(RtNewDest).getReg()) 579 .addImm(15); 580 // Create the bitfield extract for high half. 581 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 582 TII->get(AArch64::UBFMWri)) 583 .addOperand(getLdStRegOp(Rt2MI)) 584 .addReg(getLdStRegOp(RtNewDest).getReg()) 585 .addImm(16) 586 .addImm(31); 587 } 588 DEBUG(dbgs() << " "); 589 DEBUG((BitExtMI1)->print(dbgs())); 590 DEBUG(dbgs() << " "); 591 DEBUG((BitExtMI2)->print(dbgs())); 592 DEBUG(dbgs() << "\n"); 593 594 // Erase the old instructions. 595 I->eraseFromParent(); 596 Paired->eraseFromParent(); 597 return NextI; 598 } 599 600 // Handle Unscaled 601 if (IsUnscaled) 602 OffsetImm /= OffsetStride; 603 604 // Construct the new instruction. 605 MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, 606 I->getDebugLoc(), TII->get(NewOpc)) 607 .addOperand(getLdStRegOp(RtMI)) 608 .addOperand(getLdStRegOp(Rt2MI)) 609 .addOperand(BaseRegOp) 610 .addImm(OffsetImm); 611 (void)MIB; 612 613 // FIXME: Do we need/want to copy the mem operands from the source 614 // instructions? Probably. What uses them after this? 615 616 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); 617 DEBUG(I->print(dbgs())); 618 DEBUG(dbgs() << " "); 619 DEBUG(Paired->print(dbgs())); 620 DEBUG(dbgs() << " with instruction:\n "); 621 622 if (SExtIdx != -1) { 623 // Generate the sign extension for the proper result of the ldp. 624 // I.e., with X1, that would be: 625 // %W1<def> = KILL %W1, %X1<imp-def> 626 // %X1<def> = SBFMXri %X1<kill>, 0, 31 627 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 628 // Right now, DstMO has the extended register, since it comes from an 629 // extended opcode. 630 unsigned DstRegX = DstMO.getReg(); 631 // Get the W variant of that register. 632 unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 633 // Update the result of LDP to use the W instead of the X variant. 634 DstMO.setReg(DstRegW); 635 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 636 DEBUG(dbgs() << "\n"); 637 // Make the machine verifier happy by providing a definition for 638 // the X register. 639 // Insert this definition right after the generated LDP, i.e., before 640 // InsertionPoint. 641 MachineInstrBuilder MIBKill = 642 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 643 TII->get(TargetOpcode::KILL), DstRegW) 644 .addReg(DstRegW) 645 .addReg(DstRegX, RegState::Define); 646 MIBKill->getOperand(2).setImplicit(); 647 // Create the sign extension. 648 MachineInstrBuilder MIBSXTW = 649 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 650 TII->get(AArch64::SBFMXri), DstRegX) 651 .addReg(DstRegX) 652 .addImm(0) 653 .addImm(31); 654 (void)MIBSXTW; 655 DEBUG(dbgs() << " Extend operand:\n "); 656 DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 657 DEBUG(dbgs() << "\n"); 658 } else { 659 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 660 DEBUG(dbgs() << "\n"); 661 } 662 663 // Erase the old instructions. 664 I->eraseFromParent(); 665 Paired->eraseFromParent(); 666 667 return NextI; 668 } 669 670 /// trackRegDefsUses - Remember what registers the specified instruction uses 671 /// and modifies. 672 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, 673 BitVector &UsedRegs, 674 const TargetRegisterInfo *TRI) { 675 for (const MachineOperand &MO : MI->operands()) { 676 if (MO.isRegMask()) 677 ModifiedRegs.setBitsNotInMask(MO.getRegMask()); 678 679 if (!MO.isReg()) 680 continue; 681 unsigned Reg = MO.getReg(); 682 if (MO.isDef()) { 683 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 684 ModifiedRegs.set(*AI); 685 } else { 686 assert(MO.isUse() && "Reg operand not a def and not a use?!?"); 687 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 688 UsedRegs.set(*AI); 689 } 690 } 691 } 692 693 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 694 // Convert the byte-offset used by unscaled into an "element" offset used 695 // by the scaled pair load/store instructions. 696 if (IsUnscaled) 697 Offset /= OffsetStride; 698 699 return Offset <= 63 && Offset >= -64; 700 } 701 702 // Do alignment, specialized to power of 2 and for signed ints, 703 // avoiding having to do a C-style cast from uint_64t to int when 704 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h. 705 // FIXME: Move this function to include/MathExtras.h? 706 static int alignTo(int Num, int PowOf2) { 707 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 708 } 709 710 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, 711 const AArch64InstrInfo *TII) { 712 // One of the instructions must modify memory. 713 if (!MIa->mayStore() && !MIb->mayStore()) 714 return false; 715 716 // Both instructions must be memory operations. 717 if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore()) 718 return false; 719 720 return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); 721 } 722 723 static bool mayAlias(MachineInstr *MIa, 724 SmallVectorImpl<MachineInstr *> &MemInsns, 725 const AArch64InstrInfo *TII) { 726 for (auto &MIb : MemInsns) 727 if (mayAlias(MIa, MIb, TII)) 728 return true; 729 730 return false; 731 } 732 733 /// findMatchingInsn - Scan the instructions looking for a load/store that can 734 /// be combined with the current instruction into a load/store pair. 735 MachineBasicBlock::iterator 736 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 737 LdStPairFlags &Flags, unsigned Limit) { 738 MachineBasicBlock::iterator E = I->getParent()->end(); 739 MachineBasicBlock::iterator MBBI = I; 740 MachineInstr *FirstMI = I; 741 ++MBBI; 742 743 unsigned Opc = FirstMI->getOpcode(); 744 bool MayLoad = FirstMI->mayLoad(); 745 bool IsUnscaled = isUnscaledLdSt(FirstMI); 746 unsigned Reg = getLdStRegOp(FirstMI).getReg(); 747 unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); 748 int Offset = getLdStOffsetOp(FirstMI).getImm(); 749 750 // Early exit if the first instruction modifies the base register. 751 // e.g., ldr x0, [x0] 752 if (FirstMI->modifiesRegister(BaseReg, TRI)) 753 return E; 754 755 // Early exit if the offset if not possible to match. (6 bits of positive 756 // range, plus allow an extra one in case we find a later insn that matches 757 // with Offset-1) 758 int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; 759 if (!isSmallTypeLdMerge(Opc) && 760 !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 761 return E; 762 763 // Track which registers have been modified and used between the first insn 764 // (inclusive) and the second insn. 765 BitVector ModifiedRegs, UsedRegs; 766 ModifiedRegs.resize(TRI->getNumRegs()); 767 UsedRegs.resize(TRI->getNumRegs()); 768 769 // Remember any instructions that read/write memory between FirstMI and MI. 770 SmallVector<MachineInstr *, 4> MemInsns; 771 772 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 773 MachineInstr *MI = MBBI; 774 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 775 // optimization by changing how far we scan. 776 if (MI->isDebugValue()) 777 continue; 778 779 // Now that we know this is a real instruction, count it. 780 ++Count; 781 782 bool CanMergeOpc = Opc == MI->getOpcode(); 783 Flags.setSExtIdx(-1); 784 if (!CanMergeOpc) { 785 bool IsValidLdStrOpc; 786 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); 787 assert(IsValidLdStrOpc && 788 "Given Opc should be a Load or Store with an immediate"); 789 // Opc will be the first instruction in the pair. 790 Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); 791 CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); 792 } 793 794 if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { 795 assert(MI->mayLoadOrStore() && "Expected memory operation."); 796 // If we've found another instruction with the same opcode, check to see 797 // if the base and offset are compatible with our starting instruction. 798 // These instructions all have scaled immediate operands, so we just 799 // check for +1/-1. Make sure to check the new instruction offset is 800 // actually an immediate and not a symbolic reference destined for 801 // a relocation. 802 // 803 // Pairwise instructions have a 7-bit signed offset field. Single insns 804 // have a 12-bit unsigned offset field. To be a valid combine, the 805 // final offset must be in range. 806 unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); 807 int MIOffset = getLdStOffsetOp(MI).getImm(); 808 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || 809 (Offset + OffsetStride == MIOffset))) { 810 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 811 // If this is a volatile load/store that otherwise matched, stop looking 812 // as something is going on that we don't have enough information to 813 // safely transform. Similarly, stop if we see a hint to avoid pairs. 814 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 815 return E; 816 // If the resultant immediate offset of merging these instructions 817 // is out of range for a pairwise instruction, bail and keep looking. 818 bool MIIsUnscaled = isUnscaledLdSt(MI); 819 bool IsSmallTypeLd = isSmallTypeLdMerge(MI->getOpcode()); 820 if (!IsSmallTypeLd && 821 !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { 822 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 823 MemInsns.push_back(MI); 824 continue; 825 } 826 827 if (IsSmallTypeLd) { 828 // If the alignment requirements of the larger type scaled load 829 // instruction can't express the scaled offset of the smaller type 830 // input, bail and keep looking. 831 if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { 832 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 833 MemInsns.push_back(MI); 834 continue; 835 } 836 } else { 837 // If the alignment requirements of the paired (scaled) instruction 838 // can't express the offset of the unscaled input, bail and keep 839 // looking. 840 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 841 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 842 MemInsns.push_back(MI); 843 continue; 844 } 845 } 846 // If the destination register of the loads is the same register, bail 847 // and keep looking. A load-pair instruction with both destination 848 // registers the same is UNPREDICTABLE and will result in an exception. 849 if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { 850 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 851 MemInsns.push_back(MI); 852 continue; 853 } 854 855 // If the Rt of the second instruction was not modified or used between 856 // the two instructions and none of the instructions between the second 857 // and first alias with the second, we can combine the second into the 858 // first. 859 if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && 860 !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && 861 !mayAlias(MI, MemInsns, TII)) { 862 Flags.setMergeForward(false); 863 return MBBI; 864 } 865 866 // Likewise, if the Rt of the first instruction is not modified or used 867 // between the two instructions and none of the instructions between the 868 // first and the second alias with the first, we can combine the first 869 // into the second. 870 if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && 871 !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && 872 !mayAlias(FirstMI, MemInsns, TII)) { 873 Flags.setMergeForward(true); 874 return MBBI; 875 } 876 // Unable to combine these instructions due to interference in between. 877 // Keep looking. 878 } 879 } 880 881 // If the instruction wasn't a matching load or store. Stop searching if we 882 // encounter a call instruction that might modify memory. 883 if (MI->isCall()) 884 return E; 885 886 // Update modified / uses register lists. 887 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 888 889 // Otherwise, if the base register is modified, we have no match, so 890 // return early. 891 if (ModifiedRegs[BaseReg]) 892 return E; 893 894 // Update list of instructions that read/write memory. 895 if (MI->mayLoadOrStore()) 896 MemInsns.push_back(MI); 897 } 898 return E; 899 } 900 901 MachineBasicBlock::iterator 902 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 903 MachineBasicBlock::iterator Update, 904 bool IsPreIdx) { 905 assert((Update->getOpcode() == AArch64::ADDXri || 906 Update->getOpcode() == AArch64::SUBXri) && 907 "Unexpected base register update instruction to merge!"); 908 MachineBasicBlock::iterator NextI = I; 909 // Return the instruction following the merged instruction, which is 910 // the instruction following our unmerged load. Unless that's the add/sub 911 // instruction we're merging, in which case it's the one after that. 912 if (++NextI == Update) 913 ++NextI; 914 915 int Value = Update->getOperand(2).getImm(); 916 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 917 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 918 if (Update->getOpcode() == AArch64::SUBXri) 919 Value = -Value; 920 921 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 922 : getPostIndexedOpcode(I->getOpcode()); 923 MachineInstrBuilder MIB; 924 if (!isPairedLdSt(I)) { 925 // Non-paired instruction. 926 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 927 .addOperand(getLdStRegOp(Update)) 928 .addOperand(getLdStRegOp(I)) 929 .addOperand(getLdStBaseOp(I)) 930 .addImm(Value); 931 } else { 932 // Paired instruction. 933 int Scale = getMemScale(I); 934 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 935 .addOperand(getLdStRegOp(Update)) 936 .addOperand(getLdStRegOp(I, 0)) 937 .addOperand(getLdStRegOp(I, 1)) 938 .addOperand(getLdStBaseOp(I)) 939 .addImm(Value / Scale); 940 } 941 (void)MIB; 942 943 if (IsPreIdx) 944 DEBUG(dbgs() << "Creating pre-indexed load/store."); 945 else 946 DEBUG(dbgs() << "Creating post-indexed load/store."); 947 DEBUG(dbgs() << " Replacing instructions:\n "); 948 DEBUG(I->print(dbgs())); 949 DEBUG(dbgs() << " "); 950 DEBUG(Update->print(dbgs())); 951 DEBUG(dbgs() << " with instruction:\n "); 952 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 953 DEBUG(dbgs() << "\n"); 954 955 // Erase the old instructions for the block. 956 I->eraseFromParent(); 957 Update->eraseFromParent(); 958 959 return NextI; 960 } 961 962 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, 963 MachineInstr *MI, 964 unsigned BaseReg, int Offset) { 965 switch (MI->getOpcode()) { 966 default: 967 break; 968 case AArch64::SUBXri: 969 // Negate the offset for a SUB instruction. 970 Offset *= -1; 971 // FALLTHROUGH 972 case AArch64::ADDXri: 973 // Make sure it's a vanilla immediate operand, not a relocation or 974 // anything else we can't handle. 975 if (!MI->getOperand(2).isImm()) 976 break; 977 // Watch out for 1 << 12 shifted value. 978 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) 979 break; 980 981 // The update instruction source and destination register must be the 982 // same as the load/store base register. 983 if (MI->getOperand(0).getReg() != BaseReg || 984 MI->getOperand(1).getReg() != BaseReg) 985 break; 986 987 bool IsPairedInsn = isPairedLdSt(MemMI); 988 int UpdateOffset = MI->getOperand(2).getImm(); 989 // For non-paired load/store instructions, the immediate must fit in a 990 // signed 9-bit integer. 991 if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) 992 break; 993 994 // For paired load/store instructions, the immediate must be a multiple of 995 // the scaling factor. The scaled offset must also fit into a signed 7-bit 996 // integer. 997 if (IsPairedInsn) { 998 int Scale = getMemScale(MemMI); 999 if (UpdateOffset % Scale != 0) 1000 break; 1001 1002 int ScaledOffset = UpdateOffset / Scale; 1003 if (ScaledOffset > 64 || ScaledOffset < -64) 1004 break; 1005 } 1006 1007 // If we have a non-zero Offset, we check that it matches the amount 1008 // we're adding to the register. 1009 if (!Offset || Offset == MI->getOperand(2).getImm()) 1010 return true; 1011 break; 1012 } 1013 return false; 1014 } 1015 1016 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 1017 MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { 1018 MachineBasicBlock::iterator E = I->getParent()->end(); 1019 MachineInstr *MemMI = I; 1020 MachineBasicBlock::iterator MBBI = I; 1021 1022 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1023 int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); 1024 1025 // Scan forward looking for post-index opportunities. Updating instructions 1026 // can't be formed if the memory instruction doesn't have the offset we're 1027 // looking for. 1028 if (MIUnscaledOffset != UnscaledOffset) 1029 return E; 1030 1031 // If the base register overlaps a destination register, we can't 1032 // merge the update. 1033 bool IsPairedInsn = isPairedLdSt(MemMI); 1034 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1035 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1036 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1037 return E; 1038 } 1039 1040 // Track which registers have been modified and used between the first insn 1041 // (inclusive) and the second insn. 1042 BitVector ModifiedRegs, UsedRegs; 1043 ModifiedRegs.resize(TRI->getNumRegs()); 1044 UsedRegs.resize(TRI->getNumRegs()); 1045 ++MBBI; 1046 for (unsigned Count = 0; MBBI != E; ++MBBI) { 1047 MachineInstr *MI = MBBI; 1048 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1049 // optimization by changing how far we scan. 1050 if (MI->isDebugValue()) 1051 continue; 1052 1053 // Now that we know this is a real instruction, count it. 1054 ++Count; 1055 1056 // If we found a match, return it. 1057 if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) 1058 return MBBI; 1059 1060 // Update the status of what the instruction clobbered and used. 1061 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1062 1063 // Otherwise, if the base register is used or modified, we have no match, so 1064 // return early. 1065 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1066 return E; 1067 } 1068 return E; 1069 } 1070 1071 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 1072 MachineBasicBlock::iterator I, unsigned Limit) { 1073 MachineBasicBlock::iterator B = I->getParent()->begin(); 1074 MachineBasicBlock::iterator E = I->getParent()->end(); 1075 MachineInstr *MemMI = I; 1076 MachineBasicBlock::iterator MBBI = I; 1077 1078 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1079 int Offset = getLdStOffsetOp(MemMI).getImm(); 1080 1081 // If the load/store is the first instruction in the block, there's obviously 1082 // not any matching update. Ditto if the memory offset isn't zero. 1083 if (MBBI == B || Offset != 0) 1084 return E; 1085 // If the base register overlaps a destination register, we can't 1086 // merge the update. 1087 bool IsPairedInsn = isPairedLdSt(MemMI); 1088 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1089 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1090 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1091 return E; 1092 } 1093 1094 // Track which registers have been modified and used between the first insn 1095 // (inclusive) and the second insn. 1096 BitVector ModifiedRegs, UsedRegs; 1097 ModifiedRegs.resize(TRI->getNumRegs()); 1098 UsedRegs.resize(TRI->getNumRegs()); 1099 --MBBI; 1100 for (unsigned Count = 0; MBBI != B; --MBBI) { 1101 MachineInstr *MI = MBBI; 1102 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1103 // optimization by changing how far we scan. 1104 if (MI->isDebugValue()) 1105 continue; 1106 1107 // Now that we know this is a real instruction, count it. 1108 ++Count; 1109 1110 // If we found a match, return it. 1111 if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) 1112 return MBBI; 1113 1114 // Update the status of what the instruction clobbered and used. 1115 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1116 1117 // Otherwise, if the base register is used or modified, we have no match, so 1118 // return early. 1119 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1120 return E; 1121 } 1122 return E; 1123 } 1124 1125 bool AArch64LoadStoreOpt::tryToMergeLdStInst( 1126 MachineBasicBlock::iterator &MBBI) { 1127 MachineInstr *MI = MBBI; 1128 MachineBasicBlock::iterator E = MI->getParent()->end(); 1129 // If this is a volatile load/store, don't mess with it. 1130 if (MI->hasOrderedMemoryRef()) 1131 return false; 1132 1133 // Make sure this is a reg+imm (as opposed to an address reloc). 1134 if (!getLdStOffsetOp(MI).isImm()) 1135 return false; 1136 1137 // Check if this load/store has a hint to avoid pair formation. 1138 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 1139 if (TII->isLdStPairSuppressed(MI)) 1140 return false; 1141 1142 // Look ahead up to ScanLimit instructions for a pairable instruction. 1143 LdStPairFlags Flags; 1144 MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); 1145 if (Paired != E) { 1146 if (isSmallTypeLdMerge(MI)) { 1147 ++NumSmallTypeMerged; 1148 } else { 1149 ++NumPairCreated; 1150 if (isUnscaledLdSt(MI)) 1151 ++NumUnscaledPairCreated; 1152 } 1153 1154 // Merge the loads into a pair. Keeping the iterator straight is a 1155 // pain, so we let the merge routine tell us what the next instruction 1156 // is after it's done mucking about. 1157 MBBI = mergePairedInsns(MBBI, Paired, Flags); 1158 return true; 1159 } 1160 return false; 1161 } 1162 1163 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { 1164 bool Modified = false; 1165 // Three tranformations to do here: 1166 // 1) Find halfword loads that can be merged into a single 32-bit word load 1167 // with bitfield extract instructions. 1168 // e.g., 1169 // ldrh w0, [x2] 1170 // ldrh w1, [x2, #2] 1171 // ; becomes 1172 // ldr w0, [x2] 1173 // ubfx w1, w0, #16, #16 1174 // and w0, w0, #ffff 1175 // 2) Find loads and stores that can be merged into a single load or store 1176 // pair instruction. 1177 // e.g., 1178 // ldr x0, [x2] 1179 // ldr x1, [x2, #8] 1180 // ; becomes 1181 // ldp x0, x1, [x2] 1182 // 3) Find base register updates that can be merged into the load or store 1183 // as a base-reg writeback. 1184 // e.g., 1185 // ldr x0, [x2] 1186 // add x2, x2, #4 1187 // ; becomes 1188 // ldr x0, [x2], #4 1189 1190 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1191 !IsStrictAlign && MBBI != E;) { 1192 MachineInstr *MI = MBBI; 1193 switch (MI->getOpcode()) { 1194 default: 1195 // Just move on to the next instruction. 1196 ++MBBI; 1197 break; 1198 // Scaled instructions. 1199 case AArch64::LDRHHui: 1200 // Unscaled instructions. 1201 case AArch64::LDURHHi: { 1202 if (tryToMergeLdStInst(MBBI)) { 1203 Modified = true; 1204 break; 1205 } 1206 ++MBBI; 1207 break; 1208 } 1209 // FIXME: Do the other instructions. 1210 } 1211 } 1212 1213 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1214 MBBI != E;) { 1215 MachineInstr *MI = MBBI; 1216 switch (MI->getOpcode()) { 1217 default: 1218 // Just move on to the next instruction. 1219 ++MBBI; 1220 break; 1221 // Scaled instructions. 1222 case AArch64::STRSui: 1223 case AArch64::STRDui: 1224 case AArch64::STRQui: 1225 case AArch64::STRXui: 1226 case AArch64::STRWui: 1227 case AArch64::LDRSui: 1228 case AArch64::LDRDui: 1229 case AArch64::LDRQui: 1230 case AArch64::LDRXui: 1231 case AArch64::LDRWui: 1232 case AArch64::LDRSWui: 1233 // Unscaled instructions. 1234 case AArch64::STURSi: 1235 case AArch64::STURDi: 1236 case AArch64::STURQi: 1237 case AArch64::STURWi: 1238 case AArch64::STURXi: 1239 case AArch64::LDURSi: 1240 case AArch64::LDURDi: 1241 case AArch64::LDURQi: 1242 case AArch64::LDURWi: 1243 case AArch64::LDURXi: 1244 case AArch64::LDURSWi: { 1245 if (tryToMergeLdStInst(MBBI)) { 1246 Modified = true; 1247 break; 1248 } 1249 ++MBBI; 1250 break; 1251 } 1252 // FIXME: Do the other instructions. 1253 } 1254 } 1255 1256 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1257 MBBI != E;) { 1258 MachineInstr *MI = MBBI; 1259 // Do update merging. It's simpler to keep this separate from the above 1260 // switch, though not strictly necessary. 1261 unsigned Opc = MI->getOpcode(); 1262 switch (Opc) { 1263 default: 1264 // Just move on to the next instruction. 1265 ++MBBI; 1266 break; 1267 // Scaled instructions. 1268 case AArch64::STRSui: 1269 case AArch64::STRDui: 1270 case AArch64::STRQui: 1271 case AArch64::STRXui: 1272 case AArch64::STRWui: 1273 case AArch64::STRHHui: 1274 case AArch64::STRBBui: 1275 case AArch64::LDRSui: 1276 case AArch64::LDRDui: 1277 case AArch64::LDRQui: 1278 case AArch64::LDRXui: 1279 case AArch64::LDRWui: 1280 case AArch64::LDRHHui: 1281 case AArch64::LDRBBui: 1282 // Unscaled instructions. 1283 case AArch64::STURSi: 1284 case AArch64::STURDi: 1285 case AArch64::STURQi: 1286 case AArch64::STURWi: 1287 case AArch64::STURXi: 1288 case AArch64::LDURSi: 1289 case AArch64::LDURDi: 1290 case AArch64::LDURQi: 1291 case AArch64::LDURWi: 1292 case AArch64::LDURXi: 1293 // Paired instructions. 1294 case AArch64::LDPSi: 1295 case AArch64::LDPSWi: 1296 case AArch64::LDPDi: 1297 case AArch64::LDPQi: 1298 case AArch64::LDPWi: 1299 case AArch64::LDPXi: 1300 case AArch64::STPSi: 1301 case AArch64::STPDi: 1302 case AArch64::STPQi: 1303 case AArch64::STPWi: 1304 case AArch64::STPXi: { 1305 // Make sure this is a reg+imm (as opposed to an address reloc). 1306 if (!getLdStOffsetOp(MI).isImm()) { 1307 ++MBBI; 1308 break; 1309 } 1310 // Look forward to try to form a post-index instruction. For example, 1311 // ldr x0, [x20] 1312 // add x20, x20, #32 1313 // merged into: 1314 // ldr x0, [x20], #32 1315 MachineBasicBlock::iterator Update = 1316 findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); 1317 if (Update != E) { 1318 // Merge the update into the ld/st. 1319 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 1320 Modified = true; 1321 ++NumPostFolded; 1322 break; 1323 } 1324 // Don't know how to handle pre/post-index versions, so move to the next 1325 // instruction. 1326 if (isUnscaledLdSt(Opc)) { 1327 ++MBBI; 1328 break; 1329 } 1330 1331 // Look back to try to find a pre-index instruction. For example, 1332 // add x0, x0, #8 1333 // ldr x1, [x0] 1334 // merged into: 1335 // ldr x1, [x0, #8]! 1336 Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); 1337 if (Update != E) { 1338 // Merge the update into the ld/st. 1339 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1340 Modified = true; 1341 ++NumPreFolded; 1342 break; 1343 } 1344 // The immediate in the load/store is scaled by the size of the memory 1345 // operation. The immediate in the add we're looking for, 1346 // however, is not, so adjust here. 1347 int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); 1348 1349 // Look forward to try to find a post-index instruction. For example, 1350 // ldr x1, [x0, #64] 1351 // add x0, x0, #64 1352 // merged into: 1353 // ldr x1, [x0, #64]! 1354 Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); 1355 if (Update != E) { 1356 // Merge the update into the ld/st. 1357 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1358 Modified = true; 1359 ++NumPreFolded; 1360 break; 1361 } 1362 1363 // Nothing found. Just move to the next instruction. 1364 ++MBBI; 1365 break; 1366 } 1367 // FIXME: Do the other instructions. 1368 } 1369 } 1370 1371 return Modified; 1372 } 1373 1374 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1375 TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo()); 1376 TRI = Fn.getSubtarget().getRegisterInfo(); 1377 IsStrictAlign = (static_cast<const AArch64Subtarget &>(Fn.getSubtarget())) 1378 .requiresStrictAlign(); 1379 1380 bool Modified = false; 1381 for (auto &MBB : Fn) 1382 Modified |= optimizeBlock(MBB); 1383 1384 return Modified; 1385 } 1386 1387 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep 1388 // loads and stores near one another? 1389 1390 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 1391 /// load / store optimization pass. 1392 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 1393 return new AArch64LoadStoreOpt(); 1394 } 1395