1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AArch64InstrInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "MCTargetDesc/AArch64AddressingModes.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/Support/CommandLine.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include "llvm/Target/TargetRegisterInfo.h" 32 using namespace llvm; 33 34 #define DEBUG_TYPE "aarch64-ldst-opt" 35 36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine 37 /// load / store instructions to form ldp / stp instructions. 38 39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 40 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 41 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 42 STATISTIC(NumUnscaledPairCreated, 43 "Number of load/store from unscaled generated"); 44 STATISTIC(NumSmallTypeMerged, "Number of small type loads merged"); 45 46 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", 47 cl::init(20), cl::Hidden); 48 49 namespace llvm { 50 void initializeAArch64LoadStoreOptPass(PassRegistry &); 51 } 52 53 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 54 55 namespace { 56 57 typedef struct LdStPairFlags { 58 // If a matching instruction is found, MergeForward is set to true if the 59 // merge is to remove the first instruction and replace the second with 60 // a pair-wise insn, and false if the reverse is true. 61 bool MergeForward; 62 63 // SExtIdx gives the index of the result of the load pair that must be 64 // extended. The value of SExtIdx assumes that the paired load produces the 65 // value in this order: (I, returned iterator), i.e., -1 means no value has 66 // to be extended, 0 means I, and 1 means the returned iterator. 67 int SExtIdx; 68 69 LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} 70 71 void setMergeForward(bool V = true) { MergeForward = V; } 72 bool getMergeForward() const { return MergeForward; } 73 74 void setSExtIdx(int V) { SExtIdx = V; } 75 int getSExtIdx() const { return SExtIdx; } 76 77 } LdStPairFlags; 78 79 struct AArch64LoadStoreOpt : public MachineFunctionPass { 80 static char ID; 81 AArch64LoadStoreOpt() : MachineFunctionPass(ID), IsStrictAlign(false) { 82 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 83 } 84 85 const AArch64InstrInfo *TII; 86 const TargetRegisterInfo *TRI; 87 bool IsStrictAlign; 88 89 // Scan the instructions looking for a load/store that can be combined 90 // with the current instruction into a load/store pair. 91 // Return the matching instruction if one is found, else MBB->end(). 92 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 93 LdStPairFlags &Flags, 94 unsigned Limit); 95 // Merge the two instructions indicated into a single pair-wise instruction. 96 // If MergeForward is true, erase the first instruction and fold its 97 // operation into the second. If false, the reverse. Return the instruction 98 // following the first instruction (which may change during processing). 99 MachineBasicBlock::iterator 100 mergePairedInsns(MachineBasicBlock::iterator I, 101 MachineBasicBlock::iterator Paired, 102 const LdStPairFlags &Flags); 103 104 // Scan the instruction list to find a base register update that can 105 // be combined with the current instruction (a load or store) using 106 // pre or post indexed addressing with writeback. Scan forwards. 107 MachineBasicBlock::iterator 108 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, 109 int UnscaledOffset); 110 111 // Scan the instruction list to find a base register update that can 112 // be combined with the current instruction (a load or store) using 113 // pre or post indexed addressing with writeback. Scan backwards. 114 MachineBasicBlock::iterator 115 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 116 117 // Find an instruction that updates the base register of the ld/st 118 // instruction. 119 bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, 120 unsigned BaseReg, int Offset); 121 122 // Merge a pre- or post-index base register update into a ld/st instruction. 123 MachineBasicBlock::iterator 124 mergeUpdateInsn(MachineBasicBlock::iterator I, 125 MachineBasicBlock::iterator Update, bool IsPreIdx); 126 127 // Find and merge foldable ldr/str instructions. 128 bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); 129 130 bool optimizeBlock(MachineBasicBlock &MBB); 131 132 bool runOnMachineFunction(MachineFunction &Fn) override; 133 134 const char *getPassName() const override { 135 return AARCH64_LOAD_STORE_OPT_NAME; 136 } 137 }; 138 char AArch64LoadStoreOpt::ID = 0; 139 } // namespace 140 141 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 142 AARCH64_LOAD_STORE_OPT_NAME, false, false) 143 144 static bool isUnscaledLdSt(unsigned Opc) { 145 switch (Opc) { 146 default: 147 return false; 148 case AArch64::STURSi: 149 case AArch64::STURDi: 150 case AArch64::STURQi: 151 case AArch64::STURWi: 152 case AArch64::STURXi: 153 case AArch64::LDURSi: 154 case AArch64::LDURDi: 155 case AArch64::LDURQi: 156 case AArch64::LDURWi: 157 case AArch64::LDURXi: 158 case AArch64::LDURSWi: 159 case AArch64::LDURHHi: 160 return true; 161 } 162 } 163 164 static bool isUnscaledLdSt(MachineInstr *MI) { 165 return isUnscaledLdSt(MI->getOpcode()); 166 } 167 168 static bool isSmallTypeLdMerge(unsigned Opc) { 169 switch (Opc) { 170 default: 171 return false; 172 case AArch64::LDRHHui: 173 case AArch64::LDURHHi: 174 return true; 175 // FIXME: Add other instructions (e.g, LDRBBui, LDURSHWi, LDRSHWui, etc.). 176 } 177 } 178 static bool isSmallTypeLdMerge(MachineInstr *MI) { 179 return isSmallTypeLdMerge(MI->getOpcode()); 180 } 181 182 // Scaling factor for unscaled load or store. 183 static int getMemScale(MachineInstr *MI) { 184 switch (MI->getOpcode()) { 185 default: 186 llvm_unreachable("Opcode has unknown scale!"); 187 case AArch64::LDRBBui: 188 case AArch64::STRBBui: 189 return 1; 190 case AArch64::LDRHHui: 191 case AArch64::LDURHHi: 192 case AArch64::STRHHui: 193 return 2; 194 case AArch64::LDRSui: 195 case AArch64::LDURSi: 196 case AArch64::LDRSWui: 197 case AArch64::LDURSWi: 198 case AArch64::LDRWui: 199 case AArch64::LDURWi: 200 case AArch64::STRSui: 201 case AArch64::STURSi: 202 case AArch64::STRWui: 203 case AArch64::STURWi: 204 case AArch64::LDPSi: 205 case AArch64::LDPSWi: 206 case AArch64::LDPWi: 207 case AArch64::STPSi: 208 case AArch64::STPWi: 209 return 4; 210 case AArch64::LDRDui: 211 case AArch64::LDURDi: 212 case AArch64::LDRXui: 213 case AArch64::LDURXi: 214 case AArch64::STRDui: 215 case AArch64::STURDi: 216 case AArch64::STRXui: 217 case AArch64::STURXi: 218 case AArch64::LDPDi: 219 case AArch64::LDPXi: 220 case AArch64::STPDi: 221 case AArch64::STPXi: 222 return 8; 223 case AArch64::LDRQui: 224 case AArch64::LDURQi: 225 case AArch64::STRQui: 226 case AArch64::STURQi: 227 case AArch64::LDPQi: 228 case AArch64::STPQi: 229 return 16; 230 } 231 } 232 233 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 234 bool *IsValidLdStrOpc = nullptr) { 235 if (IsValidLdStrOpc) 236 *IsValidLdStrOpc = true; 237 switch (Opc) { 238 default: 239 if (IsValidLdStrOpc) 240 *IsValidLdStrOpc = false; 241 return UINT_MAX; 242 case AArch64::STRDui: 243 case AArch64::STURDi: 244 case AArch64::STRQui: 245 case AArch64::STURQi: 246 case AArch64::STRWui: 247 case AArch64::STURWi: 248 case AArch64::STRXui: 249 case AArch64::STURXi: 250 case AArch64::LDRDui: 251 case AArch64::LDURDi: 252 case AArch64::LDRQui: 253 case AArch64::LDURQi: 254 case AArch64::LDRWui: 255 case AArch64::LDURWi: 256 case AArch64::LDRXui: 257 case AArch64::LDURXi: 258 case AArch64::STRSui: 259 case AArch64::STURSi: 260 case AArch64::LDRSui: 261 case AArch64::LDURSi: 262 case AArch64::LDRHHui: 263 case AArch64::LDURHHi: 264 return Opc; 265 case AArch64::LDRSWui: 266 return AArch64::LDRWui; 267 case AArch64::LDURSWi: 268 return AArch64::LDURWi; 269 } 270 } 271 272 static unsigned getMatchingPairOpcode(unsigned Opc) { 273 switch (Opc) { 274 default: 275 llvm_unreachable("Opcode has no pairwise equivalent!"); 276 case AArch64::STRSui: 277 case AArch64::STURSi: 278 return AArch64::STPSi; 279 case AArch64::STRDui: 280 case AArch64::STURDi: 281 return AArch64::STPDi; 282 case AArch64::STRQui: 283 case AArch64::STURQi: 284 return AArch64::STPQi; 285 case AArch64::STRWui: 286 case AArch64::STURWi: 287 return AArch64::STPWi; 288 case AArch64::STRXui: 289 case AArch64::STURXi: 290 return AArch64::STPXi; 291 case AArch64::LDRSui: 292 case AArch64::LDURSi: 293 return AArch64::LDPSi; 294 case AArch64::LDRDui: 295 case AArch64::LDURDi: 296 return AArch64::LDPDi; 297 case AArch64::LDRQui: 298 case AArch64::LDURQi: 299 return AArch64::LDPQi; 300 case AArch64::LDRWui: 301 case AArch64::LDURWi: 302 return AArch64::LDPWi; 303 case AArch64::LDRXui: 304 case AArch64::LDURXi: 305 return AArch64::LDPXi; 306 case AArch64::LDRSWui: 307 case AArch64::LDURSWi: 308 return AArch64::LDPSWi; 309 case AArch64::LDRHHui: 310 return AArch64::LDRWui; 311 case AArch64::LDURHHi: 312 return AArch64::LDURWi; 313 } 314 } 315 316 static unsigned getPreIndexedOpcode(unsigned Opc) { 317 switch (Opc) { 318 default: 319 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 320 case AArch64::STRSui: 321 return AArch64::STRSpre; 322 case AArch64::STRDui: 323 return AArch64::STRDpre; 324 case AArch64::STRQui: 325 return AArch64::STRQpre; 326 case AArch64::STRBBui: 327 return AArch64::STRBBpre; 328 case AArch64::STRHHui: 329 return AArch64::STRHHpre; 330 case AArch64::STRWui: 331 return AArch64::STRWpre; 332 case AArch64::STRXui: 333 return AArch64::STRXpre; 334 case AArch64::LDRSui: 335 return AArch64::LDRSpre; 336 case AArch64::LDRDui: 337 return AArch64::LDRDpre; 338 case AArch64::LDRQui: 339 return AArch64::LDRQpre; 340 case AArch64::LDRBBui: 341 return AArch64::LDRBBpre; 342 case AArch64::LDRHHui: 343 return AArch64::LDRHHpre; 344 case AArch64::LDRWui: 345 return AArch64::LDRWpre; 346 case AArch64::LDRXui: 347 return AArch64::LDRXpre; 348 case AArch64::LDRSWui: 349 return AArch64::LDRSWpre; 350 case AArch64::LDPSi: 351 return AArch64::LDPSpre; 352 case AArch64::LDPSWi: 353 return AArch64::LDPSWpre; 354 case AArch64::LDPDi: 355 return AArch64::LDPDpre; 356 case AArch64::LDPQi: 357 return AArch64::LDPQpre; 358 case AArch64::LDPWi: 359 return AArch64::LDPWpre; 360 case AArch64::LDPXi: 361 return AArch64::LDPXpre; 362 case AArch64::STPSi: 363 return AArch64::STPSpre; 364 case AArch64::STPDi: 365 return AArch64::STPDpre; 366 case AArch64::STPQi: 367 return AArch64::STPQpre; 368 case AArch64::STPWi: 369 return AArch64::STPWpre; 370 case AArch64::STPXi: 371 return AArch64::STPXpre; 372 } 373 } 374 375 static unsigned getPostIndexedOpcode(unsigned Opc) { 376 switch (Opc) { 377 default: 378 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 379 case AArch64::STRSui: 380 return AArch64::STRSpost; 381 case AArch64::STRDui: 382 return AArch64::STRDpost; 383 case AArch64::STRQui: 384 return AArch64::STRQpost; 385 case AArch64::STRBBui: 386 return AArch64::STRBBpost; 387 case AArch64::STRHHui: 388 return AArch64::STRHHpost; 389 case AArch64::STRWui: 390 return AArch64::STRWpost; 391 case AArch64::STRXui: 392 return AArch64::STRXpost; 393 case AArch64::LDRSui: 394 return AArch64::LDRSpost; 395 case AArch64::LDRDui: 396 return AArch64::LDRDpost; 397 case AArch64::LDRQui: 398 return AArch64::LDRQpost; 399 case AArch64::LDRBBui: 400 return AArch64::LDRBBpost; 401 case AArch64::LDRHHui: 402 return AArch64::LDRHHpost; 403 case AArch64::LDRWui: 404 return AArch64::LDRWpost; 405 case AArch64::LDRXui: 406 return AArch64::LDRXpost; 407 case AArch64::LDRSWui: 408 return AArch64::LDRSWpost; 409 case AArch64::LDPSi: 410 return AArch64::LDPSpost; 411 case AArch64::LDPSWi: 412 return AArch64::LDPSWpost; 413 case AArch64::LDPDi: 414 return AArch64::LDPDpost; 415 case AArch64::LDPQi: 416 return AArch64::LDPQpost; 417 case AArch64::LDPWi: 418 return AArch64::LDPWpost; 419 case AArch64::LDPXi: 420 return AArch64::LDPXpost; 421 case AArch64::STPSi: 422 return AArch64::STPSpost; 423 case AArch64::STPDi: 424 return AArch64::STPDpost; 425 case AArch64::STPQi: 426 return AArch64::STPQpost; 427 case AArch64::STPWi: 428 return AArch64::STPWpost; 429 case AArch64::STPXi: 430 return AArch64::STPXpost; 431 } 432 } 433 434 static bool isPairedLdSt(const MachineInstr *MI) { 435 switch (MI->getOpcode()) { 436 default: 437 return false; 438 case AArch64::LDPSi: 439 case AArch64::LDPSWi: 440 case AArch64::LDPDi: 441 case AArch64::LDPQi: 442 case AArch64::LDPWi: 443 case AArch64::LDPXi: 444 case AArch64::STPSi: 445 case AArch64::STPDi: 446 case AArch64::STPQi: 447 case AArch64::STPWi: 448 case AArch64::STPXi: 449 return true; 450 } 451 } 452 453 static const MachineOperand &getLdStRegOp(const MachineInstr *MI, 454 unsigned PairedRegOp = 0) { 455 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 456 unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; 457 return MI->getOperand(Idx); 458 } 459 460 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { 461 unsigned Idx = isPairedLdSt(MI) ? 2 : 1; 462 return MI->getOperand(Idx); 463 } 464 465 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { 466 unsigned Idx = isPairedLdSt(MI) ? 3 : 2; 467 return MI->getOperand(Idx); 468 } 469 470 // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. 471 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, 472 MachineInstr *Op1) { 473 assert(MI->memoperands_empty() && "expected a new machineinstr"); 474 size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + 475 (Op1->memoperands_end() - Op1->memoperands_begin()); 476 477 MachineFunction *MF = MI->getParent()->getParent(); 478 MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); 479 MachineSDNode::mmo_iterator MemEnd = 480 std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); 481 MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); 482 MI->setMemRefs(MemBegin, MemEnd); 483 } 484 485 MachineBasicBlock::iterator 486 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 487 MachineBasicBlock::iterator Paired, 488 const LdStPairFlags &Flags) { 489 MachineBasicBlock::iterator NextI = I; 490 ++NextI; 491 // If NextI is the second of the two instructions to be merged, we need 492 // to skip one further. Either way we merge will invalidate the iterator, 493 // and we don't need to scan the new instruction, as it's a pairwise 494 // instruction, which we're not considering for further action anyway. 495 if (NextI == Paired) 496 ++NextI; 497 498 int SExtIdx = Flags.getSExtIdx(); 499 unsigned Opc = 500 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 501 bool IsUnscaled = isUnscaledLdSt(Opc); 502 int OffsetStride = IsUnscaled ? getMemScale(I) : 1; 503 504 bool MergeForward = Flags.getMergeForward(); 505 unsigned NewOpc = getMatchingPairOpcode(Opc); 506 // Insert our new paired instruction after whichever of the paired 507 // instructions MergeForward indicates. 508 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 509 // Also based on MergeForward is from where we copy the base register operand 510 // so we get the flags compatible with the input code. 511 const MachineOperand &BaseRegOp = 512 MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); 513 514 // Which register is Rt and which is Rt2 depends on the offset order. 515 MachineInstr *RtMI, *Rt2MI; 516 if (getLdStOffsetOp(I).getImm() == 517 getLdStOffsetOp(Paired).getImm() + OffsetStride) { 518 RtMI = Paired; 519 Rt2MI = I; 520 // Here we swapped the assumption made for SExtIdx. 521 // I.e., we turn ldp I, Paired into ldp Paired, I. 522 // Update the index accordingly. 523 if (SExtIdx != -1) 524 SExtIdx = (SExtIdx + 1) % 2; 525 } else { 526 RtMI = I; 527 Rt2MI = Paired; 528 } 529 530 int OffsetImm = getLdStOffsetOp(RtMI).getImm(); 531 532 if (isSmallTypeLdMerge(Opc)) { 533 // Change the scaled offset from small to large type. 534 if (!IsUnscaled) 535 OffsetImm /= 2; 536 MachineInstr *RtNewDest = MergeForward ? I : Paired; 537 // Construct the new load instruction. 538 // FIXME: currently we support only halfword unsigned load. We need to 539 // handle byte type, signed, and store instructions as well. 540 MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; 541 NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 542 TII->get(NewOpc)) 543 .addOperand(getLdStRegOp(RtNewDest)) 544 .addOperand(BaseRegOp) 545 .addImm(OffsetImm); 546 547 // Copy MachineMemOperands from the original loads. 548 concatenateMemOperands(NewMemMI, I, Paired); 549 550 DEBUG( 551 dbgs() 552 << "Creating the new load and extract. Replacing instructions:\n "); 553 DEBUG(I->print(dbgs())); 554 DEBUG(dbgs() << " "); 555 DEBUG(Paired->print(dbgs())); 556 DEBUG(dbgs() << " with instructions:\n "); 557 DEBUG((NewMemMI)->print(dbgs())); 558 559 MachineInstr *ExtDestMI = MergeForward ? Paired : I; 560 if (ExtDestMI == Rt2MI) { 561 // Create the bitfield extract for high half. 562 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 563 TII->get(AArch64::UBFMWri)) 564 .addOperand(getLdStRegOp(Rt2MI)) 565 .addReg(getLdStRegOp(RtNewDest).getReg()) 566 .addImm(16) 567 .addImm(31); 568 // Create the bitfield extract for low half. 569 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 570 TII->get(AArch64::ANDWri)) 571 .addOperand(getLdStRegOp(RtMI)) 572 .addReg(getLdStRegOp(RtNewDest).getReg()) 573 .addImm(15); 574 } else { 575 // Create the bitfield extract for low half. 576 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 577 TII->get(AArch64::ANDWri)) 578 .addOperand(getLdStRegOp(RtMI)) 579 .addReg(getLdStRegOp(RtNewDest).getReg()) 580 .addImm(15); 581 // Create the bitfield extract for high half. 582 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 583 TII->get(AArch64::UBFMWri)) 584 .addOperand(getLdStRegOp(Rt2MI)) 585 .addReg(getLdStRegOp(RtNewDest).getReg()) 586 .addImm(16) 587 .addImm(31); 588 } 589 DEBUG(dbgs() << " "); 590 DEBUG((BitExtMI1)->print(dbgs())); 591 DEBUG(dbgs() << " "); 592 DEBUG((BitExtMI2)->print(dbgs())); 593 DEBUG(dbgs() << "\n"); 594 595 // Erase the old instructions. 596 I->eraseFromParent(); 597 Paired->eraseFromParent(); 598 return NextI; 599 } 600 601 // Handle Unscaled 602 if (IsUnscaled) 603 OffsetImm /= OffsetStride; 604 605 // Construct the new instruction. 606 MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, 607 I->getDebugLoc(), TII->get(NewOpc)) 608 .addOperand(getLdStRegOp(RtMI)) 609 .addOperand(getLdStRegOp(Rt2MI)) 610 .addOperand(BaseRegOp) 611 .addImm(OffsetImm); 612 (void)MIB; 613 614 // FIXME: Do we need/want to copy the mem operands from the source 615 // instructions? Probably. What uses them after this? 616 617 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); 618 DEBUG(I->print(dbgs())); 619 DEBUG(dbgs() << " "); 620 DEBUG(Paired->print(dbgs())); 621 DEBUG(dbgs() << " with instruction:\n "); 622 623 if (SExtIdx != -1) { 624 // Generate the sign extension for the proper result of the ldp. 625 // I.e., with X1, that would be: 626 // %W1<def> = KILL %W1, %X1<imp-def> 627 // %X1<def> = SBFMXri %X1<kill>, 0, 31 628 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 629 // Right now, DstMO has the extended register, since it comes from an 630 // extended opcode. 631 unsigned DstRegX = DstMO.getReg(); 632 // Get the W variant of that register. 633 unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 634 // Update the result of LDP to use the W instead of the X variant. 635 DstMO.setReg(DstRegW); 636 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 637 DEBUG(dbgs() << "\n"); 638 // Make the machine verifier happy by providing a definition for 639 // the X register. 640 // Insert this definition right after the generated LDP, i.e., before 641 // InsertionPoint. 642 MachineInstrBuilder MIBKill = 643 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 644 TII->get(TargetOpcode::KILL), DstRegW) 645 .addReg(DstRegW) 646 .addReg(DstRegX, RegState::Define); 647 MIBKill->getOperand(2).setImplicit(); 648 // Create the sign extension. 649 MachineInstrBuilder MIBSXTW = 650 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 651 TII->get(AArch64::SBFMXri), DstRegX) 652 .addReg(DstRegX) 653 .addImm(0) 654 .addImm(31); 655 (void)MIBSXTW; 656 DEBUG(dbgs() << " Extend operand:\n "); 657 DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 658 DEBUG(dbgs() << "\n"); 659 } else { 660 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 661 DEBUG(dbgs() << "\n"); 662 } 663 664 // Erase the old instructions. 665 I->eraseFromParent(); 666 Paired->eraseFromParent(); 667 668 return NextI; 669 } 670 671 /// trackRegDefsUses - Remember what registers the specified instruction uses 672 /// and modifies. 673 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, 674 BitVector &UsedRegs, 675 const TargetRegisterInfo *TRI) { 676 for (const MachineOperand &MO : MI->operands()) { 677 if (MO.isRegMask()) 678 ModifiedRegs.setBitsNotInMask(MO.getRegMask()); 679 680 if (!MO.isReg()) 681 continue; 682 unsigned Reg = MO.getReg(); 683 if (MO.isDef()) { 684 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 685 ModifiedRegs.set(*AI); 686 } else { 687 assert(MO.isUse() && "Reg operand not a def and not a use?!?"); 688 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 689 UsedRegs.set(*AI); 690 } 691 } 692 } 693 694 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 695 // Convert the byte-offset used by unscaled into an "element" offset used 696 // by the scaled pair load/store instructions. 697 if (IsUnscaled) 698 Offset /= OffsetStride; 699 700 return Offset <= 63 && Offset >= -64; 701 } 702 703 // Do alignment, specialized to power of 2 and for signed ints, 704 // avoiding having to do a C-style cast from uint_64t to int when 705 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h. 706 // FIXME: Move this function to include/MathExtras.h? 707 static int alignTo(int Num, int PowOf2) { 708 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 709 } 710 711 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, 712 const AArch64InstrInfo *TII) { 713 // One of the instructions must modify memory. 714 if (!MIa->mayStore() && !MIb->mayStore()) 715 return false; 716 717 // Both instructions must be memory operations. 718 if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore()) 719 return false; 720 721 return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); 722 } 723 724 static bool mayAlias(MachineInstr *MIa, 725 SmallVectorImpl<MachineInstr *> &MemInsns, 726 const AArch64InstrInfo *TII) { 727 for (auto &MIb : MemInsns) 728 if (mayAlias(MIa, MIb, TII)) 729 return true; 730 731 return false; 732 } 733 734 /// findMatchingInsn - Scan the instructions looking for a load/store that can 735 /// be combined with the current instruction into a load/store pair. 736 MachineBasicBlock::iterator 737 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 738 LdStPairFlags &Flags, unsigned Limit) { 739 MachineBasicBlock::iterator E = I->getParent()->end(); 740 MachineBasicBlock::iterator MBBI = I; 741 MachineInstr *FirstMI = I; 742 ++MBBI; 743 744 unsigned Opc = FirstMI->getOpcode(); 745 bool MayLoad = FirstMI->mayLoad(); 746 bool IsUnscaled = isUnscaledLdSt(FirstMI); 747 unsigned Reg = getLdStRegOp(FirstMI).getReg(); 748 unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); 749 int Offset = getLdStOffsetOp(FirstMI).getImm(); 750 751 // Early exit if the first instruction modifies the base register. 752 // e.g., ldr x0, [x0] 753 if (FirstMI->modifiesRegister(BaseReg, TRI)) 754 return E; 755 756 // Early exit if the offset if not possible to match. (6 bits of positive 757 // range, plus allow an extra one in case we find a later insn that matches 758 // with Offset-1) 759 int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; 760 if (!isSmallTypeLdMerge(Opc) && 761 !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 762 return E; 763 764 // Track which registers have been modified and used between the first insn 765 // (inclusive) and the second insn. 766 BitVector ModifiedRegs, UsedRegs; 767 ModifiedRegs.resize(TRI->getNumRegs()); 768 UsedRegs.resize(TRI->getNumRegs()); 769 770 // Remember any instructions that read/write memory between FirstMI and MI. 771 SmallVector<MachineInstr *, 4> MemInsns; 772 773 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 774 MachineInstr *MI = MBBI; 775 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 776 // optimization by changing how far we scan. 777 if (MI->isDebugValue()) 778 continue; 779 780 // Now that we know this is a real instruction, count it. 781 ++Count; 782 783 bool CanMergeOpc = Opc == MI->getOpcode(); 784 Flags.setSExtIdx(-1); 785 if (!CanMergeOpc) { 786 bool IsValidLdStrOpc; 787 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); 788 assert(IsValidLdStrOpc && 789 "Given Opc should be a Load or Store with an immediate"); 790 // Opc will be the first instruction in the pair. 791 Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); 792 CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); 793 } 794 795 if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { 796 assert(MI->mayLoadOrStore() && "Expected memory operation."); 797 // If we've found another instruction with the same opcode, check to see 798 // if the base and offset are compatible with our starting instruction. 799 // These instructions all have scaled immediate operands, so we just 800 // check for +1/-1. Make sure to check the new instruction offset is 801 // actually an immediate and not a symbolic reference destined for 802 // a relocation. 803 // 804 // Pairwise instructions have a 7-bit signed offset field. Single insns 805 // have a 12-bit unsigned offset field. To be a valid combine, the 806 // final offset must be in range. 807 unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); 808 int MIOffset = getLdStOffsetOp(MI).getImm(); 809 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || 810 (Offset + OffsetStride == MIOffset))) { 811 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 812 // If this is a volatile load/store that otherwise matched, stop looking 813 // as something is going on that we don't have enough information to 814 // safely transform. Similarly, stop if we see a hint to avoid pairs. 815 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 816 return E; 817 // If the resultant immediate offset of merging these instructions 818 // is out of range for a pairwise instruction, bail and keep looking. 819 bool MIIsUnscaled = isUnscaledLdSt(MI); 820 bool IsSmallTypeLd = isSmallTypeLdMerge(MI->getOpcode()); 821 if (!IsSmallTypeLd && 822 !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { 823 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 824 MemInsns.push_back(MI); 825 continue; 826 } 827 828 if (IsSmallTypeLd) { 829 // If the alignment requirements of the larger type scaled load 830 // instruction can't express the scaled offset of the smaller type 831 // input, bail and keep looking. 832 if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { 833 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 834 MemInsns.push_back(MI); 835 continue; 836 } 837 } else { 838 // If the alignment requirements of the paired (scaled) instruction 839 // can't express the offset of the unscaled input, bail and keep 840 // looking. 841 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 842 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 843 MemInsns.push_back(MI); 844 continue; 845 } 846 } 847 // If the destination register of the loads is the same register, bail 848 // and keep looking. A load-pair instruction with both destination 849 // registers the same is UNPREDICTABLE and will result in an exception. 850 if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { 851 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 852 MemInsns.push_back(MI); 853 continue; 854 } 855 856 // If the Rt of the second instruction was not modified or used between 857 // the two instructions and none of the instructions between the second 858 // and first alias with the second, we can combine the second into the 859 // first. 860 if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && 861 !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && 862 !mayAlias(MI, MemInsns, TII)) { 863 Flags.setMergeForward(false); 864 return MBBI; 865 } 866 867 // Likewise, if the Rt of the first instruction is not modified or used 868 // between the two instructions and none of the instructions between the 869 // first and the second alias with the first, we can combine the first 870 // into the second. 871 if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && 872 !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && 873 !mayAlias(FirstMI, MemInsns, TII)) { 874 Flags.setMergeForward(true); 875 return MBBI; 876 } 877 // Unable to combine these instructions due to interference in between. 878 // Keep looking. 879 } 880 } 881 882 // If the instruction wasn't a matching load or store. Stop searching if we 883 // encounter a call instruction that might modify memory. 884 if (MI->isCall()) 885 return E; 886 887 // Update modified / uses register lists. 888 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 889 890 // Otherwise, if the base register is modified, we have no match, so 891 // return early. 892 if (ModifiedRegs[BaseReg]) 893 return E; 894 895 // Update list of instructions that read/write memory. 896 if (MI->mayLoadOrStore()) 897 MemInsns.push_back(MI); 898 } 899 return E; 900 } 901 902 MachineBasicBlock::iterator 903 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 904 MachineBasicBlock::iterator Update, 905 bool IsPreIdx) { 906 assert((Update->getOpcode() == AArch64::ADDXri || 907 Update->getOpcode() == AArch64::SUBXri) && 908 "Unexpected base register update instruction to merge!"); 909 MachineBasicBlock::iterator NextI = I; 910 // Return the instruction following the merged instruction, which is 911 // the instruction following our unmerged load. Unless that's the add/sub 912 // instruction we're merging, in which case it's the one after that. 913 if (++NextI == Update) 914 ++NextI; 915 916 int Value = Update->getOperand(2).getImm(); 917 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 918 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 919 if (Update->getOpcode() == AArch64::SUBXri) 920 Value = -Value; 921 922 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 923 : getPostIndexedOpcode(I->getOpcode()); 924 MachineInstrBuilder MIB; 925 if (!isPairedLdSt(I)) { 926 // Non-paired instruction. 927 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 928 .addOperand(getLdStRegOp(Update)) 929 .addOperand(getLdStRegOp(I)) 930 .addOperand(getLdStBaseOp(I)) 931 .addImm(Value); 932 } else { 933 // Paired instruction. 934 int Scale = getMemScale(I); 935 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 936 .addOperand(getLdStRegOp(Update)) 937 .addOperand(getLdStRegOp(I, 0)) 938 .addOperand(getLdStRegOp(I, 1)) 939 .addOperand(getLdStBaseOp(I)) 940 .addImm(Value / Scale); 941 } 942 (void)MIB; 943 944 if (IsPreIdx) 945 DEBUG(dbgs() << "Creating pre-indexed load/store."); 946 else 947 DEBUG(dbgs() << "Creating post-indexed load/store."); 948 DEBUG(dbgs() << " Replacing instructions:\n "); 949 DEBUG(I->print(dbgs())); 950 DEBUG(dbgs() << " "); 951 DEBUG(Update->print(dbgs())); 952 DEBUG(dbgs() << " with instruction:\n "); 953 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 954 DEBUG(dbgs() << "\n"); 955 956 // Erase the old instructions for the block. 957 I->eraseFromParent(); 958 Update->eraseFromParent(); 959 960 return NextI; 961 } 962 963 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, 964 MachineInstr *MI, 965 unsigned BaseReg, int Offset) { 966 switch (MI->getOpcode()) { 967 default: 968 break; 969 case AArch64::SUBXri: 970 // Negate the offset for a SUB instruction. 971 Offset *= -1; 972 // FALLTHROUGH 973 case AArch64::ADDXri: 974 // Make sure it's a vanilla immediate operand, not a relocation or 975 // anything else we can't handle. 976 if (!MI->getOperand(2).isImm()) 977 break; 978 // Watch out for 1 << 12 shifted value. 979 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) 980 break; 981 982 // The update instruction source and destination register must be the 983 // same as the load/store base register. 984 if (MI->getOperand(0).getReg() != BaseReg || 985 MI->getOperand(1).getReg() != BaseReg) 986 break; 987 988 bool IsPairedInsn = isPairedLdSt(MemMI); 989 int UpdateOffset = MI->getOperand(2).getImm(); 990 // For non-paired load/store instructions, the immediate must fit in a 991 // signed 9-bit integer. 992 if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) 993 break; 994 995 // For paired load/store instructions, the immediate must be a multiple of 996 // the scaling factor. The scaled offset must also fit into a signed 7-bit 997 // integer. 998 if (IsPairedInsn) { 999 int Scale = getMemScale(MemMI); 1000 if (UpdateOffset % Scale != 0) 1001 break; 1002 1003 int ScaledOffset = UpdateOffset / Scale; 1004 if (ScaledOffset > 64 || ScaledOffset < -64) 1005 break; 1006 } 1007 1008 // If we have a non-zero Offset, we check that it matches the amount 1009 // we're adding to the register. 1010 if (!Offset || Offset == MI->getOperand(2).getImm()) 1011 return true; 1012 break; 1013 } 1014 return false; 1015 } 1016 1017 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 1018 MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { 1019 MachineBasicBlock::iterator E = I->getParent()->end(); 1020 MachineInstr *MemMI = I; 1021 MachineBasicBlock::iterator MBBI = I; 1022 1023 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1024 int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); 1025 1026 // Scan forward looking for post-index opportunities. Updating instructions 1027 // can't be formed if the memory instruction doesn't have the offset we're 1028 // looking for. 1029 if (MIUnscaledOffset != UnscaledOffset) 1030 return E; 1031 1032 // If the base register overlaps a destination register, we can't 1033 // merge the update. 1034 bool IsPairedInsn = isPairedLdSt(MemMI); 1035 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1036 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1037 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1038 return E; 1039 } 1040 1041 // Track which registers have been modified and used between the first insn 1042 // (inclusive) and the second insn. 1043 BitVector ModifiedRegs, UsedRegs; 1044 ModifiedRegs.resize(TRI->getNumRegs()); 1045 UsedRegs.resize(TRI->getNumRegs()); 1046 ++MBBI; 1047 for (unsigned Count = 0; MBBI != E; ++MBBI) { 1048 MachineInstr *MI = MBBI; 1049 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1050 // optimization by changing how far we scan. 1051 if (MI->isDebugValue()) 1052 continue; 1053 1054 // Now that we know this is a real instruction, count it. 1055 ++Count; 1056 1057 // If we found a match, return it. 1058 if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) 1059 return MBBI; 1060 1061 // Update the status of what the instruction clobbered and used. 1062 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1063 1064 // Otherwise, if the base register is used or modified, we have no match, so 1065 // return early. 1066 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1067 return E; 1068 } 1069 return E; 1070 } 1071 1072 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 1073 MachineBasicBlock::iterator I, unsigned Limit) { 1074 MachineBasicBlock::iterator B = I->getParent()->begin(); 1075 MachineBasicBlock::iterator E = I->getParent()->end(); 1076 MachineInstr *MemMI = I; 1077 MachineBasicBlock::iterator MBBI = I; 1078 1079 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1080 int Offset = getLdStOffsetOp(MemMI).getImm(); 1081 1082 // If the load/store is the first instruction in the block, there's obviously 1083 // not any matching update. Ditto if the memory offset isn't zero. 1084 if (MBBI == B || Offset != 0) 1085 return E; 1086 // If the base register overlaps a destination register, we can't 1087 // merge the update. 1088 bool IsPairedInsn = isPairedLdSt(MemMI); 1089 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1090 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1091 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1092 return E; 1093 } 1094 1095 // Track which registers have been modified and used between the first insn 1096 // (inclusive) and the second insn. 1097 BitVector ModifiedRegs, UsedRegs; 1098 ModifiedRegs.resize(TRI->getNumRegs()); 1099 UsedRegs.resize(TRI->getNumRegs()); 1100 --MBBI; 1101 for (unsigned Count = 0; MBBI != B; --MBBI) { 1102 MachineInstr *MI = MBBI; 1103 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1104 // optimization by changing how far we scan. 1105 if (MI->isDebugValue()) 1106 continue; 1107 1108 // Now that we know this is a real instruction, count it. 1109 ++Count; 1110 1111 // If we found a match, return it. 1112 if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) 1113 return MBBI; 1114 1115 // Update the status of what the instruction clobbered and used. 1116 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1117 1118 // Otherwise, if the base register is used or modified, we have no match, so 1119 // return early. 1120 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1121 return E; 1122 } 1123 return E; 1124 } 1125 1126 bool AArch64LoadStoreOpt::tryToMergeLdStInst( 1127 MachineBasicBlock::iterator &MBBI) { 1128 MachineInstr *MI = MBBI; 1129 MachineBasicBlock::iterator E = MI->getParent()->end(); 1130 // If this is a volatile load/store, don't mess with it. 1131 if (MI->hasOrderedMemoryRef()) 1132 return false; 1133 1134 // Make sure this is a reg+imm (as opposed to an address reloc). 1135 if (!getLdStOffsetOp(MI).isImm()) 1136 return false; 1137 1138 // Check if this load/store has a hint to avoid pair formation. 1139 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 1140 if (TII->isLdStPairSuppressed(MI)) 1141 return false; 1142 1143 // Look ahead up to ScanLimit instructions for a pairable instruction. 1144 LdStPairFlags Flags; 1145 MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); 1146 if (Paired != E) { 1147 if (isSmallTypeLdMerge(MI)) { 1148 ++NumSmallTypeMerged; 1149 } else { 1150 ++NumPairCreated; 1151 if (isUnscaledLdSt(MI)) 1152 ++NumUnscaledPairCreated; 1153 } 1154 1155 // Merge the loads into a pair. Keeping the iterator straight is a 1156 // pain, so we let the merge routine tell us what the next instruction 1157 // is after it's done mucking about. 1158 MBBI = mergePairedInsns(MBBI, Paired, Flags); 1159 return true; 1160 } 1161 return false; 1162 } 1163 1164 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { 1165 bool Modified = false; 1166 // Three tranformations to do here: 1167 // 1) Find halfword loads that can be merged into a single 32-bit word load 1168 // with bitfield extract instructions. 1169 // e.g., 1170 // ldrh w0, [x2] 1171 // ldrh w1, [x2, #2] 1172 // ; becomes 1173 // ldr w0, [x2] 1174 // ubfx w1, w0, #16, #16 1175 // and w0, w0, #ffff 1176 // 2) Find loads and stores that can be merged into a single load or store 1177 // pair instruction. 1178 // e.g., 1179 // ldr x0, [x2] 1180 // ldr x1, [x2, #8] 1181 // ; becomes 1182 // ldp x0, x1, [x2] 1183 // 3) Find base register updates that can be merged into the load or store 1184 // as a base-reg writeback. 1185 // e.g., 1186 // ldr x0, [x2] 1187 // add x2, x2, #4 1188 // ; becomes 1189 // ldr x0, [x2], #4 1190 1191 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1192 !IsStrictAlign && MBBI != E;) { 1193 MachineInstr *MI = MBBI; 1194 switch (MI->getOpcode()) { 1195 default: 1196 // Just move on to the next instruction. 1197 ++MBBI; 1198 break; 1199 // Scaled instructions. 1200 case AArch64::LDRHHui: 1201 // Unscaled instructions. 1202 case AArch64::LDURHHi: { 1203 if (tryToMergeLdStInst(MBBI)) { 1204 Modified = true; 1205 break; 1206 } 1207 ++MBBI; 1208 break; 1209 } 1210 // FIXME: Do the other instructions. 1211 } 1212 } 1213 1214 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1215 MBBI != E;) { 1216 MachineInstr *MI = MBBI; 1217 switch (MI->getOpcode()) { 1218 default: 1219 // Just move on to the next instruction. 1220 ++MBBI; 1221 break; 1222 // Scaled instructions. 1223 case AArch64::STRSui: 1224 case AArch64::STRDui: 1225 case AArch64::STRQui: 1226 case AArch64::STRXui: 1227 case AArch64::STRWui: 1228 case AArch64::LDRSui: 1229 case AArch64::LDRDui: 1230 case AArch64::LDRQui: 1231 case AArch64::LDRXui: 1232 case AArch64::LDRWui: 1233 case AArch64::LDRSWui: 1234 // Unscaled instructions. 1235 case AArch64::STURSi: 1236 case AArch64::STURDi: 1237 case AArch64::STURQi: 1238 case AArch64::STURWi: 1239 case AArch64::STURXi: 1240 case AArch64::LDURSi: 1241 case AArch64::LDURDi: 1242 case AArch64::LDURQi: 1243 case AArch64::LDURWi: 1244 case AArch64::LDURXi: 1245 case AArch64::LDURSWi: { 1246 if (tryToMergeLdStInst(MBBI)) { 1247 Modified = true; 1248 break; 1249 } 1250 ++MBBI; 1251 break; 1252 } 1253 // FIXME: Do the other instructions. 1254 } 1255 } 1256 1257 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1258 MBBI != E;) { 1259 MachineInstr *MI = MBBI; 1260 // Do update merging. It's simpler to keep this separate from the above 1261 // switch, though not strictly necessary. 1262 unsigned Opc = MI->getOpcode(); 1263 switch (Opc) { 1264 default: 1265 // Just move on to the next instruction. 1266 ++MBBI; 1267 break; 1268 // Scaled instructions. 1269 case AArch64::STRSui: 1270 case AArch64::STRDui: 1271 case AArch64::STRQui: 1272 case AArch64::STRXui: 1273 case AArch64::STRWui: 1274 case AArch64::STRHHui: 1275 case AArch64::STRBBui: 1276 case AArch64::LDRSui: 1277 case AArch64::LDRDui: 1278 case AArch64::LDRQui: 1279 case AArch64::LDRXui: 1280 case AArch64::LDRWui: 1281 case AArch64::LDRHHui: 1282 case AArch64::LDRBBui: 1283 // Unscaled instructions. 1284 case AArch64::STURSi: 1285 case AArch64::STURDi: 1286 case AArch64::STURQi: 1287 case AArch64::STURWi: 1288 case AArch64::STURXi: 1289 case AArch64::LDURSi: 1290 case AArch64::LDURDi: 1291 case AArch64::LDURQi: 1292 case AArch64::LDURWi: 1293 case AArch64::LDURXi: 1294 // Paired instructions. 1295 case AArch64::LDPSi: 1296 case AArch64::LDPSWi: 1297 case AArch64::LDPDi: 1298 case AArch64::LDPQi: 1299 case AArch64::LDPWi: 1300 case AArch64::LDPXi: 1301 case AArch64::STPSi: 1302 case AArch64::STPDi: 1303 case AArch64::STPQi: 1304 case AArch64::STPWi: 1305 case AArch64::STPXi: { 1306 // Make sure this is a reg+imm (as opposed to an address reloc). 1307 if (!getLdStOffsetOp(MI).isImm()) { 1308 ++MBBI; 1309 break; 1310 } 1311 // Look forward to try to form a post-index instruction. For example, 1312 // ldr x0, [x20] 1313 // add x20, x20, #32 1314 // merged into: 1315 // ldr x0, [x20], #32 1316 MachineBasicBlock::iterator Update = 1317 findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); 1318 if (Update != E) { 1319 // Merge the update into the ld/st. 1320 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 1321 Modified = true; 1322 ++NumPostFolded; 1323 break; 1324 } 1325 // Don't know how to handle pre/post-index versions, so move to the next 1326 // instruction. 1327 if (isUnscaledLdSt(Opc)) { 1328 ++MBBI; 1329 break; 1330 } 1331 1332 // Look back to try to find a pre-index instruction. For example, 1333 // add x0, x0, #8 1334 // ldr x1, [x0] 1335 // merged into: 1336 // ldr x1, [x0, #8]! 1337 Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); 1338 if (Update != E) { 1339 // Merge the update into the ld/st. 1340 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1341 Modified = true; 1342 ++NumPreFolded; 1343 break; 1344 } 1345 // The immediate in the load/store is scaled by the size of the memory 1346 // operation. The immediate in the add we're looking for, 1347 // however, is not, so adjust here. 1348 int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); 1349 1350 // Look forward to try to find a post-index instruction. For example, 1351 // ldr x1, [x0, #64] 1352 // add x0, x0, #64 1353 // merged into: 1354 // ldr x1, [x0, #64]! 1355 Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); 1356 if (Update != E) { 1357 // Merge the update into the ld/st. 1358 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1359 Modified = true; 1360 ++NumPreFolded; 1361 break; 1362 } 1363 1364 // Nothing found. Just move to the next instruction. 1365 ++MBBI; 1366 break; 1367 } 1368 // FIXME: Do the other instructions. 1369 } 1370 } 1371 1372 return Modified; 1373 } 1374 1375 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1376 TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo()); 1377 TRI = Fn.getSubtarget().getRegisterInfo(); 1378 IsStrictAlign = (static_cast<const AArch64Subtarget &>(Fn.getSubtarget())) 1379 .requiresStrictAlign(); 1380 1381 bool Modified = false; 1382 for (auto &MBB : Fn) 1383 Modified |= optimizeBlock(MBB); 1384 1385 return Modified; 1386 } 1387 1388 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep 1389 // loads and stores near one another? 1390 1391 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 1392 /// load / store optimization pass. 1393 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 1394 return new AArch64LoadStoreOpt(); 1395 } 1396