1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AArch64InstrInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "MCTargetDesc/AArch64AddressingModes.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/Support/CommandLine.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include "llvm/Target/TargetRegisterInfo.h" 32 using namespace llvm; 33 34 #define DEBUG_TYPE "aarch64-ldst-opt" 35 36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine 37 /// load / store instructions to form ldp / stp instructions. 38 39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 40 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 41 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 42 STATISTIC(NumUnscaledPairCreated, 43 "Number of load/store from unscaled generated"); 44 STATISTIC(NumSmallTypeMerged, "Number of small type loads merged"); 45 46 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", 47 cl::init(20), cl::Hidden); 48 49 namespace llvm { 50 void initializeAArch64LoadStoreOptPass(PassRegistry &); 51 } 52 53 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 54 55 namespace { 56 57 typedef struct LdStPairFlags { 58 // If a matching instruction is found, MergeForward is set to true if the 59 // merge is to remove the first instruction and replace the second with 60 // a pair-wise insn, and false if the reverse is true. 61 bool MergeForward; 62 63 // SExtIdx gives the index of the result of the load pair that must be 64 // extended. The value of SExtIdx assumes that the paired load produces the 65 // value in this order: (I, returned iterator), i.e., -1 means no value has 66 // to be extended, 0 means I, and 1 means the returned iterator. 67 int SExtIdx; 68 69 LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} 70 71 void setMergeForward(bool V = true) { MergeForward = V; } 72 bool getMergeForward() const { return MergeForward; } 73 74 void setSExtIdx(int V) { SExtIdx = V; } 75 int getSExtIdx() const { return SExtIdx; } 76 77 } LdStPairFlags; 78 79 struct AArch64LoadStoreOpt : public MachineFunctionPass { 80 static char ID; 81 AArch64LoadStoreOpt() : MachineFunctionPass(ID) { 82 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 83 } 84 85 const AArch64InstrInfo *TII; 86 const TargetRegisterInfo *TRI; 87 88 // Scan the instructions looking for a load/store that can be combined 89 // with the current instruction into a load/store pair. 90 // Return the matching instruction if one is found, else MBB->end(). 91 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 92 LdStPairFlags &Flags, 93 unsigned Limit); 94 // Merge the two instructions indicated into a single pair-wise instruction. 95 // If MergeForward is true, erase the first instruction and fold its 96 // operation into the second. If false, the reverse. Return the instruction 97 // following the first instruction (which may change during processing). 98 MachineBasicBlock::iterator 99 mergePairedInsns(MachineBasicBlock::iterator I, 100 MachineBasicBlock::iterator Paired, 101 const LdStPairFlags &Flags); 102 103 // Scan the instruction list to find a base register update that can 104 // be combined with the current instruction (a load or store) using 105 // pre or post indexed addressing with writeback. Scan forwards. 106 MachineBasicBlock::iterator 107 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, 108 int UnscaledOffset); 109 110 // Scan the instruction list to find a base register update that can 111 // be combined with the current instruction (a load or store) using 112 // pre or post indexed addressing with writeback. Scan backwards. 113 MachineBasicBlock::iterator 114 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 115 116 // Find an instruction that updates the base register of the ld/st 117 // instruction. 118 bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, 119 unsigned BaseReg, int Offset); 120 121 // Merge a pre- or post-index base register update into a ld/st instruction. 122 MachineBasicBlock::iterator 123 mergeUpdateInsn(MachineBasicBlock::iterator I, 124 MachineBasicBlock::iterator Update, bool IsPreIdx); 125 126 // Find and merge foldable ldr/str instructions. 127 bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); 128 129 // Check if converting two narrow loads into a single wider load with 130 // bitfield extracts could be enabled. 131 bool enableNarrowLdMerge(MachineFunction &Fn); 132 133 bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); 134 135 bool runOnMachineFunction(MachineFunction &Fn) override; 136 137 const char *getPassName() const override { 138 return AARCH64_LOAD_STORE_OPT_NAME; 139 } 140 }; 141 char AArch64LoadStoreOpt::ID = 0; 142 } // namespace 143 144 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 145 AARCH64_LOAD_STORE_OPT_NAME, false, false) 146 147 static bool isUnscaledLdSt(unsigned Opc) { 148 switch (Opc) { 149 default: 150 return false; 151 case AArch64::STURSi: 152 case AArch64::STURDi: 153 case AArch64::STURQi: 154 case AArch64::STURWi: 155 case AArch64::STURXi: 156 case AArch64::LDURSi: 157 case AArch64::LDURDi: 158 case AArch64::LDURQi: 159 case AArch64::LDURWi: 160 case AArch64::LDURXi: 161 case AArch64::LDURSWi: 162 case AArch64::LDURHHi: 163 return true; 164 } 165 } 166 167 static bool isUnscaledLdSt(MachineInstr *MI) { 168 return isUnscaledLdSt(MI->getOpcode()); 169 } 170 171 static bool isSmallTypeLdMerge(unsigned Opc) { 172 switch (Opc) { 173 default: 174 return false; 175 case AArch64::LDRHHui: 176 case AArch64::LDURHHi: 177 return true; 178 // FIXME: Add other instructions (e.g, LDRBBui, LDURSHWi, LDRSHWui, etc.). 179 } 180 } 181 static bool isSmallTypeLdMerge(MachineInstr *MI) { 182 return isSmallTypeLdMerge(MI->getOpcode()); 183 } 184 185 // Scaling factor for unscaled load or store. 186 static int getMemScale(MachineInstr *MI) { 187 switch (MI->getOpcode()) { 188 default: 189 llvm_unreachable("Opcode has unknown scale!"); 190 case AArch64::LDRBBui: 191 case AArch64::STRBBui: 192 return 1; 193 case AArch64::LDRHHui: 194 case AArch64::LDURHHi: 195 case AArch64::STRHHui: 196 return 2; 197 case AArch64::LDRSui: 198 case AArch64::LDURSi: 199 case AArch64::LDRSWui: 200 case AArch64::LDURSWi: 201 case AArch64::LDRWui: 202 case AArch64::LDURWi: 203 case AArch64::STRSui: 204 case AArch64::STURSi: 205 case AArch64::STRWui: 206 case AArch64::STURWi: 207 case AArch64::LDPSi: 208 case AArch64::LDPSWi: 209 case AArch64::LDPWi: 210 case AArch64::STPSi: 211 case AArch64::STPWi: 212 return 4; 213 case AArch64::LDRDui: 214 case AArch64::LDURDi: 215 case AArch64::LDRXui: 216 case AArch64::LDURXi: 217 case AArch64::STRDui: 218 case AArch64::STURDi: 219 case AArch64::STRXui: 220 case AArch64::STURXi: 221 case AArch64::LDPDi: 222 case AArch64::LDPXi: 223 case AArch64::STPDi: 224 case AArch64::STPXi: 225 return 8; 226 case AArch64::LDRQui: 227 case AArch64::LDURQi: 228 case AArch64::STRQui: 229 case AArch64::STURQi: 230 case AArch64::LDPQi: 231 case AArch64::STPQi: 232 return 16; 233 } 234 } 235 236 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 237 bool *IsValidLdStrOpc = nullptr) { 238 if (IsValidLdStrOpc) 239 *IsValidLdStrOpc = true; 240 switch (Opc) { 241 default: 242 if (IsValidLdStrOpc) 243 *IsValidLdStrOpc = false; 244 return UINT_MAX; 245 case AArch64::STRDui: 246 case AArch64::STURDi: 247 case AArch64::STRQui: 248 case AArch64::STURQi: 249 case AArch64::STRWui: 250 case AArch64::STURWi: 251 case AArch64::STRXui: 252 case AArch64::STURXi: 253 case AArch64::LDRDui: 254 case AArch64::LDURDi: 255 case AArch64::LDRQui: 256 case AArch64::LDURQi: 257 case AArch64::LDRWui: 258 case AArch64::LDURWi: 259 case AArch64::LDRXui: 260 case AArch64::LDURXi: 261 case AArch64::STRSui: 262 case AArch64::STURSi: 263 case AArch64::LDRSui: 264 case AArch64::LDURSi: 265 case AArch64::LDRHHui: 266 case AArch64::LDURHHi: 267 return Opc; 268 case AArch64::LDRSWui: 269 return AArch64::LDRWui; 270 case AArch64::LDURSWi: 271 return AArch64::LDURWi; 272 } 273 } 274 275 static unsigned getMatchingPairOpcode(unsigned Opc) { 276 switch (Opc) { 277 default: 278 llvm_unreachable("Opcode has no pairwise equivalent!"); 279 case AArch64::STRSui: 280 case AArch64::STURSi: 281 return AArch64::STPSi; 282 case AArch64::STRDui: 283 case AArch64::STURDi: 284 return AArch64::STPDi; 285 case AArch64::STRQui: 286 case AArch64::STURQi: 287 return AArch64::STPQi; 288 case AArch64::STRWui: 289 case AArch64::STURWi: 290 return AArch64::STPWi; 291 case AArch64::STRXui: 292 case AArch64::STURXi: 293 return AArch64::STPXi; 294 case AArch64::LDRSui: 295 case AArch64::LDURSi: 296 return AArch64::LDPSi; 297 case AArch64::LDRDui: 298 case AArch64::LDURDi: 299 return AArch64::LDPDi; 300 case AArch64::LDRQui: 301 case AArch64::LDURQi: 302 return AArch64::LDPQi; 303 case AArch64::LDRWui: 304 case AArch64::LDURWi: 305 return AArch64::LDPWi; 306 case AArch64::LDRXui: 307 case AArch64::LDURXi: 308 return AArch64::LDPXi; 309 case AArch64::LDRSWui: 310 case AArch64::LDURSWi: 311 return AArch64::LDPSWi; 312 case AArch64::LDRHHui: 313 return AArch64::LDRWui; 314 case AArch64::LDURHHi: 315 return AArch64::LDURWi; 316 } 317 } 318 319 static unsigned getPreIndexedOpcode(unsigned Opc) { 320 switch (Opc) { 321 default: 322 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 323 case AArch64::STRSui: 324 return AArch64::STRSpre; 325 case AArch64::STRDui: 326 return AArch64::STRDpre; 327 case AArch64::STRQui: 328 return AArch64::STRQpre; 329 case AArch64::STRBBui: 330 return AArch64::STRBBpre; 331 case AArch64::STRHHui: 332 return AArch64::STRHHpre; 333 case AArch64::STRWui: 334 return AArch64::STRWpre; 335 case AArch64::STRXui: 336 return AArch64::STRXpre; 337 case AArch64::LDRSui: 338 return AArch64::LDRSpre; 339 case AArch64::LDRDui: 340 return AArch64::LDRDpre; 341 case AArch64::LDRQui: 342 return AArch64::LDRQpre; 343 case AArch64::LDRBBui: 344 return AArch64::LDRBBpre; 345 case AArch64::LDRHHui: 346 return AArch64::LDRHHpre; 347 case AArch64::LDRWui: 348 return AArch64::LDRWpre; 349 case AArch64::LDRXui: 350 return AArch64::LDRXpre; 351 case AArch64::LDRSWui: 352 return AArch64::LDRSWpre; 353 case AArch64::LDPSi: 354 return AArch64::LDPSpre; 355 case AArch64::LDPSWi: 356 return AArch64::LDPSWpre; 357 case AArch64::LDPDi: 358 return AArch64::LDPDpre; 359 case AArch64::LDPQi: 360 return AArch64::LDPQpre; 361 case AArch64::LDPWi: 362 return AArch64::LDPWpre; 363 case AArch64::LDPXi: 364 return AArch64::LDPXpre; 365 case AArch64::STPSi: 366 return AArch64::STPSpre; 367 case AArch64::STPDi: 368 return AArch64::STPDpre; 369 case AArch64::STPQi: 370 return AArch64::STPQpre; 371 case AArch64::STPWi: 372 return AArch64::STPWpre; 373 case AArch64::STPXi: 374 return AArch64::STPXpre; 375 } 376 } 377 378 static unsigned getPostIndexedOpcode(unsigned Opc) { 379 switch (Opc) { 380 default: 381 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 382 case AArch64::STRSui: 383 return AArch64::STRSpost; 384 case AArch64::STRDui: 385 return AArch64::STRDpost; 386 case AArch64::STRQui: 387 return AArch64::STRQpost; 388 case AArch64::STRBBui: 389 return AArch64::STRBBpost; 390 case AArch64::STRHHui: 391 return AArch64::STRHHpost; 392 case AArch64::STRWui: 393 return AArch64::STRWpost; 394 case AArch64::STRXui: 395 return AArch64::STRXpost; 396 case AArch64::LDRSui: 397 return AArch64::LDRSpost; 398 case AArch64::LDRDui: 399 return AArch64::LDRDpost; 400 case AArch64::LDRQui: 401 return AArch64::LDRQpost; 402 case AArch64::LDRBBui: 403 return AArch64::LDRBBpost; 404 case AArch64::LDRHHui: 405 return AArch64::LDRHHpost; 406 case AArch64::LDRWui: 407 return AArch64::LDRWpost; 408 case AArch64::LDRXui: 409 return AArch64::LDRXpost; 410 case AArch64::LDRSWui: 411 return AArch64::LDRSWpost; 412 case AArch64::LDPSi: 413 return AArch64::LDPSpost; 414 case AArch64::LDPSWi: 415 return AArch64::LDPSWpost; 416 case AArch64::LDPDi: 417 return AArch64::LDPDpost; 418 case AArch64::LDPQi: 419 return AArch64::LDPQpost; 420 case AArch64::LDPWi: 421 return AArch64::LDPWpost; 422 case AArch64::LDPXi: 423 return AArch64::LDPXpost; 424 case AArch64::STPSi: 425 return AArch64::STPSpost; 426 case AArch64::STPDi: 427 return AArch64::STPDpost; 428 case AArch64::STPQi: 429 return AArch64::STPQpost; 430 case AArch64::STPWi: 431 return AArch64::STPWpost; 432 case AArch64::STPXi: 433 return AArch64::STPXpost; 434 } 435 } 436 437 static bool isPairedLdSt(const MachineInstr *MI) { 438 switch (MI->getOpcode()) { 439 default: 440 return false; 441 case AArch64::LDPSi: 442 case AArch64::LDPSWi: 443 case AArch64::LDPDi: 444 case AArch64::LDPQi: 445 case AArch64::LDPWi: 446 case AArch64::LDPXi: 447 case AArch64::STPSi: 448 case AArch64::STPDi: 449 case AArch64::STPQi: 450 case AArch64::STPWi: 451 case AArch64::STPXi: 452 return true; 453 } 454 } 455 456 static const MachineOperand &getLdStRegOp(const MachineInstr *MI, 457 unsigned PairedRegOp = 0) { 458 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 459 unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; 460 return MI->getOperand(Idx); 461 } 462 463 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { 464 unsigned Idx = isPairedLdSt(MI) ? 2 : 1; 465 return MI->getOperand(Idx); 466 } 467 468 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { 469 unsigned Idx = isPairedLdSt(MI) ? 3 : 2; 470 return MI->getOperand(Idx); 471 } 472 473 // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. 474 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, 475 MachineInstr *Op1) { 476 assert(MI->memoperands_empty() && "expected a new machineinstr"); 477 size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + 478 (Op1->memoperands_end() - Op1->memoperands_begin()); 479 480 MachineFunction *MF = MI->getParent()->getParent(); 481 MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); 482 MachineSDNode::mmo_iterator MemEnd = 483 std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); 484 MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); 485 MI->setMemRefs(MemBegin, MemEnd); 486 } 487 488 MachineBasicBlock::iterator 489 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 490 MachineBasicBlock::iterator Paired, 491 const LdStPairFlags &Flags) { 492 MachineBasicBlock::iterator NextI = I; 493 ++NextI; 494 // If NextI is the second of the two instructions to be merged, we need 495 // to skip one further. Either way we merge will invalidate the iterator, 496 // and we don't need to scan the new instruction, as it's a pairwise 497 // instruction, which we're not considering for further action anyway. 498 if (NextI == Paired) 499 ++NextI; 500 501 int SExtIdx = Flags.getSExtIdx(); 502 unsigned Opc = 503 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 504 bool IsUnscaled = isUnscaledLdSt(Opc); 505 int OffsetStride = IsUnscaled ? getMemScale(I) : 1; 506 507 bool MergeForward = Flags.getMergeForward(); 508 unsigned NewOpc = getMatchingPairOpcode(Opc); 509 // Insert our new paired instruction after whichever of the paired 510 // instructions MergeForward indicates. 511 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 512 // Also based on MergeForward is from where we copy the base register operand 513 // so we get the flags compatible with the input code. 514 const MachineOperand &BaseRegOp = 515 MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); 516 517 // Which register is Rt and which is Rt2 depends on the offset order. 518 MachineInstr *RtMI, *Rt2MI; 519 if (getLdStOffsetOp(I).getImm() == 520 getLdStOffsetOp(Paired).getImm() + OffsetStride) { 521 RtMI = Paired; 522 Rt2MI = I; 523 // Here we swapped the assumption made for SExtIdx. 524 // I.e., we turn ldp I, Paired into ldp Paired, I. 525 // Update the index accordingly. 526 if (SExtIdx != -1) 527 SExtIdx = (SExtIdx + 1) % 2; 528 } else { 529 RtMI = I; 530 Rt2MI = Paired; 531 } 532 533 int OffsetImm = getLdStOffsetOp(RtMI).getImm(); 534 535 if (isSmallTypeLdMerge(Opc)) { 536 // Change the scaled offset from small to large type. 537 if (!IsUnscaled) 538 OffsetImm /= 2; 539 MachineInstr *RtNewDest = MergeForward ? I : Paired; 540 // Construct the new load instruction. 541 // FIXME: currently we support only halfword unsigned load. We need to 542 // handle byte type, signed, and store instructions as well. 543 MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; 544 NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 545 TII->get(NewOpc)) 546 .addOperand(getLdStRegOp(RtNewDest)) 547 .addOperand(BaseRegOp) 548 .addImm(OffsetImm); 549 550 // Copy MachineMemOperands from the original loads. 551 concatenateMemOperands(NewMemMI, I, Paired); 552 553 DEBUG( 554 dbgs() 555 << "Creating the new load and extract. Replacing instructions:\n "); 556 DEBUG(I->print(dbgs())); 557 DEBUG(dbgs() << " "); 558 DEBUG(Paired->print(dbgs())); 559 DEBUG(dbgs() << " with instructions:\n "); 560 DEBUG((NewMemMI)->print(dbgs())); 561 562 MachineInstr *ExtDestMI = MergeForward ? Paired : I; 563 if (ExtDestMI == Rt2MI) { 564 // Create the bitfield extract for high half. 565 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 566 TII->get(AArch64::UBFMWri)) 567 .addOperand(getLdStRegOp(Rt2MI)) 568 .addReg(getLdStRegOp(RtNewDest).getReg()) 569 .addImm(16) 570 .addImm(31); 571 // Create the bitfield extract for low half. 572 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 573 TII->get(AArch64::ANDWri)) 574 .addOperand(getLdStRegOp(RtMI)) 575 .addReg(getLdStRegOp(RtNewDest).getReg()) 576 .addImm(15); 577 } else { 578 // Create the bitfield extract for low half. 579 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 580 TII->get(AArch64::ANDWri)) 581 .addOperand(getLdStRegOp(RtMI)) 582 .addReg(getLdStRegOp(RtNewDest).getReg()) 583 .addImm(15); 584 // Create the bitfield extract for high half. 585 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 586 TII->get(AArch64::UBFMWri)) 587 .addOperand(getLdStRegOp(Rt2MI)) 588 .addReg(getLdStRegOp(RtNewDest).getReg()) 589 .addImm(16) 590 .addImm(31); 591 } 592 DEBUG(dbgs() << " "); 593 DEBUG((BitExtMI1)->print(dbgs())); 594 DEBUG(dbgs() << " "); 595 DEBUG((BitExtMI2)->print(dbgs())); 596 DEBUG(dbgs() << "\n"); 597 598 // Erase the old instructions. 599 I->eraseFromParent(); 600 Paired->eraseFromParent(); 601 return NextI; 602 } 603 604 // Handle Unscaled 605 if (IsUnscaled) 606 OffsetImm /= OffsetStride; 607 608 // Construct the new instruction. 609 MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, 610 I->getDebugLoc(), TII->get(NewOpc)) 611 .addOperand(getLdStRegOp(RtMI)) 612 .addOperand(getLdStRegOp(Rt2MI)) 613 .addOperand(BaseRegOp) 614 .addImm(OffsetImm); 615 (void)MIB; 616 617 // FIXME: Do we need/want to copy the mem operands from the source 618 // instructions? Probably. What uses them after this? 619 620 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); 621 DEBUG(I->print(dbgs())); 622 DEBUG(dbgs() << " "); 623 DEBUG(Paired->print(dbgs())); 624 DEBUG(dbgs() << " with instruction:\n "); 625 626 if (SExtIdx != -1) { 627 // Generate the sign extension for the proper result of the ldp. 628 // I.e., with X1, that would be: 629 // %W1<def> = KILL %W1, %X1<imp-def> 630 // %X1<def> = SBFMXri %X1<kill>, 0, 31 631 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 632 // Right now, DstMO has the extended register, since it comes from an 633 // extended opcode. 634 unsigned DstRegX = DstMO.getReg(); 635 // Get the W variant of that register. 636 unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 637 // Update the result of LDP to use the W instead of the X variant. 638 DstMO.setReg(DstRegW); 639 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 640 DEBUG(dbgs() << "\n"); 641 // Make the machine verifier happy by providing a definition for 642 // the X register. 643 // Insert this definition right after the generated LDP, i.e., before 644 // InsertionPoint. 645 MachineInstrBuilder MIBKill = 646 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 647 TII->get(TargetOpcode::KILL), DstRegW) 648 .addReg(DstRegW) 649 .addReg(DstRegX, RegState::Define); 650 MIBKill->getOperand(2).setImplicit(); 651 // Create the sign extension. 652 MachineInstrBuilder MIBSXTW = 653 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 654 TII->get(AArch64::SBFMXri), DstRegX) 655 .addReg(DstRegX) 656 .addImm(0) 657 .addImm(31); 658 (void)MIBSXTW; 659 DEBUG(dbgs() << " Extend operand:\n "); 660 DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 661 DEBUG(dbgs() << "\n"); 662 } else { 663 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 664 DEBUG(dbgs() << "\n"); 665 } 666 667 // Erase the old instructions. 668 I->eraseFromParent(); 669 Paired->eraseFromParent(); 670 671 return NextI; 672 } 673 674 /// trackRegDefsUses - Remember what registers the specified instruction uses 675 /// and modifies. 676 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, 677 BitVector &UsedRegs, 678 const TargetRegisterInfo *TRI) { 679 for (const MachineOperand &MO : MI->operands()) { 680 if (MO.isRegMask()) 681 ModifiedRegs.setBitsNotInMask(MO.getRegMask()); 682 683 if (!MO.isReg()) 684 continue; 685 unsigned Reg = MO.getReg(); 686 if (MO.isDef()) { 687 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 688 ModifiedRegs.set(*AI); 689 } else { 690 assert(MO.isUse() && "Reg operand not a def and not a use?!?"); 691 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 692 UsedRegs.set(*AI); 693 } 694 } 695 } 696 697 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 698 // Convert the byte-offset used by unscaled into an "element" offset used 699 // by the scaled pair load/store instructions. 700 if (IsUnscaled) 701 Offset /= OffsetStride; 702 703 return Offset <= 63 && Offset >= -64; 704 } 705 706 // Do alignment, specialized to power of 2 and for signed ints, 707 // avoiding having to do a C-style cast from uint_64t to int when 708 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h. 709 // FIXME: Move this function to include/MathExtras.h? 710 static int alignTo(int Num, int PowOf2) { 711 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 712 } 713 714 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, 715 const AArch64InstrInfo *TII) { 716 // One of the instructions must modify memory. 717 if (!MIa->mayStore() && !MIb->mayStore()) 718 return false; 719 720 // Both instructions must be memory operations. 721 if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore()) 722 return false; 723 724 return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); 725 } 726 727 static bool mayAlias(MachineInstr *MIa, 728 SmallVectorImpl<MachineInstr *> &MemInsns, 729 const AArch64InstrInfo *TII) { 730 for (auto &MIb : MemInsns) 731 if (mayAlias(MIa, MIb, TII)) 732 return true; 733 734 return false; 735 } 736 737 /// findMatchingInsn - Scan the instructions looking for a load/store that can 738 /// be combined with the current instruction into a load/store pair. 739 MachineBasicBlock::iterator 740 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 741 LdStPairFlags &Flags, unsigned Limit) { 742 MachineBasicBlock::iterator E = I->getParent()->end(); 743 MachineBasicBlock::iterator MBBI = I; 744 MachineInstr *FirstMI = I; 745 ++MBBI; 746 747 unsigned Opc = FirstMI->getOpcode(); 748 bool MayLoad = FirstMI->mayLoad(); 749 bool IsUnscaled = isUnscaledLdSt(FirstMI); 750 unsigned Reg = getLdStRegOp(FirstMI).getReg(); 751 unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); 752 int Offset = getLdStOffsetOp(FirstMI).getImm(); 753 754 // Early exit if the first instruction modifies the base register. 755 // e.g., ldr x0, [x0] 756 if (FirstMI->modifiesRegister(BaseReg, TRI)) 757 return E; 758 759 // Early exit if the offset if not possible to match. (6 bits of positive 760 // range, plus allow an extra one in case we find a later insn that matches 761 // with Offset-1) 762 int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; 763 if (!isSmallTypeLdMerge(Opc) && 764 !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 765 return E; 766 767 // Track which registers have been modified and used between the first insn 768 // (inclusive) and the second insn. 769 BitVector ModifiedRegs, UsedRegs; 770 ModifiedRegs.resize(TRI->getNumRegs()); 771 UsedRegs.resize(TRI->getNumRegs()); 772 773 // Remember any instructions that read/write memory between FirstMI and MI. 774 SmallVector<MachineInstr *, 4> MemInsns; 775 776 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 777 MachineInstr *MI = MBBI; 778 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 779 // optimization by changing how far we scan. 780 if (MI->isDebugValue()) 781 continue; 782 783 // Now that we know this is a real instruction, count it. 784 ++Count; 785 786 bool CanMergeOpc = Opc == MI->getOpcode(); 787 Flags.setSExtIdx(-1); 788 if (!CanMergeOpc) { 789 bool IsValidLdStrOpc; 790 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); 791 assert(IsValidLdStrOpc && 792 "Given Opc should be a Load or Store with an immediate"); 793 // Opc will be the first instruction in the pair. 794 Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); 795 CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); 796 } 797 798 if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { 799 assert(MI->mayLoadOrStore() && "Expected memory operation."); 800 // If we've found another instruction with the same opcode, check to see 801 // if the base and offset are compatible with our starting instruction. 802 // These instructions all have scaled immediate operands, so we just 803 // check for +1/-1. Make sure to check the new instruction offset is 804 // actually an immediate and not a symbolic reference destined for 805 // a relocation. 806 // 807 // Pairwise instructions have a 7-bit signed offset field. Single insns 808 // have a 12-bit unsigned offset field. To be a valid combine, the 809 // final offset must be in range. 810 unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); 811 int MIOffset = getLdStOffsetOp(MI).getImm(); 812 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || 813 (Offset + OffsetStride == MIOffset))) { 814 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 815 // If this is a volatile load/store that otherwise matched, stop looking 816 // as something is going on that we don't have enough information to 817 // safely transform. Similarly, stop if we see a hint to avoid pairs. 818 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 819 return E; 820 // If the resultant immediate offset of merging these instructions 821 // is out of range for a pairwise instruction, bail and keep looking. 822 bool MIIsUnscaled = isUnscaledLdSt(MI); 823 bool IsSmallTypeLd = isSmallTypeLdMerge(MI->getOpcode()); 824 if (!IsSmallTypeLd && 825 !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { 826 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 827 MemInsns.push_back(MI); 828 continue; 829 } 830 831 if (IsSmallTypeLd) { 832 // If the alignment requirements of the larger type scaled load 833 // instruction can't express the scaled offset of the smaller type 834 // input, bail and keep looking. 835 if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { 836 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 837 MemInsns.push_back(MI); 838 continue; 839 } 840 } else { 841 // If the alignment requirements of the paired (scaled) instruction 842 // can't express the offset of the unscaled input, bail and keep 843 // looking. 844 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 845 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 846 MemInsns.push_back(MI); 847 continue; 848 } 849 } 850 // If the destination register of the loads is the same register, bail 851 // and keep looking. A load-pair instruction with both destination 852 // registers the same is UNPREDICTABLE and will result in an exception. 853 if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { 854 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 855 MemInsns.push_back(MI); 856 continue; 857 } 858 859 // If the Rt of the second instruction was not modified or used between 860 // the two instructions and none of the instructions between the second 861 // and first alias with the second, we can combine the second into the 862 // first. 863 if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && 864 !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && 865 !mayAlias(MI, MemInsns, TII)) { 866 Flags.setMergeForward(false); 867 return MBBI; 868 } 869 870 // Likewise, if the Rt of the first instruction is not modified or used 871 // between the two instructions and none of the instructions between the 872 // first and the second alias with the first, we can combine the first 873 // into the second. 874 if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && 875 !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && 876 !mayAlias(FirstMI, MemInsns, TII)) { 877 Flags.setMergeForward(true); 878 return MBBI; 879 } 880 // Unable to combine these instructions due to interference in between. 881 // Keep looking. 882 } 883 } 884 885 // If the instruction wasn't a matching load or store. Stop searching if we 886 // encounter a call instruction that might modify memory. 887 if (MI->isCall()) 888 return E; 889 890 // Update modified / uses register lists. 891 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 892 893 // Otherwise, if the base register is modified, we have no match, so 894 // return early. 895 if (ModifiedRegs[BaseReg]) 896 return E; 897 898 // Update list of instructions that read/write memory. 899 if (MI->mayLoadOrStore()) 900 MemInsns.push_back(MI); 901 } 902 return E; 903 } 904 905 MachineBasicBlock::iterator 906 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 907 MachineBasicBlock::iterator Update, 908 bool IsPreIdx) { 909 assert((Update->getOpcode() == AArch64::ADDXri || 910 Update->getOpcode() == AArch64::SUBXri) && 911 "Unexpected base register update instruction to merge!"); 912 MachineBasicBlock::iterator NextI = I; 913 // Return the instruction following the merged instruction, which is 914 // the instruction following our unmerged load. Unless that's the add/sub 915 // instruction we're merging, in which case it's the one after that. 916 if (++NextI == Update) 917 ++NextI; 918 919 int Value = Update->getOperand(2).getImm(); 920 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 921 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 922 if (Update->getOpcode() == AArch64::SUBXri) 923 Value = -Value; 924 925 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 926 : getPostIndexedOpcode(I->getOpcode()); 927 MachineInstrBuilder MIB; 928 if (!isPairedLdSt(I)) { 929 // Non-paired instruction. 930 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 931 .addOperand(getLdStRegOp(Update)) 932 .addOperand(getLdStRegOp(I)) 933 .addOperand(getLdStBaseOp(I)) 934 .addImm(Value); 935 } else { 936 // Paired instruction. 937 int Scale = getMemScale(I); 938 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 939 .addOperand(getLdStRegOp(Update)) 940 .addOperand(getLdStRegOp(I, 0)) 941 .addOperand(getLdStRegOp(I, 1)) 942 .addOperand(getLdStBaseOp(I)) 943 .addImm(Value / Scale); 944 } 945 (void)MIB; 946 947 if (IsPreIdx) 948 DEBUG(dbgs() << "Creating pre-indexed load/store."); 949 else 950 DEBUG(dbgs() << "Creating post-indexed load/store."); 951 DEBUG(dbgs() << " Replacing instructions:\n "); 952 DEBUG(I->print(dbgs())); 953 DEBUG(dbgs() << " "); 954 DEBUG(Update->print(dbgs())); 955 DEBUG(dbgs() << " with instruction:\n "); 956 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 957 DEBUG(dbgs() << "\n"); 958 959 // Erase the old instructions for the block. 960 I->eraseFromParent(); 961 Update->eraseFromParent(); 962 963 return NextI; 964 } 965 966 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, 967 MachineInstr *MI, 968 unsigned BaseReg, int Offset) { 969 switch (MI->getOpcode()) { 970 default: 971 break; 972 case AArch64::SUBXri: 973 // Negate the offset for a SUB instruction. 974 Offset *= -1; 975 // FALLTHROUGH 976 case AArch64::ADDXri: 977 // Make sure it's a vanilla immediate operand, not a relocation or 978 // anything else we can't handle. 979 if (!MI->getOperand(2).isImm()) 980 break; 981 // Watch out for 1 << 12 shifted value. 982 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) 983 break; 984 985 // The update instruction source and destination register must be the 986 // same as the load/store base register. 987 if (MI->getOperand(0).getReg() != BaseReg || 988 MI->getOperand(1).getReg() != BaseReg) 989 break; 990 991 bool IsPairedInsn = isPairedLdSt(MemMI); 992 int UpdateOffset = MI->getOperand(2).getImm(); 993 // For non-paired load/store instructions, the immediate must fit in a 994 // signed 9-bit integer. 995 if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) 996 break; 997 998 // For paired load/store instructions, the immediate must be a multiple of 999 // the scaling factor. The scaled offset must also fit into a signed 7-bit 1000 // integer. 1001 if (IsPairedInsn) { 1002 int Scale = getMemScale(MemMI); 1003 if (UpdateOffset % Scale != 0) 1004 break; 1005 1006 int ScaledOffset = UpdateOffset / Scale; 1007 if (ScaledOffset > 64 || ScaledOffset < -64) 1008 break; 1009 } 1010 1011 // If we have a non-zero Offset, we check that it matches the amount 1012 // we're adding to the register. 1013 if (!Offset || Offset == MI->getOperand(2).getImm()) 1014 return true; 1015 break; 1016 } 1017 return false; 1018 } 1019 1020 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 1021 MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { 1022 MachineBasicBlock::iterator E = I->getParent()->end(); 1023 MachineInstr *MemMI = I; 1024 MachineBasicBlock::iterator MBBI = I; 1025 1026 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1027 int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); 1028 1029 // Scan forward looking for post-index opportunities. Updating instructions 1030 // can't be formed if the memory instruction doesn't have the offset we're 1031 // looking for. 1032 if (MIUnscaledOffset != UnscaledOffset) 1033 return E; 1034 1035 // If the base register overlaps a destination register, we can't 1036 // merge the update. 1037 bool IsPairedInsn = isPairedLdSt(MemMI); 1038 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1039 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1040 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1041 return E; 1042 } 1043 1044 // Track which registers have been modified and used between the first insn 1045 // (inclusive) and the second insn. 1046 BitVector ModifiedRegs, UsedRegs; 1047 ModifiedRegs.resize(TRI->getNumRegs()); 1048 UsedRegs.resize(TRI->getNumRegs()); 1049 ++MBBI; 1050 for (unsigned Count = 0; MBBI != E; ++MBBI) { 1051 MachineInstr *MI = MBBI; 1052 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1053 // optimization by changing how far we scan. 1054 if (MI->isDebugValue()) 1055 continue; 1056 1057 // Now that we know this is a real instruction, count it. 1058 ++Count; 1059 1060 // If we found a match, return it. 1061 if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) 1062 return MBBI; 1063 1064 // Update the status of what the instruction clobbered and used. 1065 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1066 1067 // Otherwise, if the base register is used or modified, we have no match, so 1068 // return early. 1069 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1070 return E; 1071 } 1072 return E; 1073 } 1074 1075 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 1076 MachineBasicBlock::iterator I, unsigned Limit) { 1077 MachineBasicBlock::iterator B = I->getParent()->begin(); 1078 MachineBasicBlock::iterator E = I->getParent()->end(); 1079 MachineInstr *MemMI = I; 1080 MachineBasicBlock::iterator MBBI = I; 1081 1082 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1083 int Offset = getLdStOffsetOp(MemMI).getImm(); 1084 1085 // If the load/store is the first instruction in the block, there's obviously 1086 // not any matching update. Ditto if the memory offset isn't zero. 1087 if (MBBI == B || Offset != 0) 1088 return E; 1089 // If the base register overlaps a destination register, we can't 1090 // merge the update. 1091 bool IsPairedInsn = isPairedLdSt(MemMI); 1092 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1093 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1094 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1095 return E; 1096 } 1097 1098 // Track which registers have been modified and used between the first insn 1099 // (inclusive) and the second insn. 1100 BitVector ModifiedRegs, UsedRegs; 1101 ModifiedRegs.resize(TRI->getNumRegs()); 1102 UsedRegs.resize(TRI->getNumRegs()); 1103 --MBBI; 1104 for (unsigned Count = 0; MBBI != B; --MBBI) { 1105 MachineInstr *MI = MBBI; 1106 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1107 // optimization by changing how far we scan. 1108 if (MI->isDebugValue()) 1109 continue; 1110 1111 // Now that we know this is a real instruction, count it. 1112 ++Count; 1113 1114 // If we found a match, return it. 1115 if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) 1116 return MBBI; 1117 1118 // Update the status of what the instruction clobbered and used. 1119 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1120 1121 // Otherwise, if the base register is used or modified, we have no match, so 1122 // return early. 1123 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1124 return E; 1125 } 1126 return E; 1127 } 1128 1129 bool AArch64LoadStoreOpt::tryToMergeLdStInst( 1130 MachineBasicBlock::iterator &MBBI) { 1131 MachineInstr *MI = MBBI; 1132 MachineBasicBlock::iterator E = MI->getParent()->end(); 1133 // If this is a volatile load/store, don't mess with it. 1134 if (MI->hasOrderedMemoryRef()) 1135 return false; 1136 1137 // Make sure this is a reg+imm (as opposed to an address reloc). 1138 if (!getLdStOffsetOp(MI).isImm()) 1139 return false; 1140 1141 // Check if this load/store has a hint to avoid pair formation. 1142 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 1143 if (TII->isLdStPairSuppressed(MI)) 1144 return false; 1145 1146 // Look ahead up to ScanLimit instructions for a pairable instruction. 1147 LdStPairFlags Flags; 1148 MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); 1149 if (Paired != E) { 1150 if (isSmallTypeLdMerge(MI)) { 1151 ++NumSmallTypeMerged; 1152 } else { 1153 ++NumPairCreated; 1154 if (isUnscaledLdSt(MI)) 1155 ++NumUnscaledPairCreated; 1156 } 1157 1158 // Merge the loads into a pair. Keeping the iterator straight is a 1159 // pain, so we let the merge routine tell us what the next instruction 1160 // is after it's done mucking about. 1161 MBBI = mergePairedInsns(MBBI, Paired, Flags); 1162 return true; 1163 } 1164 return false; 1165 } 1166 1167 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, 1168 bool enableNarrowLdOpt) { 1169 bool Modified = false; 1170 // Three tranformations to do here: 1171 // 1) Find halfword loads that can be merged into a single 32-bit word load 1172 // with bitfield extract instructions. 1173 // e.g., 1174 // ldrh w0, [x2] 1175 // ldrh w1, [x2, #2] 1176 // ; becomes 1177 // ldr w0, [x2] 1178 // ubfx w1, w0, #16, #16 1179 // and w0, w0, #ffff 1180 // 2) Find loads and stores that can be merged into a single load or store 1181 // pair instruction. 1182 // e.g., 1183 // ldr x0, [x2] 1184 // ldr x1, [x2, #8] 1185 // ; becomes 1186 // ldp x0, x1, [x2] 1187 // 3) Find base register updates that can be merged into the load or store 1188 // as a base-reg writeback. 1189 // e.g., 1190 // ldr x0, [x2] 1191 // add x2, x2, #4 1192 // ; becomes 1193 // ldr x0, [x2], #4 1194 1195 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1196 enableNarrowLdOpt && MBBI != E;) { 1197 MachineInstr *MI = MBBI; 1198 switch (MI->getOpcode()) { 1199 default: 1200 // Just move on to the next instruction. 1201 ++MBBI; 1202 break; 1203 // Scaled instructions. 1204 case AArch64::LDRHHui: 1205 // Unscaled instructions. 1206 case AArch64::LDURHHi: { 1207 if (tryToMergeLdStInst(MBBI)) { 1208 Modified = true; 1209 break; 1210 } 1211 ++MBBI; 1212 break; 1213 } 1214 // FIXME: Do the other instructions. 1215 } 1216 } 1217 1218 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1219 MBBI != E;) { 1220 MachineInstr *MI = MBBI; 1221 switch (MI->getOpcode()) { 1222 default: 1223 // Just move on to the next instruction. 1224 ++MBBI; 1225 break; 1226 // Scaled instructions. 1227 case AArch64::STRSui: 1228 case AArch64::STRDui: 1229 case AArch64::STRQui: 1230 case AArch64::STRXui: 1231 case AArch64::STRWui: 1232 case AArch64::LDRSui: 1233 case AArch64::LDRDui: 1234 case AArch64::LDRQui: 1235 case AArch64::LDRXui: 1236 case AArch64::LDRWui: 1237 case AArch64::LDRSWui: 1238 // Unscaled instructions. 1239 case AArch64::STURSi: 1240 case AArch64::STURDi: 1241 case AArch64::STURQi: 1242 case AArch64::STURWi: 1243 case AArch64::STURXi: 1244 case AArch64::LDURSi: 1245 case AArch64::LDURDi: 1246 case AArch64::LDURQi: 1247 case AArch64::LDURWi: 1248 case AArch64::LDURXi: 1249 case AArch64::LDURSWi: { 1250 if (tryToMergeLdStInst(MBBI)) { 1251 Modified = true; 1252 break; 1253 } 1254 ++MBBI; 1255 break; 1256 } 1257 // FIXME: Do the other instructions. 1258 } 1259 } 1260 1261 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1262 MBBI != E;) { 1263 MachineInstr *MI = MBBI; 1264 // Do update merging. It's simpler to keep this separate from the above 1265 // switch, though not strictly necessary. 1266 unsigned Opc = MI->getOpcode(); 1267 switch (Opc) { 1268 default: 1269 // Just move on to the next instruction. 1270 ++MBBI; 1271 break; 1272 // Scaled instructions. 1273 case AArch64::STRSui: 1274 case AArch64::STRDui: 1275 case AArch64::STRQui: 1276 case AArch64::STRXui: 1277 case AArch64::STRWui: 1278 case AArch64::STRHHui: 1279 case AArch64::STRBBui: 1280 case AArch64::LDRSui: 1281 case AArch64::LDRDui: 1282 case AArch64::LDRQui: 1283 case AArch64::LDRXui: 1284 case AArch64::LDRWui: 1285 case AArch64::LDRHHui: 1286 case AArch64::LDRBBui: 1287 // Unscaled instructions. 1288 case AArch64::STURSi: 1289 case AArch64::STURDi: 1290 case AArch64::STURQi: 1291 case AArch64::STURWi: 1292 case AArch64::STURXi: 1293 case AArch64::LDURSi: 1294 case AArch64::LDURDi: 1295 case AArch64::LDURQi: 1296 case AArch64::LDURWi: 1297 case AArch64::LDURXi: 1298 // Paired instructions. 1299 case AArch64::LDPSi: 1300 case AArch64::LDPSWi: 1301 case AArch64::LDPDi: 1302 case AArch64::LDPQi: 1303 case AArch64::LDPWi: 1304 case AArch64::LDPXi: 1305 case AArch64::STPSi: 1306 case AArch64::STPDi: 1307 case AArch64::STPQi: 1308 case AArch64::STPWi: 1309 case AArch64::STPXi: { 1310 // Make sure this is a reg+imm (as opposed to an address reloc). 1311 if (!getLdStOffsetOp(MI).isImm()) { 1312 ++MBBI; 1313 break; 1314 } 1315 // Look forward to try to form a post-index instruction. For example, 1316 // ldr x0, [x20] 1317 // add x20, x20, #32 1318 // merged into: 1319 // ldr x0, [x20], #32 1320 MachineBasicBlock::iterator Update = 1321 findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); 1322 if (Update != E) { 1323 // Merge the update into the ld/st. 1324 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 1325 Modified = true; 1326 ++NumPostFolded; 1327 break; 1328 } 1329 // Don't know how to handle pre/post-index versions, so move to the next 1330 // instruction. 1331 if (isUnscaledLdSt(Opc)) { 1332 ++MBBI; 1333 break; 1334 } 1335 1336 // Look back to try to find a pre-index instruction. For example, 1337 // add x0, x0, #8 1338 // ldr x1, [x0] 1339 // merged into: 1340 // ldr x1, [x0, #8]! 1341 Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); 1342 if (Update != E) { 1343 // Merge the update into the ld/st. 1344 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1345 Modified = true; 1346 ++NumPreFolded; 1347 break; 1348 } 1349 // The immediate in the load/store is scaled by the size of the memory 1350 // operation. The immediate in the add we're looking for, 1351 // however, is not, so adjust here. 1352 int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); 1353 1354 // Look forward to try to find a post-index instruction. For example, 1355 // ldr x1, [x0, #64] 1356 // add x0, x0, #64 1357 // merged into: 1358 // ldr x1, [x0, #64]! 1359 Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); 1360 if (Update != E) { 1361 // Merge the update into the ld/st. 1362 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1363 Modified = true; 1364 ++NumPreFolded; 1365 break; 1366 } 1367 1368 // Nothing found. Just move to the next instruction. 1369 ++MBBI; 1370 break; 1371 } 1372 // FIXME: Do the other instructions. 1373 } 1374 } 1375 1376 return Modified; 1377 } 1378 1379 bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { 1380 const AArch64Subtarget *SubTarget = 1381 &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); 1382 bool ProfitableArch = SubTarget->isCortexA57(); 1383 // FIXME: The benefit from converting narrow loads into a wider load could be 1384 // microarchitectural as it assumes that a single load with two bitfield 1385 // extracts is cheaper than two narrow loads. Currently, this conversion is 1386 // enabled only in cortex-a57 on which performance benefits were verified. 1387 return ProfitableArch & (!SubTarget->requiresStrictAlign()); 1388 } 1389 1390 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1391 TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo()); 1392 TRI = Fn.getSubtarget().getRegisterInfo(); 1393 1394 bool Modified = false; 1395 bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); 1396 for (auto &MBB : Fn) 1397 Modified |= optimizeBlock(MBB, enableNarrowLdOpt); 1398 1399 return Modified; 1400 } 1401 1402 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep 1403 // loads and stores near one another? 1404 1405 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 1406 /// load / store optimization pass. 1407 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 1408 return new AArch64LoadStoreOpt(); 1409 } 1410