1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AArch64InstrInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "MCTargetDesc/AArch64AddressingModes.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/Support/CommandLine.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include "llvm/Target/TargetRegisterInfo.h" 32 using namespace llvm; 33 34 #define DEBUG_TYPE "aarch64-ldst-opt" 35 36 /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine 37 /// load / store instructions to form ldp / stp instructions. 38 39 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 40 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 41 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 42 STATISTIC(NumUnscaledPairCreated, 43 "Number of load/store from unscaled generated"); 44 STATISTIC(NumSmallTypeMerged, "Number of small type loads merged"); 45 46 static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", 47 cl::init(20), cl::Hidden); 48 49 namespace llvm { 50 void initializeAArch64LoadStoreOptPass(PassRegistry &); 51 } 52 53 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 54 55 namespace { 56 57 typedef struct LdStPairFlags { 58 // If a matching instruction is found, MergeForward is set to true if the 59 // merge is to remove the first instruction and replace the second with 60 // a pair-wise insn, and false if the reverse is true. 61 bool MergeForward; 62 63 // SExtIdx gives the index of the result of the load pair that must be 64 // extended. The value of SExtIdx assumes that the paired load produces the 65 // value in this order: (I, returned iterator), i.e., -1 means no value has 66 // to be extended, 0 means I, and 1 means the returned iterator. 67 int SExtIdx; 68 69 LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} 70 71 void setMergeForward(bool V = true) { MergeForward = V; } 72 bool getMergeForward() const { return MergeForward; } 73 74 void setSExtIdx(int V) { SExtIdx = V; } 75 int getSExtIdx() const { return SExtIdx; } 76 77 } LdStPairFlags; 78 79 struct AArch64LoadStoreOpt : public MachineFunctionPass { 80 static char ID; 81 AArch64LoadStoreOpt() : MachineFunctionPass(ID) { 82 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 83 } 84 85 const AArch64InstrInfo *TII; 86 const TargetRegisterInfo *TRI; 87 const AArch64Subtarget *Subtarget; 88 89 // Scan the instructions looking for a load/store that can be combined 90 // with the current instruction into a load/store pair. 91 // Return the matching instruction if one is found, else MBB->end(). 92 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 93 LdStPairFlags &Flags, 94 unsigned Limit); 95 // Merge the two instructions indicated into a single pair-wise instruction. 96 // If MergeForward is true, erase the first instruction and fold its 97 // operation into the second. If false, the reverse. Return the instruction 98 // following the first instruction (which may change during processing). 99 MachineBasicBlock::iterator 100 mergePairedInsns(MachineBasicBlock::iterator I, 101 MachineBasicBlock::iterator Paired, 102 const LdStPairFlags &Flags); 103 104 // Scan the instruction list to find a base register update that can 105 // be combined with the current instruction (a load or store) using 106 // pre or post indexed addressing with writeback. Scan forwards. 107 MachineBasicBlock::iterator 108 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, 109 int UnscaledOffset); 110 111 // Scan the instruction list to find a base register update that can 112 // be combined with the current instruction (a load or store) using 113 // pre or post indexed addressing with writeback. Scan backwards. 114 MachineBasicBlock::iterator 115 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 116 117 // Find an instruction that updates the base register of the ld/st 118 // instruction. 119 bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, 120 unsigned BaseReg, int Offset); 121 122 // Merge a pre- or post-index base register update into a ld/st instruction. 123 MachineBasicBlock::iterator 124 mergeUpdateInsn(MachineBasicBlock::iterator I, 125 MachineBasicBlock::iterator Update, bool IsPreIdx); 126 127 // Find and merge foldable ldr/str instructions. 128 bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); 129 130 // Check if converting two narrow loads into a single wider load with 131 // bitfield extracts could be enabled. 132 bool enableNarrowLdMerge(MachineFunction &Fn); 133 134 bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); 135 136 bool runOnMachineFunction(MachineFunction &Fn) override; 137 138 const char *getPassName() const override { 139 return AARCH64_LOAD_STORE_OPT_NAME; 140 } 141 }; 142 char AArch64LoadStoreOpt::ID = 0; 143 } // namespace 144 145 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 146 AARCH64_LOAD_STORE_OPT_NAME, false, false) 147 148 static bool isUnscaledLdSt(unsigned Opc) { 149 switch (Opc) { 150 default: 151 return false; 152 case AArch64::STURSi: 153 case AArch64::STURDi: 154 case AArch64::STURQi: 155 case AArch64::STURWi: 156 case AArch64::STURXi: 157 case AArch64::LDURSi: 158 case AArch64::LDURDi: 159 case AArch64::LDURQi: 160 case AArch64::LDURWi: 161 case AArch64::LDURXi: 162 case AArch64::LDURSWi: 163 case AArch64::LDURHHi: 164 return true; 165 } 166 } 167 168 static bool isUnscaledLdSt(MachineInstr *MI) { 169 return isUnscaledLdSt(MI->getOpcode()); 170 } 171 172 static bool isSmallTypeLdMerge(unsigned Opc) { 173 switch (Opc) { 174 default: 175 return false; 176 case AArch64::LDRHHui: 177 case AArch64::LDURHHi: 178 return true; 179 // FIXME: Add other instructions (e.g, LDRBBui, LDURSHWi, LDRSHWui, etc.). 180 } 181 } 182 static bool isSmallTypeLdMerge(MachineInstr *MI) { 183 return isSmallTypeLdMerge(MI->getOpcode()); 184 } 185 186 // Scaling factor for unscaled load or store. 187 static int getMemScale(MachineInstr *MI) { 188 switch (MI->getOpcode()) { 189 default: 190 llvm_unreachable("Opcode has unknown scale!"); 191 case AArch64::LDRBBui: 192 case AArch64::STRBBui: 193 return 1; 194 case AArch64::LDRHHui: 195 case AArch64::LDURHHi: 196 case AArch64::STRHHui: 197 return 2; 198 case AArch64::LDRSui: 199 case AArch64::LDURSi: 200 case AArch64::LDRSWui: 201 case AArch64::LDURSWi: 202 case AArch64::LDRWui: 203 case AArch64::LDURWi: 204 case AArch64::STRSui: 205 case AArch64::STURSi: 206 case AArch64::STRWui: 207 case AArch64::STURWi: 208 case AArch64::LDPSi: 209 case AArch64::LDPSWi: 210 case AArch64::LDPWi: 211 case AArch64::STPSi: 212 case AArch64::STPWi: 213 return 4; 214 case AArch64::LDRDui: 215 case AArch64::LDURDi: 216 case AArch64::LDRXui: 217 case AArch64::LDURXi: 218 case AArch64::STRDui: 219 case AArch64::STURDi: 220 case AArch64::STRXui: 221 case AArch64::STURXi: 222 case AArch64::LDPDi: 223 case AArch64::LDPXi: 224 case AArch64::STPDi: 225 case AArch64::STPXi: 226 return 8; 227 case AArch64::LDRQui: 228 case AArch64::LDURQi: 229 case AArch64::STRQui: 230 case AArch64::STURQi: 231 case AArch64::LDPQi: 232 case AArch64::STPQi: 233 return 16; 234 } 235 } 236 237 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 238 bool *IsValidLdStrOpc = nullptr) { 239 if (IsValidLdStrOpc) 240 *IsValidLdStrOpc = true; 241 switch (Opc) { 242 default: 243 if (IsValidLdStrOpc) 244 *IsValidLdStrOpc = false; 245 return UINT_MAX; 246 case AArch64::STRDui: 247 case AArch64::STURDi: 248 case AArch64::STRQui: 249 case AArch64::STURQi: 250 case AArch64::STRWui: 251 case AArch64::STURWi: 252 case AArch64::STRXui: 253 case AArch64::STURXi: 254 case AArch64::LDRDui: 255 case AArch64::LDURDi: 256 case AArch64::LDRQui: 257 case AArch64::LDURQi: 258 case AArch64::LDRWui: 259 case AArch64::LDURWi: 260 case AArch64::LDRXui: 261 case AArch64::LDURXi: 262 case AArch64::STRSui: 263 case AArch64::STURSi: 264 case AArch64::LDRSui: 265 case AArch64::LDURSi: 266 case AArch64::LDRHHui: 267 case AArch64::LDURHHi: 268 return Opc; 269 case AArch64::LDRSWui: 270 return AArch64::LDRWui; 271 case AArch64::LDURSWi: 272 return AArch64::LDURWi; 273 } 274 } 275 276 static unsigned getMatchingPairOpcode(unsigned Opc) { 277 switch (Opc) { 278 default: 279 llvm_unreachable("Opcode has no pairwise equivalent!"); 280 case AArch64::STRSui: 281 case AArch64::STURSi: 282 return AArch64::STPSi; 283 case AArch64::STRDui: 284 case AArch64::STURDi: 285 return AArch64::STPDi; 286 case AArch64::STRQui: 287 case AArch64::STURQi: 288 return AArch64::STPQi; 289 case AArch64::STRWui: 290 case AArch64::STURWi: 291 return AArch64::STPWi; 292 case AArch64::STRXui: 293 case AArch64::STURXi: 294 return AArch64::STPXi; 295 case AArch64::LDRSui: 296 case AArch64::LDURSi: 297 return AArch64::LDPSi; 298 case AArch64::LDRDui: 299 case AArch64::LDURDi: 300 return AArch64::LDPDi; 301 case AArch64::LDRQui: 302 case AArch64::LDURQi: 303 return AArch64::LDPQi; 304 case AArch64::LDRWui: 305 case AArch64::LDURWi: 306 return AArch64::LDPWi; 307 case AArch64::LDRXui: 308 case AArch64::LDURXi: 309 return AArch64::LDPXi; 310 case AArch64::LDRSWui: 311 case AArch64::LDURSWi: 312 return AArch64::LDPSWi; 313 case AArch64::LDRHHui: 314 return AArch64::LDRWui; 315 case AArch64::LDURHHi: 316 return AArch64::LDURWi; 317 } 318 } 319 320 static unsigned getPreIndexedOpcode(unsigned Opc) { 321 switch (Opc) { 322 default: 323 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 324 case AArch64::STRSui: 325 return AArch64::STRSpre; 326 case AArch64::STRDui: 327 return AArch64::STRDpre; 328 case AArch64::STRQui: 329 return AArch64::STRQpre; 330 case AArch64::STRBBui: 331 return AArch64::STRBBpre; 332 case AArch64::STRHHui: 333 return AArch64::STRHHpre; 334 case AArch64::STRWui: 335 return AArch64::STRWpre; 336 case AArch64::STRXui: 337 return AArch64::STRXpre; 338 case AArch64::LDRSui: 339 return AArch64::LDRSpre; 340 case AArch64::LDRDui: 341 return AArch64::LDRDpre; 342 case AArch64::LDRQui: 343 return AArch64::LDRQpre; 344 case AArch64::LDRBBui: 345 return AArch64::LDRBBpre; 346 case AArch64::LDRHHui: 347 return AArch64::LDRHHpre; 348 case AArch64::LDRWui: 349 return AArch64::LDRWpre; 350 case AArch64::LDRXui: 351 return AArch64::LDRXpre; 352 case AArch64::LDRSWui: 353 return AArch64::LDRSWpre; 354 case AArch64::LDPSi: 355 return AArch64::LDPSpre; 356 case AArch64::LDPSWi: 357 return AArch64::LDPSWpre; 358 case AArch64::LDPDi: 359 return AArch64::LDPDpre; 360 case AArch64::LDPQi: 361 return AArch64::LDPQpre; 362 case AArch64::LDPWi: 363 return AArch64::LDPWpre; 364 case AArch64::LDPXi: 365 return AArch64::LDPXpre; 366 case AArch64::STPSi: 367 return AArch64::STPSpre; 368 case AArch64::STPDi: 369 return AArch64::STPDpre; 370 case AArch64::STPQi: 371 return AArch64::STPQpre; 372 case AArch64::STPWi: 373 return AArch64::STPWpre; 374 case AArch64::STPXi: 375 return AArch64::STPXpre; 376 } 377 } 378 379 static unsigned getPostIndexedOpcode(unsigned Opc) { 380 switch (Opc) { 381 default: 382 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 383 case AArch64::STRSui: 384 return AArch64::STRSpost; 385 case AArch64::STRDui: 386 return AArch64::STRDpost; 387 case AArch64::STRQui: 388 return AArch64::STRQpost; 389 case AArch64::STRBBui: 390 return AArch64::STRBBpost; 391 case AArch64::STRHHui: 392 return AArch64::STRHHpost; 393 case AArch64::STRWui: 394 return AArch64::STRWpost; 395 case AArch64::STRXui: 396 return AArch64::STRXpost; 397 case AArch64::LDRSui: 398 return AArch64::LDRSpost; 399 case AArch64::LDRDui: 400 return AArch64::LDRDpost; 401 case AArch64::LDRQui: 402 return AArch64::LDRQpost; 403 case AArch64::LDRBBui: 404 return AArch64::LDRBBpost; 405 case AArch64::LDRHHui: 406 return AArch64::LDRHHpost; 407 case AArch64::LDRWui: 408 return AArch64::LDRWpost; 409 case AArch64::LDRXui: 410 return AArch64::LDRXpost; 411 case AArch64::LDRSWui: 412 return AArch64::LDRSWpost; 413 case AArch64::LDPSi: 414 return AArch64::LDPSpost; 415 case AArch64::LDPSWi: 416 return AArch64::LDPSWpost; 417 case AArch64::LDPDi: 418 return AArch64::LDPDpost; 419 case AArch64::LDPQi: 420 return AArch64::LDPQpost; 421 case AArch64::LDPWi: 422 return AArch64::LDPWpost; 423 case AArch64::LDPXi: 424 return AArch64::LDPXpost; 425 case AArch64::STPSi: 426 return AArch64::STPSpost; 427 case AArch64::STPDi: 428 return AArch64::STPDpost; 429 case AArch64::STPQi: 430 return AArch64::STPQpost; 431 case AArch64::STPWi: 432 return AArch64::STPWpost; 433 case AArch64::STPXi: 434 return AArch64::STPXpost; 435 } 436 } 437 438 static bool isPairedLdSt(const MachineInstr *MI) { 439 switch (MI->getOpcode()) { 440 default: 441 return false; 442 case AArch64::LDPSi: 443 case AArch64::LDPSWi: 444 case AArch64::LDPDi: 445 case AArch64::LDPQi: 446 case AArch64::LDPWi: 447 case AArch64::LDPXi: 448 case AArch64::STPSi: 449 case AArch64::STPDi: 450 case AArch64::STPQi: 451 case AArch64::STPWi: 452 case AArch64::STPXi: 453 return true; 454 } 455 } 456 457 static const MachineOperand &getLdStRegOp(const MachineInstr *MI, 458 unsigned PairedRegOp = 0) { 459 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 460 unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; 461 return MI->getOperand(Idx); 462 } 463 464 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { 465 unsigned Idx = isPairedLdSt(MI) ? 2 : 1; 466 return MI->getOperand(Idx); 467 } 468 469 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { 470 unsigned Idx = isPairedLdSt(MI) ? 3 : 2; 471 return MI->getOperand(Idx); 472 } 473 474 // Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. 475 static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, 476 MachineInstr *Op1) { 477 assert(MI->memoperands_empty() && "expected a new machineinstr"); 478 size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + 479 (Op1->memoperands_end() - Op1->memoperands_begin()); 480 481 MachineFunction *MF = MI->getParent()->getParent(); 482 MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); 483 MachineSDNode::mmo_iterator MemEnd = 484 std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); 485 MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); 486 MI->setMemRefs(MemBegin, MemEnd); 487 } 488 489 MachineBasicBlock::iterator 490 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 491 MachineBasicBlock::iterator Paired, 492 const LdStPairFlags &Flags) { 493 MachineBasicBlock::iterator NextI = I; 494 ++NextI; 495 // If NextI is the second of the two instructions to be merged, we need 496 // to skip one further. Either way we merge will invalidate the iterator, 497 // and we don't need to scan the new instruction, as it's a pairwise 498 // instruction, which we're not considering for further action anyway. 499 if (NextI == Paired) 500 ++NextI; 501 502 int SExtIdx = Flags.getSExtIdx(); 503 unsigned Opc = 504 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 505 bool IsUnscaled = isUnscaledLdSt(Opc); 506 int OffsetStride = IsUnscaled ? getMemScale(I) : 1; 507 508 bool MergeForward = Flags.getMergeForward(); 509 unsigned NewOpc = getMatchingPairOpcode(Opc); 510 // Insert our new paired instruction after whichever of the paired 511 // instructions MergeForward indicates. 512 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 513 // Also based on MergeForward is from where we copy the base register operand 514 // so we get the flags compatible with the input code. 515 const MachineOperand &BaseRegOp = 516 MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); 517 518 // Which register is Rt and which is Rt2 depends on the offset order. 519 MachineInstr *RtMI, *Rt2MI; 520 if (getLdStOffsetOp(I).getImm() == 521 getLdStOffsetOp(Paired).getImm() + OffsetStride) { 522 RtMI = Paired; 523 Rt2MI = I; 524 // Here we swapped the assumption made for SExtIdx. 525 // I.e., we turn ldp I, Paired into ldp Paired, I. 526 // Update the index accordingly. 527 if (SExtIdx != -1) 528 SExtIdx = (SExtIdx + 1) % 2; 529 } else { 530 RtMI = I; 531 Rt2MI = Paired; 532 } 533 534 int OffsetImm = getLdStOffsetOp(RtMI).getImm(); 535 536 if (isSmallTypeLdMerge(Opc)) { 537 // Change the scaled offset from small to large type. 538 if (!IsUnscaled) 539 OffsetImm /= 2; 540 MachineInstr *RtNewDest = MergeForward ? I : Paired; 541 // When merging small (< 32 bit) loads for big-endian targets, the order of 542 // the component parts gets swapped. 543 if (!Subtarget->isLittleEndian()) 544 std::swap(RtMI, Rt2MI); 545 // Construct the new load instruction. 546 // FIXME: currently we support only halfword unsigned load. We need to 547 // handle byte type, signed, and store instructions as well. 548 MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; 549 NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 550 TII->get(NewOpc)) 551 .addOperand(getLdStRegOp(RtNewDest)) 552 .addOperand(BaseRegOp) 553 .addImm(OffsetImm); 554 555 // Copy MachineMemOperands from the original loads. 556 concatenateMemOperands(NewMemMI, I, Paired); 557 558 DEBUG( 559 dbgs() 560 << "Creating the new load and extract. Replacing instructions:\n "); 561 DEBUG(I->print(dbgs())); 562 DEBUG(dbgs() << " "); 563 DEBUG(Paired->print(dbgs())); 564 DEBUG(dbgs() << " with instructions:\n "); 565 DEBUG((NewMemMI)->print(dbgs())); 566 567 MachineInstr *ExtDestMI = MergeForward ? Paired : I; 568 if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { 569 // Create the bitfield extract for high half. 570 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 571 TII->get(AArch64::UBFMWri)) 572 .addOperand(getLdStRegOp(Rt2MI)) 573 .addReg(getLdStRegOp(RtNewDest).getReg()) 574 .addImm(16) 575 .addImm(31); 576 // Create the bitfield extract for low half. 577 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 578 TII->get(AArch64::ANDWri)) 579 .addOperand(getLdStRegOp(RtMI)) 580 .addReg(getLdStRegOp(RtNewDest).getReg()) 581 .addImm(15); 582 } else { 583 // Create the bitfield extract for low half. 584 BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 585 TII->get(AArch64::ANDWri)) 586 .addOperand(getLdStRegOp(RtMI)) 587 .addReg(getLdStRegOp(RtNewDest).getReg()) 588 .addImm(15); 589 // Create the bitfield extract for high half. 590 BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 591 TII->get(AArch64::UBFMWri)) 592 .addOperand(getLdStRegOp(Rt2MI)) 593 .addReg(getLdStRegOp(RtNewDest).getReg()) 594 .addImm(16) 595 .addImm(31); 596 } 597 DEBUG(dbgs() << " "); 598 DEBUG((BitExtMI1)->print(dbgs())); 599 DEBUG(dbgs() << " "); 600 DEBUG((BitExtMI2)->print(dbgs())); 601 DEBUG(dbgs() << "\n"); 602 603 // Erase the old instructions. 604 I->eraseFromParent(); 605 Paired->eraseFromParent(); 606 return NextI; 607 } 608 609 // Handle Unscaled 610 if (IsUnscaled) 611 OffsetImm /= OffsetStride; 612 613 // Construct the new instruction. 614 MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, 615 I->getDebugLoc(), TII->get(NewOpc)) 616 .addOperand(getLdStRegOp(RtMI)) 617 .addOperand(getLdStRegOp(Rt2MI)) 618 .addOperand(BaseRegOp) 619 .addImm(OffsetImm); 620 (void)MIB; 621 622 // FIXME: Do we need/want to copy the mem operands from the source 623 // instructions? Probably. What uses them after this? 624 625 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); 626 DEBUG(I->print(dbgs())); 627 DEBUG(dbgs() << " "); 628 DEBUG(Paired->print(dbgs())); 629 DEBUG(dbgs() << " with instruction:\n "); 630 631 if (SExtIdx != -1) { 632 // Generate the sign extension for the proper result of the ldp. 633 // I.e., with X1, that would be: 634 // %W1<def> = KILL %W1, %X1<imp-def> 635 // %X1<def> = SBFMXri %X1<kill>, 0, 31 636 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 637 // Right now, DstMO has the extended register, since it comes from an 638 // extended opcode. 639 unsigned DstRegX = DstMO.getReg(); 640 // Get the W variant of that register. 641 unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 642 // Update the result of LDP to use the W instead of the X variant. 643 DstMO.setReg(DstRegW); 644 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 645 DEBUG(dbgs() << "\n"); 646 // Make the machine verifier happy by providing a definition for 647 // the X register. 648 // Insert this definition right after the generated LDP, i.e., before 649 // InsertionPoint. 650 MachineInstrBuilder MIBKill = 651 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 652 TII->get(TargetOpcode::KILL), DstRegW) 653 .addReg(DstRegW) 654 .addReg(DstRegX, RegState::Define); 655 MIBKill->getOperand(2).setImplicit(); 656 // Create the sign extension. 657 MachineInstrBuilder MIBSXTW = 658 BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), 659 TII->get(AArch64::SBFMXri), DstRegX) 660 .addReg(DstRegX) 661 .addImm(0) 662 .addImm(31); 663 (void)MIBSXTW; 664 DEBUG(dbgs() << " Extend operand:\n "); 665 DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 666 DEBUG(dbgs() << "\n"); 667 } else { 668 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 669 DEBUG(dbgs() << "\n"); 670 } 671 672 // Erase the old instructions. 673 I->eraseFromParent(); 674 Paired->eraseFromParent(); 675 676 return NextI; 677 } 678 679 /// trackRegDefsUses - Remember what registers the specified instruction uses 680 /// and modifies. 681 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, 682 BitVector &UsedRegs, 683 const TargetRegisterInfo *TRI) { 684 for (const MachineOperand &MO : MI->operands()) { 685 if (MO.isRegMask()) 686 ModifiedRegs.setBitsNotInMask(MO.getRegMask()); 687 688 if (!MO.isReg()) 689 continue; 690 unsigned Reg = MO.getReg(); 691 if (MO.isDef()) { 692 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 693 ModifiedRegs.set(*AI); 694 } else { 695 assert(MO.isUse() && "Reg operand not a def and not a use?!?"); 696 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 697 UsedRegs.set(*AI); 698 } 699 } 700 } 701 702 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 703 // Convert the byte-offset used by unscaled into an "element" offset used 704 // by the scaled pair load/store instructions. 705 if (IsUnscaled) 706 Offset /= OffsetStride; 707 708 return Offset <= 63 && Offset >= -64; 709 } 710 711 // Do alignment, specialized to power of 2 and for signed ints, 712 // avoiding having to do a C-style cast from uint_64t to int when 713 // using RoundUpToAlignment from include/llvm/Support/MathExtras.h. 714 // FIXME: Move this function to include/MathExtras.h? 715 static int alignTo(int Num, int PowOf2) { 716 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 717 } 718 719 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, 720 const AArch64InstrInfo *TII) { 721 // One of the instructions must modify memory. 722 if (!MIa->mayStore() && !MIb->mayStore()) 723 return false; 724 725 // Both instructions must be memory operations. 726 if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore()) 727 return false; 728 729 return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); 730 } 731 732 static bool mayAlias(MachineInstr *MIa, 733 SmallVectorImpl<MachineInstr *> &MemInsns, 734 const AArch64InstrInfo *TII) { 735 for (auto &MIb : MemInsns) 736 if (mayAlias(MIa, MIb, TII)) 737 return true; 738 739 return false; 740 } 741 742 /// findMatchingInsn - Scan the instructions looking for a load/store that can 743 /// be combined with the current instruction into a load/store pair. 744 MachineBasicBlock::iterator 745 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 746 LdStPairFlags &Flags, unsigned Limit) { 747 MachineBasicBlock::iterator E = I->getParent()->end(); 748 MachineBasicBlock::iterator MBBI = I; 749 MachineInstr *FirstMI = I; 750 ++MBBI; 751 752 unsigned Opc = FirstMI->getOpcode(); 753 bool MayLoad = FirstMI->mayLoad(); 754 bool IsUnscaled = isUnscaledLdSt(FirstMI); 755 unsigned Reg = getLdStRegOp(FirstMI).getReg(); 756 unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); 757 int Offset = getLdStOffsetOp(FirstMI).getImm(); 758 759 // Early exit if the first instruction modifies the base register. 760 // e.g., ldr x0, [x0] 761 if (FirstMI->modifiesRegister(BaseReg, TRI)) 762 return E; 763 764 // Early exit if the offset if not possible to match. (6 bits of positive 765 // range, plus allow an extra one in case we find a later insn that matches 766 // with Offset-1) 767 int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; 768 if (!isSmallTypeLdMerge(Opc) && 769 !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 770 return E; 771 772 // Track which registers have been modified and used between the first insn 773 // (inclusive) and the second insn. 774 BitVector ModifiedRegs, UsedRegs; 775 ModifiedRegs.resize(TRI->getNumRegs()); 776 UsedRegs.resize(TRI->getNumRegs()); 777 778 // Remember any instructions that read/write memory between FirstMI and MI. 779 SmallVector<MachineInstr *, 4> MemInsns; 780 781 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 782 MachineInstr *MI = MBBI; 783 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 784 // optimization by changing how far we scan. 785 if (MI->isDebugValue()) 786 continue; 787 788 // Now that we know this is a real instruction, count it. 789 ++Count; 790 791 bool CanMergeOpc = Opc == MI->getOpcode(); 792 Flags.setSExtIdx(-1); 793 if (!CanMergeOpc) { 794 bool IsValidLdStrOpc; 795 unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); 796 assert(IsValidLdStrOpc && 797 "Given Opc should be a Load or Store with an immediate"); 798 // Opc will be the first instruction in the pair. 799 Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); 800 CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); 801 } 802 803 if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { 804 assert(MI->mayLoadOrStore() && "Expected memory operation."); 805 // If we've found another instruction with the same opcode, check to see 806 // if the base and offset are compatible with our starting instruction. 807 // These instructions all have scaled immediate operands, so we just 808 // check for +1/-1. Make sure to check the new instruction offset is 809 // actually an immediate and not a symbolic reference destined for 810 // a relocation. 811 // 812 // Pairwise instructions have a 7-bit signed offset field. Single insns 813 // have a 12-bit unsigned offset field. To be a valid combine, the 814 // final offset must be in range. 815 unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); 816 int MIOffset = getLdStOffsetOp(MI).getImm(); 817 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || 818 (Offset + OffsetStride == MIOffset))) { 819 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 820 // If this is a volatile load/store that otherwise matched, stop looking 821 // as something is going on that we don't have enough information to 822 // safely transform. Similarly, stop if we see a hint to avoid pairs. 823 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 824 return E; 825 // If the resultant immediate offset of merging these instructions 826 // is out of range for a pairwise instruction, bail and keep looking. 827 bool MIIsUnscaled = isUnscaledLdSt(MI); 828 bool IsSmallTypeLd = isSmallTypeLdMerge(MI->getOpcode()); 829 if (!IsSmallTypeLd && 830 !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { 831 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 832 MemInsns.push_back(MI); 833 continue; 834 } 835 836 if (IsSmallTypeLd) { 837 // If the alignment requirements of the larger type scaled load 838 // instruction can't express the scaled offset of the smaller type 839 // input, bail and keep looking. 840 if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { 841 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 842 MemInsns.push_back(MI); 843 continue; 844 } 845 } else { 846 // If the alignment requirements of the paired (scaled) instruction 847 // can't express the offset of the unscaled input, bail and keep 848 // looking. 849 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 850 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 851 MemInsns.push_back(MI); 852 continue; 853 } 854 } 855 // If the destination register of the loads is the same register, bail 856 // and keep looking. A load-pair instruction with both destination 857 // registers the same is UNPREDICTABLE and will result in an exception. 858 if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { 859 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 860 MemInsns.push_back(MI); 861 continue; 862 } 863 864 // If the Rt of the second instruction was not modified or used between 865 // the two instructions and none of the instructions between the second 866 // and first alias with the second, we can combine the second into the 867 // first. 868 if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && 869 !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && 870 !mayAlias(MI, MemInsns, TII)) { 871 Flags.setMergeForward(false); 872 return MBBI; 873 } 874 875 // Likewise, if the Rt of the first instruction is not modified or used 876 // between the two instructions and none of the instructions between the 877 // first and the second alias with the first, we can combine the first 878 // into the second. 879 if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && 880 !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && 881 !mayAlias(FirstMI, MemInsns, TII)) { 882 Flags.setMergeForward(true); 883 return MBBI; 884 } 885 // Unable to combine these instructions due to interference in between. 886 // Keep looking. 887 } 888 } 889 890 // If the instruction wasn't a matching load or store. Stop searching if we 891 // encounter a call instruction that might modify memory. 892 if (MI->isCall()) 893 return E; 894 895 // Update modified / uses register lists. 896 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 897 898 // Otherwise, if the base register is modified, we have no match, so 899 // return early. 900 if (ModifiedRegs[BaseReg]) 901 return E; 902 903 // Update list of instructions that read/write memory. 904 if (MI->mayLoadOrStore()) 905 MemInsns.push_back(MI); 906 } 907 return E; 908 } 909 910 MachineBasicBlock::iterator 911 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 912 MachineBasicBlock::iterator Update, 913 bool IsPreIdx) { 914 assert((Update->getOpcode() == AArch64::ADDXri || 915 Update->getOpcode() == AArch64::SUBXri) && 916 "Unexpected base register update instruction to merge!"); 917 MachineBasicBlock::iterator NextI = I; 918 // Return the instruction following the merged instruction, which is 919 // the instruction following our unmerged load. Unless that's the add/sub 920 // instruction we're merging, in which case it's the one after that. 921 if (++NextI == Update) 922 ++NextI; 923 924 int Value = Update->getOperand(2).getImm(); 925 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 926 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 927 if (Update->getOpcode() == AArch64::SUBXri) 928 Value = -Value; 929 930 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 931 : getPostIndexedOpcode(I->getOpcode()); 932 MachineInstrBuilder MIB; 933 if (!isPairedLdSt(I)) { 934 // Non-paired instruction. 935 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 936 .addOperand(getLdStRegOp(Update)) 937 .addOperand(getLdStRegOp(I)) 938 .addOperand(getLdStBaseOp(I)) 939 .addImm(Value); 940 } else { 941 // Paired instruction. 942 int Scale = getMemScale(I); 943 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 944 .addOperand(getLdStRegOp(Update)) 945 .addOperand(getLdStRegOp(I, 0)) 946 .addOperand(getLdStRegOp(I, 1)) 947 .addOperand(getLdStBaseOp(I)) 948 .addImm(Value / Scale); 949 } 950 (void)MIB; 951 952 if (IsPreIdx) 953 DEBUG(dbgs() << "Creating pre-indexed load/store."); 954 else 955 DEBUG(dbgs() << "Creating post-indexed load/store."); 956 DEBUG(dbgs() << " Replacing instructions:\n "); 957 DEBUG(I->print(dbgs())); 958 DEBUG(dbgs() << " "); 959 DEBUG(Update->print(dbgs())); 960 DEBUG(dbgs() << " with instruction:\n "); 961 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 962 DEBUG(dbgs() << "\n"); 963 964 // Erase the old instructions for the block. 965 I->eraseFromParent(); 966 Update->eraseFromParent(); 967 968 return NextI; 969 } 970 971 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, 972 MachineInstr *MI, 973 unsigned BaseReg, int Offset) { 974 switch (MI->getOpcode()) { 975 default: 976 break; 977 case AArch64::SUBXri: 978 // Negate the offset for a SUB instruction. 979 Offset *= -1; 980 // FALLTHROUGH 981 case AArch64::ADDXri: 982 // Make sure it's a vanilla immediate operand, not a relocation or 983 // anything else we can't handle. 984 if (!MI->getOperand(2).isImm()) 985 break; 986 // Watch out for 1 << 12 shifted value. 987 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) 988 break; 989 990 // The update instruction source and destination register must be the 991 // same as the load/store base register. 992 if (MI->getOperand(0).getReg() != BaseReg || 993 MI->getOperand(1).getReg() != BaseReg) 994 break; 995 996 bool IsPairedInsn = isPairedLdSt(MemMI); 997 int UpdateOffset = MI->getOperand(2).getImm(); 998 // For non-paired load/store instructions, the immediate must fit in a 999 // signed 9-bit integer. 1000 if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) 1001 break; 1002 1003 // For paired load/store instructions, the immediate must be a multiple of 1004 // the scaling factor. The scaled offset must also fit into a signed 7-bit 1005 // integer. 1006 if (IsPairedInsn) { 1007 int Scale = getMemScale(MemMI); 1008 if (UpdateOffset % Scale != 0) 1009 break; 1010 1011 int ScaledOffset = UpdateOffset / Scale; 1012 if (ScaledOffset > 64 || ScaledOffset < -64) 1013 break; 1014 } 1015 1016 // If we have a non-zero Offset, we check that it matches the amount 1017 // we're adding to the register. 1018 if (!Offset || Offset == MI->getOperand(2).getImm()) 1019 return true; 1020 break; 1021 } 1022 return false; 1023 } 1024 1025 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 1026 MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { 1027 MachineBasicBlock::iterator E = I->getParent()->end(); 1028 MachineInstr *MemMI = I; 1029 MachineBasicBlock::iterator MBBI = I; 1030 1031 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1032 int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); 1033 1034 // Scan forward looking for post-index opportunities. Updating instructions 1035 // can't be formed if the memory instruction doesn't have the offset we're 1036 // looking for. 1037 if (MIUnscaledOffset != UnscaledOffset) 1038 return E; 1039 1040 // If the base register overlaps a destination register, we can't 1041 // merge the update. 1042 bool IsPairedInsn = isPairedLdSt(MemMI); 1043 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1044 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1045 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1046 return E; 1047 } 1048 1049 // Track which registers have been modified and used between the first insn 1050 // (inclusive) and the second insn. 1051 BitVector ModifiedRegs, UsedRegs; 1052 ModifiedRegs.resize(TRI->getNumRegs()); 1053 UsedRegs.resize(TRI->getNumRegs()); 1054 ++MBBI; 1055 for (unsigned Count = 0; MBBI != E; ++MBBI) { 1056 MachineInstr *MI = MBBI; 1057 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1058 // optimization by changing how far we scan. 1059 if (MI->isDebugValue()) 1060 continue; 1061 1062 // Now that we know this is a real instruction, count it. 1063 ++Count; 1064 1065 // If we found a match, return it. 1066 if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) 1067 return MBBI; 1068 1069 // Update the status of what the instruction clobbered and used. 1070 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1071 1072 // Otherwise, if the base register is used or modified, we have no match, so 1073 // return early. 1074 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1075 return E; 1076 } 1077 return E; 1078 } 1079 1080 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 1081 MachineBasicBlock::iterator I, unsigned Limit) { 1082 MachineBasicBlock::iterator B = I->getParent()->begin(); 1083 MachineBasicBlock::iterator E = I->getParent()->end(); 1084 MachineInstr *MemMI = I; 1085 MachineBasicBlock::iterator MBBI = I; 1086 1087 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1088 int Offset = getLdStOffsetOp(MemMI).getImm(); 1089 1090 // If the load/store is the first instruction in the block, there's obviously 1091 // not any matching update. Ditto if the memory offset isn't zero. 1092 if (MBBI == B || Offset != 0) 1093 return E; 1094 // If the base register overlaps a destination register, we can't 1095 // merge the update. 1096 bool IsPairedInsn = isPairedLdSt(MemMI); 1097 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1098 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1099 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1100 return E; 1101 } 1102 1103 // Track which registers have been modified and used between the first insn 1104 // (inclusive) and the second insn. 1105 BitVector ModifiedRegs, UsedRegs; 1106 ModifiedRegs.resize(TRI->getNumRegs()); 1107 UsedRegs.resize(TRI->getNumRegs()); 1108 --MBBI; 1109 for (unsigned Count = 0; MBBI != B; --MBBI) { 1110 MachineInstr *MI = MBBI; 1111 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1112 // optimization by changing how far we scan. 1113 if (MI->isDebugValue()) 1114 continue; 1115 1116 // Now that we know this is a real instruction, count it. 1117 ++Count; 1118 1119 // If we found a match, return it. 1120 if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) 1121 return MBBI; 1122 1123 // Update the status of what the instruction clobbered and used. 1124 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1125 1126 // Otherwise, if the base register is used or modified, we have no match, so 1127 // return early. 1128 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1129 return E; 1130 } 1131 return E; 1132 } 1133 1134 bool AArch64LoadStoreOpt::tryToMergeLdStInst( 1135 MachineBasicBlock::iterator &MBBI) { 1136 MachineInstr *MI = MBBI; 1137 MachineBasicBlock::iterator E = MI->getParent()->end(); 1138 // If this is a volatile load/store, don't mess with it. 1139 if (MI->hasOrderedMemoryRef()) 1140 return false; 1141 1142 // Make sure this is a reg+imm (as opposed to an address reloc). 1143 if (!getLdStOffsetOp(MI).isImm()) 1144 return false; 1145 1146 // Check if this load/store has a hint to avoid pair formation. 1147 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 1148 if (TII->isLdStPairSuppressed(MI)) 1149 return false; 1150 1151 // Look ahead up to ScanLimit instructions for a pairable instruction. 1152 LdStPairFlags Flags; 1153 MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); 1154 if (Paired != E) { 1155 if (isSmallTypeLdMerge(MI)) { 1156 ++NumSmallTypeMerged; 1157 } else { 1158 ++NumPairCreated; 1159 if (isUnscaledLdSt(MI)) 1160 ++NumUnscaledPairCreated; 1161 } 1162 1163 // Merge the loads into a pair. Keeping the iterator straight is a 1164 // pain, so we let the merge routine tell us what the next instruction 1165 // is after it's done mucking about. 1166 MBBI = mergePairedInsns(MBBI, Paired, Flags); 1167 return true; 1168 } 1169 return false; 1170 } 1171 1172 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, 1173 bool enableNarrowLdOpt) { 1174 bool Modified = false; 1175 // Three tranformations to do here: 1176 // 1) Find halfword loads that can be merged into a single 32-bit word load 1177 // with bitfield extract instructions. 1178 // e.g., 1179 // ldrh w0, [x2] 1180 // ldrh w1, [x2, #2] 1181 // ; becomes 1182 // ldr w0, [x2] 1183 // ubfx w1, w0, #16, #16 1184 // and w0, w0, #ffff 1185 // 2) Find loads and stores that can be merged into a single load or store 1186 // pair instruction. 1187 // e.g., 1188 // ldr x0, [x2] 1189 // ldr x1, [x2, #8] 1190 // ; becomes 1191 // ldp x0, x1, [x2] 1192 // 3) Find base register updates that can be merged into the load or store 1193 // as a base-reg writeback. 1194 // e.g., 1195 // ldr x0, [x2] 1196 // add x2, x2, #4 1197 // ; becomes 1198 // ldr x0, [x2], #4 1199 1200 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1201 enableNarrowLdOpt && MBBI != E;) { 1202 MachineInstr *MI = MBBI; 1203 switch (MI->getOpcode()) { 1204 default: 1205 // Just move on to the next instruction. 1206 ++MBBI; 1207 break; 1208 // Scaled instructions. 1209 case AArch64::LDRHHui: 1210 // Unscaled instructions. 1211 case AArch64::LDURHHi: { 1212 if (tryToMergeLdStInst(MBBI)) { 1213 Modified = true; 1214 break; 1215 } 1216 ++MBBI; 1217 break; 1218 } 1219 // FIXME: Do the other instructions. 1220 } 1221 } 1222 1223 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1224 MBBI != E;) { 1225 MachineInstr *MI = MBBI; 1226 switch (MI->getOpcode()) { 1227 default: 1228 // Just move on to the next instruction. 1229 ++MBBI; 1230 break; 1231 // Scaled instructions. 1232 case AArch64::STRSui: 1233 case AArch64::STRDui: 1234 case AArch64::STRQui: 1235 case AArch64::STRXui: 1236 case AArch64::STRWui: 1237 case AArch64::LDRSui: 1238 case AArch64::LDRDui: 1239 case AArch64::LDRQui: 1240 case AArch64::LDRXui: 1241 case AArch64::LDRWui: 1242 case AArch64::LDRSWui: 1243 // Unscaled instructions. 1244 case AArch64::STURSi: 1245 case AArch64::STURDi: 1246 case AArch64::STURQi: 1247 case AArch64::STURWi: 1248 case AArch64::STURXi: 1249 case AArch64::LDURSi: 1250 case AArch64::LDURDi: 1251 case AArch64::LDURQi: 1252 case AArch64::LDURWi: 1253 case AArch64::LDURXi: 1254 case AArch64::LDURSWi: { 1255 if (tryToMergeLdStInst(MBBI)) { 1256 Modified = true; 1257 break; 1258 } 1259 ++MBBI; 1260 break; 1261 } 1262 // FIXME: Do the other instructions. 1263 } 1264 } 1265 1266 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1267 MBBI != E;) { 1268 MachineInstr *MI = MBBI; 1269 // Do update merging. It's simpler to keep this separate from the above 1270 // switch, though not strictly necessary. 1271 unsigned Opc = MI->getOpcode(); 1272 switch (Opc) { 1273 default: 1274 // Just move on to the next instruction. 1275 ++MBBI; 1276 break; 1277 // Scaled instructions. 1278 case AArch64::STRSui: 1279 case AArch64::STRDui: 1280 case AArch64::STRQui: 1281 case AArch64::STRXui: 1282 case AArch64::STRWui: 1283 case AArch64::STRHHui: 1284 case AArch64::STRBBui: 1285 case AArch64::LDRSui: 1286 case AArch64::LDRDui: 1287 case AArch64::LDRQui: 1288 case AArch64::LDRXui: 1289 case AArch64::LDRWui: 1290 case AArch64::LDRHHui: 1291 case AArch64::LDRBBui: 1292 // Unscaled instructions. 1293 case AArch64::STURSi: 1294 case AArch64::STURDi: 1295 case AArch64::STURQi: 1296 case AArch64::STURWi: 1297 case AArch64::STURXi: 1298 case AArch64::LDURSi: 1299 case AArch64::LDURDi: 1300 case AArch64::LDURQi: 1301 case AArch64::LDURWi: 1302 case AArch64::LDURXi: 1303 // Paired instructions. 1304 case AArch64::LDPSi: 1305 case AArch64::LDPSWi: 1306 case AArch64::LDPDi: 1307 case AArch64::LDPQi: 1308 case AArch64::LDPWi: 1309 case AArch64::LDPXi: 1310 case AArch64::STPSi: 1311 case AArch64::STPDi: 1312 case AArch64::STPQi: 1313 case AArch64::STPWi: 1314 case AArch64::STPXi: { 1315 // Make sure this is a reg+imm (as opposed to an address reloc). 1316 if (!getLdStOffsetOp(MI).isImm()) { 1317 ++MBBI; 1318 break; 1319 } 1320 // Look forward to try to form a post-index instruction. For example, 1321 // ldr x0, [x20] 1322 // add x20, x20, #32 1323 // merged into: 1324 // ldr x0, [x20], #32 1325 MachineBasicBlock::iterator Update = 1326 findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); 1327 if (Update != E) { 1328 // Merge the update into the ld/st. 1329 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 1330 Modified = true; 1331 ++NumPostFolded; 1332 break; 1333 } 1334 // Don't know how to handle pre/post-index versions, so move to the next 1335 // instruction. 1336 if (isUnscaledLdSt(Opc)) { 1337 ++MBBI; 1338 break; 1339 } 1340 1341 // Look back to try to find a pre-index instruction. For example, 1342 // add x0, x0, #8 1343 // ldr x1, [x0] 1344 // merged into: 1345 // ldr x1, [x0, #8]! 1346 Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); 1347 if (Update != E) { 1348 // Merge the update into the ld/st. 1349 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1350 Modified = true; 1351 ++NumPreFolded; 1352 break; 1353 } 1354 // The immediate in the load/store is scaled by the size of the memory 1355 // operation. The immediate in the add we're looking for, 1356 // however, is not, so adjust here. 1357 int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); 1358 1359 // Look forward to try to find a post-index instruction. For example, 1360 // ldr x1, [x0, #64] 1361 // add x0, x0, #64 1362 // merged into: 1363 // ldr x1, [x0, #64]! 1364 Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); 1365 if (Update != E) { 1366 // Merge the update into the ld/st. 1367 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1368 Modified = true; 1369 ++NumPreFolded; 1370 break; 1371 } 1372 1373 // Nothing found. Just move to the next instruction. 1374 ++MBBI; 1375 break; 1376 } 1377 // FIXME: Do the other instructions. 1378 } 1379 } 1380 1381 return Modified; 1382 } 1383 1384 bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { 1385 const AArch64Subtarget *SubTarget = 1386 &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); 1387 bool ProfitableArch = SubTarget->isCortexA57(); 1388 // FIXME: The benefit from converting narrow loads into a wider load could be 1389 // microarchitectural as it assumes that a single load with two bitfield 1390 // extracts is cheaper than two narrow loads. Currently, this conversion is 1391 // enabled only in cortex-a57 on which performance benefits were verified. 1392 return ProfitableArch & (!SubTarget->requiresStrictAlign()); 1393 } 1394 1395 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1396 Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); 1397 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); 1398 TRI = Subtarget->getRegisterInfo(); 1399 1400 bool Modified = false; 1401 bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); 1402 for (auto &MBB : Fn) 1403 Modified |= optimizeBlock(MBB, enableNarrowLdOpt); 1404 1405 return Modified; 1406 } 1407 1408 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep 1409 // loads and stores near one another? 1410 1411 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 1412 /// load / store optimization pass. 1413 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 1414 return new AArch64LoadStoreOpt(); 1415 } 1416