1 //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains a pass that performs load / store related peephole 11 // optimizations. This pass should be run after register allocation. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AArch64InstrInfo.h" 16 #include "AArch64Subtarget.h" 17 #include "MCTargetDesc/AArch64AddressingModes.h" 18 #include "llvm/ADT/BitVector.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineInstr.h" 24 #include "llvm/CodeGen/MachineInstrBuilder.h" 25 #include "llvm/Support/CommandLine.h" 26 #include "llvm/Support/Debug.h" 27 #include "llvm/Support/ErrorHandling.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include "llvm/Target/TargetInstrInfo.h" 30 #include "llvm/Target/TargetMachine.h" 31 #include "llvm/Target/TargetRegisterInfo.h" 32 using namespace llvm; 33 34 #define DEBUG_TYPE "aarch64-ldst-opt" 35 36 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 37 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 38 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 39 STATISTIC(NumUnscaledPairCreated, 40 "Number of load/store from unscaled generated"); 41 STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); 42 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); 43 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); 44 45 // The LdStLimit limits how far we search for load/store pairs. 46 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", 47 cl::init(20), cl::Hidden); 48 49 // The UpdateLimit limits how far we search for update instructions when we form 50 // pre-/post-index instructions. 51 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), 52 cl::Hidden); 53 54 namespace llvm { 55 void initializeAArch64LoadStoreOptPass(PassRegistry &); 56 } 57 58 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 59 60 namespace { 61 62 typedef struct LdStPairFlags { 63 // If a matching instruction is found, MergeForward is set to true if the 64 // merge is to remove the first instruction and replace the second with 65 // a pair-wise insn, and false if the reverse is true. 66 bool MergeForward; 67 68 // SExtIdx gives the index of the result of the load pair that must be 69 // extended. The value of SExtIdx assumes that the paired load produces the 70 // value in this order: (I, returned iterator), i.e., -1 means no value has 71 // to be extended, 0 means I, and 1 means the returned iterator. 72 int SExtIdx; 73 74 LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} 75 76 void setMergeForward(bool V = true) { MergeForward = V; } 77 bool getMergeForward() const { return MergeForward; } 78 79 void setSExtIdx(int V) { SExtIdx = V; } 80 int getSExtIdx() const { return SExtIdx; } 81 82 } LdStPairFlags; 83 84 struct AArch64LoadStoreOpt : public MachineFunctionPass { 85 static char ID; 86 AArch64LoadStoreOpt() : MachineFunctionPass(ID) { 87 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 88 } 89 90 const AArch64InstrInfo *TII; 91 const TargetRegisterInfo *TRI; 92 const AArch64Subtarget *Subtarget; 93 94 // Track which registers have been modified and used. 95 BitVector ModifiedRegs, UsedRegs; 96 97 // Scan the instructions looking for a load/store that can be combined 98 // with the current instruction into a load/store pair. 99 // Return the matching instruction if one is found, else MBB->end(). 100 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 101 LdStPairFlags &Flags, 102 unsigned Limit); 103 104 // Scan the instructions looking for a store that writes to the address from 105 // which the current load instruction reads. Return true if one is found. 106 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, 107 MachineBasicBlock::iterator &StoreI); 108 109 // Merge the two instructions indicated into a wider instruction. 110 MachineBasicBlock::iterator 111 mergeNarrowInsns(MachineBasicBlock::iterator I, 112 MachineBasicBlock::iterator MergeMI, 113 const LdStPairFlags &Flags); 114 115 // Merge the two instructions indicated into a single pair-wise instruction. 116 MachineBasicBlock::iterator 117 mergePairedInsns(MachineBasicBlock::iterator I, 118 MachineBasicBlock::iterator Paired, 119 const LdStPairFlags &Flags); 120 121 // Promote the load that reads directly from the address stored to. 122 MachineBasicBlock::iterator 123 promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 124 MachineBasicBlock::iterator StoreI); 125 126 // Scan the instruction list to find a base register update that can 127 // be combined with the current instruction (a load or store) using 128 // pre or post indexed addressing with writeback. Scan forwards. 129 MachineBasicBlock::iterator 130 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, 131 int UnscaledOffset, unsigned Limit); 132 133 // Scan the instruction list to find a base register update that can 134 // be combined with the current instruction (a load or store) using 135 // pre or post indexed addressing with writeback. Scan backwards. 136 MachineBasicBlock::iterator 137 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 138 139 // Find an instruction that updates the base register of the ld/st 140 // instruction. 141 bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, 142 unsigned BaseReg, int Offset); 143 144 // Merge a pre- or post-index base register update into a ld/st instruction. 145 MachineBasicBlock::iterator 146 mergeUpdateInsn(MachineBasicBlock::iterator I, 147 MachineBasicBlock::iterator Update, bool IsPreIdx); 148 149 // Is this a candidate for ld/st merging or pairing? For example, we don't 150 // touch volatiles or load/stores that have a hint to avoid pair formation. 151 bool isCandidateToMergeOrPair(MachineInstr *MI); 152 153 // Find and merge foldable ldr/str instructions. 154 bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); 155 156 // Find and pair ldr/str instructions. 157 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); 158 159 // Find and promote load instructions which read directly from store. 160 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); 161 162 // Check if converting two narrow loads into a single wider load with 163 // bitfield extracts could be enabled. 164 bool enableNarrowLdMerge(MachineFunction &Fn); 165 166 bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); 167 168 bool runOnMachineFunction(MachineFunction &Fn) override; 169 170 const char *getPassName() const override { 171 return AARCH64_LOAD_STORE_OPT_NAME; 172 } 173 }; 174 char AArch64LoadStoreOpt::ID = 0; 175 } // namespace 176 177 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 178 AARCH64_LOAD_STORE_OPT_NAME, false, false) 179 180 static unsigned getBitExtrOpcode(MachineInstr *MI) { 181 switch (MI->getOpcode()) { 182 default: 183 llvm_unreachable("Unexpected opcode."); 184 case AArch64::LDRBBui: 185 case AArch64::LDURBBi: 186 case AArch64::LDRHHui: 187 case AArch64::LDURHHi: 188 return AArch64::UBFMWri; 189 case AArch64::LDRSBWui: 190 case AArch64::LDURSBWi: 191 case AArch64::LDRSHWui: 192 case AArch64::LDURSHWi: 193 return AArch64::SBFMWri; 194 } 195 } 196 197 static bool isNarrowStore(unsigned Opc) { 198 switch (Opc) { 199 default: 200 return false; 201 case AArch64::STRBBui: 202 case AArch64::STURBBi: 203 case AArch64::STRHHui: 204 case AArch64::STURHHi: 205 return true; 206 } 207 } 208 209 static bool isNarrowLoad(unsigned Opc) { 210 switch (Opc) { 211 default: 212 return false; 213 case AArch64::LDRHHui: 214 case AArch64::LDURHHi: 215 case AArch64::LDRBBui: 216 case AArch64::LDURBBi: 217 case AArch64::LDRSHWui: 218 case AArch64::LDURSHWi: 219 case AArch64::LDRSBWui: 220 case AArch64::LDURSBWi: 221 return true; 222 } 223 } 224 225 static bool isNarrowLoad(MachineInstr *MI) { 226 return isNarrowLoad(MI->getOpcode()); 227 } 228 229 static bool isNarrowLoadOrStore(unsigned Opc) { 230 return isNarrowLoad(Opc) || isNarrowStore(Opc); 231 } 232 233 // Scaling factor for unscaled load or store. 234 static int getMemScale(MachineInstr *MI) { 235 switch (MI->getOpcode()) { 236 default: 237 llvm_unreachable("Opcode has unknown scale!"); 238 case AArch64::LDRBBui: 239 case AArch64::LDURBBi: 240 case AArch64::LDRSBWui: 241 case AArch64::LDURSBWi: 242 case AArch64::STRBBui: 243 case AArch64::STURBBi: 244 return 1; 245 case AArch64::LDRHHui: 246 case AArch64::LDURHHi: 247 case AArch64::LDRSHWui: 248 case AArch64::LDURSHWi: 249 case AArch64::STRHHui: 250 case AArch64::STURHHi: 251 return 2; 252 case AArch64::LDRSui: 253 case AArch64::LDURSi: 254 case AArch64::LDRSWui: 255 case AArch64::LDURSWi: 256 case AArch64::LDRWui: 257 case AArch64::LDURWi: 258 case AArch64::STRSui: 259 case AArch64::STURSi: 260 case AArch64::STRWui: 261 case AArch64::STURWi: 262 case AArch64::LDPSi: 263 case AArch64::LDPSWi: 264 case AArch64::LDPWi: 265 case AArch64::STPSi: 266 case AArch64::STPWi: 267 return 4; 268 case AArch64::LDRDui: 269 case AArch64::LDURDi: 270 case AArch64::LDRXui: 271 case AArch64::LDURXi: 272 case AArch64::STRDui: 273 case AArch64::STURDi: 274 case AArch64::STRXui: 275 case AArch64::STURXi: 276 case AArch64::LDPDi: 277 case AArch64::LDPXi: 278 case AArch64::STPDi: 279 case AArch64::STPXi: 280 return 8; 281 case AArch64::LDRQui: 282 case AArch64::LDURQi: 283 case AArch64::STRQui: 284 case AArch64::STURQi: 285 case AArch64::LDPQi: 286 case AArch64::STPQi: 287 return 16; 288 } 289 } 290 291 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 292 bool *IsValidLdStrOpc = nullptr) { 293 if (IsValidLdStrOpc) 294 *IsValidLdStrOpc = true; 295 switch (Opc) { 296 default: 297 if (IsValidLdStrOpc) 298 *IsValidLdStrOpc = false; 299 return UINT_MAX; 300 case AArch64::STRDui: 301 case AArch64::STURDi: 302 case AArch64::STRQui: 303 case AArch64::STURQi: 304 case AArch64::STRBBui: 305 case AArch64::STURBBi: 306 case AArch64::STRHHui: 307 case AArch64::STURHHi: 308 case AArch64::STRWui: 309 case AArch64::STURWi: 310 case AArch64::STRXui: 311 case AArch64::STURXi: 312 case AArch64::LDRDui: 313 case AArch64::LDURDi: 314 case AArch64::LDRQui: 315 case AArch64::LDURQi: 316 case AArch64::LDRWui: 317 case AArch64::LDURWi: 318 case AArch64::LDRXui: 319 case AArch64::LDURXi: 320 case AArch64::STRSui: 321 case AArch64::STURSi: 322 case AArch64::LDRSui: 323 case AArch64::LDURSi: 324 case AArch64::LDRHHui: 325 case AArch64::LDURHHi: 326 case AArch64::LDRBBui: 327 case AArch64::LDURBBi: 328 return Opc; 329 case AArch64::LDRSWui: 330 return AArch64::LDRWui; 331 case AArch64::LDURSWi: 332 return AArch64::LDURWi; 333 case AArch64::LDRSBWui: 334 return AArch64::LDRBBui; 335 case AArch64::LDRSHWui: 336 return AArch64::LDRHHui; 337 case AArch64::LDURSBWi: 338 return AArch64::LDURBBi; 339 case AArch64::LDURSHWi: 340 return AArch64::LDURHHi; 341 } 342 } 343 344 static unsigned getMatchingWideOpcode(unsigned Opc) { 345 switch (Opc) { 346 default: 347 llvm_unreachable("Opcode has no wide equivalent!"); 348 case AArch64::STRBBui: 349 return AArch64::STRHHui; 350 case AArch64::STRHHui: 351 return AArch64::STRWui; 352 case AArch64::STURBBi: 353 return AArch64::STURHHi; 354 case AArch64::STURHHi: 355 return AArch64::STURWi; 356 case AArch64::STURWi: 357 return AArch64::STURXi; 358 case AArch64::STRWui: 359 return AArch64::STRXui; 360 case AArch64::LDRHHui: 361 case AArch64::LDRSHWui: 362 return AArch64::LDRWui; 363 case AArch64::LDURHHi: 364 case AArch64::LDURSHWi: 365 return AArch64::LDURWi; 366 case AArch64::LDRBBui: 367 case AArch64::LDRSBWui: 368 return AArch64::LDRHHui; 369 case AArch64::LDURBBi: 370 case AArch64::LDURSBWi: 371 return AArch64::LDURHHi; 372 } 373 } 374 375 static unsigned getMatchingPairOpcode(unsigned Opc) { 376 switch (Opc) { 377 default: 378 llvm_unreachable("Opcode has no pairwise equivalent!"); 379 case AArch64::STRSui: 380 case AArch64::STURSi: 381 return AArch64::STPSi; 382 case AArch64::STRDui: 383 case AArch64::STURDi: 384 return AArch64::STPDi; 385 case AArch64::STRQui: 386 case AArch64::STURQi: 387 return AArch64::STPQi; 388 case AArch64::STRWui: 389 case AArch64::STURWi: 390 return AArch64::STPWi; 391 case AArch64::STRXui: 392 case AArch64::STURXi: 393 return AArch64::STPXi; 394 case AArch64::LDRSui: 395 case AArch64::LDURSi: 396 return AArch64::LDPSi; 397 case AArch64::LDRDui: 398 case AArch64::LDURDi: 399 return AArch64::LDPDi; 400 case AArch64::LDRQui: 401 case AArch64::LDURQi: 402 return AArch64::LDPQi; 403 case AArch64::LDRWui: 404 case AArch64::LDURWi: 405 return AArch64::LDPWi; 406 case AArch64::LDRXui: 407 case AArch64::LDURXi: 408 return AArch64::LDPXi; 409 case AArch64::LDRSWui: 410 case AArch64::LDURSWi: 411 return AArch64::LDPSWi; 412 } 413 } 414 415 static unsigned isMatchingStore(MachineInstr *LoadInst, 416 MachineInstr *StoreInst) { 417 unsigned LdOpc = LoadInst->getOpcode(); 418 unsigned StOpc = StoreInst->getOpcode(); 419 switch (LdOpc) { 420 default: 421 llvm_unreachable("Unsupported load instruction!"); 422 case AArch64::LDRBBui: 423 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || 424 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 425 case AArch64::LDURBBi: 426 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || 427 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 428 case AArch64::LDRHHui: 429 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || 430 StOpc == AArch64::STRXui; 431 case AArch64::LDURHHi: 432 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || 433 StOpc == AArch64::STURXi; 434 case AArch64::LDRWui: 435 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 436 case AArch64::LDURWi: 437 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 438 case AArch64::LDRXui: 439 return StOpc == AArch64::STRXui; 440 case AArch64::LDURXi: 441 return StOpc == AArch64::STURXi; 442 } 443 } 444 445 static unsigned getPreIndexedOpcode(unsigned Opc) { 446 switch (Opc) { 447 default: 448 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 449 case AArch64::STRSui: 450 return AArch64::STRSpre; 451 case AArch64::STRDui: 452 return AArch64::STRDpre; 453 case AArch64::STRQui: 454 return AArch64::STRQpre; 455 case AArch64::STRBBui: 456 return AArch64::STRBBpre; 457 case AArch64::STRHHui: 458 return AArch64::STRHHpre; 459 case AArch64::STRWui: 460 return AArch64::STRWpre; 461 case AArch64::STRXui: 462 return AArch64::STRXpre; 463 case AArch64::LDRSui: 464 return AArch64::LDRSpre; 465 case AArch64::LDRDui: 466 return AArch64::LDRDpre; 467 case AArch64::LDRQui: 468 return AArch64::LDRQpre; 469 case AArch64::LDRBBui: 470 return AArch64::LDRBBpre; 471 case AArch64::LDRHHui: 472 return AArch64::LDRHHpre; 473 case AArch64::LDRWui: 474 return AArch64::LDRWpre; 475 case AArch64::LDRXui: 476 return AArch64::LDRXpre; 477 case AArch64::LDRSWui: 478 return AArch64::LDRSWpre; 479 case AArch64::LDPSi: 480 return AArch64::LDPSpre; 481 case AArch64::LDPSWi: 482 return AArch64::LDPSWpre; 483 case AArch64::LDPDi: 484 return AArch64::LDPDpre; 485 case AArch64::LDPQi: 486 return AArch64::LDPQpre; 487 case AArch64::LDPWi: 488 return AArch64::LDPWpre; 489 case AArch64::LDPXi: 490 return AArch64::LDPXpre; 491 case AArch64::STPSi: 492 return AArch64::STPSpre; 493 case AArch64::STPDi: 494 return AArch64::STPDpre; 495 case AArch64::STPQi: 496 return AArch64::STPQpre; 497 case AArch64::STPWi: 498 return AArch64::STPWpre; 499 case AArch64::STPXi: 500 return AArch64::STPXpre; 501 } 502 } 503 504 static unsigned getPostIndexedOpcode(unsigned Opc) { 505 switch (Opc) { 506 default: 507 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 508 case AArch64::STRSui: 509 return AArch64::STRSpost; 510 case AArch64::STRDui: 511 return AArch64::STRDpost; 512 case AArch64::STRQui: 513 return AArch64::STRQpost; 514 case AArch64::STRBBui: 515 return AArch64::STRBBpost; 516 case AArch64::STRHHui: 517 return AArch64::STRHHpost; 518 case AArch64::STRWui: 519 return AArch64::STRWpost; 520 case AArch64::STRXui: 521 return AArch64::STRXpost; 522 case AArch64::LDRSui: 523 return AArch64::LDRSpost; 524 case AArch64::LDRDui: 525 return AArch64::LDRDpost; 526 case AArch64::LDRQui: 527 return AArch64::LDRQpost; 528 case AArch64::LDRBBui: 529 return AArch64::LDRBBpost; 530 case AArch64::LDRHHui: 531 return AArch64::LDRHHpost; 532 case AArch64::LDRWui: 533 return AArch64::LDRWpost; 534 case AArch64::LDRXui: 535 return AArch64::LDRXpost; 536 case AArch64::LDRSWui: 537 return AArch64::LDRSWpost; 538 case AArch64::LDPSi: 539 return AArch64::LDPSpost; 540 case AArch64::LDPSWi: 541 return AArch64::LDPSWpost; 542 case AArch64::LDPDi: 543 return AArch64::LDPDpost; 544 case AArch64::LDPQi: 545 return AArch64::LDPQpost; 546 case AArch64::LDPWi: 547 return AArch64::LDPWpost; 548 case AArch64::LDPXi: 549 return AArch64::LDPXpost; 550 case AArch64::STPSi: 551 return AArch64::STPSpost; 552 case AArch64::STPDi: 553 return AArch64::STPDpost; 554 case AArch64::STPQi: 555 return AArch64::STPQpost; 556 case AArch64::STPWi: 557 return AArch64::STPWpost; 558 case AArch64::STPXi: 559 return AArch64::STPXpost; 560 } 561 } 562 563 static bool isPairedLdSt(const MachineInstr *MI) { 564 switch (MI->getOpcode()) { 565 default: 566 return false; 567 case AArch64::LDPSi: 568 case AArch64::LDPSWi: 569 case AArch64::LDPDi: 570 case AArch64::LDPQi: 571 case AArch64::LDPWi: 572 case AArch64::LDPXi: 573 case AArch64::STPSi: 574 case AArch64::STPDi: 575 case AArch64::STPQi: 576 case AArch64::STPWi: 577 case AArch64::STPXi: 578 return true; 579 } 580 } 581 582 static const MachineOperand &getLdStRegOp(const MachineInstr *MI, 583 unsigned PairedRegOp = 0) { 584 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 585 unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; 586 return MI->getOperand(Idx); 587 } 588 589 static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { 590 unsigned Idx = isPairedLdSt(MI) ? 2 : 1; 591 return MI->getOperand(Idx); 592 } 593 594 static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { 595 unsigned Idx = isPairedLdSt(MI) ? 3 : 2; 596 return MI->getOperand(Idx); 597 } 598 599 static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst, 600 MachineInstr *StoreInst, 601 const AArch64InstrInfo *TII) { 602 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); 603 int LoadSize = getMemScale(LoadInst); 604 int StoreSize = getMemScale(StoreInst); 605 int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst) 606 ? getLdStOffsetOp(StoreInst).getImm() 607 : getLdStOffsetOp(StoreInst).getImm() * StoreSize; 608 int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst) 609 ? getLdStOffsetOp(LoadInst).getImm() 610 : getLdStOffsetOp(LoadInst).getImm() * LoadSize; 611 return (UnscaledStOffset <= UnscaledLdOffset) && 612 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); 613 } 614 615 static bool isPromotableZeroStoreOpcode(MachineInstr *MI) { 616 unsigned Opc = MI->getOpcode(); 617 return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi; 618 } 619 620 static bool isPromotableZeroStoreInst(MachineInstr *MI) { 621 return (isPromotableZeroStoreOpcode(MI)) && 622 getLdStRegOp(MI).getReg() == AArch64::WZR; 623 } 624 625 MachineBasicBlock::iterator 626 AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I, 627 MachineBasicBlock::iterator MergeMI, 628 const LdStPairFlags &Flags) { 629 MachineBasicBlock::iterator NextI = I; 630 ++NextI; 631 // If NextI is the second of the two instructions to be merged, we need 632 // to skip one further. Either way we merge will invalidate the iterator, 633 // and we don't need to scan the new instruction, as it's a pairwise 634 // instruction, which we're not considering for further action anyway. 635 if (NextI == MergeMI) 636 ++NextI; 637 638 unsigned Opc = I->getOpcode(); 639 bool IsScaled = !TII->isUnscaledLdSt(Opc); 640 int OffsetStride = IsScaled ? 1 : getMemScale(I); 641 642 bool MergeForward = Flags.getMergeForward(); 643 // Insert our new paired instruction after whichever of the paired 644 // instructions MergeForward indicates. 645 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I; 646 // Also based on MergeForward is from where we copy the base register operand 647 // so we get the flags compatible with the input code. 648 const MachineOperand &BaseRegOp = 649 MergeForward ? getLdStBaseOp(MergeMI) : getLdStBaseOp(I); 650 651 // Which register is Rt and which is Rt2 depends on the offset order. 652 MachineInstr *RtMI, *Rt2MI; 653 if (getLdStOffsetOp(I).getImm() == 654 getLdStOffsetOp(MergeMI).getImm() + OffsetStride) { 655 RtMI = MergeMI; 656 Rt2MI = I; 657 } else { 658 RtMI = I; 659 Rt2MI = MergeMI; 660 } 661 662 int OffsetImm = getLdStOffsetOp(RtMI).getImm(); 663 // Change the scaled offset from small to large type. 664 if (IsScaled) { 665 assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); 666 OffsetImm /= 2; 667 } 668 669 DebugLoc DL = I->getDebugLoc(); 670 MachineBasicBlock *MBB = I->getParent(); 671 if (isNarrowLoad(Opc)) { 672 MachineInstr *RtNewDest = MergeForward ? I : MergeMI; 673 // When merging small (< 32 bit) loads for big-endian targets, the order of 674 // the component parts gets swapped. 675 if (!Subtarget->isLittleEndian()) 676 std::swap(RtMI, Rt2MI); 677 // Construct the new load instruction. 678 MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; 679 NewMemMI = 680 BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) 681 .addOperand(getLdStRegOp(RtNewDest)) 682 .addOperand(BaseRegOp) 683 .addImm(OffsetImm) 684 .setMemRefs(I->mergeMemRefsWith(*MergeMI)); 685 686 DEBUG( 687 dbgs() 688 << "Creating the new load and extract. Replacing instructions:\n "); 689 DEBUG(I->print(dbgs())); 690 DEBUG(dbgs() << " "); 691 DEBUG(MergeMI->print(dbgs())); 692 DEBUG(dbgs() << " with instructions:\n "); 693 DEBUG((NewMemMI)->print(dbgs())); 694 695 int Width = getMemScale(I) == 1 ? 8 : 16; 696 int LSBLow = 0; 697 int LSBHigh = Width; 698 int ImmsLow = LSBLow + Width - 1; 699 int ImmsHigh = LSBHigh + Width - 1; 700 MachineInstr *ExtDestMI = MergeForward ? MergeMI : I; 701 if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { 702 // Create the bitfield extract for high bits. 703 BitExtMI1 = 704 BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(Rt2MI))) 705 .addOperand(getLdStRegOp(Rt2MI)) 706 .addReg(getLdStRegOp(RtNewDest).getReg()) 707 .addImm(LSBHigh) 708 .addImm(ImmsHigh); 709 // Create the bitfield extract for low bits. 710 if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { 711 // For unsigned, prefer to use AND for low bits. 712 BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) 713 .addOperand(getLdStRegOp(RtMI)) 714 .addReg(getLdStRegOp(RtNewDest).getReg()) 715 .addImm(ImmsLow); 716 } else { 717 BitExtMI2 = 718 BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(RtMI))) 719 .addOperand(getLdStRegOp(RtMI)) 720 .addReg(getLdStRegOp(RtNewDest).getReg()) 721 .addImm(LSBLow) 722 .addImm(ImmsLow); 723 } 724 } else { 725 // Create the bitfield extract for low bits. 726 if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { 727 // For unsigned, prefer to use AND for low bits. 728 BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri)) 729 .addOperand(getLdStRegOp(RtMI)) 730 .addReg(getLdStRegOp(RtNewDest).getReg()) 731 .addImm(ImmsLow); 732 } else { 733 BitExtMI1 = 734 BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(RtMI))) 735 .addOperand(getLdStRegOp(RtMI)) 736 .addReg(getLdStRegOp(RtNewDest).getReg()) 737 .addImm(LSBLow) 738 .addImm(ImmsLow); 739 } 740 741 // Create the bitfield extract for high bits. 742 BitExtMI2 = 743 BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(Rt2MI))) 744 .addOperand(getLdStRegOp(Rt2MI)) 745 .addReg(getLdStRegOp(RtNewDest).getReg()) 746 .addImm(LSBHigh) 747 .addImm(ImmsHigh); 748 } 749 DEBUG(dbgs() << " "); 750 DEBUG((BitExtMI1)->print(dbgs())); 751 DEBUG(dbgs() << " "); 752 DEBUG((BitExtMI2)->print(dbgs())); 753 DEBUG(dbgs() << "\n"); 754 755 // Erase the old instructions. 756 I->eraseFromParent(); 757 MergeMI->eraseFromParent(); 758 return NextI; 759 } 760 assert(isPromotableZeroStoreInst(I) && "Expected promotable zero store"); 761 762 // Construct the new instruction. 763 MachineInstrBuilder MIB; 764 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) 765 .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) 766 .addOperand(BaseRegOp) 767 .addImm(OffsetImm) 768 .setMemRefs(I->mergeMemRefsWith(*MergeMI)); 769 770 (void)MIB; 771 772 DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n "); 773 DEBUG(I->print(dbgs())); 774 DEBUG(dbgs() << " "); 775 DEBUG(MergeMI->print(dbgs())); 776 DEBUG(dbgs() << " with instruction:\n "); 777 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 778 DEBUG(dbgs() << "\n"); 779 780 // Erase the old instructions. 781 I->eraseFromParent(); 782 MergeMI->eraseFromParent(); 783 return NextI; 784 } 785 786 MachineBasicBlock::iterator 787 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 788 MachineBasicBlock::iterator Paired, 789 const LdStPairFlags &Flags) { 790 MachineBasicBlock::iterator NextI = I; 791 ++NextI; 792 // If NextI is the second of the two instructions to be merged, we need 793 // to skip one further. Either way we merge will invalidate the iterator, 794 // and we don't need to scan the new instruction, as it's a pairwise 795 // instruction, which we're not considering for further action anyway. 796 if (NextI == Paired) 797 ++NextI; 798 799 int SExtIdx = Flags.getSExtIdx(); 800 unsigned Opc = 801 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 802 bool IsUnscaled = TII->isUnscaledLdSt(Opc); 803 int OffsetStride = IsUnscaled ? getMemScale(I) : 1; 804 805 bool MergeForward = Flags.getMergeForward(); 806 // Insert our new paired instruction after whichever of the paired 807 // instructions MergeForward indicates. 808 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 809 // Also based on MergeForward is from where we copy the base register operand 810 // so we get the flags compatible with the input code. 811 const MachineOperand &BaseRegOp = 812 MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); 813 814 int Offset = getLdStOffsetOp(I).getImm(); 815 int PairedOffset = getLdStOffsetOp(Paired).getImm(); 816 bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode()); 817 if (IsUnscaled != PairedIsUnscaled) { 818 // We're trying to pair instructions that differ in how they are scaled. If 819 // I is scaled then scale the offset of Paired accordingly. Otherwise, do 820 // the opposite (i.e., make Paired's offset unscaled). 821 int MemSize = getMemScale(Paired); 822 if (PairedIsUnscaled) { 823 // If the unscaled offset isn't a multiple of the MemSize, we can't 824 // pair the operations together. 825 assert(!(PairedOffset % getMemScale(Paired)) && 826 "Offset should be a multiple of the stride!"); 827 PairedOffset /= MemSize; 828 } else { 829 PairedOffset *= MemSize; 830 } 831 } 832 833 // Which register is Rt and which is Rt2 depends on the offset order. 834 MachineInstr *RtMI, *Rt2MI; 835 if (Offset == PairedOffset + OffsetStride) { 836 RtMI = Paired; 837 Rt2MI = I; 838 // Here we swapped the assumption made for SExtIdx. 839 // I.e., we turn ldp I, Paired into ldp Paired, I. 840 // Update the index accordingly. 841 if (SExtIdx != -1) 842 SExtIdx = (SExtIdx + 1) % 2; 843 } else { 844 RtMI = I; 845 Rt2MI = Paired; 846 } 847 int OffsetImm = getLdStOffsetOp(RtMI).getImm(); 848 // Scale the immediate offset, if necessary. 849 if (TII->isUnscaledLdSt(RtMI->getOpcode())) { 850 assert(!(OffsetImm % getMemScale(RtMI)) && 851 "Unscaled offset cannot be scaled."); 852 OffsetImm /= getMemScale(RtMI); 853 } 854 855 // Construct the new instruction. 856 MachineInstrBuilder MIB; 857 DebugLoc DL = I->getDebugLoc(); 858 MachineBasicBlock *MBB = I->getParent(); 859 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) 860 .addOperand(getLdStRegOp(RtMI)) 861 .addOperand(getLdStRegOp(Rt2MI)) 862 .addOperand(BaseRegOp) 863 .addImm(OffsetImm) 864 .setMemRefs(I->mergeMemRefsWith(*Paired)); 865 866 (void)MIB; 867 868 DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); 869 DEBUG(I->print(dbgs())); 870 DEBUG(dbgs() << " "); 871 DEBUG(Paired->print(dbgs())); 872 DEBUG(dbgs() << " with instruction:\n "); 873 if (SExtIdx != -1) { 874 // Generate the sign extension for the proper result of the ldp. 875 // I.e., with X1, that would be: 876 // %W1<def> = KILL %W1, %X1<imp-def> 877 // %X1<def> = SBFMXri %X1<kill>, 0, 31 878 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 879 // Right now, DstMO has the extended register, since it comes from an 880 // extended opcode. 881 unsigned DstRegX = DstMO.getReg(); 882 // Get the W variant of that register. 883 unsigned DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 884 // Update the result of LDP to use the W instead of the X variant. 885 DstMO.setReg(DstRegW); 886 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 887 DEBUG(dbgs() << "\n"); 888 // Make the machine verifier happy by providing a definition for 889 // the X register. 890 // Insert this definition right after the generated LDP, i.e., before 891 // InsertionPoint. 892 MachineInstrBuilder MIBKill = 893 BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) 894 .addReg(DstRegW) 895 .addReg(DstRegX, RegState::Define); 896 MIBKill->getOperand(2).setImplicit(); 897 // Create the sign extension. 898 MachineInstrBuilder MIBSXTW = 899 BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) 900 .addReg(DstRegX) 901 .addImm(0) 902 .addImm(31); 903 (void)MIBSXTW; 904 DEBUG(dbgs() << " Extend operand:\n "); 905 DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 906 } else { 907 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 908 } 909 DEBUG(dbgs() << "\n"); 910 911 // Erase the old instructions. 912 I->eraseFromParent(); 913 Paired->eraseFromParent(); 914 915 return NextI; 916 } 917 918 MachineBasicBlock::iterator 919 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 920 MachineBasicBlock::iterator StoreI) { 921 MachineBasicBlock::iterator NextI = LoadI; 922 ++NextI; 923 924 int LoadSize = getMemScale(LoadI); 925 int StoreSize = getMemScale(StoreI); 926 unsigned LdRt = getLdStRegOp(LoadI).getReg(); 927 unsigned StRt = getLdStRegOp(StoreI).getReg(); 928 bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); 929 930 assert((IsStoreXReg || 931 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && 932 "Unexpected RegClass"); 933 934 MachineInstr *BitExtMI; 935 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { 936 // Remove the load, if the destination register of the loads is the same 937 // register for stored value. 938 if (StRt == LdRt && LoadSize == 8) { 939 DEBUG(dbgs() << "Remove load instruction:\n "); 940 DEBUG(LoadI->print(dbgs())); 941 DEBUG(dbgs() << "\n"); 942 LoadI->eraseFromParent(); 943 return NextI; 944 } 945 // Replace the load with a mov if the load and store are in the same size. 946 BitExtMI = 947 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 948 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) 949 .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) 950 .addReg(StRt) 951 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 952 } else { 953 // FIXME: Currently we disable this transformation in big-endian targets as 954 // performance and correctness are verified only in little-endian. 955 if (!Subtarget->isLittleEndian()) 956 return NextI; 957 bool IsUnscaled = TII->isUnscaledLdSt(LoadI); 958 assert(IsUnscaled == TII->isUnscaledLdSt(StoreI) && 959 "Unsupported ld/st match"); 960 assert(LoadSize <= StoreSize && "Invalid load size"); 961 int UnscaledLdOffset = IsUnscaled 962 ? getLdStOffsetOp(LoadI).getImm() 963 : getLdStOffsetOp(LoadI).getImm() * LoadSize; 964 int UnscaledStOffset = IsUnscaled 965 ? getLdStOffsetOp(StoreI).getImm() 966 : getLdStOffsetOp(StoreI).getImm() * StoreSize; 967 int Width = LoadSize * 8; 968 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); 969 int Imms = Immr + Width - 1; 970 unsigned DestReg = IsStoreXReg 971 ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, 972 &AArch64::GPR64RegClass) 973 : LdRt; 974 975 assert((UnscaledLdOffset >= UnscaledStOffset && 976 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && 977 "Invalid offset"); 978 979 Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); 980 Imms = Immr + Width - 1; 981 if (UnscaledLdOffset == UnscaledStOffset) { 982 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N 983 | ((Immr) << 6) // immr 984 | ((Imms) << 0) // imms 985 ; 986 987 BitExtMI = 988 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 989 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), 990 DestReg) 991 .addReg(StRt) 992 .addImm(AndMaskEncoded); 993 } else { 994 BitExtMI = 995 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 996 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), 997 DestReg) 998 .addReg(StRt) 999 .addImm(Immr) 1000 .addImm(Imms); 1001 } 1002 } 1003 1004 DEBUG(dbgs() << "Promoting load by replacing :\n "); 1005 DEBUG(StoreI->print(dbgs())); 1006 DEBUG(dbgs() << " "); 1007 DEBUG(LoadI->print(dbgs())); 1008 DEBUG(dbgs() << " with instructions:\n "); 1009 DEBUG(StoreI->print(dbgs())); 1010 DEBUG(dbgs() << " "); 1011 DEBUG((BitExtMI)->print(dbgs())); 1012 DEBUG(dbgs() << "\n"); 1013 1014 // Erase the old instructions. 1015 LoadI->eraseFromParent(); 1016 return NextI; 1017 } 1018 1019 /// trackRegDefsUses - Remember what registers the specified instruction uses 1020 /// and modifies. 1021 static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, 1022 BitVector &UsedRegs, 1023 const TargetRegisterInfo *TRI) { 1024 for (const MachineOperand &MO : MI->operands()) { 1025 if (MO.isRegMask()) 1026 ModifiedRegs.setBitsNotInMask(MO.getRegMask()); 1027 1028 if (!MO.isReg()) 1029 continue; 1030 unsigned Reg = MO.getReg(); 1031 if (!Reg) 1032 continue; 1033 if (MO.isDef()) { 1034 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 1035 ModifiedRegs.set(*AI); 1036 } else { 1037 assert(MO.isUse() && "Reg operand not a def and not a use?!?"); 1038 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) 1039 UsedRegs.set(*AI); 1040 } 1041 } 1042 } 1043 1044 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 1045 // Convert the byte-offset used by unscaled into an "element" offset used 1046 // by the scaled pair load/store instructions. 1047 if (IsUnscaled) { 1048 // If the byte-offset isn't a multiple of the stride, there's no point 1049 // trying to match it. 1050 if (Offset % OffsetStride) 1051 return false; 1052 Offset /= OffsetStride; 1053 } 1054 return Offset <= 63 && Offset >= -64; 1055 } 1056 1057 // Do alignment, specialized to power of 2 and for signed ints, 1058 // avoiding having to do a C-style cast from uint_64t to int when 1059 // using alignTo from include/llvm/Support/MathExtras.h. 1060 // FIXME: Move this function to include/MathExtras.h? 1061 static int alignTo(int Num, int PowOf2) { 1062 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 1063 } 1064 1065 static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, 1066 const AArch64InstrInfo *TII) { 1067 // One of the instructions must modify memory. 1068 if (!MIa->mayStore() && !MIb->mayStore()) 1069 return false; 1070 1071 // Both instructions must be memory operations. 1072 if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore()) 1073 return false; 1074 1075 return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); 1076 } 1077 1078 static bool mayAlias(MachineInstr *MIa, 1079 SmallVectorImpl<MachineInstr *> &MemInsns, 1080 const AArch64InstrInfo *TII) { 1081 for (auto &MIb : MemInsns) 1082 if (mayAlias(MIa, MIb, TII)) 1083 return true; 1084 1085 return false; 1086 } 1087 1088 bool AArch64LoadStoreOpt::findMatchingStore( 1089 MachineBasicBlock::iterator I, unsigned Limit, 1090 MachineBasicBlock::iterator &StoreI) { 1091 MachineBasicBlock::iterator B = I->getParent()->begin(); 1092 MachineBasicBlock::iterator MBBI = I; 1093 MachineInstr *LoadMI = I; 1094 unsigned BaseReg = getLdStBaseOp(LoadMI).getReg(); 1095 1096 // If the load is the first instruction in the block, there's obviously 1097 // not any matching store. 1098 if (MBBI == B) 1099 return false; 1100 1101 // Track which registers have been modified and used between the first insn 1102 // and the second insn. 1103 ModifiedRegs.reset(); 1104 UsedRegs.reset(); 1105 1106 unsigned Count = 0; 1107 do { 1108 --MBBI; 1109 MachineInstr *MI = MBBI; 1110 1111 // Don't count DBG_VALUE instructions towards the search limit. 1112 if (!MI->isDebugValue()) 1113 ++Count; 1114 1115 // If the load instruction reads directly from the address to which the 1116 // store instruction writes and the stored value is not modified, we can 1117 // promote the load. Since we do not handle stores with pre-/post-index, 1118 // it's unnecessary to check if BaseReg is modified by the store itself. 1119 if (MI->mayStore() && isMatchingStore(LoadMI, MI) && 1120 BaseReg == getLdStBaseOp(MI).getReg() && 1121 isLdOffsetInRangeOfSt(LoadMI, MI, TII) && 1122 !ModifiedRegs[getLdStRegOp(MI).getReg()]) { 1123 StoreI = MBBI; 1124 return true; 1125 } 1126 1127 if (MI->isCall()) 1128 return false; 1129 1130 // Update modified / uses register lists. 1131 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1132 1133 // Otherwise, if the base register is modified, we have no match, so 1134 // return early. 1135 if (ModifiedRegs[BaseReg]) 1136 return false; 1137 1138 // If we encounter a store aliased with the load, return early. 1139 if (MI->mayStore() && mayAlias(LoadMI, MI, TII)) 1140 return false; 1141 } while (MBBI != B && Count < Limit); 1142 return false; 1143 } 1144 1145 // Returns true if these two opcodes can be merged or paired. Otherwise, 1146 // returns false. 1147 static bool canMergeOpc(unsigned OpcA, unsigned OpcB, LdStPairFlags &Flags, 1148 const AArch64InstrInfo *TII) { 1149 // Opcodes match: nothing more to check. 1150 if (OpcA == OpcB) 1151 return true; 1152 1153 // Try to match a sign-extended load/store with a zero-extended load/store. 1154 bool IsValidLdStrOpc, PairIsValidLdStrOpc; 1155 unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); 1156 assert(IsValidLdStrOpc && 1157 "Given Opc should be a Load or Store with an immediate"); 1158 // OpcA will be the first instruction in the pair. 1159 if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { 1160 Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); 1161 return true; 1162 } 1163 1164 // If the second instruction isn't even a load/store, bail out. 1165 if (!PairIsValidLdStrOpc) 1166 return false; 1167 1168 // FIXME: We don't support merging narrow loads/stores with mixed 1169 // scaled/unscaled offsets. 1170 if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB)) 1171 return false; 1172 1173 // Try to match an unscaled load/store with a scaled load/store. 1174 return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) && 1175 getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); 1176 1177 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? 1178 } 1179 1180 /// Scan the instructions looking for a load/store that can be combined with the 1181 /// current instruction into a wider equivalent or a load/store pair. 1182 MachineBasicBlock::iterator 1183 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 1184 LdStPairFlags &Flags, unsigned Limit) { 1185 MachineBasicBlock::iterator E = I->getParent()->end(); 1186 MachineBasicBlock::iterator MBBI = I; 1187 MachineInstr *FirstMI = I; 1188 ++MBBI; 1189 1190 unsigned Opc = FirstMI->getOpcode(); 1191 bool MayLoad = FirstMI->mayLoad(); 1192 bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); 1193 unsigned Reg = getLdStRegOp(FirstMI).getReg(); 1194 unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); 1195 int Offset = getLdStOffsetOp(FirstMI).getImm(); 1196 int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; 1197 bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); 1198 1199 // Track which registers have been modified and used between the first insn 1200 // (inclusive) and the second insn. 1201 ModifiedRegs.reset(); 1202 UsedRegs.reset(); 1203 1204 // Remember any instructions that read/write memory between FirstMI and MI. 1205 SmallVector<MachineInstr *, 4> MemInsns; 1206 1207 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 1208 MachineInstr *MI = MBBI; 1209 // Skip DBG_VALUE instructions. Otherwise debug info can affect the 1210 // optimization by changing how far we scan. 1211 if (MI->isDebugValue()) 1212 continue; 1213 1214 // Now that we know this is a real instruction, count it. 1215 ++Count; 1216 1217 Flags.setSExtIdx(-1); 1218 if (canMergeOpc(Opc, MI->getOpcode(), Flags, TII) && 1219 getLdStOffsetOp(MI).isImm()) { 1220 assert(MI->mayLoadOrStore() && "Expected memory operation."); 1221 // If we've found another instruction with the same opcode, check to see 1222 // if the base and offset are compatible with our starting instruction. 1223 // These instructions all have scaled immediate operands, so we just 1224 // check for +1/-1. Make sure to check the new instruction offset is 1225 // actually an immediate and not a symbolic reference destined for 1226 // a relocation. 1227 // 1228 // Pairwise instructions have a 7-bit signed offset field. Single insns 1229 // have a 12-bit unsigned offset field. To be a valid combine, the 1230 // final offset must be in range. 1231 unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); 1232 int MIOffset = getLdStOffsetOp(MI).getImm(); 1233 bool MIIsUnscaled = TII->isUnscaledLdSt(MI); 1234 if (IsUnscaled != MIIsUnscaled) { 1235 // We're trying to pair instructions that differ in how they are scaled. 1236 // If FirstMI is scaled then scale the offset of MI accordingly. 1237 // Otherwise, do the opposite (i.e., make MI's offset unscaled). 1238 int MemSize = getMemScale(MI); 1239 if (MIIsUnscaled) { 1240 // If the unscaled offset isn't a multiple of the MemSize, we can't 1241 // pair the operations together: bail and keep looking. 1242 if (MIOffset % MemSize) 1243 continue; 1244 MIOffset /= MemSize; 1245 } else { 1246 MIOffset *= MemSize; 1247 } 1248 } 1249 1250 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || 1251 (Offset + OffsetStride == MIOffset))) { 1252 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 1253 // If this is a volatile load/store that otherwise matched, stop looking 1254 // as something is going on that we don't have enough information to 1255 // safely transform. Similarly, stop if we see a hint to avoid pairs. 1256 if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 1257 return E; 1258 // If the resultant immediate offset of merging these instructions 1259 // is out of range for a pairwise instruction, bail and keep looking. 1260 bool IsNarrowLoad = isNarrowLoad(MI->getOpcode()); 1261 if (!IsNarrowLoad && 1262 !inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { 1263 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1264 MemInsns.push_back(MI); 1265 continue; 1266 } 1267 1268 if (IsNarrowLoad || IsPromotableZeroStore) { 1269 // If the alignment requirements of the scaled wide load/store 1270 // instruction can't express the offset of the scaled narrow 1271 // input, bail and keep looking. 1272 if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { 1273 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1274 MemInsns.push_back(MI); 1275 continue; 1276 } 1277 } else { 1278 // If the alignment requirements of the paired (scaled) instruction 1279 // can't express the offset of the unscaled input, bail and keep 1280 // looking. 1281 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 1282 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1283 MemInsns.push_back(MI); 1284 continue; 1285 } 1286 } 1287 // If the destination register of the loads is the same register, bail 1288 // and keep looking. A load-pair instruction with both destination 1289 // registers the same is UNPREDICTABLE and will result in an exception. 1290 // For narrow stores, allow only when the stored value is the same 1291 // (i.e., WZR). 1292 if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || 1293 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { 1294 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1295 MemInsns.push_back(MI); 1296 continue; 1297 } 1298 1299 // If the Rt of the second instruction was not modified or used between 1300 // the two instructions and none of the instructions between the second 1301 // and first alias with the second, we can combine the second into the 1302 // first. 1303 if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && 1304 !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && 1305 !mayAlias(MI, MemInsns, TII)) { 1306 Flags.setMergeForward(false); 1307 return MBBI; 1308 } 1309 1310 // Likewise, if the Rt of the first instruction is not modified or used 1311 // between the two instructions and none of the instructions between the 1312 // first and the second alias with the first, we can combine the first 1313 // into the second. 1314 if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && 1315 !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && 1316 !mayAlias(FirstMI, MemInsns, TII)) { 1317 Flags.setMergeForward(true); 1318 return MBBI; 1319 } 1320 // Unable to combine these instructions due to interference in between. 1321 // Keep looking. 1322 } 1323 } 1324 1325 // If the instruction wasn't a matching load or store. Stop searching if we 1326 // encounter a call instruction that might modify memory. 1327 if (MI->isCall()) 1328 return E; 1329 1330 // Update modified / uses register lists. 1331 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1332 1333 // Otherwise, if the base register is modified, we have no match, so 1334 // return early. 1335 if (ModifiedRegs[BaseReg]) 1336 return E; 1337 1338 // Update list of instructions that read/write memory. 1339 if (MI->mayLoadOrStore()) 1340 MemInsns.push_back(MI); 1341 } 1342 return E; 1343 } 1344 1345 MachineBasicBlock::iterator 1346 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 1347 MachineBasicBlock::iterator Update, 1348 bool IsPreIdx) { 1349 assert((Update->getOpcode() == AArch64::ADDXri || 1350 Update->getOpcode() == AArch64::SUBXri) && 1351 "Unexpected base register update instruction to merge!"); 1352 MachineBasicBlock::iterator NextI = I; 1353 // Return the instruction following the merged instruction, which is 1354 // the instruction following our unmerged load. Unless that's the add/sub 1355 // instruction we're merging, in which case it's the one after that. 1356 if (++NextI == Update) 1357 ++NextI; 1358 1359 int Value = Update->getOperand(2).getImm(); 1360 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 1361 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 1362 if (Update->getOpcode() == AArch64::SUBXri) 1363 Value = -Value; 1364 1365 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 1366 : getPostIndexedOpcode(I->getOpcode()); 1367 MachineInstrBuilder MIB; 1368 if (!isPairedLdSt(I)) { 1369 // Non-paired instruction. 1370 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 1371 .addOperand(getLdStRegOp(Update)) 1372 .addOperand(getLdStRegOp(I)) 1373 .addOperand(getLdStBaseOp(I)) 1374 .addImm(Value) 1375 .setMemRefs(I->memoperands_begin(), I->memoperands_end()); 1376 } else { 1377 // Paired instruction. 1378 int Scale = getMemScale(I); 1379 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 1380 .addOperand(getLdStRegOp(Update)) 1381 .addOperand(getLdStRegOp(I, 0)) 1382 .addOperand(getLdStRegOp(I, 1)) 1383 .addOperand(getLdStBaseOp(I)) 1384 .addImm(Value / Scale) 1385 .setMemRefs(I->memoperands_begin(), I->memoperands_end()); 1386 } 1387 (void)MIB; 1388 1389 if (IsPreIdx) 1390 DEBUG(dbgs() << "Creating pre-indexed load/store."); 1391 else 1392 DEBUG(dbgs() << "Creating post-indexed load/store."); 1393 DEBUG(dbgs() << " Replacing instructions:\n "); 1394 DEBUG(I->print(dbgs())); 1395 DEBUG(dbgs() << " "); 1396 DEBUG(Update->print(dbgs())); 1397 DEBUG(dbgs() << " with instruction:\n "); 1398 DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1399 DEBUG(dbgs() << "\n"); 1400 1401 // Erase the old instructions for the block. 1402 I->eraseFromParent(); 1403 Update->eraseFromParent(); 1404 1405 return NextI; 1406 } 1407 1408 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, 1409 MachineInstr *MI, 1410 unsigned BaseReg, int Offset) { 1411 switch (MI->getOpcode()) { 1412 default: 1413 break; 1414 case AArch64::SUBXri: 1415 // Negate the offset for a SUB instruction. 1416 Offset *= -1; 1417 // FALLTHROUGH 1418 case AArch64::ADDXri: 1419 // Make sure it's a vanilla immediate operand, not a relocation or 1420 // anything else we can't handle. 1421 if (!MI->getOperand(2).isImm()) 1422 break; 1423 // Watch out for 1 << 12 shifted value. 1424 if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) 1425 break; 1426 1427 // The update instruction source and destination register must be the 1428 // same as the load/store base register. 1429 if (MI->getOperand(0).getReg() != BaseReg || 1430 MI->getOperand(1).getReg() != BaseReg) 1431 break; 1432 1433 bool IsPairedInsn = isPairedLdSt(MemMI); 1434 int UpdateOffset = MI->getOperand(2).getImm(); 1435 // For non-paired load/store instructions, the immediate must fit in a 1436 // signed 9-bit integer. 1437 if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) 1438 break; 1439 1440 // For paired load/store instructions, the immediate must be a multiple of 1441 // the scaling factor. The scaled offset must also fit into a signed 7-bit 1442 // integer. 1443 if (IsPairedInsn) { 1444 int Scale = getMemScale(MemMI); 1445 if (UpdateOffset % Scale != 0) 1446 break; 1447 1448 int ScaledOffset = UpdateOffset / Scale; 1449 if (ScaledOffset > 64 || ScaledOffset < -64) 1450 break; 1451 } 1452 1453 // If we have a non-zero Offset, we check that it matches the amount 1454 // we're adding to the register. 1455 if (!Offset || Offset == MI->getOperand(2).getImm()) 1456 return true; 1457 break; 1458 } 1459 return false; 1460 } 1461 1462 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 1463 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { 1464 MachineBasicBlock::iterator E = I->getParent()->end(); 1465 MachineInstr *MemMI = I; 1466 MachineBasicBlock::iterator MBBI = I; 1467 1468 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1469 int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); 1470 1471 // Scan forward looking for post-index opportunities. Updating instructions 1472 // can't be formed if the memory instruction doesn't have the offset we're 1473 // looking for. 1474 if (MIUnscaledOffset != UnscaledOffset) 1475 return E; 1476 1477 // If the base register overlaps a destination register, we can't 1478 // merge the update. 1479 bool IsPairedInsn = isPairedLdSt(MemMI); 1480 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1481 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1482 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1483 return E; 1484 } 1485 1486 // Track which registers have been modified and used between the first insn 1487 // (inclusive) and the second insn. 1488 ModifiedRegs.reset(); 1489 UsedRegs.reset(); 1490 ++MBBI; 1491 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 1492 MachineInstr *MI = MBBI; 1493 // Skip DBG_VALUE instructions. 1494 if (MI->isDebugValue()) 1495 continue; 1496 1497 // Now that we know this is a real instruction, count it. 1498 ++Count; 1499 1500 // If we found a match, return it. 1501 if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) 1502 return MBBI; 1503 1504 // Update the status of what the instruction clobbered and used. 1505 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1506 1507 // Otherwise, if the base register is used or modified, we have no match, so 1508 // return early. 1509 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1510 return E; 1511 } 1512 return E; 1513 } 1514 1515 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 1516 MachineBasicBlock::iterator I, unsigned Limit) { 1517 MachineBasicBlock::iterator B = I->getParent()->begin(); 1518 MachineBasicBlock::iterator E = I->getParent()->end(); 1519 MachineInstr *MemMI = I; 1520 MachineBasicBlock::iterator MBBI = I; 1521 1522 unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); 1523 int Offset = getLdStOffsetOp(MemMI).getImm(); 1524 1525 // If the load/store is the first instruction in the block, there's obviously 1526 // not any matching update. Ditto if the memory offset isn't zero. 1527 if (MBBI == B || Offset != 0) 1528 return E; 1529 // If the base register overlaps a destination register, we can't 1530 // merge the update. 1531 bool IsPairedInsn = isPairedLdSt(MemMI); 1532 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1533 unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); 1534 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1535 return E; 1536 } 1537 1538 // Track which registers have been modified and used between the first insn 1539 // (inclusive) and the second insn. 1540 ModifiedRegs.reset(); 1541 UsedRegs.reset(); 1542 unsigned Count = 0; 1543 do { 1544 --MBBI; 1545 MachineInstr *MI = MBBI; 1546 1547 // Don't count DBG_VALUE instructions towards the search limit. 1548 if (!MI->isDebugValue()) 1549 ++Count; 1550 1551 // If we found a match, return it. 1552 if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) 1553 return MBBI; 1554 1555 // Update the status of what the instruction clobbered and used. 1556 trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); 1557 1558 // Otherwise, if the base register is used or modified, we have no match, so 1559 // return early. 1560 if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) 1561 return E; 1562 } while (MBBI != B && Count < Limit); 1563 return E; 1564 } 1565 1566 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( 1567 MachineBasicBlock::iterator &MBBI) { 1568 MachineInstr *MI = MBBI; 1569 // If this is a volatile load, don't mess with it. 1570 if (MI->hasOrderedMemoryRef()) 1571 return false; 1572 1573 // Make sure this is a reg+imm. 1574 // FIXME: It is possible to extend it to handle reg+reg cases. 1575 if (!getLdStOffsetOp(MI).isImm()) 1576 return false; 1577 1578 // Look backward up to LdStLimit instructions. 1579 MachineBasicBlock::iterator StoreI; 1580 if (findMatchingStore(MBBI, LdStLimit, StoreI)) { 1581 ++NumLoadsFromStoresPromoted; 1582 // Promote the load. Keeping the iterator straight is a 1583 // pain, so we let the merge routine tell us what the next instruction 1584 // is after it's done mucking about. 1585 MBBI = promoteLoadFromStore(MBBI, StoreI); 1586 return true; 1587 } 1588 return false; 1589 } 1590 1591 bool AArch64LoadStoreOpt::isCandidateToMergeOrPair(MachineInstr *MI) { 1592 // If this is a volatile load/store, don't mess with it. 1593 if (MI->hasOrderedMemoryRef()) 1594 return false; 1595 1596 // Make sure this is a reg+imm (as opposed to an address reloc). 1597 if (!getLdStOffsetOp(MI).isImm()) 1598 return false; 1599 1600 // Can't merge/pair if the instruction modifies the base register. 1601 // e.g., ldr x0, [x0] 1602 unsigned BaseReg = getLdStBaseOp(MI).getReg(); 1603 if (MI->modifiesRegister(BaseReg, TRI)) 1604 return false; 1605 1606 // Check if this load/store has a hint to avoid pair formation. 1607 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 1608 if (TII->isLdStPairSuppressed(MI)) 1609 return false; 1610 1611 return true; 1612 } 1613 1614 // Find narrow loads that can be converted into a single wider load with 1615 // bitfield extract instructions. Also merge adjacent zero stores into a wider 1616 // store. 1617 bool AArch64LoadStoreOpt::tryToMergeLdStInst( 1618 MachineBasicBlock::iterator &MBBI) { 1619 assert((isNarrowLoad(MBBI) || isPromotableZeroStoreOpcode(MBBI)) && 1620 "Expected narrow op."); 1621 MachineInstr *MI = MBBI; 1622 MachineBasicBlock::iterator E = MI->getParent()->end(); 1623 1624 if (!isCandidateToMergeOrPair(MI)) 1625 return false; 1626 1627 // For promotable zero stores, the stored value should be WZR. 1628 if (isPromotableZeroStoreOpcode(MI) && 1629 getLdStRegOp(MI).getReg() != AArch64::WZR) 1630 return false; 1631 1632 // Look ahead up to LdStLimit instructions for a mergable instruction. 1633 LdStPairFlags Flags; 1634 MachineBasicBlock::iterator MergeMI = 1635 findMatchingInsn(MBBI, Flags, LdStLimit); 1636 if (MergeMI != E) { 1637 if (isNarrowLoad(MI)) { 1638 ++NumNarrowLoadsPromoted; 1639 } else if (isPromotableZeroStoreInst(MI)) { 1640 ++NumZeroStoresPromoted; 1641 } 1642 // Keeping the iterator straight is a pain, so we let the merge routine tell 1643 // us what the next instruction is after it's done mucking about. 1644 MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags); 1645 return true; 1646 } 1647 return false; 1648 } 1649 1650 // Find loads and stores that can be merged into a single load or store pair 1651 // instruction. 1652 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { 1653 MachineInstr *MI = MBBI; 1654 MachineBasicBlock::iterator E = MI->getParent()->end(); 1655 1656 if (!isCandidateToMergeOrPair(MI)) 1657 return false; 1658 1659 // Early exit if the offset is not possible to match. (6 bits of positive 1660 // range, plus allow an extra one in case we find a later insn that matches 1661 // with Offset-1) 1662 bool IsUnscaled = TII->isUnscaledLdSt(MI); 1663 int Offset = getLdStOffsetOp(MI).getImm(); 1664 int OffsetStride = IsUnscaled ? getMemScale(MI) : 1; 1665 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 1666 return false; 1667 1668 // Look ahead up to LdStLimit instructions for a pairable instruction. 1669 LdStPairFlags Flags; 1670 MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, LdStLimit); 1671 if (Paired != E) { 1672 ++NumPairCreated; 1673 if (TII->isUnscaledLdSt(MI)) 1674 ++NumUnscaledPairCreated; 1675 // Keeping the iterator straight is a pain, so we let the merge routine tell 1676 // us what the next instruction is after it's done mucking about. 1677 MBBI = mergePairedInsns(MBBI, Paired, Flags); 1678 return true; 1679 } 1680 return false; 1681 } 1682 1683 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, 1684 bool enableNarrowLdOpt) { 1685 bool Modified = false; 1686 // Four tranformations to do here: 1687 // 1) Find loads that directly read from stores and promote them by 1688 // replacing with mov instructions. If the store is wider than the load, 1689 // the load will be replaced with a bitfield extract. 1690 // e.g., 1691 // str w1, [x0, #4] 1692 // ldrh w2, [x0, #6] 1693 // ; becomes 1694 // str w1, [x0, #4] 1695 // lsr w2, w1, #16 1696 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1697 MBBI != E;) { 1698 MachineInstr *MI = MBBI; 1699 switch (MI->getOpcode()) { 1700 default: 1701 // Just move on to the next instruction. 1702 ++MBBI; 1703 break; 1704 // Scaled instructions. 1705 case AArch64::LDRBBui: 1706 case AArch64::LDRHHui: 1707 case AArch64::LDRWui: 1708 case AArch64::LDRXui: 1709 // Unscaled instructions. 1710 case AArch64::LDURBBi: 1711 case AArch64::LDURHHi: 1712 case AArch64::LDURWi: 1713 case AArch64::LDURXi: { 1714 if (tryToPromoteLoadFromStore(MBBI)) { 1715 Modified = true; 1716 break; 1717 } 1718 ++MBBI; 1719 break; 1720 } 1721 } 1722 } 1723 // 2) Find narrow loads that can be converted into a single wider load 1724 // with bitfield extract instructions. 1725 // e.g., 1726 // ldrh w0, [x2] 1727 // ldrh w1, [x2, #2] 1728 // ; becomes 1729 // ldr w0, [x2] 1730 // ubfx w1, w0, #16, #16 1731 // and w0, w0, #ffff 1732 // 1733 // Also merge adjacent zero stores into a wider store. 1734 // e.g., 1735 // strh wzr, [x0] 1736 // strh wzr, [x0, #2] 1737 // ; becomes 1738 // str wzr, [x0] 1739 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1740 enableNarrowLdOpt && MBBI != E;) { 1741 MachineInstr *MI = MBBI; 1742 switch (MI->getOpcode()) { 1743 default: 1744 // Just move on to the next instruction. 1745 ++MBBI; 1746 break; 1747 // Scaled instructions. 1748 case AArch64::LDRBBui: 1749 case AArch64::LDRHHui: 1750 case AArch64::LDRSBWui: 1751 case AArch64::LDRSHWui: 1752 case AArch64::STRBBui: 1753 case AArch64::STRHHui: 1754 case AArch64::STRWui: 1755 // Unscaled instructions. 1756 case AArch64::LDURBBi: 1757 case AArch64::LDURHHi: 1758 case AArch64::LDURSBWi: 1759 case AArch64::LDURSHWi: 1760 case AArch64::STURBBi: 1761 case AArch64::STURHHi: 1762 case AArch64::STURWi: { 1763 if (tryToMergeLdStInst(MBBI)) { 1764 Modified = true; 1765 break; 1766 } 1767 ++MBBI; 1768 break; 1769 } 1770 } 1771 } 1772 // 3) Find loads and stores that can be merged into a single load or store 1773 // pair instruction. 1774 // e.g., 1775 // ldr x0, [x2] 1776 // ldr x1, [x2, #8] 1777 // ; becomes 1778 // ldp x0, x1, [x2] 1779 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1780 MBBI != E;) { 1781 MachineInstr *MI = MBBI; 1782 switch (MI->getOpcode()) { 1783 default: 1784 // Just move on to the next instruction. 1785 ++MBBI; 1786 break; 1787 // Scaled instructions. 1788 case AArch64::STRSui: 1789 case AArch64::STRDui: 1790 case AArch64::STRQui: 1791 case AArch64::STRXui: 1792 case AArch64::STRWui: 1793 case AArch64::LDRSui: 1794 case AArch64::LDRDui: 1795 case AArch64::LDRQui: 1796 case AArch64::LDRXui: 1797 case AArch64::LDRWui: 1798 case AArch64::LDRSWui: 1799 // Unscaled instructions. 1800 case AArch64::STURSi: 1801 case AArch64::STURDi: 1802 case AArch64::STURQi: 1803 case AArch64::STURWi: 1804 case AArch64::STURXi: 1805 case AArch64::LDURSi: 1806 case AArch64::LDURDi: 1807 case AArch64::LDURQi: 1808 case AArch64::LDURWi: 1809 case AArch64::LDURXi: 1810 case AArch64::LDURSWi: { 1811 if (tryToPairLdStInst(MBBI)) { 1812 Modified = true; 1813 break; 1814 } 1815 ++MBBI; 1816 break; 1817 } 1818 } 1819 } 1820 // 4) Find base register updates that can be merged into the load or store 1821 // as a base-reg writeback. 1822 // e.g., 1823 // ldr x0, [x2] 1824 // add x2, x2, #4 1825 // ; becomes 1826 // ldr x0, [x2], #4 1827 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1828 MBBI != E;) { 1829 MachineInstr *MI = MBBI; 1830 // Do update merging. It's simpler to keep this separate from the above 1831 // switchs, though not strictly necessary. 1832 unsigned Opc = MI->getOpcode(); 1833 switch (Opc) { 1834 default: 1835 // Just move on to the next instruction. 1836 ++MBBI; 1837 break; 1838 // Scaled instructions. 1839 case AArch64::STRSui: 1840 case AArch64::STRDui: 1841 case AArch64::STRQui: 1842 case AArch64::STRXui: 1843 case AArch64::STRWui: 1844 case AArch64::STRHHui: 1845 case AArch64::STRBBui: 1846 case AArch64::LDRSui: 1847 case AArch64::LDRDui: 1848 case AArch64::LDRQui: 1849 case AArch64::LDRXui: 1850 case AArch64::LDRWui: 1851 case AArch64::LDRHHui: 1852 case AArch64::LDRBBui: 1853 // Unscaled instructions. 1854 case AArch64::STURSi: 1855 case AArch64::STURDi: 1856 case AArch64::STURQi: 1857 case AArch64::STURWi: 1858 case AArch64::STURXi: 1859 case AArch64::LDURSi: 1860 case AArch64::LDURDi: 1861 case AArch64::LDURQi: 1862 case AArch64::LDURWi: 1863 case AArch64::LDURXi: 1864 // Paired instructions. 1865 case AArch64::LDPSi: 1866 case AArch64::LDPSWi: 1867 case AArch64::LDPDi: 1868 case AArch64::LDPQi: 1869 case AArch64::LDPWi: 1870 case AArch64::LDPXi: 1871 case AArch64::STPSi: 1872 case AArch64::STPDi: 1873 case AArch64::STPQi: 1874 case AArch64::STPWi: 1875 case AArch64::STPXi: { 1876 // Make sure this is a reg+imm (as opposed to an address reloc). 1877 if (!getLdStOffsetOp(MI).isImm()) { 1878 ++MBBI; 1879 break; 1880 } 1881 // Look forward to try to form a post-index instruction. For example, 1882 // ldr x0, [x20] 1883 // add x20, x20, #32 1884 // merged into: 1885 // ldr x0, [x20], #32 1886 MachineBasicBlock::iterator Update = 1887 findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); 1888 if (Update != E) { 1889 // Merge the update into the ld/st. 1890 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 1891 Modified = true; 1892 ++NumPostFolded; 1893 break; 1894 } 1895 // Don't know how to handle pre/post-index versions, so move to the next 1896 // instruction. 1897 if (TII->isUnscaledLdSt(Opc)) { 1898 ++MBBI; 1899 break; 1900 } 1901 1902 // Look back to try to find a pre-index instruction. For example, 1903 // add x0, x0, #8 1904 // ldr x1, [x0] 1905 // merged into: 1906 // ldr x1, [x0, #8]! 1907 Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); 1908 if (Update != E) { 1909 // Merge the update into the ld/st. 1910 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1911 Modified = true; 1912 ++NumPreFolded; 1913 break; 1914 } 1915 // The immediate in the load/store is scaled by the size of the memory 1916 // operation. The immediate in the add we're looking for, 1917 // however, is not, so adjust here. 1918 int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); 1919 1920 // Look forward to try to find a post-index instruction. For example, 1921 // ldr x1, [x0, #64] 1922 // add x0, x0, #64 1923 // merged into: 1924 // ldr x1, [x0, #64]! 1925 Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); 1926 if (Update != E) { 1927 // Merge the update into the ld/st. 1928 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1929 Modified = true; 1930 ++NumPreFolded; 1931 break; 1932 } 1933 1934 // Nothing found. Just move to the next instruction. 1935 ++MBBI; 1936 break; 1937 } 1938 } 1939 } 1940 1941 return Modified; 1942 } 1943 1944 bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { 1945 bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo(); 1946 // FIXME: The benefit from converting narrow loads into a wider load could be 1947 // microarchitectural as it assumes that a single load with two bitfield 1948 // extracts is cheaper than two narrow loads. Currently, this conversion is 1949 // enabled only in cortex-a57 on which performance benefits were verified. 1950 return ProfitableArch && !Subtarget->requiresStrictAlign(); 1951 } 1952 1953 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1954 Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); 1955 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); 1956 TRI = Subtarget->getRegisterInfo(); 1957 1958 // Resize the modified and used register bitfield trackers. We do this once 1959 // per function and then clear the bitfield each time we optimize a load or 1960 // store. 1961 ModifiedRegs.resize(TRI->getNumRegs()); 1962 UsedRegs.resize(TRI->getNumRegs()); 1963 1964 bool Modified = false; 1965 bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); 1966 for (auto &MBB : Fn) 1967 Modified |= optimizeBlock(MBB, enableNarrowLdOpt); 1968 1969 return Modified; 1970 } 1971 1972 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep 1973 // loads and stores near one another? 1974 1975 // FIXME: When pairing store instructions it's very possible for this pass to 1976 // hoist a store with a KILL marker above another use (without a KILL marker). 1977 // The resulting IR is invalid, but nothing uses the KILL markers after this 1978 // pass, so it's never caused a problem in practice. 1979 1980 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 1981 /// load / store optimization pass. 1982 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 1983 return new AArch64LoadStoreOpt(); 1984 } 1985