1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass tries to fuse DS instructions with close by immediate offsets. 11 // This will fuse operations such as 12 // ds_read_b32 v0, v2 offset:16 13 // ds_read_b32 v1, v2 offset:32 14 // ==> 15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16 // 17 // The same is done for certain SMEM and VMEM opcodes, e.g.: 18 // s_buffer_load_dword s4, s[0:3], 4 19 // s_buffer_load_dword s5, s[0:3], 8 20 // ==> 21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 22 // 23 // 24 // Future improvements: 25 // 26 // - This currently relies on the scheduler to place loads and stores next to 27 // each other, and then only merges adjacent pairs of instructions. It would 28 // be good to be more flexible with interleaved instructions, and possibly run 29 // before scheduling. It currently missing stores of constants because loading 30 // the constant into the data register is placed between the stores, although 31 // this is arguably a scheduling problem. 32 // 33 // - Live interval recomputing seems inefficient. This currently only matches 34 // one pair, and recomputes live intervals and moves on to the next pair. It 35 // would be better to compute a list of all merges that need to occur. 36 // 37 // - With a list of instructions to process, we can also merge more. If a 38 // cluster of loads have offsets that are too large to fit in the 8-bit 39 // offsets, but are close enough to fit in the 8 bits, we can add to the base 40 // pointer and use the new reduced offsets. 41 // 42 //===----------------------------------------------------------------------===// 43 44 #include "AMDGPU.h" 45 #include "AMDGPUSubtarget.h" 46 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 47 #include "SIInstrInfo.h" 48 #include "SIRegisterInfo.h" 49 #include "Utils/AMDGPUBaseInfo.h" 50 #include "llvm/ADT/ArrayRef.h" 51 #include "llvm/ADT/SmallVector.h" 52 #include "llvm/ADT/StringRef.h" 53 #include "llvm/Analysis/AliasAnalysis.h" 54 #include "llvm/CodeGen/MachineBasicBlock.h" 55 #include "llvm/CodeGen/MachineFunction.h" 56 #include "llvm/CodeGen/MachineFunctionPass.h" 57 #include "llvm/CodeGen/MachineInstr.h" 58 #include "llvm/CodeGen/MachineInstrBuilder.h" 59 #include "llvm/CodeGen/MachineOperand.h" 60 #include "llvm/CodeGen/MachineRegisterInfo.h" 61 #include "llvm/IR/DebugLoc.h" 62 #include "llvm/Pass.h" 63 #include "llvm/Support/Debug.h" 64 #include "llvm/Support/MathExtras.h" 65 #include "llvm/Support/raw_ostream.h" 66 #include <algorithm> 67 #include <cassert> 68 #include <cstdlib> 69 #include <iterator> 70 #include <utility> 71 72 using namespace llvm; 73 74 #define DEBUG_TYPE "si-load-store-opt" 75 76 namespace { 77 enum InstClassEnum { 78 UNKNOWN, 79 DS_READ, 80 DS_WRITE, 81 S_BUFFER_LOAD_IMM, 82 BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN, 83 BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET, 84 BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN, 85 BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET, 86 BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact, 87 BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact, 88 BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact, 89 BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact, 90 }; 91 92 enum RegisterEnum { 93 SBASE = 0x1, 94 SRSRC = 0x2, 95 SOFFSET = 0x4, 96 VADDR = 0x8, 97 ADDR = 0x10, 98 }; 99 100 class SILoadStoreOptimizer : public MachineFunctionPass { 101 struct CombineInfo { 102 MachineBasicBlock::iterator I; 103 MachineBasicBlock::iterator Paired; 104 unsigned EltSize; 105 unsigned Offset0; 106 unsigned Offset1; 107 unsigned Width0; 108 unsigned Width1; 109 unsigned BaseOff; 110 InstClassEnum InstClass; 111 bool GLC0; 112 bool GLC1; 113 bool SLC0; 114 bool SLC1; 115 bool UseST64; 116 SmallVector<MachineInstr *, 8> InstsToMove; 117 }; 118 119 private: 120 const GCNSubtarget *STM = nullptr; 121 const SIInstrInfo *TII = nullptr; 122 const SIRegisterInfo *TRI = nullptr; 123 MachineRegisterInfo *MRI = nullptr; 124 AliasAnalysis *AA = nullptr; 125 bool OptimizeAgain; 126 127 static bool offsetsCanBeCombined(CombineInfo &CI); 128 static bool widthsFit(const CombineInfo &CI); 129 static unsigned getNewOpcode(const CombineInfo &CI); 130 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); 131 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); 132 unsigned getOpcodeWidth(const MachineInstr &MI); 133 InstClassEnum getInstClass(unsigned Opc); 134 unsigned getRegs(unsigned Opc); 135 136 bool findMatchingInst(CombineInfo &CI); 137 138 unsigned read2Opcode(unsigned EltSize) const; 139 unsigned read2ST64Opcode(unsigned EltSize) const; 140 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); 141 142 unsigned write2Opcode(unsigned EltSize) const; 143 unsigned write2ST64Opcode(unsigned EltSize) const; 144 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); 145 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); 146 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); 147 MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); 148 149 public: 150 static char ID; 151 152 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 153 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 154 } 155 156 bool optimizeBlock(MachineBasicBlock &MBB); 157 158 bool runOnMachineFunction(MachineFunction &MF) override; 159 160 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 161 162 void getAnalysisUsage(AnalysisUsage &AU) const override { 163 AU.setPreservesCFG(); 164 AU.addRequired<AAResultsWrapperPass>(); 165 166 MachineFunctionPass::getAnalysisUsage(AU); 167 } 168 }; 169 170 } // end anonymous namespace. 171 172 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 173 "SI Load Store Optimizer", false, false) 174 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 175 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 176 false, false) 177 178 char SILoadStoreOptimizer::ID = 0; 179 180 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 181 182 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 183 return new SILoadStoreOptimizer(); 184 } 185 186 static void moveInstsAfter(MachineBasicBlock::iterator I, 187 ArrayRef<MachineInstr *> InstsToMove) { 188 MachineBasicBlock *MBB = I->getParent(); 189 ++I; 190 for (MachineInstr *MI : InstsToMove) { 191 MI->removeFromParent(); 192 MBB->insert(I, MI); 193 } 194 } 195 196 static void addDefsUsesToList(const MachineInstr &MI, 197 DenseSet<unsigned> &RegDefs, 198 DenseSet<unsigned> &PhysRegUses) { 199 for (const MachineOperand &Op : MI.operands()) { 200 if (Op.isReg()) { 201 if (Op.isDef()) 202 RegDefs.insert(Op.getReg()); 203 else if (Op.readsReg() && 204 TargetRegisterInfo::isPhysicalRegister(Op.getReg())) 205 PhysRegUses.insert(Op.getReg()); 206 } 207 } 208 } 209 210 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 211 MachineBasicBlock::iterator B, 212 const SIInstrInfo *TII, 213 AliasAnalysis *AA) { 214 // RAW or WAR - cannot reorder 215 // WAW - cannot reorder 216 // RAR - safe to reorder 217 return !(A->mayStore() || B->mayStore()) || 218 TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); 219 } 220 221 // Add MI and its defs to the lists if MI reads one of the defs that are 222 // already in the list. Returns true in that case. 223 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, 224 DenseSet<unsigned> &PhysRegUses, 225 SmallVectorImpl<MachineInstr *> &Insts) { 226 for (MachineOperand &Use : MI.operands()) { 227 // If one of the defs is read, then there is a use of Def between I and the 228 // instruction that I will potentially be merged with. We will need to move 229 // this instruction after the merged instructions. 230 // 231 // Similarly, if there is a def which is read by an instruction that is to 232 // be moved for merging, then we need to move the def-instruction as well. 233 // This can only happen for physical registers such as M0; virtual 234 // registers are in SSA form. 235 if (Use.isReg() && 236 ((Use.readsReg() && RegDefs.count(Use.getReg())) || 237 (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && 238 PhysRegUses.count(Use.getReg())))) { 239 Insts.push_back(&MI); 240 addDefsUsesToList(MI, RegDefs, PhysRegUses); 241 return true; 242 } 243 } 244 245 return false; 246 } 247 248 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 249 ArrayRef<MachineInstr *> InstsToMove, 250 const SIInstrInfo *TII, AliasAnalysis *AA) { 251 assert(MemOp.mayLoadOrStore()); 252 253 for (MachineInstr *InstToMove : InstsToMove) { 254 if (!InstToMove->mayLoadOrStore()) 255 continue; 256 if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) 257 return false; 258 } 259 return true; 260 } 261 262 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { 263 // XXX - Would the same offset be OK? Is there any reason this would happen or 264 // be useful? 265 if (CI.Offset0 == CI.Offset1) 266 return false; 267 268 // This won't be valid if the offset isn't aligned. 269 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) 270 return false; 271 272 unsigned EltOffset0 = CI.Offset0 / CI.EltSize; 273 unsigned EltOffset1 = CI.Offset1 / CI.EltSize; 274 CI.UseST64 = false; 275 CI.BaseOff = 0; 276 277 // Handle SMEM and VMEM instructions. 278 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 279 return (EltOffset0 + CI.Width0 == EltOffset1 || 280 EltOffset1 + CI.Width1 == EltOffset0) && 281 CI.GLC0 == CI.GLC1 && 282 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); 283 } 284 285 // If the offset in elements doesn't fit in 8-bits, we might be able to use 286 // the stride 64 versions. 287 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 288 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 289 CI.Offset0 = EltOffset0 / 64; 290 CI.Offset1 = EltOffset1 / 64; 291 CI.UseST64 = true; 292 return true; 293 } 294 295 // Check if the new offsets fit in the reduced 8-bit range. 296 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 297 CI.Offset0 = EltOffset0; 298 CI.Offset1 = EltOffset1; 299 return true; 300 } 301 302 // Try to shift base address to decrease offsets. 303 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 304 CI.BaseOff = std::min(CI.Offset0, CI.Offset1); 305 306 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 307 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 308 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 309 CI.UseST64 = true; 310 return true; 311 } 312 313 if (isUInt<8>(OffsetDiff)) { 314 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; 315 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; 316 return true; 317 } 318 319 return false; 320 } 321 322 bool SILoadStoreOptimizer::widthsFit(const CombineInfo &CI) { 323 const unsigned Width = (CI.Width0 + CI.Width1); 324 switch (CI.InstClass) { 325 default: 326 return Width <= 4; 327 case S_BUFFER_LOAD_IMM: 328 switch (Width) { 329 default: 330 return false; 331 case 2: 332 case 4: 333 return true; 334 } 335 } 336 } 337 338 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) { 339 const unsigned Opc = MI.getOpcode(); 340 341 if (TII->isMUBUF(MI)) { 342 return AMDGPU::getMUBUFDwords(Opc); 343 } 344 345 switch (Opc) { 346 default: 347 return 0; 348 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 349 return 1; 350 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 351 return 2; 352 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 353 return 4; 354 } 355 } 356 357 InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) { 358 if (TII->isMUBUF(Opc)) { 359 const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc); 360 361 // If we couldn't identify the opcode, bail out. 362 if (baseOpcode == -1) { 363 return UNKNOWN; 364 } 365 366 switch (baseOpcode) { 367 default: 368 return UNKNOWN; 369 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 370 return BUFFER_LOAD_OFFEN; 371 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 372 return BUFFER_LOAD_OFFSET; 373 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 374 return BUFFER_STORE_OFFEN; 375 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 376 return BUFFER_STORE_OFFSET; 377 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 378 return BUFFER_LOAD_OFFEN_exact; 379 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 380 return BUFFER_LOAD_OFFSET_exact; 381 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 382 return BUFFER_STORE_OFFEN_exact; 383 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 384 return BUFFER_STORE_OFFSET_exact; 385 } 386 } 387 388 switch (Opc) { 389 default: 390 return UNKNOWN; 391 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 392 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 393 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 394 return S_BUFFER_LOAD_IMM; 395 case AMDGPU::DS_READ_B32: 396 case AMDGPU::DS_READ_B64: 397 case AMDGPU::DS_READ_B32_gfx9: 398 case AMDGPU::DS_READ_B64_gfx9: 399 return DS_READ; 400 case AMDGPU::DS_WRITE_B32: 401 case AMDGPU::DS_WRITE_B64: 402 case AMDGPU::DS_WRITE_B32_gfx9: 403 case AMDGPU::DS_WRITE_B64_gfx9: 404 return DS_WRITE; 405 } 406 } 407 408 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) { 409 if (TII->isMUBUF(Opc)) { 410 unsigned result = 0; 411 412 if (AMDGPU::getMUBUFHasVAddr(Opc)) { 413 result |= VADDR; 414 } 415 416 if (AMDGPU::getMUBUFHasSrsrc(Opc)) { 417 result |= SRSRC; 418 } 419 420 if (AMDGPU::getMUBUFHasSoffset(Opc)) { 421 result |= SOFFSET; 422 } 423 424 return result; 425 } 426 427 switch (Opc) { 428 default: 429 return 0; 430 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 431 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 432 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 433 return SBASE; 434 case AMDGPU::DS_READ_B32: 435 case AMDGPU::DS_READ_B64: 436 case AMDGPU::DS_READ_B32_gfx9: 437 case AMDGPU::DS_READ_B64_gfx9: 438 case AMDGPU::DS_WRITE_B32: 439 case AMDGPU::DS_WRITE_B64: 440 case AMDGPU::DS_WRITE_B32_gfx9: 441 case AMDGPU::DS_WRITE_B64_gfx9: 442 return ADDR; 443 } 444 } 445 446 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { 447 MachineBasicBlock *MBB = CI.I->getParent(); 448 MachineBasicBlock::iterator E = MBB->end(); 449 MachineBasicBlock::iterator MBBI = CI.I; 450 451 const unsigned Opc = CI.I->getOpcode(); 452 const InstClassEnum InstClass = getInstClass(Opc); 453 454 if (InstClass == UNKNOWN) { 455 return false; 456 } 457 458 const unsigned Regs = getRegs(Opc); 459 460 unsigned AddrOpName[5] = {0}; 461 int AddrIdx[5]; 462 const MachineOperand *AddrReg[5]; 463 unsigned NumAddresses = 0; 464 465 if (Regs & ADDR) { 466 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; 467 } 468 469 if (Regs & SBASE) { 470 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; 471 } 472 473 if (Regs & SRSRC) { 474 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 475 } 476 477 if (Regs & SOFFSET) { 478 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 479 } 480 481 if (Regs & VADDR) { 482 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; 483 } 484 485 for (unsigned i = 0; i < NumAddresses; i++) { 486 AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); 487 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); 488 489 // We only ever merge operations with the same base address register, so 490 // don't bother scanning forward if there are no other uses. 491 if (AddrReg[i]->isReg() && 492 (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || 493 MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) 494 return false; 495 } 496 497 ++MBBI; 498 499 DenseSet<unsigned> RegDefsToMove; 500 DenseSet<unsigned> PhysRegUsesToMove; 501 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 502 503 for (; MBBI != E; ++MBBI) { 504 const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE); 505 506 if ((getInstClass(MBBI->getOpcode()) != InstClass) || 507 (IsDS && (MBBI->getOpcode() != Opc))) { 508 // This is not a matching DS instruction, but we can keep looking as 509 // long as one of these conditions are met: 510 // 1. It is safe to move I down past MBBI. 511 // 2. It is safe to move MBBI down past the instruction that I will 512 // be merged into. 513 514 if (MBBI->hasUnmodeledSideEffects()) { 515 // We can't re-order this instruction with respect to other memory 516 // operations, so we fail both conditions mentioned above. 517 return false; 518 } 519 520 if (MBBI->mayLoadOrStore() && 521 (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || 522 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { 523 // We fail condition #1, but we may still be able to satisfy condition 524 // #2. Add this instruction to the move list and then we will check 525 // if condition #2 holds once we have selected the matching instruction. 526 CI.InstsToMove.push_back(&*MBBI); 527 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 528 continue; 529 } 530 531 // When we match I with another DS instruction we will be moving I down 532 // to the location of the matched instruction any uses of I will need to 533 // be moved down as well. 534 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 535 CI.InstsToMove); 536 continue; 537 } 538 539 // Don't merge volatiles. 540 if (MBBI->hasOrderedMemoryRef()) 541 return false; 542 543 // Handle a case like 544 // DS_WRITE_B32 addr, v, idx0 545 // w = DS_READ_B32 addr, idx0 546 // DS_WRITE_B32 addr, f(w), idx1 547 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 548 // merging of the two writes. 549 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 550 CI.InstsToMove)) 551 continue; 552 553 bool Match = true; 554 for (unsigned i = 0; i < NumAddresses; i++) { 555 const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); 556 557 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 558 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 559 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 560 Match = false; 561 break; 562 } 563 continue; 564 } 565 566 // Check same base pointer. Be careful of subregisters, which can occur 567 // with vectors of pointers. 568 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 569 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 570 Match = false; 571 break; 572 } 573 } 574 575 if (Match) { 576 int OffsetIdx = 577 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); 578 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); 579 CI.Width0 = getOpcodeWidth(*CI.I); 580 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); 581 CI.Width1 = getOpcodeWidth(*MBBI); 582 CI.Paired = MBBI; 583 584 if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { 585 CI.Offset0 &= 0xffff; 586 CI.Offset1 &= 0xffff; 587 } else { 588 CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); 589 CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); 590 if (CI.InstClass != S_BUFFER_LOAD_IMM) { 591 CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); 592 CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); 593 } 594 } 595 596 // Check both offsets fit in the reduced range. 597 // We also need to go through the list of instructions that we plan to 598 // move and make sure they are all safe to move down past the merged 599 // instruction. 600 if (widthsFit(CI) && offsetsCanBeCombined(CI)) 601 if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) 602 return true; 603 } 604 605 // We've found a load/store that we couldn't merge for some reason. 606 // We could potentially keep looking, but we'd need to make sure that 607 // it was safe to move I and also all the instruction in InstsToMove 608 // down past this instruction. 609 // check if we can move I across MBBI and if we can move all I's users 610 if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || 611 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) 612 break; 613 } 614 return false; 615 } 616 617 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 618 if (STM->ldsRequiresM0Init()) 619 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 620 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 621 } 622 623 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 624 if (STM->ldsRequiresM0Init()) 625 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 626 627 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 628 : AMDGPU::DS_READ2ST64_B64_gfx9; 629 } 630 631 MachineBasicBlock::iterator 632 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { 633 MachineBasicBlock *MBB = CI.I->getParent(); 634 635 // Be careful, since the addresses could be subregisters themselves in weird 636 // cases, like vectors of pointers. 637 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 638 639 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 640 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst); 641 642 unsigned NewOffset0 = CI.Offset0; 643 unsigned NewOffset1 = CI.Offset1; 644 unsigned Opc = 645 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 646 647 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 648 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 649 650 if (NewOffset0 > NewOffset1) { 651 // Canonicalize the merged instruction so the smaller offset comes first. 652 std::swap(NewOffset0, NewOffset1); 653 std::swap(SubRegIdx0, SubRegIdx1); 654 } 655 656 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 657 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 658 659 const MCInstrDesc &Read2Desc = TII->get(Opc); 660 661 const TargetRegisterClass *SuperRC = 662 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 663 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 664 665 DebugLoc DL = CI.I->getDebugLoc(); 666 667 unsigned BaseReg = AddrReg->getReg(); 668 unsigned BaseSubReg = AddrReg->getSubReg(); 669 unsigned BaseRegFlags = 0; 670 if (CI.BaseOff) { 671 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 672 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 673 .addImm(CI.BaseOff); 674 675 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 676 BaseRegFlags = RegState::Kill; 677 678 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) 679 .addReg(ImmReg) 680 .addReg(AddrReg->getReg(), 0, BaseSubReg); 681 BaseSubReg = 0; 682 } 683 684 MachineInstrBuilder Read2 = 685 BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) 686 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 687 .addImm(NewOffset0) // offset0 688 .addImm(NewOffset1) // offset1 689 .addImm(0) // gds 690 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 691 692 (void)Read2; 693 694 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 695 696 // Copy to the old destination registers. 697 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 698 .add(*Dest0) // Copy to same destination including flags and sub reg. 699 .addReg(DestReg, 0, SubRegIdx0); 700 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 701 .add(*Dest1) 702 .addReg(DestReg, RegState::Kill, SubRegIdx1); 703 704 moveInstsAfter(Copy1, CI.InstsToMove); 705 706 MachineBasicBlock::iterator Next = std::next(CI.I); 707 CI.I->eraseFromParent(); 708 CI.Paired->eraseFromParent(); 709 710 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 711 return Next; 712 } 713 714 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 715 if (STM->ldsRequiresM0Init()) 716 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 717 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 718 : AMDGPU::DS_WRITE2_B64_gfx9; 719 } 720 721 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 722 if (STM->ldsRequiresM0Init()) 723 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 724 : AMDGPU::DS_WRITE2ST64_B64; 725 726 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 727 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 728 } 729 730 MachineBasicBlock::iterator 731 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { 732 MachineBasicBlock *MBB = CI.I->getParent(); 733 734 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 735 // sure we preserve the subregister index and any register flags set on them. 736 const MachineOperand *AddrReg = 737 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 738 const MachineOperand *Data0 = 739 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 740 const MachineOperand *Data1 = 741 TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); 742 743 unsigned NewOffset0 = CI.Offset0; 744 unsigned NewOffset1 = CI.Offset1; 745 unsigned Opc = 746 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 747 748 if (NewOffset0 > NewOffset1) { 749 // Canonicalize the merged instruction so the smaller offset comes first. 750 std::swap(NewOffset0, NewOffset1); 751 std::swap(Data0, Data1); 752 } 753 754 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 755 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 756 757 const MCInstrDesc &Write2Desc = TII->get(Opc); 758 DebugLoc DL = CI.I->getDebugLoc(); 759 760 unsigned BaseReg = AddrReg->getReg(); 761 unsigned BaseSubReg = AddrReg->getSubReg(); 762 unsigned BaseRegFlags = 0; 763 if (CI.BaseOff) { 764 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 765 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 766 .addImm(CI.BaseOff); 767 768 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 769 BaseRegFlags = RegState::Kill; 770 771 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) 772 .addReg(ImmReg) 773 .addReg(AddrReg->getReg(), 0, BaseSubReg); 774 BaseSubReg = 0; 775 } 776 777 MachineInstrBuilder Write2 = 778 BuildMI(*MBB, CI.Paired, DL, Write2Desc) 779 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 780 .add(*Data0) // data0 781 .add(*Data1) // data1 782 .addImm(NewOffset0) // offset0 783 .addImm(NewOffset1) // offset1 784 .addImm(0) // gds 785 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 786 787 moveInstsAfter(Write2, CI.InstsToMove); 788 789 MachineBasicBlock::iterator Next = std::next(CI.I); 790 CI.I->eraseFromParent(); 791 CI.Paired->eraseFromParent(); 792 793 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 794 return Next; 795 } 796 797 MachineBasicBlock::iterator 798 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { 799 MachineBasicBlock *MBB = CI.I->getParent(); 800 DebugLoc DL = CI.I->getDebugLoc(); 801 const unsigned Opcode = getNewOpcode(CI); 802 803 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); 804 805 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 806 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); 807 808 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) 809 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 810 .addImm(MergedOffset) // offset 811 .addImm(CI.GLC0) // glc 812 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 813 814 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); 815 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 816 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 817 818 // Copy to the old destination registers. 819 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 820 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 821 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst); 822 823 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 824 .add(*Dest0) // Copy to same destination including flags and sub reg. 825 .addReg(DestReg, 0, SubRegIdx0); 826 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 827 .add(*Dest1) 828 .addReg(DestReg, RegState::Kill, SubRegIdx1); 829 830 moveInstsAfter(Copy1, CI.InstsToMove); 831 832 MachineBasicBlock::iterator Next = std::next(CI.I); 833 CI.I->eraseFromParent(); 834 CI.Paired->eraseFromParent(); 835 return Next; 836 } 837 838 MachineBasicBlock::iterator 839 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { 840 MachineBasicBlock *MBB = CI.I->getParent(); 841 DebugLoc DL = CI.I->getDebugLoc(); 842 843 const unsigned Opcode = getNewOpcode(CI); 844 845 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); 846 847 // Copy to the new source register. 848 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 849 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); 850 851 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); 852 853 const unsigned Regs = getRegs(Opcode); 854 855 if (Regs & VADDR) 856 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 857 858 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 859 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 860 .addImm(MergedOffset) // offset 861 .addImm(CI.GLC0) // glc 862 .addImm(CI.SLC0) // slc 863 .addImm(0) // tfe 864 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 865 866 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); 867 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 868 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 869 870 // Copy to the old destination registers. 871 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 872 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 873 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); 874 875 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 876 .add(*Dest0) // Copy to same destination including flags and sub reg. 877 .addReg(DestReg, 0, SubRegIdx0); 878 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 879 .add(*Dest1) 880 .addReg(DestReg, RegState::Kill, SubRegIdx1); 881 882 moveInstsAfter(Copy1, CI.InstsToMove); 883 884 MachineBasicBlock::iterator Next = std::next(CI.I); 885 CI.I->eraseFromParent(); 886 CI.Paired->eraseFromParent(); 887 return Next; 888 } 889 890 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { 891 const unsigned Width = CI.Width0 + CI.Width1; 892 893 switch (CI.InstClass) { 894 default: 895 return AMDGPU::getMUBUFOpcode(CI.InstClass, Width); 896 case UNKNOWN: 897 llvm_unreachable("Unknown instruction class"); 898 case S_BUFFER_LOAD_IMM: 899 switch (Width) { 900 default: 901 return 0; 902 case 2: 903 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 904 case 4: 905 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 906 } 907 } 908 } 909 910 std::pair<unsigned, unsigned> 911 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { 912 if (CI.Offset0 > CI.Offset1) { 913 switch (CI.Width0) { 914 default: 915 return std::make_pair(0, 0); 916 case 1: 917 switch (CI.Width1) { 918 default: 919 return std::make_pair(0, 0); 920 case 1: 921 return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); 922 case 2: 923 return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); 924 case 3: 925 return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); 926 } 927 case 2: 928 switch (CI.Width1) { 929 default: 930 return std::make_pair(0, 0); 931 case 1: 932 return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); 933 case 2: 934 return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); 935 } 936 case 3: 937 switch (CI.Width1) { 938 default: 939 return std::make_pair(0, 0); 940 case 1: 941 return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); 942 } 943 } 944 } else { 945 switch (CI.Width0) { 946 default: 947 return std::make_pair(0, 0); 948 case 1: 949 switch (CI.Width1) { 950 default: 951 return std::make_pair(0, 0); 952 case 1: 953 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); 954 case 2: 955 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); 956 case 3: 957 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); 958 } 959 case 2: 960 switch (CI.Width1) { 961 default: 962 return std::make_pair(0, 0); 963 case 1: 964 return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); 965 case 2: 966 return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); 967 } 968 case 3: 969 switch (CI.Width1) { 970 default: 971 return std::make_pair(0, 0); 972 case 1: 973 return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); 974 } 975 } 976 } 977 } 978 979 const TargetRegisterClass * 980 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { 981 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 982 switch (CI.Width0 + CI.Width1) { 983 default: 984 return nullptr; 985 case 2: 986 return &AMDGPU::SReg_64_XEXECRegClass; 987 case 4: 988 return &AMDGPU::SReg_128RegClass; 989 case 8: 990 return &AMDGPU::SReg_256RegClass; 991 case 16: 992 return &AMDGPU::SReg_512RegClass; 993 } 994 } else { 995 switch (CI.Width0 + CI.Width1) { 996 default: 997 return nullptr; 998 case 2: 999 return &AMDGPU::VReg_64RegClass; 1000 case 3: 1001 return &AMDGPU::VReg_96RegClass; 1002 case 4: 1003 return &AMDGPU::VReg_128RegClass; 1004 } 1005 } 1006 } 1007 1008 MachineBasicBlock::iterator 1009 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { 1010 MachineBasicBlock *MBB = CI.I->getParent(); 1011 DebugLoc DL = CI.I->getDebugLoc(); 1012 1013 const unsigned Opcode = getNewOpcode(CI); 1014 1015 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); 1016 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1017 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1018 1019 // Copy to the new source register. 1020 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); 1021 unsigned SrcReg = MRI->createVirtualRegister(SuperRC); 1022 1023 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1024 const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); 1025 1026 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1027 .add(*Src0) 1028 .addImm(SubRegIdx0) 1029 .add(*Src1) 1030 .addImm(SubRegIdx1); 1031 1032 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) 1033 .addReg(SrcReg, RegState::Kill); 1034 1035 const unsigned Regs = getRegs(Opcode); 1036 1037 if (Regs & VADDR) 1038 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1039 1040 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1041 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1042 .addImm(std::min(CI.Offset0, CI.Offset1)) // offset 1043 .addImm(CI.GLC0) // glc 1044 .addImm(CI.SLC0) // slc 1045 .addImm(0) // tfe 1046 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 1047 1048 moveInstsAfter(MIB, CI.InstsToMove); 1049 1050 MachineBasicBlock::iterator Next = std::next(CI.I); 1051 CI.I->eraseFromParent(); 1052 CI.Paired->eraseFromParent(); 1053 return Next; 1054 } 1055 1056 // Scan through looking for adjacent LDS operations with constant offsets from 1057 // the same base register. We rely on the scheduler to do the hard work of 1058 // clustering nearby loads, and assume these are all adjacent. 1059 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 1060 bool Modified = false; 1061 1062 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 1063 MachineInstr &MI = *I; 1064 1065 // Don't combine if volatile. 1066 if (MI.hasOrderedMemoryRef()) { 1067 ++I; 1068 continue; 1069 } 1070 1071 const unsigned Opc = MI.getOpcode(); 1072 1073 CombineInfo CI; 1074 CI.I = I; 1075 CI.InstClass = getInstClass(Opc); 1076 1077 switch (CI.InstClass) { 1078 default: 1079 break; 1080 case DS_READ: 1081 CI.EltSize = 1082 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 1083 : 4; 1084 if (findMatchingInst(CI)) { 1085 Modified = true; 1086 I = mergeRead2Pair(CI); 1087 } else { 1088 ++I; 1089 } 1090 continue; 1091 case DS_WRITE: 1092 CI.EltSize = 1093 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 1094 : 4; 1095 if (findMatchingInst(CI)) { 1096 Modified = true; 1097 I = mergeWrite2Pair(CI); 1098 } else { 1099 ++I; 1100 } 1101 continue; 1102 case S_BUFFER_LOAD_IMM: 1103 CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); 1104 if (findMatchingInst(CI)) { 1105 Modified = true; 1106 I = mergeSBufferLoadImmPair(CI); 1107 OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; 1108 } else { 1109 ++I; 1110 } 1111 continue; 1112 case BUFFER_LOAD_OFFEN: 1113 case BUFFER_LOAD_OFFSET: 1114 case BUFFER_LOAD_OFFEN_exact: 1115 case BUFFER_LOAD_OFFSET_exact: 1116 CI.EltSize = 4; 1117 if (findMatchingInst(CI)) { 1118 Modified = true; 1119 I = mergeBufferLoadPair(CI); 1120 OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; 1121 } else { 1122 ++I; 1123 } 1124 continue; 1125 case BUFFER_STORE_OFFEN: 1126 case BUFFER_STORE_OFFSET: 1127 case BUFFER_STORE_OFFEN_exact: 1128 case BUFFER_STORE_OFFSET_exact: 1129 CI.EltSize = 4; 1130 if (findMatchingInst(CI)) { 1131 Modified = true; 1132 I = mergeBufferStorePair(CI); 1133 OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; 1134 } else { 1135 ++I; 1136 } 1137 continue; 1138 } 1139 1140 ++I; 1141 } 1142 1143 return Modified; 1144 } 1145 1146 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 1147 if (skipFunction(MF.getFunction())) 1148 return false; 1149 1150 STM = &MF.getSubtarget<GCNSubtarget>(); 1151 if (!STM->loadStoreOptEnabled()) 1152 return false; 1153 1154 TII = STM->getInstrInfo(); 1155 TRI = &TII->getRegisterInfo(); 1156 1157 MRI = &MF.getRegInfo(); 1158 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1159 1160 assert(MRI->isSSA() && "Must be run on SSA"); 1161 1162 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 1163 1164 bool Modified = false; 1165 1166 for (MachineBasicBlock &MBB : MF) { 1167 do { 1168 OptimizeAgain = false; 1169 Modified |= optimizeBlock(MBB); 1170 } while (OptimizeAgain); 1171 } 1172 1173 return Modified; 1174 } 1175