1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This currently relies on the scheduler to place loads and stores next to 46 // each other, and then only merges adjacent pairs of instructions. It would 47 // be good to be more flexible with interleaved instructions, and possibly run 48 // before scheduling. It currently missing stores of constants because loading 49 // the constant into the data register is placed between the stores, although 50 // this is arguably a scheduling problem. 51 // 52 // - Live interval recomputing seems inefficient. This currently only matches 53 // one pair, and recomputes live intervals and moves on to the next pair. It 54 // would be better to compute a list of all merges that need to occur. 55 // 56 // - With a list of instructions to process, we can also merge more. If a 57 // cluster of loads have offsets that are too large to fit in the 8-bit 58 // offsets, but are close enough to fit in the 8 bits, we can add to the base 59 // pointer and use the new reduced offsets. 60 // 61 //===----------------------------------------------------------------------===// 62 63 #include "AMDGPU.h" 64 #include "AMDGPUSubtarget.h" 65 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 66 #include "SIInstrInfo.h" 67 #include "SIRegisterInfo.h" 68 #include "Utils/AMDGPUBaseInfo.h" 69 #include "llvm/ADT/ArrayRef.h" 70 #include "llvm/ADT/SmallVector.h" 71 #include "llvm/ADT/StringRef.h" 72 #include "llvm/Analysis/AliasAnalysis.h" 73 #include "llvm/CodeGen/MachineBasicBlock.h" 74 #include "llvm/CodeGen/MachineFunction.h" 75 #include "llvm/CodeGen/MachineFunctionPass.h" 76 #include "llvm/CodeGen/MachineInstr.h" 77 #include "llvm/CodeGen/MachineInstrBuilder.h" 78 #include "llvm/CodeGen/MachineOperand.h" 79 #include "llvm/CodeGen/MachineRegisterInfo.h" 80 #include "llvm/IR/DebugLoc.h" 81 #include "llvm/Pass.h" 82 #include "llvm/Support/Debug.h" 83 #include "llvm/Support/MathExtras.h" 84 #include "llvm/Support/raw_ostream.h" 85 #include <algorithm> 86 #include <cassert> 87 #include <cstdlib> 88 #include <iterator> 89 #include <utility> 90 91 using namespace llvm; 92 93 #define DEBUG_TYPE "si-load-store-opt" 94 95 namespace { 96 enum InstClassEnum { 97 UNKNOWN, 98 DS_READ, 99 DS_WRITE, 100 S_BUFFER_LOAD_IMM, 101 BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN, 102 BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET, 103 BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN, 104 BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET, 105 BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact, 106 BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact, 107 BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact, 108 BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact, 109 }; 110 111 enum RegisterEnum { 112 SBASE = 0x1, 113 SRSRC = 0x2, 114 SOFFSET = 0x4, 115 VADDR = 0x8, 116 ADDR = 0x10, 117 }; 118 119 class SILoadStoreOptimizer : public MachineFunctionPass { 120 struct CombineInfo { 121 MachineBasicBlock::iterator I; 122 MachineBasicBlock::iterator Paired; 123 unsigned EltSize; 124 unsigned Offset0; 125 unsigned Offset1; 126 unsigned Width0; 127 unsigned Width1; 128 unsigned BaseOff; 129 InstClassEnum InstClass; 130 bool GLC0; 131 bool GLC1; 132 bool SLC0; 133 bool SLC1; 134 bool DLC0; 135 bool DLC1; 136 bool UseST64; 137 SmallVector<MachineInstr *, 8> InstsToMove; 138 }; 139 140 struct BaseRegisters { 141 unsigned LoReg = 0; 142 unsigned HiReg = 0; 143 144 unsigned LoSubReg = 0; 145 unsigned HiSubReg = 0; 146 }; 147 148 struct MemAddress { 149 BaseRegisters Base; 150 int64_t Offset = 0; 151 }; 152 153 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 154 155 private: 156 const GCNSubtarget *STM = nullptr; 157 const SIInstrInfo *TII = nullptr; 158 const SIRegisterInfo *TRI = nullptr; 159 MachineRegisterInfo *MRI = nullptr; 160 AliasAnalysis *AA = nullptr; 161 bool OptimizeAgain; 162 163 static bool offsetsCanBeCombined(CombineInfo &CI); 164 static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); 165 static unsigned getNewOpcode(const CombineInfo &CI); 166 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); 167 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); 168 unsigned getOpcodeWidth(const MachineInstr &MI) const; 169 InstClassEnum getInstClass(unsigned Opc) const; 170 unsigned getRegs(unsigned Opc) const; 171 172 bool findMatchingInst(CombineInfo &CI); 173 174 unsigned read2Opcode(unsigned EltSize) const; 175 unsigned read2ST64Opcode(unsigned EltSize) const; 176 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); 177 178 unsigned write2Opcode(unsigned EltSize) const; 179 unsigned write2ST64Opcode(unsigned EltSize) const; 180 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); 181 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); 182 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); 183 MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); 184 185 void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, 186 int32_t NewOffset) const; 187 unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const; 188 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 189 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 190 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 191 /// Promotes constant offset to the immediate by adjusting the base. It 192 /// tries to use a base from the nearby instructions that allows it to have 193 /// a 13bit constant offset which gets promoted to the immediate. 194 bool promoteConstantOffsetToImm(MachineInstr &CI, 195 MemInfoMap &Visited, 196 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 197 198 public: 199 static char ID; 200 201 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 202 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 203 } 204 205 bool optimizeBlock(MachineBasicBlock &MBB); 206 207 bool runOnMachineFunction(MachineFunction &MF) override; 208 209 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 210 211 void getAnalysisUsage(AnalysisUsage &AU) const override { 212 AU.setPreservesCFG(); 213 AU.addRequired<AAResultsWrapperPass>(); 214 215 MachineFunctionPass::getAnalysisUsage(AU); 216 } 217 }; 218 219 } // end anonymous namespace. 220 221 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 222 "SI Load Store Optimizer", false, false) 223 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 224 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 225 false, false) 226 227 char SILoadStoreOptimizer::ID = 0; 228 229 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 230 231 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 232 return new SILoadStoreOptimizer(); 233 } 234 235 static void moveInstsAfter(MachineBasicBlock::iterator I, 236 ArrayRef<MachineInstr *> InstsToMove) { 237 MachineBasicBlock *MBB = I->getParent(); 238 ++I; 239 for (MachineInstr *MI : InstsToMove) { 240 MI->removeFromParent(); 241 MBB->insert(I, MI); 242 } 243 } 244 245 static void addDefsUsesToList(const MachineInstr &MI, 246 DenseSet<unsigned> &RegDefs, 247 DenseSet<unsigned> &PhysRegUses) { 248 for (const MachineOperand &Op : MI.operands()) { 249 if (Op.isReg()) { 250 if (Op.isDef()) 251 RegDefs.insert(Op.getReg()); 252 else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg())) 253 PhysRegUses.insert(Op.getReg()); 254 } 255 } 256 } 257 258 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 259 MachineBasicBlock::iterator B, 260 AliasAnalysis *AA) { 261 // RAW or WAR - cannot reorder 262 // WAW - cannot reorder 263 // RAR - safe to reorder 264 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 265 } 266 267 // Add MI and its defs to the lists if MI reads one of the defs that are 268 // already in the list. Returns true in that case. 269 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, 270 DenseSet<unsigned> &PhysRegUses, 271 SmallVectorImpl<MachineInstr *> &Insts) { 272 for (MachineOperand &Use : MI.operands()) { 273 // If one of the defs is read, then there is a use of Def between I and the 274 // instruction that I will potentially be merged with. We will need to move 275 // this instruction after the merged instructions. 276 // 277 // Similarly, if there is a def which is read by an instruction that is to 278 // be moved for merging, then we need to move the def-instruction as well. 279 // This can only happen for physical registers such as M0; virtual 280 // registers are in SSA form. 281 if (Use.isReg() && 282 ((Use.readsReg() && RegDefs.count(Use.getReg())) || 283 (Use.isDef() && RegDefs.count(Use.getReg())) || 284 (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) && 285 PhysRegUses.count(Use.getReg())))) { 286 Insts.push_back(&MI); 287 addDefsUsesToList(MI, RegDefs, PhysRegUses); 288 return true; 289 } 290 } 291 292 return false; 293 } 294 295 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 296 ArrayRef<MachineInstr *> InstsToMove, 297 AliasAnalysis *AA) { 298 assert(MemOp.mayLoadOrStore()); 299 300 for (MachineInstr *InstToMove : InstsToMove) { 301 if (!InstToMove->mayLoadOrStore()) 302 continue; 303 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 304 return false; 305 } 306 return true; 307 } 308 309 // This function assumes that \p A and \p B have are identical except for 310 // size and offset, and they referecne adjacent memory. 311 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 312 const MachineMemOperand *A, 313 const MachineMemOperand *B) { 314 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 315 unsigned Size = A->getSize() + B->getSize(); 316 // This function adds the offset parameter to the existing offset for A, 317 // so we pass 0 here as the offset and then manually set it to the correct 318 // value after the call. 319 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 320 MMO->setOffset(MinOffset); 321 return MMO; 322 } 323 324 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { 325 // XXX - Would the same offset be OK? Is there any reason this would happen or 326 // be useful? 327 if (CI.Offset0 == CI.Offset1) 328 return false; 329 330 // This won't be valid if the offset isn't aligned. 331 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) 332 return false; 333 334 unsigned EltOffset0 = CI.Offset0 / CI.EltSize; 335 unsigned EltOffset1 = CI.Offset1 / CI.EltSize; 336 CI.UseST64 = false; 337 CI.BaseOff = 0; 338 339 // Handle SMEM and VMEM instructions. 340 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 341 return (EltOffset0 + CI.Width0 == EltOffset1 || 342 EltOffset1 + CI.Width1 == EltOffset0) && 343 CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 && 344 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); 345 } 346 347 // If the offset in elements doesn't fit in 8-bits, we might be able to use 348 // the stride 64 versions. 349 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 350 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 351 CI.Offset0 = EltOffset0 / 64; 352 CI.Offset1 = EltOffset1 / 64; 353 CI.UseST64 = true; 354 return true; 355 } 356 357 // Check if the new offsets fit in the reduced 8-bit range. 358 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 359 CI.Offset0 = EltOffset0; 360 CI.Offset1 = EltOffset1; 361 return true; 362 } 363 364 // Try to shift base address to decrease offsets. 365 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 366 CI.BaseOff = std::min(CI.Offset0, CI.Offset1); 367 368 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 369 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 370 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 371 CI.UseST64 = true; 372 return true; 373 } 374 375 if (isUInt<8>(OffsetDiff)) { 376 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; 377 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; 378 return true; 379 } 380 381 return false; 382 } 383 384 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 385 const CombineInfo &CI) { 386 const unsigned Width = (CI.Width0 + CI.Width1); 387 switch (CI.InstClass) { 388 default: 389 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 390 case S_BUFFER_LOAD_IMM: 391 switch (Width) { 392 default: 393 return false; 394 case 2: 395 case 4: 396 return true; 397 } 398 } 399 } 400 401 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) const { 402 const unsigned Opc = MI.getOpcode(); 403 404 if (TII->isMUBUF(MI)) { 405 // FIXME: Handle d16 correctly 406 return AMDGPU::getMUBUFElements(Opc); 407 } 408 409 switch (Opc) { 410 default: 411 return 0; 412 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 413 return 1; 414 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 415 return 2; 416 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 417 return 4; 418 } 419 } 420 421 InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) const { 422 if (TII->isMUBUF(Opc)) { 423 const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc); 424 425 // If we couldn't identify the opcode, bail out. 426 if (baseOpcode == -1) { 427 return UNKNOWN; 428 } 429 430 switch (baseOpcode) { 431 default: 432 return UNKNOWN; 433 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 434 return BUFFER_LOAD_OFFEN; 435 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 436 return BUFFER_LOAD_OFFSET; 437 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 438 return BUFFER_STORE_OFFEN; 439 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 440 return BUFFER_STORE_OFFSET; 441 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 442 return BUFFER_LOAD_OFFEN_exact; 443 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 444 return BUFFER_LOAD_OFFSET_exact; 445 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 446 return BUFFER_STORE_OFFEN_exact; 447 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 448 return BUFFER_STORE_OFFSET_exact; 449 } 450 } 451 452 switch (Opc) { 453 default: 454 return UNKNOWN; 455 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 456 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 457 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 458 return S_BUFFER_LOAD_IMM; 459 case AMDGPU::DS_READ_B32: 460 case AMDGPU::DS_READ_B64: 461 case AMDGPU::DS_READ_B32_gfx9: 462 case AMDGPU::DS_READ_B64_gfx9: 463 return DS_READ; 464 case AMDGPU::DS_WRITE_B32: 465 case AMDGPU::DS_WRITE_B64: 466 case AMDGPU::DS_WRITE_B32_gfx9: 467 case AMDGPU::DS_WRITE_B64_gfx9: 468 return DS_WRITE; 469 } 470 } 471 472 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) const { 473 if (TII->isMUBUF(Opc)) { 474 unsigned result = 0; 475 476 if (AMDGPU::getMUBUFHasVAddr(Opc)) { 477 result |= VADDR; 478 } 479 480 if (AMDGPU::getMUBUFHasSrsrc(Opc)) { 481 result |= SRSRC; 482 } 483 484 if (AMDGPU::getMUBUFHasSoffset(Opc)) { 485 result |= SOFFSET; 486 } 487 488 return result; 489 } 490 491 switch (Opc) { 492 default: 493 return 0; 494 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 495 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 496 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 497 return SBASE; 498 case AMDGPU::DS_READ_B32: 499 case AMDGPU::DS_READ_B64: 500 case AMDGPU::DS_READ_B32_gfx9: 501 case AMDGPU::DS_READ_B64_gfx9: 502 case AMDGPU::DS_WRITE_B32: 503 case AMDGPU::DS_WRITE_B64: 504 case AMDGPU::DS_WRITE_B32_gfx9: 505 case AMDGPU::DS_WRITE_B64_gfx9: 506 return ADDR; 507 } 508 } 509 510 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { 511 MachineBasicBlock *MBB = CI.I->getParent(); 512 MachineBasicBlock::iterator E = MBB->end(); 513 MachineBasicBlock::iterator MBBI = CI.I; 514 515 const unsigned Opc = CI.I->getOpcode(); 516 const InstClassEnum InstClass = getInstClass(Opc); 517 518 if (InstClass == UNKNOWN) { 519 return false; 520 } 521 522 const unsigned Regs = getRegs(Opc); 523 524 unsigned AddrOpName[5] = {0}; 525 int AddrIdx[5]; 526 const MachineOperand *AddrReg[5]; 527 unsigned NumAddresses = 0; 528 529 if (Regs & ADDR) { 530 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; 531 } 532 533 if (Regs & SBASE) { 534 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; 535 } 536 537 if (Regs & SRSRC) { 538 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 539 } 540 541 if (Regs & SOFFSET) { 542 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 543 } 544 545 if (Regs & VADDR) { 546 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; 547 } 548 549 for (unsigned i = 0; i < NumAddresses; i++) { 550 AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); 551 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); 552 553 // We only ever merge operations with the same base address register, so 554 // don't bother scanning forward if there are no other uses. 555 if (AddrReg[i]->isReg() && 556 (Register::isPhysicalRegister(AddrReg[i]->getReg()) || 557 MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) 558 return false; 559 } 560 561 ++MBBI; 562 563 DenseSet<unsigned> RegDefsToMove; 564 DenseSet<unsigned> PhysRegUsesToMove; 565 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 566 567 for (; MBBI != E; ++MBBI) { 568 const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE); 569 570 if ((getInstClass(MBBI->getOpcode()) != InstClass) || 571 (IsDS && (MBBI->getOpcode() != Opc))) { 572 // This is not a matching DS instruction, but we can keep looking as 573 // long as one of these conditions are met: 574 // 1. It is safe to move I down past MBBI. 575 // 2. It is safe to move MBBI down past the instruction that I will 576 // be merged into. 577 578 if (MBBI->hasUnmodeledSideEffects()) { 579 // We can't re-order this instruction with respect to other memory 580 // operations, so we fail both conditions mentioned above. 581 return false; 582 } 583 584 if (MBBI->mayLoadOrStore() && 585 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 586 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) { 587 // We fail condition #1, but we may still be able to satisfy condition 588 // #2. Add this instruction to the move list and then we will check 589 // if condition #2 holds once we have selected the matching instruction. 590 CI.InstsToMove.push_back(&*MBBI); 591 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 592 continue; 593 } 594 595 // When we match I with another DS instruction we will be moving I down 596 // to the location of the matched instruction any uses of I will need to 597 // be moved down as well. 598 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 599 CI.InstsToMove); 600 continue; 601 } 602 603 // Don't merge volatiles. 604 if (MBBI->hasOrderedMemoryRef()) 605 return false; 606 607 // Handle a case like 608 // DS_WRITE_B32 addr, v, idx0 609 // w = DS_READ_B32 addr, idx0 610 // DS_WRITE_B32 addr, f(w), idx1 611 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 612 // merging of the two writes. 613 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 614 CI.InstsToMove)) 615 continue; 616 617 bool Match = true; 618 for (unsigned i = 0; i < NumAddresses; i++) { 619 const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); 620 621 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 622 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 623 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 624 Match = false; 625 break; 626 } 627 continue; 628 } 629 630 // Check same base pointer. Be careful of subregisters, which can occur 631 // with vectors of pointers. 632 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 633 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 634 Match = false; 635 break; 636 } 637 } 638 639 if (Match) { 640 int OffsetIdx = 641 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); 642 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); 643 CI.Width0 = getOpcodeWidth(*CI.I); 644 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); 645 CI.Width1 = getOpcodeWidth(*MBBI); 646 CI.Paired = MBBI; 647 648 if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { 649 CI.Offset0 &= 0xffff; 650 CI.Offset1 &= 0xffff; 651 } else { 652 CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); 653 CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); 654 if (CI.InstClass != S_BUFFER_LOAD_IMM) { 655 CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); 656 CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); 657 } 658 CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm(); 659 CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm(); 660 } 661 662 // Check both offsets fit in the reduced range. 663 // We also need to go through the list of instructions that we plan to 664 // move and make sure they are all safe to move down past the merged 665 // instruction. 666 if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) 667 if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) 668 return true; 669 } 670 671 // We've found a load/store that we couldn't merge for some reason. 672 // We could potentially keep looking, but we'd need to make sure that 673 // it was safe to move I and also all the instruction in InstsToMove 674 // down past this instruction. 675 // check if we can move I across MBBI and if we can move all I's users 676 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 677 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) 678 break; 679 } 680 return false; 681 } 682 683 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 684 if (STM->ldsRequiresM0Init()) 685 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 686 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 687 } 688 689 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 690 if (STM->ldsRequiresM0Init()) 691 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 692 693 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 694 : AMDGPU::DS_READ2ST64_B64_gfx9; 695 } 696 697 MachineBasicBlock::iterator 698 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { 699 MachineBasicBlock *MBB = CI.I->getParent(); 700 701 // Be careful, since the addresses could be subregisters themselves in weird 702 // cases, like vectors of pointers. 703 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 704 705 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 706 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst); 707 708 unsigned NewOffset0 = CI.Offset0; 709 unsigned NewOffset1 = CI.Offset1; 710 unsigned Opc = 711 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 712 713 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 714 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 715 716 if (NewOffset0 > NewOffset1) { 717 // Canonicalize the merged instruction so the smaller offset comes first. 718 std::swap(NewOffset0, NewOffset1); 719 std::swap(SubRegIdx0, SubRegIdx1); 720 } 721 722 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 723 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 724 725 const MCInstrDesc &Read2Desc = TII->get(Opc); 726 727 const TargetRegisterClass *SuperRC = 728 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 729 Register DestReg = MRI->createVirtualRegister(SuperRC); 730 731 DebugLoc DL = CI.I->getDebugLoc(); 732 733 Register BaseReg = AddrReg->getReg(); 734 unsigned BaseSubReg = AddrReg->getSubReg(); 735 unsigned BaseRegFlags = 0; 736 if (CI.BaseOff) { 737 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 738 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 739 .addImm(CI.BaseOff); 740 741 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 742 BaseRegFlags = RegState::Kill; 743 744 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) 745 .addReg(ImmReg) 746 .addReg(AddrReg->getReg(), 0, BaseSubReg) 747 .addImm(0); // clamp bit 748 BaseSubReg = 0; 749 } 750 751 MachineInstrBuilder Read2 = 752 BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) 753 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 754 .addImm(NewOffset0) // offset0 755 .addImm(NewOffset1) // offset1 756 .addImm(0) // gds 757 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 758 759 (void)Read2; 760 761 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 762 763 // Copy to the old destination registers. 764 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 765 .add(*Dest0) // Copy to same destination including flags and sub reg. 766 .addReg(DestReg, 0, SubRegIdx0); 767 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 768 .add(*Dest1) 769 .addReg(DestReg, RegState::Kill, SubRegIdx1); 770 771 moveInstsAfter(Copy1, CI.InstsToMove); 772 773 MachineBasicBlock::iterator Next = std::next(CI.I); 774 CI.I->eraseFromParent(); 775 CI.Paired->eraseFromParent(); 776 777 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 778 return Next; 779 } 780 781 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 782 if (STM->ldsRequiresM0Init()) 783 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 784 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 785 : AMDGPU::DS_WRITE2_B64_gfx9; 786 } 787 788 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 789 if (STM->ldsRequiresM0Init()) 790 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 791 : AMDGPU::DS_WRITE2ST64_B64; 792 793 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 794 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 795 } 796 797 MachineBasicBlock::iterator 798 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { 799 MachineBasicBlock *MBB = CI.I->getParent(); 800 801 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 802 // sure we preserve the subregister index and any register flags set on them. 803 const MachineOperand *AddrReg = 804 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 805 const MachineOperand *Data0 = 806 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 807 const MachineOperand *Data1 = 808 TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); 809 810 unsigned NewOffset0 = CI.Offset0; 811 unsigned NewOffset1 = CI.Offset1; 812 unsigned Opc = 813 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 814 815 if (NewOffset0 > NewOffset1) { 816 // Canonicalize the merged instruction so the smaller offset comes first. 817 std::swap(NewOffset0, NewOffset1); 818 std::swap(Data0, Data1); 819 } 820 821 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 822 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 823 824 const MCInstrDesc &Write2Desc = TII->get(Opc); 825 DebugLoc DL = CI.I->getDebugLoc(); 826 827 Register BaseReg = AddrReg->getReg(); 828 unsigned BaseSubReg = AddrReg->getSubReg(); 829 unsigned BaseRegFlags = 0; 830 if (CI.BaseOff) { 831 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 832 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 833 .addImm(CI.BaseOff); 834 835 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 836 BaseRegFlags = RegState::Kill; 837 838 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) 839 .addReg(ImmReg) 840 .addReg(AddrReg->getReg(), 0, BaseSubReg) 841 .addImm(0); // clamp bit 842 BaseSubReg = 0; 843 } 844 845 MachineInstrBuilder Write2 = 846 BuildMI(*MBB, CI.Paired, DL, Write2Desc) 847 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 848 .add(*Data0) // data0 849 .add(*Data1) // data1 850 .addImm(NewOffset0) // offset0 851 .addImm(NewOffset1) // offset1 852 .addImm(0) // gds 853 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 854 855 moveInstsAfter(Write2, CI.InstsToMove); 856 857 MachineBasicBlock::iterator Next = std::next(CI.I); 858 CI.I->eraseFromParent(); 859 CI.Paired->eraseFromParent(); 860 861 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 862 return Next; 863 } 864 865 MachineBasicBlock::iterator 866 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { 867 MachineBasicBlock *MBB = CI.I->getParent(); 868 DebugLoc DL = CI.I->getDebugLoc(); 869 const unsigned Opcode = getNewOpcode(CI); 870 871 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); 872 873 Register DestReg = MRI->createVirtualRegister(SuperRC); 874 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); 875 876 // It shouldn't be possible to get this far if the two instructions 877 // don't have a single memoperand, because MachineInstr::mayAlias() 878 // will return true if this is the case. 879 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); 880 881 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 882 const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); 883 884 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) 885 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 886 .addImm(MergedOffset) // offset 887 .addImm(CI.GLC0) // glc 888 .addImm(CI.DLC0) // dlc 889 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 890 891 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); 892 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 893 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 894 895 // Copy to the old destination registers. 896 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 897 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 898 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst); 899 900 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 901 .add(*Dest0) // Copy to same destination including flags and sub reg. 902 .addReg(DestReg, 0, SubRegIdx0); 903 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 904 .add(*Dest1) 905 .addReg(DestReg, RegState::Kill, SubRegIdx1); 906 907 moveInstsAfter(Copy1, CI.InstsToMove); 908 909 MachineBasicBlock::iterator Next = std::next(CI.I); 910 CI.I->eraseFromParent(); 911 CI.Paired->eraseFromParent(); 912 return Next; 913 } 914 915 MachineBasicBlock::iterator 916 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { 917 MachineBasicBlock *MBB = CI.I->getParent(); 918 DebugLoc DL = CI.I->getDebugLoc(); 919 920 const unsigned Opcode = getNewOpcode(CI); 921 922 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); 923 924 // Copy to the new source register. 925 Register DestReg = MRI->createVirtualRegister(SuperRC); 926 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); 927 928 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); 929 930 const unsigned Regs = getRegs(Opcode); 931 932 if (Regs & VADDR) 933 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 934 935 // It shouldn't be possible to get this far if the two instructions 936 // don't have a single memoperand, because MachineInstr::mayAlias() 937 // will return true if this is the case. 938 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); 939 940 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 941 const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); 942 943 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 944 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 945 .addImm(MergedOffset) // offset 946 .addImm(CI.GLC0) // glc 947 .addImm(CI.SLC0) // slc 948 .addImm(0) // tfe 949 .addImm(CI.DLC0) // dlc 950 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 951 952 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); 953 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 954 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 955 956 // Copy to the old destination registers. 957 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 958 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 959 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); 960 961 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 962 .add(*Dest0) // Copy to same destination including flags and sub reg. 963 .addReg(DestReg, 0, SubRegIdx0); 964 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 965 .add(*Dest1) 966 .addReg(DestReg, RegState::Kill, SubRegIdx1); 967 968 moveInstsAfter(Copy1, CI.InstsToMove); 969 970 MachineBasicBlock::iterator Next = std::next(CI.I); 971 CI.I->eraseFromParent(); 972 CI.Paired->eraseFromParent(); 973 return Next; 974 } 975 976 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { 977 const unsigned Width = CI.Width0 + CI.Width1; 978 979 switch (CI.InstClass) { 980 default: 981 // FIXME: Handle d16 correctly 982 return AMDGPU::getMUBUFOpcode(CI.InstClass, Width); 983 case UNKNOWN: 984 llvm_unreachable("Unknown instruction class"); 985 case S_BUFFER_LOAD_IMM: 986 switch (Width) { 987 default: 988 return 0; 989 case 2: 990 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 991 case 4: 992 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 993 } 994 } 995 } 996 997 std::pair<unsigned, unsigned> 998 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { 999 if (CI.Offset0 > CI.Offset1) { 1000 switch (CI.Width0) { 1001 default: 1002 return std::make_pair(0, 0); 1003 case 1: 1004 switch (CI.Width1) { 1005 default: 1006 return std::make_pair(0, 0); 1007 case 1: 1008 return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); 1009 case 2: 1010 return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); 1011 case 3: 1012 return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); 1013 } 1014 case 2: 1015 switch (CI.Width1) { 1016 default: 1017 return std::make_pair(0, 0); 1018 case 1: 1019 return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); 1020 case 2: 1021 return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); 1022 } 1023 case 3: 1024 switch (CI.Width1) { 1025 default: 1026 return std::make_pair(0, 0); 1027 case 1: 1028 return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); 1029 } 1030 } 1031 } else { 1032 switch (CI.Width0) { 1033 default: 1034 return std::make_pair(0, 0); 1035 case 1: 1036 switch (CI.Width1) { 1037 default: 1038 return std::make_pair(0, 0); 1039 case 1: 1040 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); 1041 case 2: 1042 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); 1043 case 3: 1044 return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); 1045 } 1046 case 2: 1047 switch (CI.Width1) { 1048 default: 1049 return std::make_pair(0, 0); 1050 case 1: 1051 return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); 1052 case 2: 1053 return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); 1054 } 1055 case 3: 1056 switch (CI.Width1) { 1057 default: 1058 return std::make_pair(0, 0); 1059 case 1: 1060 return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); 1061 } 1062 } 1063 } 1064 } 1065 1066 const TargetRegisterClass * 1067 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { 1068 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1069 switch (CI.Width0 + CI.Width1) { 1070 default: 1071 return nullptr; 1072 case 2: 1073 return &AMDGPU::SReg_64_XEXECRegClass; 1074 case 4: 1075 return &AMDGPU::SReg_128RegClass; 1076 case 8: 1077 return &AMDGPU::SReg_256RegClass; 1078 case 16: 1079 return &AMDGPU::SReg_512RegClass; 1080 } 1081 } else { 1082 switch (CI.Width0 + CI.Width1) { 1083 default: 1084 return nullptr; 1085 case 2: 1086 return &AMDGPU::VReg_64RegClass; 1087 case 3: 1088 return &AMDGPU::VReg_96RegClass; 1089 case 4: 1090 return &AMDGPU::VReg_128RegClass; 1091 } 1092 } 1093 } 1094 1095 MachineBasicBlock::iterator 1096 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { 1097 MachineBasicBlock *MBB = CI.I->getParent(); 1098 DebugLoc DL = CI.I->getDebugLoc(); 1099 1100 const unsigned Opcode = getNewOpcode(CI); 1101 1102 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); 1103 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1104 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1105 1106 // Copy to the new source register. 1107 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); 1108 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1109 1110 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1111 const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); 1112 1113 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1114 .add(*Src0) 1115 .addImm(SubRegIdx0) 1116 .add(*Src1) 1117 .addImm(SubRegIdx1); 1118 1119 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) 1120 .addReg(SrcReg, RegState::Kill); 1121 1122 const unsigned Regs = getRegs(Opcode); 1123 1124 if (Regs & VADDR) 1125 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1126 1127 1128 // It shouldn't be possible to get this far if the two instructions 1129 // don't have a single memoperand, because MachineInstr::mayAlias() 1130 // will return true if this is the case. 1131 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); 1132 1133 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1134 const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); 1135 1136 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1137 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1138 .addImm(std::min(CI.Offset0, CI.Offset1)) // offset 1139 .addImm(CI.GLC0) // glc 1140 .addImm(CI.SLC0) // slc 1141 .addImm(0) // tfe 1142 .addImm(CI.DLC0) // dlc 1143 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1144 1145 moveInstsAfter(MIB, CI.InstsToMove); 1146 1147 MachineBasicBlock::iterator Next = std::next(CI.I); 1148 CI.I->eraseFromParent(); 1149 CI.Paired->eraseFromParent(); 1150 return Next; 1151 } 1152 1153 MachineOperand 1154 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1155 APInt V(32, Val, true); 1156 if (TII->isInlineConstant(V)) 1157 return MachineOperand::CreateImm(Val); 1158 1159 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1160 MachineInstr *Mov = 1161 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1162 TII->get(AMDGPU::S_MOV_B32), Reg) 1163 .addImm(Val); 1164 (void)Mov; 1165 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1166 return MachineOperand::CreateReg(Reg, false); 1167 } 1168 1169 // Compute base address using Addr and return the final register. 1170 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1171 const MemAddress &Addr) const { 1172 MachineBasicBlock *MBB = MI.getParent(); 1173 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1174 DebugLoc DL = MI.getDebugLoc(); 1175 1176 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1177 Addr.Base.LoSubReg) && 1178 "Expected 32-bit Base-Register-Low!!"); 1179 1180 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1181 Addr.Base.HiSubReg) && 1182 "Expected 32-bit Base-Register-Hi!!"); 1183 1184 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1185 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1186 MachineOperand OffsetHi = 1187 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1188 1189 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1190 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1191 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1192 1193 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1194 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1195 MachineInstr *LoHalf = 1196 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) 1197 .addReg(CarryReg, RegState::Define) 1198 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1199 .add(OffsetLo) 1200 .addImm(0); // clamp bit 1201 (void)LoHalf; 1202 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1203 1204 MachineInstr *HiHalf = 1205 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1206 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1207 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1208 .add(OffsetHi) 1209 .addReg(CarryReg, RegState::Kill) 1210 .addImm(0); // clamp bit 1211 (void)HiHalf; 1212 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1213 1214 Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); 1215 MachineInstr *FullBase = 1216 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1217 .addReg(DestSub0) 1218 .addImm(AMDGPU::sub0) 1219 .addReg(DestSub1) 1220 .addImm(AMDGPU::sub1); 1221 (void)FullBase; 1222 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1223 1224 return FullDestReg; 1225 } 1226 1227 // Update base and offset with the NewBase and NewOffset in MI. 1228 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1229 unsigned NewBase, 1230 int32_t NewOffset) const { 1231 TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase); 1232 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1233 } 1234 1235 Optional<int32_t> 1236 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1237 if (Op.isImm()) 1238 return Op.getImm(); 1239 1240 if (!Op.isReg()) 1241 return None; 1242 1243 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1244 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1245 !Def->getOperand(1).isImm()) 1246 return None; 1247 1248 return Def->getOperand(1).getImm(); 1249 } 1250 1251 // Analyze Base and extracts: 1252 // - 32bit base registers, subregisters 1253 // - 64bit constant offset 1254 // Expecting base computation as: 1255 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1256 // %LO:vgpr_32, %c:sreg_64_xexec = 1257 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1258 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1259 // %Base:vreg_64 = 1260 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1261 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1262 MemAddress &Addr) const { 1263 if (!Base.isReg()) 1264 return; 1265 1266 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1267 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1268 || Def->getNumOperands() != 5) 1269 return; 1270 1271 MachineOperand BaseLo = Def->getOperand(1); 1272 MachineOperand BaseHi = Def->getOperand(3); 1273 if (!BaseLo.isReg() || !BaseHi.isReg()) 1274 return; 1275 1276 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1277 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1278 1279 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || 1280 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1281 return; 1282 1283 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1284 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1285 1286 auto Offset0P = extractConstOffset(*Src0); 1287 if (Offset0P) 1288 BaseLo = *Src1; 1289 else { 1290 if (!(Offset0P = extractConstOffset(*Src1))) 1291 return; 1292 BaseLo = *Src0; 1293 } 1294 1295 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1296 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1297 1298 if (Src0->isImm()) 1299 std::swap(Src0, Src1); 1300 1301 if (!Src1->isImm()) 1302 return; 1303 1304 uint64_t Offset1 = Src1->getImm(); 1305 BaseHi = *Src0; 1306 1307 Addr.Base.LoReg = BaseLo.getReg(); 1308 Addr.Base.HiReg = BaseHi.getReg(); 1309 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1310 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1311 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1312 } 1313 1314 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1315 MachineInstr &MI, 1316 MemInfoMap &Visited, 1317 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1318 1319 if (!(MI.mayLoad() ^ MI.mayStore())) 1320 return false; 1321 1322 // TODO: Support flat and scratch. 1323 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1324 return false; 1325 1326 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1327 return false; 1328 1329 if (AnchorList.count(&MI)) 1330 return false; 1331 1332 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1333 1334 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1335 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1336 return false; 1337 } 1338 1339 // Step1: Find the base-registers and a 64bit constant offset. 1340 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1341 MemAddress MAddr; 1342 if (Visited.find(&MI) == Visited.end()) { 1343 processBaseWithConstOffset(Base, MAddr); 1344 Visited[&MI] = MAddr; 1345 } else 1346 MAddr = Visited[&MI]; 1347 1348 if (MAddr.Offset == 0) { 1349 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1350 " constant offsets that can be promoted.\n";); 1351 return false; 1352 } 1353 1354 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1355 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1356 1357 // Step2: Traverse through MI's basic block and find an anchor(that has the 1358 // same base-registers) with the highest 13bit distance from MI's offset. 1359 // E.g. (64bit loads) 1360 // bb: 1361 // addr1 = &a + 4096; load1 = load(addr1, 0) 1362 // addr2 = &a + 6144; load2 = load(addr2, 0) 1363 // addr3 = &a + 8192; load3 = load(addr3, 0) 1364 // addr4 = &a + 10240; load4 = load(addr4, 0) 1365 // addr5 = &a + 12288; load5 = load(addr5, 0) 1366 // 1367 // Starting from the first load, the optimization will try to find a new base 1368 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1369 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1370 // as the new-base(anchor) because of the maximum distance which can 1371 // accomodate more intermediate bases presumeably. 1372 // 1373 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1374 // (&a + 8192) for load1, load2, load4. 1375 // addr = &a + 8192 1376 // load1 = load(addr, -4096) 1377 // load2 = load(addr, -2048) 1378 // load3 = load(addr, 0) 1379 // load4 = load(addr, 2048) 1380 // addr5 = &a + 12288; load5 = load(addr5, 0) 1381 // 1382 MachineInstr *AnchorInst = nullptr; 1383 MemAddress AnchorAddr; 1384 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1385 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1386 1387 MachineBasicBlock *MBB = MI.getParent(); 1388 MachineBasicBlock::iterator E = MBB->end(); 1389 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1390 ++MBBI; 1391 const SITargetLowering *TLI = 1392 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1393 1394 for ( ; MBBI != E; ++MBBI) { 1395 MachineInstr &MINext = *MBBI; 1396 // TODO: Support finding an anchor(with same base) from store addresses or 1397 // any other load addresses where the opcodes are different. 1398 if (MINext.getOpcode() != MI.getOpcode() || 1399 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1400 continue; 1401 1402 const MachineOperand &BaseNext = 1403 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1404 MemAddress MAddrNext; 1405 if (Visited.find(&MINext) == Visited.end()) { 1406 processBaseWithConstOffset(BaseNext, MAddrNext); 1407 Visited[&MINext] = MAddrNext; 1408 } else 1409 MAddrNext = Visited[&MINext]; 1410 1411 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1412 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1413 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1414 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1415 continue; 1416 1417 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1418 1419 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1420 TargetLoweringBase::AddrMode AM; 1421 AM.HasBaseReg = true; 1422 AM.BaseOffs = Dist; 1423 if (TLI->isLegalGlobalAddressingMode(AM) && 1424 (uint32_t)std::abs(Dist) > MaxDist) { 1425 MaxDist = std::abs(Dist); 1426 1427 AnchorAddr = MAddrNext; 1428 AnchorInst = &MINext; 1429 } 1430 } 1431 1432 if (AnchorInst) { 1433 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1434 AnchorInst->dump()); 1435 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1436 << AnchorAddr.Offset << "\n\n"); 1437 1438 // Instead of moving up, just re-compute anchor-instruction's base address. 1439 unsigned Base = computeBase(MI, AnchorAddr); 1440 1441 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1442 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1443 1444 for (auto P : InstsWCommonBase) { 1445 TargetLoweringBase::AddrMode AM; 1446 AM.HasBaseReg = true; 1447 AM.BaseOffs = P.second - AnchorAddr.Offset; 1448 1449 if (TLI->isLegalGlobalAddressingMode(AM)) { 1450 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1451 dbgs() << ")"; P.first->dump()); 1452 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1453 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1454 } 1455 } 1456 AnchorList.insert(AnchorInst); 1457 return true; 1458 } 1459 1460 return false; 1461 } 1462 1463 // Scan through looking for adjacent LDS operations with constant offsets from 1464 // the same base register. We rely on the scheduler to do the hard work of 1465 // clustering nearby loads, and assume these are all adjacent. 1466 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 1467 bool Modified = false; 1468 1469 // Contain the list 1470 MemInfoMap Visited; 1471 // Contains the list of instructions for which constant offsets are being 1472 // promoted to the IMM. 1473 SmallPtrSet<MachineInstr *, 4> AnchorList; 1474 1475 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 1476 MachineInstr &MI = *I; 1477 1478 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1479 Modified = true; 1480 1481 // Don't combine if volatile. 1482 if (MI.hasOrderedMemoryRef()) { 1483 ++I; 1484 continue; 1485 } 1486 1487 const unsigned Opc = MI.getOpcode(); 1488 1489 CombineInfo CI; 1490 CI.I = I; 1491 CI.InstClass = getInstClass(Opc); 1492 1493 switch (CI.InstClass) { 1494 default: 1495 break; 1496 case DS_READ: 1497 CI.EltSize = 1498 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 1499 : 4; 1500 if (findMatchingInst(CI)) { 1501 Modified = true; 1502 I = mergeRead2Pair(CI); 1503 } else { 1504 ++I; 1505 } 1506 continue; 1507 case DS_WRITE: 1508 CI.EltSize = 1509 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 1510 : 4; 1511 if (findMatchingInst(CI)) { 1512 Modified = true; 1513 I = mergeWrite2Pair(CI); 1514 } else { 1515 ++I; 1516 } 1517 continue; 1518 case S_BUFFER_LOAD_IMM: 1519 CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); 1520 if (findMatchingInst(CI)) { 1521 Modified = true; 1522 I = mergeSBufferLoadImmPair(CI); 1523 OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; 1524 } else { 1525 ++I; 1526 } 1527 continue; 1528 case BUFFER_LOAD_OFFEN: 1529 case BUFFER_LOAD_OFFSET: 1530 case BUFFER_LOAD_OFFEN_exact: 1531 case BUFFER_LOAD_OFFSET_exact: 1532 CI.EltSize = 4; 1533 if (findMatchingInst(CI)) { 1534 Modified = true; 1535 I = mergeBufferLoadPair(CI); 1536 OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; 1537 } else { 1538 ++I; 1539 } 1540 continue; 1541 case BUFFER_STORE_OFFEN: 1542 case BUFFER_STORE_OFFSET: 1543 case BUFFER_STORE_OFFEN_exact: 1544 case BUFFER_STORE_OFFSET_exact: 1545 CI.EltSize = 4; 1546 if (findMatchingInst(CI)) { 1547 Modified = true; 1548 I = mergeBufferStorePair(CI); 1549 OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; 1550 } else { 1551 ++I; 1552 } 1553 continue; 1554 } 1555 1556 ++I; 1557 } 1558 1559 return Modified; 1560 } 1561 1562 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 1563 if (skipFunction(MF.getFunction())) 1564 return false; 1565 1566 STM = &MF.getSubtarget<GCNSubtarget>(); 1567 if (!STM->loadStoreOptEnabled()) 1568 return false; 1569 1570 TII = STM->getInstrInfo(); 1571 TRI = &TII->getRegisterInfo(); 1572 1573 MRI = &MF.getRegInfo(); 1574 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1575 1576 assert(MRI->isSSA() && "Must be run on SSA"); 1577 1578 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 1579 1580 bool Modified = false; 1581 1582 for (MachineBasicBlock &MBB : MF) { 1583 do { 1584 OptimizeAgain = false; 1585 Modified |= optimizeBlock(MBB); 1586 } while (OptimizeAgain); 1587 } 1588 1589 return Modified; 1590 } 1591