1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass tries to fuse DS instructions with close by immediate offsets. 11 // This will fuse operations such as 12 // ds_read_b32 v0, v2 offset:16 13 // ds_read_b32 v1, v2 offset:32 14 // ==> 15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16 // 17 // The same is done for certain SMEM and VMEM opcodes, e.g.: 18 // s_buffer_load_dword s4, s[0:3], 4 19 // s_buffer_load_dword s5, s[0:3], 8 20 // ==> 21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 22 // 23 // 24 // Future improvements: 25 // 26 // - This currently relies on the scheduler to place loads and stores next to 27 // each other, and then only merges adjacent pairs of instructions. It would 28 // be good to be more flexible with interleaved instructions, and possibly run 29 // before scheduling. It currently missing stores of constants because loading 30 // the constant into the data register is placed between the stores, although 31 // this is arguably a scheduling problem. 32 // 33 // - Live interval recomputing seems inefficient. This currently only matches 34 // one pair, and recomputes live intervals and moves on to the next pair. It 35 // would be better to compute a list of all merges that need to occur. 36 // 37 // - With a list of instructions to process, we can also merge more. If a 38 // cluster of loads have offsets that are too large to fit in the 8-bit 39 // offsets, but are close enough to fit in the 8 bits, we can add to the base 40 // pointer and use the new reduced offsets. 41 // 42 //===----------------------------------------------------------------------===// 43 44 #include "AMDGPU.h" 45 #include "AMDGPUSubtarget.h" 46 #include "SIInstrInfo.h" 47 #include "SIRegisterInfo.h" 48 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 49 #include "Utils/AMDGPUBaseInfo.h" 50 #include "llvm/ADT/ArrayRef.h" 51 #include "llvm/ADT/SmallVector.h" 52 #include "llvm/ADT/StringRef.h" 53 #include "llvm/Analysis/AliasAnalysis.h" 54 #include "llvm/CodeGen/MachineBasicBlock.h" 55 #include "llvm/CodeGen/MachineFunction.h" 56 #include "llvm/CodeGen/MachineFunctionPass.h" 57 #include "llvm/CodeGen/MachineInstr.h" 58 #include "llvm/CodeGen/MachineInstrBuilder.h" 59 #include "llvm/CodeGen/MachineOperand.h" 60 #include "llvm/CodeGen/MachineRegisterInfo.h" 61 #include "llvm/IR/DebugLoc.h" 62 #include "llvm/Pass.h" 63 #include "llvm/Support/Debug.h" 64 #include "llvm/Support/MathExtras.h" 65 #include "llvm/Support/raw_ostream.h" 66 #include <algorithm> 67 #include <cassert> 68 #include <cstdlib> 69 #include <iterator> 70 #include <utility> 71 72 using namespace llvm; 73 74 #define DEBUG_TYPE "si-load-store-opt" 75 76 namespace { 77 78 class SILoadStoreOptimizer : public MachineFunctionPass { 79 enum InstClassEnum { 80 DS_READ_WRITE, 81 S_BUFFER_LOAD_IMM, 82 BUFFER_LOAD_OFFEN, 83 BUFFER_LOAD_OFFSET, 84 BUFFER_STORE_OFFEN, 85 BUFFER_STORE_OFFSET, 86 }; 87 88 struct CombineInfo { 89 MachineBasicBlock::iterator I; 90 MachineBasicBlock::iterator Paired; 91 unsigned EltSize; 92 unsigned Offset0; 93 unsigned Offset1; 94 unsigned BaseOff; 95 InstClassEnum InstClass; 96 bool GLC0; 97 bool GLC1; 98 bool SLC0; 99 bool SLC1; 100 bool UseST64; 101 bool IsX2; 102 SmallVector<MachineInstr*, 8> InstsToMove; 103 }; 104 105 private: 106 const GCNSubtarget *STM = nullptr; 107 const SIInstrInfo *TII = nullptr; 108 const SIRegisterInfo *TRI = nullptr; 109 MachineRegisterInfo *MRI = nullptr; 110 AliasAnalysis *AA = nullptr; 111 unsigned CreatedX2; 112 113 static bool offsetsCanBeCombined(CombineInfo &CI); 114 115 bool findMatchingInst(CombineInfo &CI); 116 117 unsigned read2Opcode(unsigned EltSize) const; 118 unsigned read2ST64Opcode(unsigned EltSize) const; 119 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); 120 121 unsigned write2Opcode(unsigned EltSize) const; 122 unsigned write2ST64Opcode(unsigned EltSize) const; 123 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); 124 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); 125 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); 126 unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, 127 bool &IsOffen) const; 128 MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); 129 130 public: 131 static char ID; 132 133 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 134 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 135 } 136 137 bool optimizeBlock(MachineBasicBlock &MBB); 138 139 bool runOnMachineFunction(MachineFunction &MF) override; 140 141 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 142 143 void getAnalysisUsage(AnalysisUsage &AU) const override { 144 AU.setPreservesCFG(); 145 AU.addRequired<AAResultsWrapperPass>(); 146 147 MachineFunctionPass::getAnalysisUsage(AU); 148 } 149 }; 150 151 } // end anonymous namespace. 152 153 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 154 "SI Load Store Optimizer", false, false) 155 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 156 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, 157 "SI Load Store Optimizer", false, false) 158 159 char SILoadStoreOptimizer::ID = 0; 160 161 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 162 163 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 164 return new SILoadStoreOptimizer(); 165 } 166 167 static void moveInstsAfter(MachineBasicBlock::iterator I, 168 ArrayRef<MachineInstr*> InstsToMove) { 169 MachineBasicBlock *MBB = I->getParent(); 170 ++I; 171 for (MachineInstr *MI : InstsToMove) { 172 MI->removeFromParent(); 173 MBB->insert(I, MI); 174 } 175 } 176 177 static void addDefsUsesToList(const MachineInstr &MI, 178 DenseSet<unsigned> &RegDefs, 179 DenseSet<unsigned> &PhysRegUses) { 180 for (const MachineOperand &Op : MI.operands()) { 181 if (Op.isReg()) { 182 if (Op.isDef()) 183 RegDefs.insert(Op.getReg()); 184 else if (Op.readsReg() && 185 TargetRegisterInfo::isPhysicalRegister(Op.getReg())) 186 PhysRegUses.insert(Op.getReg()); 187 } 188 } 189 } 190 191 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 192 MachineBasicBlock::iterator B, 193 const SIInstrInfo *TII, 194 AliasAnalysis * AA) { 195 // RAW or WAR - cannot reorder 196 // WAW - cannot reorder 197 // RAR - safe to reorder 198 return !(A->mayStore() || B->mayStore()) || 199 TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); 200 } 201 202 // Add MI and its defs to the lists if MI reads one of the defs that are 203 // already in the list. Returns true in that case. 204 static bool 205 addToListsIfDependent(MachineInstr &MI, 206 DenseSet<unsigned> &RegDefs, 207 DenseSet<unsigned> &PhysRegUses, 208 SmallVectorImpl<MachineInstr*> &Insts) { 209 for (MachineOperand &Use : MI.operands()) { 210 // If one of the defs is read, then there is a use of Def between I and the 211 // instruction that I will potentially be merged with. We will need to move 212 // this instruction after the merged instructions. 213 // 214 // Similarly, if there is a def which is read by an instruction that is to 215 // be moved for merging, then we need to move the def-instruction as well. 216 // This can only happen for physical registers such as M0; virtual 217 // registers are in SSA form. 218 if (Use.isReg() && 219 ((Use.readsReg() && RegDefs.count(Use.getReg())) || 220 (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && 221 PhysRegUses.count(Use.getReg())))) { 222 Insts.push_back(&MI); 223 addDefsUsesToList(MI, RegDefs, PhysRegUses); 224 return true; 225 } 226 } 227 228 return false; 229 } 230 231 static bool 232 canMoveInstsAcrossMemOp(MachineInstr &MemOp, 233 ArrayRef<MachineInstr*> InstsToMove, 234 const SIInstrInfo *TII, 235 AliasAnalysis *AA) { 236 assert(MemOp.mayLoadOrStore()); 237 238 for (MachineInstr *InstToMove : InstsToMove) { 239 if (!InstToMove->mayLoadOrStore()) 240 continue; 241 if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) 242 return false; 243 } 244 return true; 245 } 246 247 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { 248 // XXX - Would the same offset be OK? Is there any reason this would happen or 249 // be useful? 250 if (CI.Offset0 == CI.Offset1) 251 return false; 252 253 // This won't be valid if the offset isn't aligned. 254 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) 255 return false; 256 257 unsigned EltOffset0 = CI.Offset0 / CI.EltSize; 258 unsigned EltOffset1 = CI.Offset1 / CI.EltSize; 259 CI.UseST64 = false; 260 CI.BaseOff = 0; 261 262 // Handle SMEM and VMEM instructions. 263 if (CI.InstClass != DS_READ_WRITE) { 264 unsigned Diff = CI.IsX2 ? 2 : 1; 265 return (EltOffset0 + Diff == EltOffset1 || 266 EltOffset1 + Diff == EltOffset0) && 267 CI.GLC0 == CI.GLC1 && 268 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); 269 } 270 271 // If the offset in elements doesn't fit in 8-bits, we might be able to use 272 // the stride 64 versions. 273 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 274 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 275 CI.Offset0 = EltOffset0 / 64; 276 CI.Offset1 = EltOffset1 / 64; 277 CI.UseST64 = true; 278 return true; 279 } 280 281 // Check if the new offsets fit in the reduced 8-bit range. 282 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 283 CI.Offset0 = EltOffset0; 284 CI.Offset1 = EltOffset1; 285 return true; 286 } 287 288 // Try to shift base address to decrease offsets. 289 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 290 CI.BaseOff = std::min(CI.Offset0, CI.Offset1); 291 292 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 293 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 294 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 295 CI.UseST64 = true; 296 return true; 297 } 298 299 if (isUInt<8>(OffsetDiff)) { 300 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; 301 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; 302 return true; 303 } 304 305 return false; 306 } 307 308 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { 309 MachineBasicBlock *MBB = CI.I->getParent(); 310 MachineBasicBlock::iterator E = MBB->end(); 311 MachineBasicBlock::iterator MBBI = CI.I; 312 313 unsigned AddrOpName[3] = {0}; 314 int AddrIdx[3]; 315 const MachineOperand *AddrReg[3]; 316 unsigned NumAddresses = 0; 317 318 switch (CI.InstClass) { 319 case DS_READ_WRITE: 320 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; 321 break; 322 case S_BUFFER_LOAD_IMM: 323 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; 324 break; 325 case BUFFER_LOAD_OFFEN: 326 case BUFFER_STORE_OFFEN: 327 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 328 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; 329 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 330 break; 331 case BUFFER_LOAD_OFFSET: 332 case BUFFER_STORE_OFFSET: 333 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 334 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 335 break; 336 } 337 338 for (unsigned i = 0; i < NumAddresses; i++) { 339 AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); 340 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); 341 342 // We only ever merge operations with the same base address register, so don't 343 // bother scanning forward if there are no other uses. 344 if (AddrReg[i]->isReg() && 345 (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || 346 MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) 347 return false; 348 } 349 350 ++MBBI; 351 352 DenseSet<unsigned> RegDefsToMove; 353 DenseSet<unsigned> PhysRegUsesToMove; 354 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 355 356 for ( ; MBBI != E; ++MBBI) { 357 if (MBBI->getOpcode() != CI.I->getOpcode()) { 358 // This is not a matching DS instruction, but we can keep looking as 359 // long as one of these conditions are met: 360 // 1. It is safe to move I down past MBBI. 361 // 2. It is safe to move MBBI down past the instruction that I will 362 // be merged into. 363 364 if (MBBI->hasUnmodeledSideEffects()) { 365 // We can't re-order this instruction with respect to other memory 366 // operations, so we fail both conditions mentioned above. 367 return false; 368 } 369 370 if (MBBI->mayLoadOrStore() && 371 (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || 372 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { 373 // We fail condition #1, but we may still be able to satisfy condition 374 // #2. Add this instruction to the move list and then we will check 375 // if condition #2 holds once we have selected the matching instruction. 376 CI.InstsToMove.push_back(&*MBBI); 377 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 378 continue; 379 } 380 381 // When we match I with another DS instruction we will be moving I down 382 // to the location of the matched instruction any uses of I will need to 383 // be moved down as well. 384 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 385 CI.InstsToMove); 386 continue; 387 } 388 389 // Don't merge volatiles. 390 if (MBBI->hasOrderedMemoryRef()) 391 return false; 392 393 // Handle a case like 394 // DS_WRITE_B32 addr, v, idx0 395 // w = DS_READ_B32 addr, idx0 396 // DS_WRITE_B32 addr, f(w), idx1 397 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 398 // merging of the two writes. 399 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 400 CI.InstsToMove)) 401 continue; 402 403 bool Match = true; 404 for (unsigned i = 0; i < NumAddresses; i++) { 405 const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); 406 407 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 408 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 409 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 410 Match = false; 411 break; 412 } 413 continue; 414 } 415 416 // Check same base pointer. Be careful of subregisters, which can occur with 417 // vectors of pointers. 418 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 419 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 420 Match = false; 421 break; 422 } 423 } 424 425 if (Match) { 426 int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), 427 AMDGPU::OpName::offset); 428 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); 429 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); 430 CI.Paired = MBBI; 431 432 if (CI.InstClass == DS_READ_WRITE) { 433 CI.Offset0 &= 0xffff; 434 CI.Offset1 &= 0xffff; 435 } else { 436 CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); 437 CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); 438 if (CI.InstClass != S_BUFFER_LOAD_IMM) { 439 CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); 440 CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); 441 } 442 } 443 444 // Check both offsets fit in the reduced range. 445 // We also need to go through the list of instructions that we plan to 446 // move and make sure they are all safe to move down past the merged 447 // instruction. 448 if (offsetsCanBeCombined(CI)) 449 if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) 450 return true; 451 } 452 453 // We've found a load/store that we couldn't merge for some reason. 454 // We could potentially keep looking, but we'd need to make sure that 455 // it was safe to move I and also all the instruction in InstsToMove 456 // down past this instruction. 457 // check if we can move I across MBBI and if we can move all I's users 458 if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || 459 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) 460 break; 461 } 462 return false; 463 } 464 465 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 466 if (STM->ldsRequiresM0Init()) 467 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 468 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 469 } 470 471 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 472 if (STM->ldsRequiresM0Init()) 473 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 474 475 return (EltSize == 4) ? 476 AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; 477 } 478 479 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( 480 CombineInfo &CI) { 481 MachineBasicBlock *MBB = CI.I->getParent(); 482 483 // Be careful, since the addresses could be subregisters themselves in weird 484 // cases, like vectors of pointers. 485 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 486 487 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 488 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst); 489 490 unsigned NewOffset0 = CI.Offset0; 491 unsigned NewOffset1 = CI.Offset1; 492 unsigned Opc = CI.UseST64 ? 493 read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 494 495 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 496 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 497 498 if (NewOffset0 > NewOffset1) { 499 // Canonicalize the merged instruction so the smaller offset comes first. 500 std::swap(NewOffset0, NewOffset1); 501 std::swap(SubRegIdx0, SubRegIdx1); 502 } 503 504 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 505 (NewOffset0 != NewOffset1) && 506 "Computed offset doesn't fit"); 507 508 const MCInstrDesc &Read2Desc = TII->get(Opc); 509 510 const TargetRegisterClass *SuperRC 511 = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 512 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 513 514 DebugLoc DL = CI.I->getDebugLoc(); 515 516 unsigned BaseReg = AddrReg->getReg(); 517 unsigned BaseRegFlags = 0; 518 if (CI.BaseOff) { 519 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 520 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 521 .addImm(CI.BaseOff); 522 523 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 524 BaseRegFlags = RegState::Kill; 525 526 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) 527 .addReg(ImmReg) 528 .addReg(AddrReg->getReg()); 529 } 530 531 MachineInstrBuilder Read2 = 532 BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) 533 .addReg(BaseReg, BaseRegFlags) // addr 534 .addImm(NewOffset0) // offset0 535 .addImm(NewOffset1) // offset1 536 .addImm(0) // gds 537 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); 538 539 (void)Read2; 540 541 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 542 543 // Copy to the old destination registers. 544 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 545 .add(*Dest0) // Copy to same destination including flags and sub reg. 546 .addReg(DestReg, 0, SubRegIdx0); 547 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 548 .add(*Dest1) 549 .addReg(DestReg, RegState::Kill, SubRegIdx1); 550 551 moveInstsAfter(Copy1, CI.InstsToMove); 552 553 MachineBasicBlock::iterator Next = std::next(CI.I); 554 CI.I->eraseFromParent(); 555 CI.Paired->eraseFromParent(); 556 557 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 558 return Next; 559 } 560 561 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 562 if (STM->ldsRequiresM0Init()) 563 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 564 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; 565 } 566 567 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 568 if (STM->ldsRequiresM0Init()) 569 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; 570 571 return (EltSize == 4) ? 572 AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 573 } 574 575 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 576 CombineInfo &CI) { 577 MachineBasicBlock *MBB = CI.I->getParent(); 578 579 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 580 // sure we preserve the subregister index and any register flags set on them. 581 const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 582 const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 583 const MachineOperand *Data1 584 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); 585 586 unsigned NewOffset0 = CI.Offset0; 587 unsigned NewOffset1 = CI.Offset1; 588 unsigned Opc = CI.UseST64 ? 589 write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 590 591 if (NewOffset0 > NewOffset1) { 592 // Canonicalize the merged instruction so the smaller offset comes first. 593 std::swap(NewOffset0, NewOffset1); 594 std::swap(Data0, Data1); 595 } 596 597 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 598 (NewOffset0 != NewOffset1) && 599 "Computed offset doesn't fit"); 600 601 const MCInstrDesc &Write2Desc = TII->get(Opc); 602 DebugLoc DL = CI.I->getDebugLoc(); 603 604 unsigned BaseReg = AddrReg->getReg(); 605 unsigned BaseRegFlags = 0; 606 if (CI.BaseOff) { 607 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 608 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 609 .addImm(CI.BaseOff); 610 611 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 612 BaseRegFlags = RegState::Kill; 613 614 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) 615 .addReg(ImmReg) 616 .addReg(AddrReg->getReg()); 617 } 618 619 MachineInstrBuilder Write2 = 620 BuildMI(*MBB, CI.Paired, DL, Write2Desc) 621 .addReg(BaseReg, BaseRegFlags) // addr 622 .add(*Data0) // data0 623 .add(*Data1) // data1 624 .addImm(NewOffset0) // offset0 625 .addImm(NewOffset1) // offset1 626 .addImm(0) // gds 627 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); 628 629 moveInstsAfter(Write2, CI.InstsToMove); 630 631 MachineBasicBlock::iterator Next = std::next(CI.I); 632 CI.I->eraseFromParent(); 633 CI.Paired->eraseFromParent(); 634 635 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 636 return Next; 637 } 638 639 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 640 CombineInfo &CI) { 641 MachineBasicBlock *MBB = CI.I->getParent(); 642 DebugLoc DL = CI.I->getDebugLoc(); 643 unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : 644 AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 645 646 const TargetRegisterClass *SuperRC = 647 CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; 648 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 649 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); 650 651 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) 652 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 653 .addImm(MergedOffset) // offset 654 .addImm(CI.GLC0) // glc 655 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); 656 657 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 658 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; 659 660 // Handle descending offsets 661 if (CI.Offset0 > CI.Offset1) 662 std::swap(SubRegIdx0, SubRegIdx1); 663 664 // Copy to the old destination registers. 665 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 666 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 667 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst); 668 669 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 670 .add(*Dest0) // Copy to same destination including flags and sub reg. 671 .addReg(DestReg, 0, SubRegIdx0); 672 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 673 .add(*Dest1) 674 .addReg(DestReg, RegState::Kill, SubRegIdx1); 675 676 moveInstsAfter(Copy1, CI.InstsToMove); 677 678 MachineBasicBlock::iterator Next = std::next(CI.I); 679 CI.I->eraseFromParent(); 680 CI.Paired->eraseFromParent(); 681 return Next; 682 } 683 684 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 685 CombineInfo &CI) { 686 MachineBasicBlock *MBB = CI.I->getParent(); 687 DebugLoc DL = CI.I->getDebugLoc(); 688 unsigned Opcode; 689 690 if (CI.InstClass == BUFFER_LOAD_OFFEN) { 691 Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : 692 AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 693 } else { 694 Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : 695 AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 696 } 697 698 const TargetRegisterClass *SuperRC = 699 CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; 700 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 701 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); 702 703 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); 704 705 if (CI.InstClass == BUFFER_LOAD_OFFEN) 706 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 707 708 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 709 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 710 .addImm(MergedOffset) // offset 711 .addImm(CI.GLC0) // glc 712 .addImm(CI.SLC0) // slc 713 .addImm(0) // tfe 714 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); 715 716 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 717 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; 718 719 // Handle descending offsets 720 if (CI.Offset0 > CI.Offset1) 721 std::swap(SubRegIdx0, SubRegIdx1); 722 723 // Copy to the old destination registers. 724 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 725 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 726 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); 727 728 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 729 .add(*Dest0) // Copy to same destination including flags and sub reg. 730 .addReg(DestReg, 0, SubRegIdx0); 731 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 732 .add(*Dest1) 733 .addReg(DestReg, RegState::Kill, SubRegIdx1); 734 735 moveInstsAfter(Copy1, CI.InstsToMove); 736 737 MachineBasicBlock::iterator Next = std::next(CI.I); 738 CI.I->eraseFromParent(); 739 CI.Paired->eraseFromParent(); 740 return Next; 741 } 742 743 unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( 744 const MachineInstr &I, bool &IsX2, bool &IsOffen) const { 745 IsX2 = false; 746 IsOffen = false; 747 748 switch (I.getOpcode()) { 749 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 750 IsOffen = true; 751 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 752 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 753 IsOffen = true; 754 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; 755 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 756 IsX2 = true; 757 IsOffen = true; 758 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 759 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: 760 IsX2 = true; 761 IsOffen = true; 762 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; 763 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 764 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 765 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 766 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; 767 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 768 IsX2 = true; 769 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 770 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: 771 IsX2 = true; 772 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; 773 } 774 return 0; 775 } 776 777 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 778 CombineInfo &CI) { 779 MachineBasicBlock *MBB = CI.I->getParent(); 780 DebugLoc DL = CI.I->getDebugLoc(); 781 bool Unused1, Unused2; 782 unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); 783 784 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 785 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; 786 787 // Handle descending offsets 788 if (CI.Offset0 > CI.Offset1) 789 std::swap(SubRegIdx0, SubRegIdx1); 790 791 // Copy to the new source register. 792 const TargetRegisterClass *SuperRC = 793 CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; 794 unsigned SrcReg = MRI->createVirtualRegister(SuperRC); 795 796 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 797 const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); 798 799 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 800 .add(*Src0) 801 .addImm(SubRegIdx0) 802 .add(*Src1) 803 .addImm(SubRegIdx1); 804 805 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) 806 .addReg(SrcReg, RegState::Kill); 807 808 if (CI.InstClass == BUFFER_STORE_OFFEN) 809 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 810 811 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 812 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 813 .addImm(std::min(CI.Offset0, CI.Offset1)) // offset 814 .addImm(CI.GLC0) // glc 815 .addImm(CI.SLC0) // slc 816 .addImm(0) // tfe 817 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); 818 819 moveInstsAfter(MIB, CI.InstsToMove); 820 821 MachineBasicBlock::iterator Next = std::next(CI.I); 822 CI.I->eraseFromParent(); 823 CI.Paired->eraseFromParent(); 824 return Next; 825 } 826 827 // Scan through looking for adjacent LDS operations with constant offsets from 828 // the same base register. We rely on the scheduler to do the hard work of 829 // clustering nearby loads, and assume these are all adjacent. 830 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 831 bool Modified = false; 832 833 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 834 MachineInstr &MI = *I; 835 836 // Don't combine if volatile. 837 if (MI.hasOrderedMemoryRef()) { 838 ++I; 839 continue; 840 } 841 842 CombineInfo CI; 843 CI.I = I; 844 unsigned Opc = MI.getOpcode(); 845 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || 846 Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { 847 848 CI.InstClass = DS_READ_WRITE; 849 CI.EltSize = 850 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; 851 852 if (findMatchingInst(CI)) { 853 Modified = true; 854 I = mergeRead2Pair(CI); 855 } else { 856 ++I; 857 } 858 859 continue; 860 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || 861 Opc == AMDGPU::DS_WRITE_B32_gfx9 || 862 Opc == AMDGPU::DS_WRITE_B64_gfx9) { 863 CI.InstClass = DS_READ_WRITE; 864 CI.EltSize 865 = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; 866 867 if (findMatchingInst(CI)) { 868 Modified = true; 869 I = mergeWrite2Pair(CI); 870 } else { 871 ++I; 872 } 873 874 continue; 875 } 876 if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || 877 Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { 878 // EltSize is in units of the offset encoding. 879 CI.InstClass = S_BUFFER_LOAD_IMM; 880 CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); 881 CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 882 if (findMatchingInst(CI)) { 883 Modified = true; 884 I = mergeSBufferLoadImmPair(CI); 885 if (!CI.IsX2) 886 CreatedX2++; 887 } else { 888 ++I; 889 } 890 continue; 891 } 892 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || 893 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || 894 Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || 895 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { 896 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || 897 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) 898 CI.InstClass = BUFFER_LOAD_OFFEN; 899 else 900 CI.InstClass = BUFFER_LOAD_OFFSET; 901 902 CI.EltSize = 4; 903 CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || 904 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 905 if (findMatchingInst(CI)) { 906 Modified = true; 907 I = mergeBufferLoadPair(CI); 908 if (!CI.IsX2) 909 CreatedX2++; 910 } else { 911 ++I; 912 } 913 continue; 914 } 915 916 bool StoreIsX2, IsOffen; 917 if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { 918 CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; 919 CI.EltSize = 4; 920 CI.IsX2 = StoreIsX2; 921 if (findMatchingInst(CI)) { 922 Modified = true; 923 I = mergeBufferStorePair(CI); 924 if (!CI.IsX2) 925 CreatedX2++; 926 } else { 927 ++I; 928 } 929 continue; 930 } 931 932 ++I; 933 } 934 935 return Modified; 936 } 937 938 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 939 if (skipFunction(MF.getFunction())) 940 return false; 941 942 STM = &MF.getSubtarget<GCNSubtarget>(); 943 if (!STM->loadStoreOptEnabled()) 944 return false; 945 946 TII = STM->getInstrInfo(); 947 TRI = &TII->getRegisterInfo(); 948 949 MRI = &MF.getRegInfo(); 950 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 951 952 assert(MRI->isSSA() && "Must be run on SSA"); 953 954 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 955 956 bool Modified = false; 957 958 for (MachineBasicBlock &MBB : MF) { 959 CreatedX2 = 0; 960 Modified |= optimizeBlock(MBB); 961 962 // Run again to convert x2 to x4. 963 if (CreatedX2 >= 1) 964 Modified |= optimizeBlock(MBB); 965 } 966 967 return Modified; 968 } 969