1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass tries to fuse DS instructions with close by immediate offsets. 11 // This will fuse operations such as 12 // ds_read_b32 v0, v2 offset:16 13 // ds_read_b32 v1, v2 offset:32 14 // ==> 15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16 // 17 // The same is done for certain SMEM and VMEM opcodes, e.g.: 18 // s_buffer_load_dword s4, s[0:3], 4 19 // s_buffer_load_dword s5, s[0:3], 8 20 // ==> 21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 22 // 23 // 24 // Future improvements: 25 // 26 // - This currently relies on the scheduler to place loads and stores next to 27 // each other, and then only merges adjacent pairs of instructions. It would 28 // be good to be more flexible with interleaved instructions, and possibly run 29 // before scheduling. It currently missing stores of constants because loading 30 // the constant into the data register is placed between the stores, although 31 // this is arguably a scheduling problem. 32 // 33 // - Live interval recomputing seems inefficient. This currently only matches 34 // one pair, and recomputes live intervals and moves on to the next pair. It 35 // would be better to compute a list of all merges that need to occur. 36 // 37 // - With a list of instructions to process, we can also merge more. If a 38 // cluster of loads have offsets that are too large to fit in the 8-bit 39 // offsets, but are close enough to fit in the 8 bits, we can add to the base 40 // pointer and use the new reduced offsets. 41 // 42 //===----------------------------------------------------------------------===// 43 44 #include "AMDGPU.h" 45 #include "AMDGPUSubtarget.h" 46 #include "SIInstrInfo.h" 47 #include "SIRegisterInfo.h" 48 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 49 #include "Utils/AMDGPUBaseInfo.h" 50 #include "llvm/ADT/ArrayRef.h" 51 #include "llvm/ADT/SmallVector.h" 52 #include "llvm/ADT/StringRef.h" 53 #include "llvm/Analysis/AliasAnalysis.h" 54 #include "llvm/CodeGen/MachineBasicBlock.h" 55 #include "llvm/CodeGen/MachineFunction.h" 56 #include "llvm/CodeGen/MachineFunctionPass.h" 57 #include "llvm/CodeGen/MachineInstr.h" 58 #include "llvm/CodeGen/MachineInstrBuilder.h" 59 #include "llvm/CodeGen/MachineOperand.h" 60 #include "llvm/CodeGen/MachineRegisterInfo.h" 61 #include "llvm/IR/DebugLoc.h" 62 #include "llvm/Pass.h" 63 #include "llvm/Support/Debug.h" 64 #include "llvm/Support/MathExtras.h" 65 #include "llvm/Support/raw_ostream.h" 66 #include <algorithm> 67 #include <cassert> 68 #include <cstdlib> 69 #include <iterator> 70 #include <utility> 71 72 using namespace llvm; 73 74 #define DEBUG_TYPE "si-load-store-opt" 75 76 namespace { 77 78 class SILoadStoreOptimizer : public MachineFunctionPass { 79 enum InstClassEnum { 80 DS_READ_WRITE, 81 S_BUFFER_LOAD_IMM, 82 BUFFER_LOAD_OFFEN, 83 BUFFER_LOAD_OFFSET, 84 BUFFER_STORE_OFFEN, 85 BUFFER_STORE_OFFSET, 86 }; 87 88 struct CombineInfo { 89 MachineBasicBlock::iterator I; 90 MachineBasicBlock::iterator Paired; 91 unsigned EltSize; 92 unsigned Offset0; 93 unsigned Offset1; 94 unsigned BaseOff; 95 InstClassEnum InstClass; 96 bool GLC0; 97 bool GLC1; 98 bool SLC0; 99 bool SLC1; 100 bool UseST64; 101 bool IsX2; 102 SmallVector<MachineInstr*, 8> InstsToMove; 103 }; 104 105 private: 106 const GCNSubtarget *STM = nullptr; 107 const SIInstrInfo *TII = nullptr; 108 const SIRegisterInfo *TRI = nullptr; 109 MachineRegisterInfo *MRI = nullptr; 110 AliasAnalysis *AA = nullptr; 111 unsigned CreatedX2; 112 113 static bool offsetsCanBeCombined(CombineInfo &CI); 114 115 bool findMatchingInst(CombineInfo &CI); 116 117 unsigned read2Opcode(unsigned EltSize) const; 118 unsigned read2ST64Opcode(unsigned EltSize) const; 119 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); 120 121 unsigned write2Opcode(unsigned EltSize) const; 122 unsigned write2ST64Opcode(unsigned EltSize) const; 123 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); 124 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); 125 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); 126 unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, 127 bool &IsOffen) const; 128 MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); 129 130 public: 131 static char ID; 132 133 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 134 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 135 } 136 137 bool optimizeBlock(MachineBasicBlock &MBB); 138 139 bool runOnMachineFunction(MachineFunction &MF) override; 140 141 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 142 143 void getAnalysisUsage(AnalysisUsage &AU) const override { 144 AU.setPreservesCFG(); 145 AU.addRequired<AAResultsWrapperPass>(); 146 147 MachineFunctionPass::getAnalysisUsage(AU); 148 } 149 }; 150 151 } // end anonymous namespace. 152 153 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 154 "SI Load Store Optimizer", false, false) 155 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 156 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, 157 "SI Load Store Optimizer", false, false) 158 159 char SILoadStoreOptimizer::ID = 0; 160 161 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 162 163 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 164 return new SILoadStoreOptimizer(); 165 } 166 167 static void moveInstsAfter(MachineBasicBlock::iterator I, 168 ArrayRef<MachineInstr*> InstsToMove) { 169 MachineBasicBlock *MBB = I->getParent(); 170 ++I; 171 for (MachineInstr *MI : InstsToMove) { 172 MI->removeFromParent(); 173 MBB->insert(I, MI); 174 } 175 } 176 177 static void addDefsUsesToList(const MachineInstr &MI, 178 DenseSet<unsigned> &RegDefs, 179 DenseSet<unsigned> &PhysRegUses) { 180 for (const MachineOperand &Op : MI.operands()) { 181 if (Op.isReg()) { 182 if (Op.isDef()) 183 RegDefs.insert(Op.getReg()); 184 else if (Op.readsReg() && 185 TargetRegisterInfo::isPhysicalRegister(Op.getReg())) 186 PhysRegUses.insert(Op.getReg()); 187 } 188 } 189 } 190 191 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 192 MachineBasicBlock::iterator B, 193 const SIInstrInfo *TII, 194 AliasAnalysis * AA) { 195 // RAW or WAR - cannot reorder 196 // WAW - cannot reorder 197 // RAR - safe to reorder 198 return !(A->mayStore() || B->mayStore()) || 199 TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); 200 } 201 202 // Add MI and its defs to the lists if MI reads one of the defs that are 203 // already in the list. Returns true in that case. 204 static bool 205 addToListsIfDependent(MachineInstr &MI, 206 DenseSet<unsigned> &RegDefs, 207 DenseSet<unsigned> &PhysRegUses, 208 SmallVectorImpl<MachineInstr*> &Insts) { 209 for (MachineOperand &Use : MI.operands()) { 210 // If one of the defs is read, then there is a use of Def between I and the 211 // instruction that I will potentially be merged with. We will need to move 212 // this instruction after the merged instructions. 213 // 214 // Similarly, if there is a def which is read by an instruction that is to 215 // be moved for merging, then we need to move the def-instruction as well. 216 // This can only happen for physical registers such as M0; virtual 217 // registers are in SSA form. 218 if (Use.isReg() && 219 ((Use.readsReg() && RegDefs.count(Use.getReg())) || 220 (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && 221 PhysRegUses.count(Use.getReg())))) { 222 Insts.push_back(&MI); 223 addDefsUsesToList(MI, RegDefs, PhysRegUses); 224 return true; 225 } 226 } 227 228 return false; 229 } 230 231 static bool 232 canMoveInstsAcrossMemOp(MachineInstr &MemOp, 233 ArrayRef<MachineInstr*> InstsToMove, 234 const SIInstrInfo *TII, 235 AliasAnalysis *AA) { 236 assert(MemOp.mayLoadOrStore()); 237 238 for (MachineInstr *InstToMove : InstsToMove) { 239 if (!InstToMove->mayLoadOrStore()) 240 continue; 241 if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) 242 return false; 243 } 244 return true; 245 } 246 247 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { 248 // XXX - Would the same offset be OK? Is there any reason this would happen or 249 // be useful? 250 if (CI.Offset0 == CI.Offset1) 251 return false; 252 253 // This won't be valid if the offset isn't aligned. 254 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) 255 return false; 256 257 unsigned EltOffset0 = CI.Offset0 / CI.EltSize; 258 unsigned EltOffset1 = CI.Offset1 / CI.EltSize; 259 CI.UseST64 = false; 260 CI.BaseOff = 0; 261 262 // Handle SMEM and VMEM instructions. 263 if (CI.InstClass != DS_READ_WRITE) { 264 unsigned Diff = CI.IsX2 ? 2 : 1; 265 return (EltOffset0 + Diff == EltOffset1 || 266 EltOffset1 + Diff == EltOffset0) && 267 CI.GLC0 == CI.GLC1 && 268 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); 269 } 270 271 // If the offset in elements doesn't fit in 8-bits, we might be able to use 272 // the stride 64 versions. 273 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 274 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 275 CI.Offset0 = EltOffset0 / 64; 276 CI.Offset1 = EltOffset1 / 64; 277 CI.UseST64 = true; 278 return true; 279 } 280 281 // Check if the new offsets fit in the reduced 8-bit range. 282 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 283 CI.Offset0 = EltOffset0; 284 CI.Offset1 = EltOffset1; 285 return true; 286 } 287 288 // Try to shift base address to decrease offsets. 289 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 290 CI.BaseOff = std::min(CI.Offset0, CI.Offset1); 291 292 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 293 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 294 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 295 CI.UseST64 = true; 296 return true; 297 } 298 299 if (isUInt<8>(OffsetDiff)) { 300 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; 301 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; 302 return true; 303 } 304 305 return false; 306 } 307 308 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { 309 MachineBasicBlock *MBB = CI.I->getParent(); 310 MachineBasicBlock::iterator E = MBB->end(); 311 MachineBasicBlock::iterator MBBI = CI.I; 312 313 unsigned AddrOpName[3] = {0}; 314 int AddrIdx[3]; 315 const MachineOperand *AddrReg[3]; 316 unsigned NumAddresses = 0; 317 318 switch (CI.InstClass) { 319 case DS_READ_WRITE: 320 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; 321 break; 322 case S_BUFFER_LOAD_IMM: 323 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; 324 break; 325 case BUFFER_LOAD_OFFEN: 326 case BUFFER_STORE_OFFEN: 327 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 328 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; 329 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 330 break; 331 case BUFFER_LOAD_OFFSET: 332 case BUFFER_STORE_OFFSET: 333 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 334 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 335 break; 336 } 337 338 for (unsigned i = 0; i < NumAddresses; i++) { 339 AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); 340 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); 341 342 // We only ever merge operations with the same base address register, so don't 343 // bother scanning forward if there are no other uses. 344 if (AddrReg[i]->isReg() && 345 (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || 346 MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) 347 return false; 348 } 349 350 ++MBBI; 351 352 DenseSet<unsigned> RegDefsToMove; 353 DenseSet<unsigned> PhysRegUsesToMove; 354 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 355 356 for ( ; MBBI != E; ++MBBI) { 357 if (MBBI->getOpcode() != CI.I->getOpcode()) { 358 // This is not a matching DS instruction, but we can keep looking as 359 // long as one of these conditions are met: 360 // 1. It is safe to move I down past MBBI. 361 // 2. It is safe to move MBBI down past the instruction that I will 362 // be merged into. 363 364 if (MBBI->hasUnmodeledSideEffects()) { 365 // We can't re-order this instruction with respect to other memory 366 // operations, so we fail both conditions mentioned above. 367 return false; 368 } 369 370 if (MBBI->mayLoadOrStore() && 371 (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || 372 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { 373 // We fail condition #1, but we may still be able to satisfy condition 374 // #2. Add this instruction to the move list and then we will check 375 // if condition #2 holds once we have selected the matching instruction. 376 CI.InstsToMove.push_back(&*MBBI); 377 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 378 continue; 379 } 380 381 // When we match I with another DS instruction we will be moving I down 382 // to the location of the matched instruction any uses of I will need to 383 // be moved down as well. 384 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 385 CI.InstsToMove); 386 continue; 387 } 388 389 // Don't merge volatiles. 390 if (MBBI->hasOrderedMemoryRef()) 391 return false; 392 393 // Handle a case like 394 // DS_WRITE_B32 addr, v, idx0 395 // w = DS_READ_B32 addr, idx0 396 // DS_WRITE_B32 addr, f(w), idx1 397 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 398 // merging of the two writes. 399 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 400 CI.InstsToMove)) 401 continue; 402 403 bool Match = true; 404 for (unsigned i = 0; i < NumAddresses; i++) { 405 const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); 406 407 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 408 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 409 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 410 Match = false; 411 break; 412 } 413 continue; 414 } 415 416 // Check same base pointer. Be careful of subregisters, which can occur with 417 // vectors of pointers. 418 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 419 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 420 Match = false; 421 break; 422 } 423 } 424 425 if (Match) { 426 int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), 427 AMDGPU::OpName::offset); 428 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); 429 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); 430 CI.Paired = MBBI; 431 432 if (CI.InstClass == DS_READ_WRITE) { 433 CI.Offset0 &= 0xffff; 434 CI.Offset1 &= 0xffff; 435 } else { 436 CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); 437 CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); 438 if (CI.InstClass != S_BUFFER_LOAD_IMM) { 439 CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); 440 CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); 441 } 442 } 443 444 // Check both offsets fit in the reduced range. 445 // We also need to go through the list of instructions that we plan to 446 // move and make sure they are all safe to move down past the merged 447 // instruction. 448 if (offsetsCanBeCombined(CI)) 449 if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) 450 return true; 451 } 452 453 // We've found a load/store that we couldn't merge for some reason. 454 // We could potentially keep looking, but we'd need to make sure that 455 // it was safe to move I and also all the instruction in InstsToMove 456 // down past this instruction. 457 // check if we can move I across MBBI and if we can move all I's users 458 if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || 459 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) 460 break; 461 } 462 return false; 463 } 464 465 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 466 if (STM->ldsRequiresM0Init()) 467 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 468 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 469 } 470 471 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 472 if (STM->ldsRequiresM0Init()) 473 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 474 475 return (EltSize == 4) ? 476 AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; 477 } 478 479 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( 480 CombineInfo &CI) { 481 MachineBasicBlock *MBB = CI.I->getParent(); 482 483 // Be careful, since the addresses could be subregisters themselves in weird 484 // cases, like vectors of pointers. 485 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 486 487 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 488 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst); 489 490 unsigned NewOffset0 = CI.Offset0; 491 unsigned NewOffset1 = CI.Offset1; 492 unsigned Opc = CI.UseST64 ? 493 read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 494 495 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 496 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 497 498 if (NewOffset0 > NewOffset1) { 499 // Canonicalize the merged instruction so the smaller offset comes first. 500 std::swap(NewOffset0, NewOffset1); 501 std::swap(SubRegIdx0, SubRegIdx1); 502 } 503 504 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 505 (NewOffset0 != NewOffset1) && 506 "Computed offset doesn't fit"); 507 508 const MCInstrDesc &Read2Desc = TII->get(Opc); 509 510 const TargetRegisterClass *SuperRC 511 = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 512 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 513 514 DebugLoc DL = CI.I->getDebugLoc(); 515 516 unsigned BaseReg = AddrReg->getReg(); 517 unsigned BaseRegFlags = 0; 518 if (CI.BaseOff) { 519 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 520 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 521 .addImm(CI.BaseOff); 522 523 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 524 BaseRegFlags = RegState::Kill; 525 526 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) 527 .addReg(ImmReg) 528 .addReg(AddrReg->getReg()); 529 } 530 531 MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) 532 .addReg(BaseReg, BaseRegFlags) // addr 533 .addImm(NewOffset0) // offset0 534 .addImm(NewOffset1) // offset1 535 .addImm(0) // gds 536 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 537 538 (void)Read2; 539 540 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 541 542 // Copy to the old destination registers. 543 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 544 .add(*Dest0) // Copy to same destination including flags and sub reg. 545 .addReg(DestReg, 0, SubRegIdx0); 546 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 547 .add(*Dest1) 548 .addReg(DestReg, RegState::Kill, SubRegIdx1); 549 550 moveInstsAfter(Copy1, CI.InstsToMove); 551 552 MachineBasicBlock::iterator Next = std::next(CI.I); 553 CI.I->eraseFromParent(); 554 CI.Paired->eraseFromParent(); 555 556 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 557 return Next; 558 } 559 560 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 561 if (STM->ldsRequiresM0Init()) 562 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 563 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; 564 } 565 566 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 567 if (STM->ldsRequiresM0Init()) 568 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; 569 570 return (EltSize == 4) ? 571 AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 572 } 573 574 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 575 CombineInfo &CI) { 576 MachineBasicBlock *MBB = CI.I->getParent(); 577 578 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 579 // sure we preserve the subregister index and any register flags set on them. 580 const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 581 const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 582 const MachineOperand *Data1 583 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); 584 585 unsigned NewOffset0 = CI.Offset0; 586 unsigned NewOffset1 = CI.Offset1; 587 unsigned Opc = CI.UseST64 ? 588 write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 589 590 if (NewOffset0 > NewOffset1) { 591 // Canonicalize the merged instruction so the smaller offset comes first. 592 std::swap(NewOffset0, NewOffset1); 593 std::swap(Data0, Data1); 594 } 595 596 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 597 (NewOffset0 != NewOffset1) && 598 "Computed offset doesn't fit"); 599 600 const MCInstrDesc &Write2Desc = TII->get(Opc); 601 DebugLoc DL = CI.I->getDebugLoc(); 602 603 unsigned BaseReg = AddrReg->getReg(); 604 unsigned BaseRegFlags = 0; 605 if (CI.BaseOff) { 606 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 607 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 608 .addImm(CI.BaseOff); 609 610 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 611 BaseRegFlags = RegState::Kill; 612 613 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) 614 .addReg(ImmReg) 615 .addReg(AddrReg->getReg()); 616 } 617 618 MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc) 619 .addReg(BaseReg, BaseRegFlags) // addr 620 .add(*Data0) // data0 621 .add(*Data1) // data1 622 .addImm(NewOffset0) // offset0 623 .addImm(NewOffset1) // offset1 624 .addImm(0) // gds 625 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 626 627 moveInstsAfter(Write2, CI.InstsToMove); 628 629 MachineBasicBlock::iterator Next = std::next(CI.I); 630 CI.I->eraseFromParent(); 631 CI.Paired->eraseFromParent(); 632 633 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 634 return Next; 635 } 636 637 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 638 CombineInfo &CI) { 639 MachineBasicBlock *MBB = CI.I->getParent(); 640 DebugLoc DL = CI.I->getDebugLoc(); 641 unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : 642 AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 643 644 const TargetRegisterClass *SuperRC = 645 CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; 646 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 647 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); 648 649 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) 650 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 651 .addImm(MergedOffset) // offset 652 .addImm(CI.GLC0) // glc 653 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 654 655 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 656 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; 657 658 // Handle descending offsets 659 if (CI.Offset0 > CI.Offset1) 660 std::swap(SubRegIdx0, SubRegIdx1); 661 662 // Copy to the old destination registers. 663 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 664 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 665 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst); 666 667 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 668 .add(*Dest0) // Copy to same destination including flags and sub reg. 669 .addReg(DestReg, 0, SubRegIdx0); 670 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 671 .add(*Dest1) 672 .addReg(DestReg, RegState::Kill, SubRegIdx1); 673 674 moveInstsAfter(Copy1, CI.InstsToMove); 675 676 MachineBasicBlock::iterator Next = std::next(CI.I); 677 CI.I->eraseFromParent(); 678 CI.Paired->eraseFromParent(); 679 return Next; 680 } 681 682 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 683 CombineInfo &CI) { 684 MachineBasicBlock *MBB = CI.I->getParent(); 685 DebugLoc DL = CI.I->getDebugLoc(); 686 unsigned Opcode; 687 688 if (CI.InstClass == BUFFER_LOAD_OFFEN) { 689 Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : 690 AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 691 } else { 692 Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : 693 AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 694 } 695 696 const TargetRegisterClass *SuperRC = 697 CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; 698 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 699 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); 700 701 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); 702 703 if (CI.InstClass == BUFFER_LOAD_OFFEN) 704 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 705 706 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 707 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 708 .addImm(MergedOffset) // offset 709 .addImm(CI.GLC0) // glc 710 .addImm(CI.SLC0) // slc 711 .addImm(0) // tfe 712 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 713 714 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 715 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; 716 717 // Handle descending offsets 718 if (CI.Offset0 > CI.Offset1) 719 std::swap(SubRegIdx0, SubRegIdx1); 720 721 // Copy to the old destination registers. 722 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 723 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 724 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); 725 726 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 727 .add(*Dest0) // Copy to same destination including flags and sub reg. 728 .addReg(DestReg, 0, SubRegIdx0); 729 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 730 .add(*Dest1) 731 .addReg(DestReg, RegState::Kill, SubRegIdx1); 732 733 moveInstsAfter(Copy1, CI.InstsToMove); 734 735 MachineBasicBlock::iterator Next = std::next(CI.I); 736 CI.I->eraseFromParent(); 737 CI.Paired->eraseFromParent(); 738 return Next; 739 } 740 741 unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( 742 const MachineInstr &I, bool &IsX2, bool &IsOffen) const { 743 IsX2 = false; 744 IsOffen = false; 745 746 switch (I.getOpcode()) { 747 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 748 IsOffen = true; 749 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 750 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 751 IsOffen = true; 752 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; 753 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 754 IsX2 = true; 755 IsOffen = true; 756 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 757 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: 758 IsX2 = true; 759 IsOffen = true; 760 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; 761 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 762 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 763 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 764 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; 765 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 766 IsX2 = true; 767 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 768 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: 769 IsX2 = true; 770 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; 771 } 772 return 0; 773 } 774 775 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 776 CombineInfo &CI) { 777 MachineBasicBlock *MBB = CI.I->getParent(); 778 DebugLoc DL = CI.I->getDebugLoc(); 779 bool Unused1, Unused2; 780 unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); 781 782 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 783 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; 784 785 // Handle descending offsets 786 if (CI.Offset0 > CI.Offset1) 787 std::swap(SubRegIdx0, SubRegIdx1); 788 789 // Copy to the new source register. 790 const TargetRegisterClass *SuperRC = 791 CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; 792 unsigned SrcReg = MRI->createVirtualRegister(SuperRC); 793 794 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 795 const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); 796 797 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 798 .add(*Src0) 799 .addImm(SubRegIdx0) 800 .add(*Src1) 801 .addImm(SubRegIdx1); 802 803 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) 804 .addReg(SrcReg, RegState::Kill); 805 806 if (CI.InstClass == BUFFER_STORE_OFFEN) 807 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 808 809 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 810 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 811 .addImm(std::min(CI.Offset0, CI.Offset1)) // offset 812 .addImm(CI.GLC0) // glc 813 .addImm(CI.SLC0) // slc 814 .addImm(0) // tfe 815 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 816 817 moveInstsAfter(MIB, CI.InstsToMove); 818 819 MachineBasicBlock::iterator Next = std::next(CI.I); 820 CI.I->eraseFromParent(); 821 CI.Paired->eraseFromParent(); 822 return Next; 823 } 824 825 // Scan through looking for adjacent LDS operations with constant offsets from 826 // the same base register. We rely on the scheduler to do the hard work of 827 // clustering nearby loads, and assume these are all adjacent. 828 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 829 bool Modified = false; 830 831 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 832 MachineInstr &MI = *I; 833 834 // Don't combine if volatile. 835 if (MI.hasOrderedMemoryRef()) { 836 ++I; 837 continue; 838 } 839 840 CombineInfo CI; 841 CI.I = I; 842 unsigned Opc = MI.getOpcode(); 843 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || 844 Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { 845 846 CI.InstClass = DS_READ_WRITE; 847 CI.EltSize = 848 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; 849 850 if (findMatchingInst(CI)) { 851 Modified = true; 852 I = mergeRead2Pair(CI); 853 } else { 854 ++I; 855 } 856 857 continue; 858 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || 859 Opc == AMDGPU::DS_WRITE_B32_gfx9 || 860 Opc == AMDGPU::DS_WRITE_B64_gfx9) { 861 CI.InstClass = DS_READ_WRITE; 862 CI.EltSize 863 = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; 864 865 if (findMatchingInst(CI)) { 866 Modified = true; 867 I = mergeWrite2Pair(CI); 868 } else { 869 ++I; 870 } 871 872 continue; 873 } 874 if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || 875 Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { 876 // EltSize is in units of the offset encoding. 877 CI.InstClass = S_BUFFER_LOAD_IMM; 878 CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); 879 CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 880 if (findMatchingInst(CI)) { 881 Modified = true; 882 I = mergeSBufferLoadImmPair(CI); 883 if (!CI.IsX2) 884 CreatedX2++; 885 } else { 886 ++I; 887 } 888 continue; 889 } 890 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || 891 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || 892 Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || 893 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { 894 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || 895 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) 896 CI.InstClass = BUFFER_LOAD_OFFEN; 897 else 898 CI.InstClass = BUFFER_LOAD_OFFSET; 899 900 CI.EltSize = 4; 901 CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || 902 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 903 if (findMatchingInst(CI)) { 904 Modified = true; 905 I = mergeBufferLoadPair(CI); 906 if (!CI.IsX2) 907 CreatedX2++; 908 } else { 909 ++I; 910 } 911 continue; 912 } 913 914 bool StoreIsX2, IsOffen; 915 if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { 916 CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; 917 CI.EltSize = 4; 918 CI.IsX2 = StoreIsX2; 919 if (findMatchingInst(CI)) { 920 Modified = true; 921 I = mergeBufferStorePair(CI); 922 if (!CI.IsX2) 923 CreatedX2++; 924 } else { 925 ++I; 926 } 927 continue; 928 } 929 930 ++I; 931 } 932 933 return Modified; 934 } 935 936 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 937 if (skipFunction(MF.getFunction())) 938 return false; 939 940 STM = &MF.getSubtarget<GCNSubtarget>(); 941 if (!STM->loadStoreOptEnabled()) 942 return false; 943 944 TII = STM->getInstrInfo(); 945 TRI = &TII->getRegisterInfo(); 946 947 MRI = &MF.getRegInfo(); 948 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 949 950 assert(MRI->isSSA() && "Must be run on SSA"); 951 952 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 953 954 bool Modified = false; 955 956 for (MachineBasicBlock &MBB : MF) { 957 CreatedX2 = 0; 958 Modified |= optimizeBlock(MBB); 959 960 // Run again to convert x2 to x4. 961 if (CreatedX2 >= 1) 962 Modified |= optimizeBlock(MBB); 963 } 964 965 return Modified; 966 } 967