1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass tries to fuse DS instructions with close by immediate offsets. 11 // This will fuse operations such as 12 // ds_read_b32 v0, v2 offset:16 13 // ds_read_b32 v1, v2 offset:32 14 // ==> 15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16 // 17 // The same is done for certain SMEM and VMEM opcodes, e.g.: 18 // s_buffer_load_dword s4, s[0:3], 4 19 // s_buffer_load_dword s5, s[0:3], 8 20 // ==> 21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 22 // 23 // 24 // Future improvements: 25 // 26 // - This currently relies on the scheduler to place loads and stores next to 27 // each other, and then only merges adjacent pairs of instructions. It would 28 // be good to be more flexible with interleaved instructions, and possibly run 29 // before scheduling. It currently missing stores of constants because loading 30 // the constant into the data register is placed between the stores, although 31 // this is arguably a scheduling problem. 32 // 33 // - Live interval recomputing seems inefficient. This currently only matches 34 // one pair, and recomputes live intervals and moves on to the next pair. It 35 // would be better to compute a list of all merges that need to occur. 36 // 37 // - With a list of instructions to process, we can also merge more. If a 38 // cluster of loads have offsets that are too large to fit in the 8-bit 39 // offsets, but are close enough to fit in the 8 bits, we can add to the base 40 // pointer and use the new reduced offsets. 41 // 42 //===----------------------------------------------------------------------===// 43 44 #include "AMDGPU.h" 45 #include "AMDGPUSubtarget.h" 46 #include "SIInstrInfo.h" 47 #include "SIRegisterInfo.h" 48 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 49 #include "Utils/AMDGPUBaseInfo.h" 50 #include "llvm/ADT/ArrayRef.h" 51 #include "llvm/ADT/SmallVector.h" 52 #include "llvm/ADT/StringRef.h" 53 #include "llvm/Analysis/AliasAnalysis.h" 54 #include "llvm/CodeGen/MachineBasicBlock.h" 55 #include "llvm/CodeGen/MachineFunction.h" 56 #include "llvm/CodeGen/MachineFunctionPass.h" 57 #include "llvm/CodeGen/MachineInstr.h" 58 #include "llvm/CodeGen/MachineInstrBuilder.h" 59 #include "llvm/CodeGen/MachineOperand.h" 60 #include "llvm/CodeGen/MachineRegisterInfo.h" 61 #include "llvm/IR/DebugLoc.h" 62 #include "llvm/Pass.h" 63 #include "llvm/Support/Debug.h" 64 #include "llvm/Support/MathExtras.h" 65 #include "llvm/Support/raw_ostream.h" 66 #include <algorithm> 67 #include <cassert> 68 #include <cstdlib> 69 #include <iterator> 70 #include <utility> 71 72 using namespace llvm; 73 74 #define DEBUG_TYPE "si-load-store-opt" 75 76 namespace { 77 78 class SILoadStoreOptimizer : public MachineFunctionPass { 79 enum InstClassEnum { 80 DS_READ_WRITE, 81 S_BUFFER_LOAD_IMM, 82 BUFFER_LOAD_OFFEN, 83 BUFFER_LOAD_OFFSET, 84 BUFFER_STORE_OFFEN, 85 BUFFER_STORE_OFFSET, 86 }; 87 88 struct CombineInfo { 89 MachineBasicBlock::iterator I; 90 MachineBasicBlock::iterator Paired; 91 unsigned EltSize; 92 unsigned Offset0; 93 unsigned Offset1; 94 unsigned BaseOff; 95 InstClassEnum InstClass; 96 bool GLC0; 97 bool GLC1; 98 bool SLC0; 99 bool SLC1; 100 bool UseST64; 101 bool IsX2; 102 SmallVector<MachineInstr*, 8> InstsToMove; 103 }; 104 105 private: 106 const GCNSubtarget *STM = nullptr; 107 const SIInstrInfo *TII = nullptr; 108 const SIRegisterInfo *TRI = nullptr; 109 MachineRegisterInfo *MRI = nullptr; 110 AliasAnalysis *AA = nullptr; 111 unsigned CreatedX2; 112 113 static bool offsetsCanBeCombined(CombineInfo &CI); 114 115 bool findMatchingInst(CombineInfo &CI); 116 117 unsigned read2Opcode(unsigned EltSize) const; 118 unsigned read2ST64Opcode(unsigned EltSize) const; 119 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); 120 121 unsigned write2Opcode(unsigned EltSize) const; 122 unsigned write2ST64Opcode(unsigned EltSize) const; 123 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); 124 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); 125 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); 126 unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, 127 bool &IsOffen) const; 128 MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); 129 130 public: 131 static char ID; 132 133 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 134 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 135 } 136 137 bool optimizeBlock(MachineBasicBlock &MBB); 138 139 bool runOnMachineFunction(MachineFunction &MF) override; 140 141 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 142 143 void getAnalysisUsage(AnalysisUsage &AU) const override { 144 AU.setPreservesCFG(); 145 AU.addRequired<AAResultsWrapperPass>(); 146 147 MachineFunctionPass::getAnalysisUsage(AU); 148 } 149 }; 150 151 } // end anonymous namespace. 152 153 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 154 "SI Load Store Optimizer", false, false) 155 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 156 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, 157 "SI Load Store Optimizer", false, false) 158 159 char SILoadStoreOptimizer::ID = 0; 160 161 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 162 163 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 164 return new SILoadStoreOptimizer(); 165 } 166 167 static void moveInstsAfter(MachineBasicBlock::iterator I, 168 ArrayRef<MachineInstr*> InstsToMove) { 169 MachineBasicBlock *MBB = I->getParent(); 170 ++I; 171 for (MachineInstr *MI : InstsToMove) { 172 MI->removeFromParent(); 173 MBB->insert(I, MI); 174 } 175 } 176 177 static void addDefsUsesToList(const MachineInstr &MI, 178 DenseSet<unsigned> &RegDefs, 179 DenseSet<unsigned> &PhysRegUses) { 180 for (const MachineOperand &Op : MI.operands()) { 181 if (Op.isReg()) { 182 if (Op.isDef()) 183 RegDefs.insert(Op.getReg()); 184 else if (Op.readsReg() && 185 TargetRegisterInfo::isPhysicalRegister(Op.getReg())) 186 PhysRegUses.insert(Op.getReg()); 187 } 188 } 189 } 190 191 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 192 MachineBasicBlock::iterator B, 193 const SIInstrInfo *TII, 194 AliasAnalysis * AA) { 195 // RAW or WAR - cannot reorder 196 // WAW - cannot reorder 197 // RAR - safe to reorder 198 return !(A->mayStore() || B->mayStore()) || 199 TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); 200 } 201 202 // Add MI and its defs to the lists if MI reads one of the defs that are 203 // already in the list. Returns true in that case. 204 static bool 205 addToListsIfDependent(MachineInstr &MI, 206 DenseSet<unsigned> &RegDefs, 207 DenseSet<unsigned> &PhysRegUses, 208 SmallVectorImpl<MachineInstr*> &Insts) { 209 for (MachineOperand &Use : MI.operands()) { 210 // If one of the defs is read, then there is a use of Def between I and the 211 // instruction that I will potentially be merged with. We will need to move 212 // this instruction after the merged instructions. 213 // 214 // Similarly, if there is a def which is read by an instruction that is to 215 // be moved for merging, then we need to move the def-instruction as well. 216 // This can only happen for physical registers such as M0; virtual 217 // registers are in SSA form. 218 if (Use.isReg() && 219 ((Use.readsReg() && RegDefs.count(Use.getReg())) || 220 (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && 221 PhysRegUses.count(Use.getReg())))) { 222 Insts.push_back(&MI); 223 addDefsUsesToList(MI, RegDefs, PhysRegUses); 224 return true; 225 } 226 } 227 228 return false; 229 } 230 231 static bool 232 canMoveInstsAcrossMemOp(MachineInstr &MemOp, 233 ArrayRef<MachineInstr*> InstsToMove, 234 const SIInstrInfo *TII, 235 AliasAnalysis *AA) { 236 assert(MemOp.mayLoadOrStore()); 237 238 for (MachineInstr *InstToMove : InstsToMove) { 239 if (!InstToMove->mayLoadOrStore()) 240 continue; 241 if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) 242 return false; 243 } 244 return true; 245 } 246 247 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { 248 // XXX - Would the same offset be OK? Is there any reason this would happen or 249 // be useful? 250 if (CI.Offset0 == CI.Offset1) 251 return false; 252 253 // This won't be valid if the offset isn't aligned. 254 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0)) 255 return false; 256 257 unsigned EltOffset0 = CI.Offset0 / CI.EltSize; 258 unsigned EltOffset1 = CI.Offset1 / CI.EltSize; 259 CI.UseST64 = false; 260 CI.BaseOff = 0; 261 262 // Handle SMEM and VMEM instructions. 263 if (CI.InstClass != DS_READ_WRITE) { 264 unsigned Diff = CI.IsX2 ? 2 : 1; 265 return (EltOffset0 + Diff == EltOffset1 || 266 EltOffset1 + Diff == EltOffset0) && 267 CI.GLC0 == CI.GLC1 && 268 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); 269 } 270 271 // If the offset in elements doesn't fit in 8-bits, we might be able to use 272 // the stride 64 versions. 273 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 274 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 275 CI.Offset0 = EltOffset0 / 64; 276 CI.Offset1 = EltOffset1 / 64; 277 CI.UseST64 = true; 278 return true; 279 } 280 281 // Check if the new offsets fit in the reduced 8-bit range. 282 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 283 CI.Offset0 = EltOffset0; 284 CI.Offset1 = EltOffset1; 285 return true; 286 } 287 288 // Try to shift base address to decrease offsets. 289 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 290 CI.BaseOff = std::min(CI.Offset0, CI.Offset1); 291 292 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 293 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 294 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 295 CI.UseST64 = true; 296 return true; 297 } 298 299 if (isUInt<8>(OffsetDiff)) { 300 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize; 301 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize; 302 return true; 303 } 304 305 return false; 306 } 307 308 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { 309 MachineBasicBlock *MBB = CI.I->getParent(); 310 MachineBasicBlock::iterator E = MBB->end(); 311 MachineBasicBlock::iterator MBBI = CI.I; 312 313 unsigned AddrOpName[3] = {0}; 314 int AddrIdx[3]; 315 const MachineOperand *AddrReg[3]; 316 unsigned NumAddresses = 0; 317 318 switch (CI.InstClass) { 319 case DS_READ_WRITE: 320 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; 321 break; 322 case S_BUFFER_LOAD_IMM: 323 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; 324 break; 325 case BUFFER_LOAD_OFFEN: 326 case BUFFER_STORE_OFFEN: 327 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 328 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; 329 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 330 break; 331 case BUFFER_LOAD_OFFSET: 332 case BUFFER_STORE_OFFSET: 333 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 334 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 335 break; 336 } 337 338 for (unsigned i = 0; i < NumAddresses; i++) { 339 AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); 340 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); 341 342 // We only ever merge operations with the same base address register, so don't 343 // bother scanning forward if there are no other uses. 344 if (AddrReg[i]->isReg() && 345 (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || 346 MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) 347 return false; 348 } 349 350 ++MBBI; 351 352 DenseSet<unsigned> RegDefsToMove; 353 DenseSet<unsigned> PhysRegUsesToMove; 354 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 355 356 for ( ; MBBI != E; ++MBBI) { 357 if (MBBI->getOpcode() != CI.I->getOpcode()) { 358 // This is not a matching DS instruction, but we can keep looking as 359 // long as one of these conditions are met: 360 // 1. It is safe to move I down past MBBI. 361 // 2. It is safe to move MBBI down past the instruction that I will 362 // be merged into. 363 364 if (MBBI->hasUnmodeledSideEffects()) { 365 // We can't re-order this instruction with respect to other memory 366 // operations, so we fail both conditions mentioned above. 367 return false; 368 } 369 370 if (MBBI->mayLoadOrStore() && 371 (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || 372 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { 373 // We fail condition #1, but we may still be able to satisfy condition 374 // #2. Add this instruction to the move list and then we will check 375 // if condition #2 holds once we have selected the matching instruction. 376 CI.InstsToMove.push_back(&*MBBI); 377 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 378 continue; 379 } 380 381 // When we match I with another DS instruction we will be moving I down 382 // to the location of the matched instruction any uses of I will need to 383 // be moved down as well. 384 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 385 CI.InstsToMove); 386 continue; 387 } 388 389 // Don't merge volatiles. 390 if (MBBI->hasOrderedMemoryRef()) 391 return false; 392 393 // Handle a case like 394 // DS_WRITE_B32 addr, v, idx0 395 // w = DS_READ_B32 addr, idx0 396 // DS_WRITE_B32 addr, f(w), idx1 397 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 398 // merging of the two writes. 399 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 400 CI.InstsToMove)) 401 continue; 402 403 bool Match = true; 404 for (unsigned i = 0; i < NumAddresses; i++) { 405 const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); 406 407 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 408 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 409 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 410 Match = false; 411 break; 412 } 413 continue; 414 } 415 416 // Check same base pointer. Be careful of subregisters, which can occur with 417 // vectors of pointers. 418 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 419 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 420 Match = false; 421 break; 422 } 423 } 424 425 if (Match) { 426 int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), 427 AMDGPU::OpName::offset); 428 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); 429 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); 430 CI.Paired = MBBI; 431 432 if (CI.InstClass == DS_READ_WRITE) { 433 CI.Offset0 &= 0xffff; 434 CI.Offset1 &= 0xffff; 435 } else { 436 CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); 437 CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); 438 if (CI.InstClass != S_BUFFER_LOAD_IMM) { 439 CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); 440 CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); 441 } 442 } 443 444 // Check both offsets fit in the reduced range. 445 // We also need to go through the list of instructions that we plan to 446 // move and make sure they are all safe to move down past the merged 447 // instruction. 448 if (offsetsCanBeCombined(CI)) 449 if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) 450 return true; 451 } 452 453 // We've found a load/store that we couldn't merge for some reason. 454 // We could potentially keep looking, but we'd need to make sure that 455 // it was safe to move I and also all the instruction in InstsToMove 456 // down past this instruction. 457 // check if we can move I across MBBI and if we can move all I's users 458 if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || 459 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) 460 break; 461 } 462 return false; 463 } 464 465 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 466 if (STM->ldsRequiresM0Init()) 467 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 468 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 469 } 470 471 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 472 if (STM->ldsRequiresM0Init()) 473 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 474 475 return (EltSize == 4) ? 476 AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; 477 } 478 479 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( 480 CombineInfo &CI) { 481 MachineBasicBlock *MBB = CI.I->getParent(); 482 483 // Be careful, since the addresses could be subregisters themselves in weird 484 // cases, like vectors of pointers. 485 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 486 487 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 488 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst); 489 490 unsigned NewOffset0 = CI.Offset0; 491 unsigned NewOffset1 = CI.Offset1; 492 unsigned Opc = CI.UseST64 ? 493 read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 494 495 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 496 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 497 498 if (NewOffset0 > NewOffset1) { 499 // Canonicalize the merged instruction so the smaller offset comes first. 500 std::swap(NewOffset0, NewOffset1); 501 std::swap(SubRegIdx0, SubRegIdx1); 502 } 503 504 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 505 (NewOffset0 != NewOffset1) && 506 "Computed offset doesn't fit"); 507 508 const MCInstrDesc &Read2Desc = TII->get(Opc); 509 510 const TargetRegisterClass *SuperRC 511 = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 512 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 513 514 DebugLoc DL = CI.I->getDebugLoc(); 515 516 unsigned BaseReg = AddrReg->getReg(); 517 unsigned BaseSubReg = AddrReg->getSubReg(); 518 unsigned BaseRegFlags = 0; 519 if (CI.BaseOff) { 520 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 521 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 522 .addImm(CI.BaseOff); 523 524 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 525 BaseRegFlags = RegState::Kill; 526 527 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) 528 .addReg(ImmReg) 529 .addReg(AddrReg->getReg(), 0, BaseSubReg); 530 BaseSubReg = 0; 531 } 532 533 MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) 534 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 535 .addImm(NewOffset0) // offset0 536 .addImm(NewOffset1) // offset1 537 .addImm(0) // gds 538 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 539 540 (void)Read2; 541 542 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 543 544 // Copy to the old destination registers. 545 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 546 .add(*Dest0) // Copy to same destination including flags and sub reg. 547 .addReg(DestReg, 0, SubRegIdx0); 548 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 549 .add(*Dest1) 550 .addReg(DestReg, RegState::Kill, SubRegIdx1); 551 552 moveInstsAfter(Copy1, CI.InstsToMove); 553 554 MachineBasicBlock::iterator Next = std::next(CI.I); 555 CI.I->eraseFromParent(); 556 CI.Paired->eraseFromParent(); 557 558 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 559 return Next; 560 } 561 562 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 563 if (STM->ldsRequiresM0Init()) 564 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 565 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; 566 } 567 568 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 569 if (STM->ldsRequiresM0Init()) 570 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; 571 572 return (EltSize == 4) ? 573 AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 574 } 575 576 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 577 CombineInfo &CI) { 578 MachineBasicBlock *MBB = CI.I->getParent(); 579 580 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 581 // sure we preserve the subregister index and any register flags set on them. 582 const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 583 const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 584 const MachineOperand *Data1 585 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); 586 587 unsigned NewOffset0 = CI.Offset0; 588 unsigned NewOffset1 = CI.Offset1; 589 unsigned Opc = CI.UseST64 ? 590 write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 591 592 if (NewOffset0 > NewOffset1) { 593 // Canonicalize the merged instruction so the smaller offset comes first. 594 std::swap(NewOffset0, NewOffset1); 595 std::swap(Data0, Data1); 596 } 597 598 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 599 (NewOffset0 != NewOffset1) && 600 "Computed offset doesn't fit"); 601 602 const MCInstrDesc &Write2Desc = TII->get(Opc); 603 DebugLoc DL = CI.I->getDebugLoc(); 604 605 unsigned BaseReg = AddrReg->getReg(); 606 unsigned BaseSubReg = AddrReg->getSubReg(); 607 unsigned BaseRegFlags = 0; 608 if (CI.BaseOff) { 609 unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 610 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 611 .addImm(CI.BaseOff); 612 613 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 614 BaseRegFlags = RegState::Kill; 615 616 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) 617 .addReg(ImmReg) 618 .addReg(AddrReg->getReg(), 0, BaseSubReg); 619 BaseSubReg = 0; 620 } 621 622 MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc) 623 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 624 .add(*Data0) // data0 625 .add(*Data1) // data1 626 .addImm(NewOffset0) // offset0 627 .addImm(NewOffset1) // offset1 628 .addImm(0) // gds 629 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 630 631 moveInstsAfter(Write2, CI.InstsToMove); 632 633 MachineBasicBlock::iterator Next = std::next(CI.I); 634 CI.I->eraseFromParent(); 635 CI.Paired->eraseFromParent(); 636 637 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 638 return Next; 639 } 640 641 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 642 CombineInfo &CI) { 643 MachineBasicBlock *MBB = CI.I->getParent(); 644 DebugLoc DL = CI.I->getDebugLoc(); 645 unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : 646 AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 647 648 const TargetRegisterClass *SuperRC = 649 CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; 650 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 651 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); 652 653 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) 654 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 655 .addImm(MergedOffset) // offset 656 .addImm(CI.GLC0) // glc 657 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 658 659 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 660 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; 661 662 // Handle descending offsets 663 if (CI.Offset0 > CI.Offset1) 664 std::swap(SubRegIdx0, SubRegIdx1); 665 666 // Copy to the old destination registers. 667 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 668 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 669 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst); 670 671 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 672 .add(*Dest0) // Copy to same destination including flags and sub reg. 673 .addReg(DestReg, 0, SubRegIdx0); 674 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 675 .add(*Dest1) 676 .addReg(DestReg, RegState::Kill, SubRegIdx1); 677 678 moveInstsAfter(Copy1, CI.InstsToMove); 679 680 MachineBasicBlock::iterator Next = std::next(CI.I); 681 CI.I->eraseFromParent(); 682 CI.Paired->eraseFromParent(); 683 return Next; 684 } 685 686 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 687 CombineInfo &CI) { 688 MachineBasicBlock *MBB = CI.I->getParent(); 689 DebugLoc DL = CI.I->getDebugLoc(); 690 unsigned Opcode; 691 692 if (CI.InstClass == BUFFER_LOAD_OFFEN) { 693 Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : 694 AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; 695 } else { 696 Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : 697 AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 698 } 699 700 const TargetRegisterClass *SuperRC = 701 CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; 702 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 703 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); 704 705 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); 706 707 if (CI.InstClass == BUFFER_LOAD_OFFEN) 708 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 709 710 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 711 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 712 .addImm(MergedOffset) // offset 713 .addImm(CI.GLC0) // glc 714 .addImm(CI.SLC0) // slc 715 .addImm(0) // tfe 716 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 717 718 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 719 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; 720 721 // Handle descending offsets 722 if (CI.Offset0 > CI.Offset1) 723 std::swap(SubRegIdx0, SubRegIdx1); 724 725 // Copy to the old destination registers. 726 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 727 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 728 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); 729 730 BuildMI(*MBB, CI.Paired, DL, CopyDesc) 731 .add(*Dest0) // Copy to same destination including flags and sub reg. 732 .addReg(DestReg, 0, SubRegIdx0); 733 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) 734 .add(*Dest1) 735 .addReg(DestReg, RegState::Kill, SubRegIdx1); 736 737 moveInstsAfter(Copy1, CI.InstsToMove); 738 739 MachineBasicBlock::iterator Next = std::next(CI.I); 740 CI.I->eraseFromParent(); 741 CI.Paired->eraseFromParent(); 742 return Next; 743 } 744 745 unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( 746 const MachineInstr &I, bool &IsX2, bool &IsOffen) const { 747 IsX2 = false; 748 IsOffen = false; 749 750 switch (I.getOpcode()) { 751 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 752 IsOffen = true; 753 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; 754 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 755 IsOffen = true; 756 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; 757 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: 758 IsX2 = true; 759 IsOffen = true; 760 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; 761 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: 762 IsX2 = true; 763 IsOffen = true; 764 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; 765 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 766 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; 767 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 768 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; 769 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: 770 IsX2 = true; 771 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; 772 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: 773 IsX2 = true; 774 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; 775 } 776 return 0; 777 } 778 779 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 780 CombineInfo &CI) { 781 MachineBasicBlock *MBB = CI.I->getParent(); 782 DebugLoc DL = CI.I->getDebugLoc(); 783 bool Unused1, Unused2; 784 unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); 785 786 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; 787 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; 788 789 // Handle descending offsets 790 if (CI.Offset0 > CI.Offset1) 791 std::swap(SubRegIdx0, SubRegIdx1); 792 793 // Copy to the new source register. 794 const TargetRegisterClass *SuperRC = 795 CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; 796 unsigned SrcReg = MRI->createVirtualRegister(SuperRC); 797 798 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 799 const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); 800 801 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 802 .add(*Src0) 803 .addImm(SubRegIdx0) 804 .add(*Src1) 805 .addImm(SubRegIdx1); 806 807 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) 808 .addReg(SrcReg, RegState::Kill); 809 810 if (CI.InstClass == BUFFER_STORE_OFFEN) 811 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 812 813 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 814 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 815 .addImm(std::min(CI.Offset0, CI.Offset1)) // offset 816 .addImm(CI.GLC0) // glc 817 .addImm(CI.SLC0) // slc 818 .addImm(0) // tfe 819 .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); 820 821 moveInstsAfter(MIB, CI.InstsToMove); 822 823 MachineBasicBlock::iterator Next = std::next(CI.I); 824 CI.I->eraseFromParent(); 825 CI.Paired->eraseFromParent(); 826 return Next; 827 } 828 829 // Scan through looking for adjacent LDS operations with constant offsets from 830 // the same base register. We rely on the scheduler to do the hard work of 831 // clustering nearby loads, and assume these are all adjacent. 832 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 833 bool Modified = false; 834 835 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 836 MachineInstr &MI = *I; 837 838 // Don't combine if volatile. 839 if (MI.hasOrderedMemoryRef()) { 840 ++I; 841 continue; 842 } 843 844 CombineInfo CI; 845 CI.I = I; 846 unsigned Opc = MI.getOpcode(); 847 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || 848 Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { 849 850 CI.InstClass = DS_READ_WRITE; 851 CI.EltSize = 852 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; 853 854 if (findMatchingInst(CI)) { 855 Modified = true; 856 I = mergeRead2Pair(CI); 857 } else { 858 ++I; 859 } 860 861 continue; 862 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || 863 Opc == AMDGPU::DS_WRITE_B32_gfx9 || 864 Opc == AMDGPU::DS_WRITE_B64_gfx9) { 865 CI.InstClass = DS_READ_WRITE; 866 CI.EltSize 867 = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; 868 869 if (findMatchingInst(CI)) { 870 Modified = true; 871 I = mergeWrite2Pair(CI); 872 } else { 873 ++I; 874 } 875 876 continue; 877 } 878 if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || 879 Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { 880 // EltSize is in units of the offset encoding. 881 CI.InstClass = S_BUFFER_LOAD_IMM; 882 CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); 883 CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 884 if (findMatchingInst(CI)) { 885 Modified = true; 886 I = mergeSBufferLoadImmPair(CI); 887 if (!CI.IsX2) 888 CreatedX2++; 889 } else { 890 ++I; 891 } 892 continue; 893 } 894 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || 895 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || 896 Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || 897 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { 898 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || 899 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) 900 CI.InstClass = BUFFER_LOAD_OFFEN; 901 else 902 CI.InstClass = BUFFER_LOAD_OFFSET; 903 904 CI.EltSize = 4; 905 CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || 906 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; 907 if (findMatchingInst(CI)) { 908 Modified = true; 909 I = mergeBufferLoadPair(CI); 910 if (!CI.IsX2) 911 CreatedX2++; 912 } else { 913 ++I; 914 } 915 continue; 916 } 917 918 bool StoreIsX2, IsOffen; 919 if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { 920 CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; 921 CI.EltSize = 4; 922 CI.IsX2 = StoreIsX2; 923 if (findMatchingInst(CI)) { 924 Modified = true; 925 I = mergeBufferStorePair(CI); 926 if (!CI.IsX2) 927 CreatedX2++; 928 } else { 929 ++I; 930 } 931 continue; 932 } 933 934 ++I; 935 } 936 937 return Modified; 938 } 939 940 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 941 if (skipFunction(MF.getFunction())) 942 return false; 943 944 STM = &MF.getSubtarget<GCNSubtarget>(); 945 if (!STM->loadStoreOptEnabled()) 946 return false; 947 948 TII = STM->getInstrInfo(); 949 TRI = &TII->getRegisterInfo(); 950 951 MRI = &MF.getRegInfo(); 952 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 953 954 assert(MRI->isSSA() && "Must be run on SSA"); 955 956 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 957 958 bool Modified = false; 959 960 for (MachineBasicBlock &MBB : MF) { 961 CreatedX2 = 0; 962 Modified |= optimizeBlock(MBB); 963 964 // Run again to convert x2 to x4. 965 if (CreatedX2 >= 1) 966 Modified |= optimizeBlock(MBB); 967 } 968 969 return Modified; 970 } 971