1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 GLOBAL_LOAD, 83 GLOBAL_LOAD_SADDR 84 }; 85 86 struct AddressRegs { 87 unsigned char NumVAddrs = 0; 88 bool SBase = false; 89 bool SRsrc = false; 90 bool SOffset = false; 91 bool SAddr = false; 92 bool VAddr = false; 93 bool Addr = false; 94 bool SSamp = false; 95 }; 96 97 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 98 const unsigned MaxAddressRegs = 12 + 1 + 1; 99 100 class SILoadStoreOptimizer : public MachineFunctionPass { 101 struct CombineInfo { 102 MachineBasicBlock::iterator I; 103 unsigned EltSize; 104 unsigned Offset; 105 unsigned Width; 106 unsigned Format; 107 unsigned BaseOff; 108 unsigned DMask; 109 InstClassEnum InstClass; 110 unsigned CPol = 0; 111 bool IsAGPR; 112 bool UseST64; 113 int AddrIdx[MaxAddressRegs]; 114 const MachineOperand *AddrReg[MaxAddressRegs]; 115 unsigned NumAddresses; 116 unsigned Order; 117 118 bool hasSameBaseAddress(const MachineInstr &MI) { 119 for (unsigned i = 0; i < NumAddresses; i++) { 120 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 121 122 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 123 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 124 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 125 return false; 126 } 127 continue; 128 } 129 130 // Check same base pointer. Be careful of subregisters, which can occur 131 // with vectors of pointers. 132 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 133 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 134 return false; 135 } 136 } 137 return true; 138 } 139 140 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 141 for (unsigned i = 0; i < NumAddresses; ++i) { 142 const MachineOperand *AddrOp = AddrReg[i]; 143 // Immediates are always OK. 144 if (AddrOp->isImm()) 145 continue; 146 147 // Don't try to merge addresses that aren't either immediates or registers. 148 // TODO: Should be possible to merge FrameIndexes and maybe some other 149 // non-register 150 if (!AddrOp->isReg()) 151 return false; 152 153 // TODO: We should be able to merge physical reg addresses. 154 if (AddrOp->getReg().isPhysical()) 155 return false; 156 157 // If an address has only one use then there will be on other 158 // instructions with the same address, so we can't merge this one. 159 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 160 return false; 161 } 162 return true; 163 } 164 165 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 166 }; 167 168 struct BaseRegisters { 169 Register LoReg; 170 Register HiReg; 171 172 unsigned LoSubReg = 0; 173 unsigned HiSubReg = 0; 174 }; 175 176 struct MemAddress { 177 BaseRegisters Base; 178 int64_t Offset = 0; 179 }; 180 181 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 182 183 private: 184 const GCNSubtarget *STM = nullptr; 185 const SIInstrInfo *TII = nullptr; 186 const SIRegisterInfo *TRI = nullptr; 187 MachineRegisterInfo *MRI = nullptr; 188 AliasAnalysis *AA = nullptr; 189 bool OptimizeAgain; 190 191 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 192 const DenseSet<Register> &ARegUses, 193 const MachineInstr &A, const MachineInstr &B) const; 194 static bool dmasksCanBeCombined(const CombineInfo &CI, 195 const SIInstrInfo &TII, 196 const CombineInfo &Paired); 197 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 198 CombineInfo &Paired, bool Modify = false); 199 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 200 const CombineInfo &Paired); 201 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 202 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 203 const CombineInfo &Paired); 204 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 205 const CombineInfo &Paired); 206 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 207 208 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 209 210 unsigned read2Opcode(unsigned EltSize) const; 211 unsigned read2ST64Opcode(unsigned EltSize) const; 212 MachineBasicBlock::iterator 213 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 214 MachineBasicBlock::iterator InsertBefore); 215 216 unsigned write2Opcode(unsigned EltSize) const; 217 unsigned write2ST64Opcode(unsigned EltSize) const; 218 MachineBasicBlock::iterator 219 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 220 MachineBasicBlock::iterator InsertBefore); 221 MachineBasicBlock::iterator 222 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 223 MachineBasicBlock::iterator InsertBefore); 224 MachineBasicBlock::iterator 225 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 226 MachineBasicBlock::iterator InsertBefore); 227 MachineBasicBlock::iterator 228 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 229 MachineBasicBlock::iterator InsertBefore); 230 MachineBasicBlock::iterator 231 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 232 MachineBasicBlock::iterator InsertBefore); 233 MachineBasicBlock::iterator 234 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 235 MachineBasicBlock::iterator InsertBefore); 236 MachineBasicBlock::iterator 237 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 238 MachineBasicBlock::iterator InsertBefore); 239 MachineBasicBlock::iterator 240 mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired, 241 MachineBasicBlock::iterator InsertBefore); 242 243 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 244 int32_t NewOffset) const; 245 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 246 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 247 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 248 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 249 /// Promotes constant offset to the immediate by adjusting the base. It 250 /// tries to use a base from the nearby instructions that allows it to have 251 /// a 13bit constant offset which gets promoted to the immediate. 252 bool promoteConstantOffsetToImm(MachineInstr &CI, 253 MemInfoMap &Visited, 254 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 255 void addInstToMergeableList(const CombineInfo &CI, 256 std::list<std::list<CombineInfo> > &MergeableInsts) const; 257 258 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 259 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 260 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 261 std::list<std::list<CombineInfo>> &MergeableInsts) const; 262 263 public: 264 static char ID; 265 266 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 267 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 268 } 269 270 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 271 bool &OptimizeListAgain); 272 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 273 274 bool runOnMachineFunction(MachineFunction &MF) override; 275 276 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 277 278 void getAnalysisUsage(AnalysisUsage &AU) const override { 279 AU.setPreservesCFG(); 280 AU.addRequired<AAResultsWrapperPass>(); 281 282 MachineFunctionPass::getAnalysisUsage(AU); 283 } 284 285 MachineFunctionProperties getRequiredProperties() const override { 286 return MachineFunctionProperties() 287 .set(MachineFunctionProperties::Property::IsSSA); 288 } 289 }; 290 291 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 292 const unsigned Opc = MI.getOpcode(); 293 294 if (TII.isMUBUF(Opc)) { 295 // FIXME: Handle d16 correctly 296 return AMDGPU::getMUBUFElements(Opc); 297 } 298 if (TII.isMIMG(MI)) { 299 uint64_t DMaskImm = 300 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 301 return countPopulation(DMaskImm); 302 } 303 if (TII.isMTBUF(Opc)) { 304 return AMDGPU::getMTBUFElements(Opc); 305 } 306 307 switch (Opc) { 308 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 309 case AMDGPU::GLOBAL_LOAD_DWORD: 310 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 311 return 1; 312 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 313 case AMDGPU::GLOBAL_LOAD_DWORDX2: 314 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 315 return 2; 316 case AMDGPU::GLOBAL_LOAD_DWORDX3: 317 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 318 return 3; 319 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 320 case AMDGPU::GLOBAL_LOAD_DWORDX4: 321 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 322 return 4; 323 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 324 return 8; 325 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 326 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 327 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 328 case AMDGPU::DS_WRITE_B32_gfx9: 329 return 1; 330 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 331 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 332 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 333 case AMDGPU::DS_WRITE_B64_gfx9: 334 return 2; 335 default: 336 return 0; 337 } 338 } 339 340 /// Maps instruction opcode to enum InstClassEnum. 341 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 342 switch (Opc) { 343 default: 344 if (TII.isMUBUF(Opc)) { 345 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 346 default: 347 return UNKNOWN; 348 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 349 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 350 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 351 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 352 return BUFFER_LOAD; 353 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 354 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 355 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 356 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 357 return BUFFER_STORE; 358 } 359 } 360 if (TII.isMIMG(Opc)) { 361 // Ignore instructions encoded without vaddr. 362 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 363 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 364 return UNKNOWN; 365 // Ignore BVH instructions 366 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 367 return UNKNOWN; 368 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 369 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 370 TII.isGather4(Opc)) 371 return UNKNOWN; 372 return MIMG; 373 } 374 if (TII.isMTBUF(Opc)) { 375 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 376 default: 377 return UNKNOWN; 378 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 379 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 380 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 381 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 382 return TBUFFER_LOAD; 383 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 384 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 385 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 386 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 387 return TBUFFER_STORE; 388 } 389 } 390 return UNKNOWN; 391 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 392 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 393 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 394 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 395 return S_BUFFER_LOAD_IMM; 396 case AMDGPU::DS_READ_B32: 397 case AMDGPU::DS_READ_B32_gfx9: 398 case AMDGPU::DS_READ_B64: 399 case AMDGPU::DS_READ_B64_gfx9: 400 return DS_READ; 401 case AMDGPU::DS_WRITE_B32: 402 case AMDGPU::DS_WRITE_B32_gfx9: 403 case AMDGPU::DS_WRITE_B64: 404 case AMDGPU::DS_WRITE_B64_gfx9: 405 return DS_WRITE; 406 case AMDGPU::GLOBAL_LOAD_DWORD: 407 case AMDGPU::GLOBAL_LOAD_DWORDX2: 408 case AMDGPU::GLOBAL_LOAD_DWORDX3: 409 case AMDGPU::GLOBAL_LOAD_DWORDX4: 410 return GLOBAL_LOAD; 411 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 412 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 413 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 414 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 415 return GLOBAL_LOAD_SADDR; 416 } 417 } 418 419 /// Determines instruction subclass from opcode. Only instructions 420 /// of the same subclass can be merged together. The merged instruction may have 421 /// a different subclass but must have the same class. 422 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 423 switch (Opc) { 424 default: 425 if (TII.isMUBUF(Opc)) 426 return AMDGPU::getMUBUFBaseOpcode(Opc); 427 if (TII.isMIMG(Opc)) { 428 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 429 assert(Info); 430 return Info->BaseOpcode; 431 } 432 if (TII.isMTBUF(Opc)) 433 return AMDGPU::getMTBUFBaseOpcode(Opc); 434 return -1; 435 case AMDGPU::DS_READ_B32: 436 case AMDGPU::DS_READ_B32_gfx9: 437 case AMDGPU::DS_READ_B64: 438 case AMDGPU::DS_READ_B64_gfx9: 439 case AMDGPU::DS_WRITE_B32: 440 case AMDGPU::DS_WRITE_B32_gfx9: 441 case AMDGPU::DS_WRITE_B64: 442 case AMDGPU::DS_WRITE_B64_gfx9: 443 return Opc; 444 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 445 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 446 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 447 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 448 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 449 case AMDGPU::GLOBAL_LOAD_DWORD: 450 case AMDGPU::GLOBAL_LOAD_DWORDX2: 451 case AMDGPU::GLOBAL_LOAD_DWORDX3: 452 case AMDGPU::GLOBAL_LOAD_DWORDX4: 453 return AMDGPU::GLOBAL_LOAD_DWORD; 454 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 455 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 456 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 457 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 458 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 459 } 460 } 461 462 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 463 AddressRegs Result; 464 465 if (TII.isMUBUF(Opc)) { 466 if (AMDGPU::getMUBUFHasVAddr(Opc)) 467 Result.VAddr = true; 468 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 469 Result.SRsrc = true; 470 if (AMDGPU::getMUBUFHasSoffset(Opc)) 471 Result.SOffset = true; 472 473 return Result; 474 } 475 476 if (TII.isMIMG(Opc)) { 477 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 478 if (VAddr0Idx >= 0) { 479 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 480 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 481 } else { 482 Result.VAddr = true; 483 } 484 Result.SRsrc = true; 485 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 486 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 487 Result.SSamp = true; 488 489 return Result; 490 } 491 if (TII.isMTBUF(Opc)) { 492 if (AMDGPU::getMTBUFHasVAddr(Opc)) 493 Result.VAddr = true; 494 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 495 Result.SRsrc = true; 496 if (AMDGPU::getMTBUFHasSoffset(Opc)) 497 Result.SOffset = true; 498 499 return Result; 500 } 501 502 switch (Opc) { 503 default: 504 return Result; 505 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 506 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 507 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 508 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 509 Result.SBase = true; 510 return Result; 511 case AMDGPU::DS_READ_B32: 512 case AMDGPU::DS_READ_B64: 513 case AMDGPU::DS_READ_B32_gfx9: 514 case AMDGPU::DS_READ_B64_gfx9: 515 case AMDGPU::DS_WRITE_B32: 516 case AMDGPU::DS_WRITE_B64: 517 case AMDGPU::DS_WRITE_B32_gfx9: 518 case AMDGPU::DS_WRITE_B64_gfx9: 519 Result.Addr = true; 520 return Result; 521 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 522 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 523 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 524 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 525 Result.SAddr = true; 526 LLVM_FALLTHROUGH; 527 case AMDGPU::GLOBAL_LOAD_DWORD: 528 case AMDGPU::GLOBAL_LOAD_DWORDX2: 529 case AMDGPU::GLOBAL_LOAD_DWORDX3: 530 case AMDGPU::GLOBAL_LOAD_DWORDX4: 531 Result.VAddr = true; 532 return Result; 533 } 534 } 535 536 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 537 const SILoadStoreOptimizer &LSO) { 538 I = MI; 539 unsigned Opc = MI->getOpcode(); 540 InstClass = getInstClass(Opc, *LSO.TII); 541 542 if (InstClass == UNKNOWN) 543 return; 544 545 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 546 547 switch (InstClass) { 548 case DS_READ: 549 EltSize = 550 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 551 : 4; 552 break; 553 case DS_WRITE: 554 EltSize = 555 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 556 : 4; 557 break; 558 case S_BUFFER_LOAD_IMM: 559 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 560 break; 561 default: 562 EltSize = 4; 563 break; 564 } 565 566 if (InstClass == MIMG) { 567 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 568 // Offset is not considered for MIMG instructions. 569 Offset = 0; 570 } else { 571 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 572 Offset = I->getOperand(OffsetIdx).getImm(); 573 } 574 575 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 576 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 577 578 Width = getOpcodeWidth(*I, *LSO.TII); 579 580 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 581 Offset &= 0xffff; 582 } else if (InstClass != MIMG) { 583 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 584 } 585 586 AddressRegs Regs = getRegs(Opc, *LSO.TII); 587 588 NumAddresses = 0; 589 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 590 AddrIdx[NumAddresses++] = 591 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 592 if (Regs.Addr) 593 AddrIdx[NumAddresses++] = 594 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 595 if (Regs.SBase) 596 AddrIdx[NumAddresses++] = 597 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 598 if (Regs.SRsrc) 599 AddrIdx[NumAddresses++] = 600 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 601 if (Regs.SOffset) 602 AddrIdx[NumAddresses++] = 603 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 604 if (Regs.SAddr) 605 AddrIdx[NumAddresses++] = 606 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 607 if (Regs.VAddr) 608 AddrIdx[NumAddresses++] = 609 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 610 if (Regs.SSamp) 611 AddrIdx[NumAddresses++] = 612 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 613 assert(NumAddresses <= MaxAddressRegs); 614 615 for (unsigned J = 0; J < NumAddresses; J++) 616 AddrReg[J] = &I->getOperand(AddrIdx[J]); 617 } 618 619 } // end anonymous namespace. 620 621 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 622 "SI Load Store Optimizer", false, false) 623 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 624 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 625 false, false) 626 627 char SILoadStoreOptimizer::ID = 0; 628 629 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 630 631 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 632 return new SILoadStoreOptimizer(); 633 } 634 635 static void addDefsUsesToList(const MachineInstr &MI, 636 DenseSet<Register> &RegDefs, 637 DenseSet<Register> &RegUses) { 638 for (const auto &Op : MI.operands()) { 639 if (!Op.isReg()) 640 continue; 641 if (Op.isDef()) 642 RegDefs.insert(Op.getReg()); 643 if (Op.readsReg()) 644 RegUses.insert(Op.getReg()); 645 } 646 } 647 648 bool SILoadStoreOptimizer::canSwapInstructions( 649 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 650 const MachineInstr &A, const MachineInstr &B) const { 651 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 652 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 653 return false; 654 for (const auto &BOp : B.operands()) { 655 if (!BOp.isReg()) 656 continue; 657 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 658 return false; 659 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 660 return false; 661 } 662 return true; 663 } 664 665 // This function assumes that \p A and \p B have are identical except for 666 // size and offset, and they reference adjacent memory. 667 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 668 const MachineMemOperand *A, 669 const MachineMemOperand *B) { 670 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 671 unsigned Size = A->getSize() + B->getSize(); 672 // This function adds the offset parameter to the existing offset for A, 673 // so we pass 0 here as the offset and then manually set it to the correct 674 // value after the call. 675 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 676 MMO->setOffset(MinOffset); 677 return MMO; 678 } 679 680 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 681 const SIInstrInfo &TII, 682 const CombineInfo &Paired) { 683 assert(CI.InstClass == MIMG); 684 685 // Ignore instructions with tfe/lwe set. 686 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 687 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 688 689 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 690 return false; 691 692 // Check other optional immediate operands for equality. 693 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 694 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 695 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 696 697 for (auto op : OperandsToMatch) { 698 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 699 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 700 return false; 701 if (Idx != -1 && 702 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 703 return false; 704 } 705 706 // Check DMask for overlaps. 707 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 708 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 709 710 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 711 if ((1u << AllowedBitsForMin) <= MinMask) 712 return false; 713 714 return true; 715 } 716 717 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 718 unsigned ComponentCount, 719 const GCNSubtarget &STI) { 720 if (ComponentCount > 4) 721 return 0; 722 723 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 724 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 725 if (!OldFormatInfo) 726 return 0; 727 728 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 729 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 730 ComponentCount, 731 OldFormatInfo->NumFormat, STI); 732 733 if (!NewFormatInfo) 734 return 0; 735 736 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 737 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 738 739 return NewFormatInfo->Format; 740 } 741 742 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 743 // highest power of two. Note that the result is well defined for all inputs 744 // including corner cases like: 745 // - if Lo == Hi, return that value 746 // - if Lo == 0, return 0 (even though the "- 1" below underflows 747 // - if Lo > Hi, return 0 (as if the range wrapped around) 748 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 749 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 750 } 751 752 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 753 const GCNSubtarget &STI, 754 CombineInfo &Paired, 755 bool Modify) { 756 assert(CI.InstClass != MIMG); 757 758 // XXX - Would the same offset be OK? Is there any reason this would happen or 759 // be useful? 760 if (CI.Offset == Paired.Offset) 761 return false; 762 763 // This won't be valid if the offset isn't aligned. 764 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 765 return false; 766 767 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 768 769 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 770 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 771 if (!Info0) 772 return false; 773 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 774 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 775 if (!Info1) 776 return false; 777 778 if (Info0->BitsPerComp != Info1->BitsPerComp || 779 Info0->NumFormat != Info1->NumFormat) 780 return false; 781 782 // TODO: Should be possible to support more formats, but if format loads 783 // are not dword-aligned, the merged load might not be valid. 784 if (Info0->BitsPerComp != 32) 785 return false; 786 787 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 788 return false; 789 } 790 791 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 792 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 793 CI.UseST64 = false; 794 CI.BaseOff = 0; 795 796 // Handle all non-DS instructions. 797 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 798 return (EltOffset0 + CI.Width == EltOffset1 || 799 EltOffset1 + Paired.Width == EltOffset0) && 800 CI.CPol == Paired.CPol; 801 } 802 803 // If the offset in elements doesn't fit in 8-bits, we might be able to use 804 // the stride 64 versions. 805 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 806 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 807 if (Modify) { 808 CI.Offset = EltOffset0 / 64; 809 Paired.Offset = EltOffset1 / 64; 810 CI.UseST64 = true; 811 } 812 return true; 813 } 814 815 // Check if the new offsets fit in the reduced 8-bit range. 816 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 817 if (Modify) { 818 CI.Offset = EltOffset0; 819 Paired.Offset = EltOffset1; 820 } 821 return true; 822 } 823 824 // Try to shift base address to decrease offsets. 825 uint32_t Min = std::min(EltOffset0, EltOffset1); 826 uint32_t Max = std::max(EltOffset0, EltOffset1); 827 828 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 829 if (((Max - Min) & ~Mask) == 0) { 830 if (Modify) { 831 // From the range of values we could use for BaseOff, choose the one that 832 // is aligned to the highest power of two, to maximise the chance that 833 // the same offset can be reused for other load/store pairs. 834 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 835 // Copy the low bits of the offsets, so that when we adjust them by 836 // subtracting BaseOff they will be multiples of 64. 837 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 838 CI.BaseOff = BaseOff * CI.EltSize; 839 CI.Offset = (EltOffset0 - BaseOff) / 64; 840 Paired.Offset = (EltOffset1 - BaseOff) / 64; 841 CI.UseST64 = true; 842 } 843 return true; 844 } 845 846 if (isUInt<8>(Max - Min)) { 847 if (Modify) { 848 // From the range of values we could use for BaseOff, choose the one that 849 // is aligned to the highest power of two, to maximise the chance that 850 // the same offset can be reused for other load/store pairs. 851 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 852 CI.BaseOff = BaseOff * CI.EltSize; 853 CI.Offset = EltOffset0 - BaseOff; 854 Paired.Offset = EltOffset1 - BaseOff; 855 } 856 return true; 857 } 858 859 return false; 860 } 861 862 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 863 const CombineInfo &CI, 864 const CombineInfo &Paired) { 865 const unsigned Width = (CI.Width + Paired.Width); 866 switch (CI.InstClass) { 867 default: 868 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 869 case S_BUFFER_LOAD_IMM: 870 switch (Width) { 871 default: 872 return false; 873 case 2: 874 case 4: 875 case 8: 876 return true; 877 } 878 } 879 } 880 881 const TargetRegisterClass * 882 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 883 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 884 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 885 } 886 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 887 return TRI->getRegClassForReg(*MRI, Src->getReg()); 888 } 889 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 890 return TRI->getRegClassForReg(*MRI, Src->getReg()); 891 } 892 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 893 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 894 } 895 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 896 return TRI->getRegClassForReg(*MRI, Src->getReg()); 897 } 898 return nullptr; 899 } 900 901 /// This function assumes that CI comes before Paired in a basic block. Return 902 /// an insertion point for the merged instruction or nullptr on failure. 903 SILoadStoreOptimizer::CombineInfo * 904 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 905 CombineInfo &Paired) { 906 // If another instruction has already been merged into CI, it may now be a 907 // type that we can't do any further merging into. 908 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 909 return nullptr; 910 assert(CI.InstClass == Paired.InstClass); 911 912 if (getInstSubclass(CI.I->getOpcode(), *TII) != 913 getInstSubclass(Paired.I->getOpcode(), *TII)) 914 return nullptr; 915 916 // Check both offsets (or masks for MIMG) can be combined and fit in the 917 // reduced range. 918 if (CI.InstClass == MIMG) { 919 if (!dmasksCanBeCombined(CI, *TII, Paired)) 920 return nullptr; 921 } else { 922 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 923 return nullptr; 924 } 925 926 DenseSet<Register> RegDefs; 927 DenseSet<Register> RegUses; 928 CombineInfo *Where; 929 if (CI.I->mayLoad()) { 930 // Try to hoist Paired up to CI. 931 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 932 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 933 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 934 return nullptr; 935 } 936 Where = &CI; 937 } else { 938 // Try to sink CI down to Paired. 939 addDefsUsesToList(*CI.I, RegDefs, RegUses); 940 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 941 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 942 return nullptr; 943 } 944 Where = &Paired; 945 } 946 947 // Call offsetsCanBeCombined with modify = true so that the offsets are 948 // correct for the new instruction. This should return true, because 949 // this function should only be called on CombineInfo objects that 950 // have already been confirmed to be mergeable. 951 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 952 offsetsCanBeCombined(CI, *STM, Paired, true); 953 return Where; 954 } 955 956 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 957 if (STM->ldsRequiresM0Init()) 958 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 959 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 960 } 961 962 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 963 if (STM->ldsRequiresM0Init()) 964 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 965 966 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 967 : AMDGPU::DS_READ2ST64_B64_gfx9; 968 } 969 970 MachineBasicBlock::iterator 971 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 972 MachineBasicBlock::iterator InsertBefore) { 973 MachineBasicBlock *MBB = CI.I->getParent(); 974 975 // Be careful, since the addresses could be subregisters themselves in weird 976 // cases, like vectors of pointers. 977 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 978 979 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 980 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 981 982 unsigned NewOffset0 = CI.Offset; 983 unsigned NewOffset1 = Paired.Offset; 984 unsigned Opc = 985 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 986 987 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 988 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 989 990 if (NewOffset0 > NewOffset1) { 991 // Canonicalize the merged instruction so the smaller offset comes first. 992 std::swap(NewOffset0, NewOffset1); 993 std::swap(SubRegIdx0, SubRegIdx1); 994 } 995 996 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 997 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 998 999 const MCInstrDesc &Read2Desc = TII->get(Opc); 1000 1001 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1002 Register DestReg = MRI->createVirtualRegister(SuperRC); 1003 1004 DebugLoc DL = CI.I->getDebugLoc(); 1005 1006 Register BaseReg = AddrReg->getReg(); 1007 unsigned BaseSubReg = AddrReg->getSubReg(); 1008 unsigned BaseRegFlags = 0; 1009 if (CI.BaseOff) { 1010 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1011 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1012 .addImm(CI.BaseOff); 1013 1014 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1015 BaseRegFlags = RegState::Kill; 1016 1017 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1018 .addReg(ImmReg) 1019 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1020 .addImm(0); // clamp bit 1021 BaseSubReg = 0; 1022 } 1023 1024 MachineInstrBuilder Read2 = 1025 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1026 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1027 .addImm(NewOffset0) // offset0 1028 .addImm(NewOffset1) // offset1 1029 .addImm(0) // gds 1030 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1031 1032 (void)Read2; 1033 1034 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1035 1036 // Copy to the old destination registers. 1037 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1038 .add(*Dest0) // Copy to same destination including flags and sub reg. 1039 .addReg(DestReg, 0, SubRegIdx0); 1040 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1041 .add(*Dest1) 1042 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1043 1044 CI.I->eraseFromParent(); 1045 Paired.I->eraseFromParent(); 1046 1047 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1048 return Read2; 1049 } 1050 1051 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1052 if (STM->ldsRequiresM0Init()) 1053 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1054 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1055 : AMDGPU::DS_WRITE2_B64_gfx9; 1056 } 1057 1058 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1059 if (STM->ldsRequiresM0Init()) 1060 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1061 : AMDGPU::DS_WRITE2ST64_B64; 1062 1063 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1064 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1065 } 1066 1067 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1068 CombineInfo &CI, CombineInfo &Paired, 1069 MachineBasicBlock::iterator InsertBefore) { 1070 MachineBasicBlock *MBB = CI.I->getParent(); 1071 1072 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1073 // sure we preserve the subregister index and any register flags set on them. 1074 const MachineOperand *AddrReg = 1075 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1076 const MachineOperand *Data0 = 1077 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1078 const MachineOperand *Data1 = 1079 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1080 1081 unsigned NewOffset0 = CI.Offset; 1082 unsigned NewOffset1 = Paired.Offset; 1083 unsigned Opc = 1084 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1085 1086 if (NewOffset0 > NewOffset1) { 1087 // Canonicalize the merged instruction so the smaller offset comes first. 1088 std::swap(NewOffset0, NewOffset1); 1089 std::swap(Data0, Data1); 1090 } 1091 1092 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1093 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1094 1095 const MCInstrDesc &Write2Desc = TII->get(Opc); 1096 DebugLoc DL = CI.I->getDebugLoc(); 1097 1098 Register BaseReg = AddrReg->getReg(); 1099 unsigned BaseSubReg = AddrReg->getSubReg(); 1100 unsigned BaseRegFlags = 0; 1101 if (CI.BaseOff) { 1102 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1103 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1104 .addImm(CI.BaseOff); 1105 1106 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1107 BaseRegFlags = RegState::Kill; 1108 1109 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1110 .addReg(ImmReg) 1111 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1112 .addImm(0); // clamp bit 1113 BaseSubReg = 0; 1114 } 1115 1116 MachineInstrBuilder Write2 = 1117 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1118 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1119 .add(*Data0) // data0 1120 .add(*Data1) // data1 1121 .addImm(NewOffset0) // offset0 1122 .addImm(NewOffset1) // offset1 1123 .addImm(0) // gds 1124 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1125 1126 CI.I->eraseFromParent(); 1127 Paired.I->eraseFromParent(); 1128 1129 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1130 return Write2; 1131 } 1132 1133 MachineBasicBlock::iterator 1134 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1135 MachineBasicBlock::iterator InsertBefore) { 1136 MachineBasicBlock *MBB = CI.I->getParent(); 1137 DebugLoc DL = CI.I->getDebugLoc(); 1138 const unsigned Opcode = getNewOpcode(CI, Paired); 1139 1140 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1141 1142 Register DestReg = MRI->createVirtualRegister(SuperRC); 1143 unsigned MergedDMask = CI.DMask | Paired.DMask; 1144 unsigned DMaskIdx = 1145 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1146 1147 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1148 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1149 if (I == DMaskIdx) 1150 MIB.addImm(MergedDMask); 1151 else 1152 MIB.add((*CI.I).getOperand(I)); 1153 } 1154 1155 // It shouldn't be possible to get this far if the two instructions 1156 // don't have a single memoperand, because MachineInstr::mayAlias() 1157 // will return true if this is the case. 1158 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1159 1160 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1161 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1162 1163 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1164 1165 unsigned SubRegIdx0, SubRegIdx1; 1166 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1167 1168 // Copy to the old destination registers. 1169 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1170 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1171 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1172 1173 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1174 .add(*Dest0) // Copy to same destination including flags and sub reg. 1175 .addReg(DestReg, 0, SubRegIdx0); 1176 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1177 .add(*Dest1) 1178 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1179 1180 CI.I->eraseFromParent(); 1181 Paired.I->eraseFromParent(); 1182 return New; 1183 } 1184 1185 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1186 CombineInfo &CI, CombineInfo &Paired, 1187 MachineBasicBlock::iterator InsertBefore) { 1188 MachineBasicBlock *MBB = CI.I->getParent(); 1189 DebugLoc DL = CI.I->getDebugLoc(); 1190 const unsigned Opcode = getNewOpcode(CI, Paired); 1191 1192 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1193 1194 Register DestReg = MRI->createVirtualRegister(SuperRC); 1195 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1196 1197 // It shouldn't be possible to get this far if the two instructions 1198 // don't have a single memoperand, because MachineInstr::mayAlias() 1199 // will return true if this is the case. 1200 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1201 1202 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1203 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1204 1205 MachineInstr *New = 1206 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1207 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1208 .addImm(MergedOffset) // offset 1209 .addImm(CI.CPol) // cpol 1210 .addMemOperand( 1211 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1212 1213 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1214 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1215 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1216 1217 // Copy to the old destination registers. 1218 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1219 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1220 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1221 1222 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1223 .add(*Dest0) // Copy to same destination including flags and sub reg. 1224 .addReg(DestReg, 0, SubRegIdx0); 1225 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1226 .add(*Dest1) 1227 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1228 1229 CI.I->eraseFromParent(); 1230 Paired.I->eraseFromParent(); 1231 return New; 1232 } 1233 1234 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1235 CombineInfo &CI, CombineInfo &Paired, 1236 MachineBasicBlock::iterator InsertBefore) { 1237 MachineBasicBlock *MBB = CI.I->getParent(); 1238 DebugLoc DL = CI.I->getDebugLoc(); 1239 1240 const unsigned Opcode = getNewOpcode(CI, Paired); 1241 1242 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1243 1244 // Copy to the new source register. 1245 Register DestReg = MRI->createVirtualRegister(SuperRC); 1246 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1247 1248 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1249 1250 AddressRegs Regs = getRegs(Opcode, *TII); 1251 1252 if (Regs.VAddr) 1253 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1254 1255 // It shouldn't be possible to get this far if the two instructions 1256 // don't have a single memoperand, because MachineInstr::mayAlias() 1257 // will return true if this is the case. 1258 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1259 1260 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1261 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1262 1263 MachineInstr *New = 1264 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1265 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1266 .addImm(MergedOffset) // offset 1267 .addImm(CI.CPol) // cpol 1268 .addImm(0) // tfe 1269 .addImm(0) // swz 1270 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1271 1272 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1273 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1274 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1275 1276 // Copy to the old destination registers. 1277 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1278 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1279 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1280 1281 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1282 .add(*Dest0) // Copy to same destination including flags and sub reg. 1283 .addReg(DestReg, 0, SubRegIdx0); 1284 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1285 .add(*Dest1) 1286 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1287 1288 CI.I->eraseFromParent(); 1289 Paired.I->eraseFromParent(); 1290 return New; 1291 } 1292 1293 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1294 CombineInfo &CI, CombineInfo &Paired, 1295 MachineBasicBlock::iterator InsertBefore) { 1296 MachineBasicBlock *MBB = CI.I->getParent(); 1297 DebugLoc DL = CI.I->getDebugLoc(); 1298 1299 const unsigned Opcode = getNewOpcode(CI, Paired); 1300 1301 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1302 1303 // Copy to the new source register. 1304 Register DestReg = MRI->createVirtualRegister(SuperRC); 1305 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1306 1307 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1308 1309 AddressRegs Regs = getRegs(Opcode, *TII); 1310 1311 if (Regs.VAddr) 1312 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1313 1314 unsigned JoinedFormat = 1315 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1316 1317 // It shouldn't be possible to get this far if the two instructions 1318 // don't have a single memoperand, because MachineInstr::mayAlias() 1319 // will return true if this is the case. 1320 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1321 1322 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1323 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1324 1325 MachineInstr *New = 1326 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1327 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1328 .addImm(MergedOffset) // offset 1329 .addImm(JoinedFormat) // format 1330 .addImm(CI.CPol) // cpol 1331 .addImm(0) // tfe 1332 .addImm(0) // swz 1333 .addMemOperand( 1334 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1335 1336 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1337 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1338 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1339 1340 // Copy to the old destination registers. 1341 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1342 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1343 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1344 1345 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1346 .add(*Dest0) // Copy to same destination including flags and sub reg. 1347 .addReg(DestReg, 0, SubRegIdx0); 1348 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1349 .add(*Dest1) 1350 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1351 1352 CI.I->eraseFromParent(); 1353 Paired.I->eraseFromParent(); 1354 return New; 1355 } 1356 1357 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1358 CombineInfo &CI, CombineInfo &Paired, 1359 MachineBasicBlock::iterator InsertBefore) { 1360 MachineBasicBlock *MBB = CI.I->getParent(); 1361 DebugLoc DL = CI.I->getDebugLoc(); 1362 1363 const unsigned Opcode = getNewOpcode(CI, Paired); 1364 1365 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1366 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1367 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1368 1369 // Copy to the new source register. 1370 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1371 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1372 1373 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1374 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1375 1376 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1377 .add(*Src0) 1378 .addImm(SubRegIdx0) 1379 .add(*Src1) 1380 .addImm(SubRegIdx1); 1381 1382 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1383 .addReg(SrcReg, RegState::Kill); 1384 1385 AddressRegs Regs = getRegs(Opcode, *TII); 1386 1387 if (Regs.VAddr) 1388 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1389 1390 unsigned JoinedFormat = 1391 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1392 1393 // It shouldn't be possible to get this far if the two instructions 1394 // don't have a single memoperand, because MachineInstr::mayAlias() 1395 // will return true if this is the case. 1396 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1397 1398 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1399 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1400 1401 MachineInstr *New = 1402 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1403 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1404 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1405 .addImm(JoinedFormat) // format 1406 .addImm(CI.CPol) // cpol 1407 .addImm(0) // tfe 1408 .addImm(0) // swz 1409 .addMemOperand( 1410 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1411 1412 CI.I->eraseFromParent(); 1413 Paired.I->eraseFromParent(); 1414 return New; 1415 } 1416 1417 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair( 1418 CombineInfo &CI, CombineInfo &Paired, 1419 MachineBasicBlock::iterator InsertBefore) { 1420 MachineBasicBlock *MBB = CI.I->getParent(); 1421 DebugLoc DL = CI.I->getDebugLoc(); 1422 1423 const unsigned Opcode = getNewOpcode(CI, Paired); 1424 1425 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1426 Register DestReg = MRI->createVirtualRegister(SuperRC); 1427 1428 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1429 1430 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1431 MIB.add(*SAddr); 1432 1433 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1434 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1435 1436 MachineInstr *New = 1437 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1438 .addImm(std::min(CI.Offset, Paired.Offset)) 1439 .addImm(CI.CPol) 1440 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1441 1442 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1443 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1444 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1445 1446 // Copy to the old destination registers. 1447 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1448 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1449 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1450 1451 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1452 .add(*Dest0) // Copy to same destination including flags and sub reg. 1453 .addReg(DestReg, 0, SubRegIdx0); 1454 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1455 .add(*Dest1) 1456 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1457 1458 CI.I->eraseFromParent(); 1459 Paired.I->eraseFromParent(); 1460 return New; 1461 } 1462 1463 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1464 const CombineInfo &Paired) { 1465 const unsigned Width = CI.Width + Paired.Width; 1466 1467 switch (CI.InstClass) { 1468 default: 1469 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1470 // FIXME: Handle d16 correctly 1471 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1472 Width); 1473 case TBUFFER_LOAD: 1474 case TBUFFER_STORE: 1475 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1476 Width); 1477 1478 case UNKNOWN: 1479 llvm_unreachable("Unknown instruction class"); 1480 case S_BUFFER_LOAD_IMM: 1481 switch (Width) { 1482 default: 1483 return 0; 1484 case 2: 1485 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1486 case 4: 1487 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1488 case 8: 1489 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1490 } 1491 case GLOBAL_LOAD: 1492 switch (Width) { 1493 default: 1494 return 0; 1495 case 2: 1496 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1497 case 3: 1498 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1499 case 4: 1500 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1501 } 1502 case GLOBAL_LOAD_SADDR: 1503 switch (Width) { 1504 default: 1505 return 0; 1506 case 2: 1507 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1508 case 3: 1509 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1510 case 4: 1511 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1512 } 1513 case MIMG: 1514 assert((countPopulation(CI.DMask | Paired.DMask) == Width) && 1515 "No overlaps"); 1516 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1517 } 1518 } 1519 1520 std::pair<unsigned, unsigned> 1521 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1522 const CombineInfo &Paired) { 1523 bool ReverseOrder; 1524 if (CI.InstClass == MIMG) { 1525 assert( 1526 (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1527 "No overlaps"); 1528 ReverseOrder = CI.DMask > Paired.DMask; 1529 } else { 1530 ReverseOrder = CI.Offset > Paired.Offset; 1531 } 1532 1533 unsigned Idx0; 1534 unsigned Idx1; 1535 1536 static const unsigned Idxs[5][4] = { 1537 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1538 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1539 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1540 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1541 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1542 }; 1543 1544 assert(CI.Width >= 1 && CI.Width <= 4); 1545 assert(Paired.Width >= 1 && Paired.Width <= 4); 1546 1547 if (ReverseOrder) { 1548 Idx1 = Idxs[0][Paired.Width - 1]; 1549 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1550 } else { 1551 Idx0 = Idxs[0][CI.Width - 1]; 1552 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1553 } 1554 1555 return std::make_pair(Idx0, Idx1); 1556 } 1557 1558 const TargetRegisterClass * 1559 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1560 const CombineInfo &Paired) { 1561 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1562 switch (CI.Width + Paired.Width) { 1563 default: 1564 return nullptr; 1565 case 2: 1566 return &AMDGPU::SReg_64_XEXECRegClass; 1567 case 4: 1568 return &AMDGPU::SGPR_128RegClass; 1569 case 8: 1570 return &AMDGPU::SGPR_256RegClass; 1571 case 16: 1572 return &AMDGPU::SGPR_512RegClass; 1573 } 1574 } 1575 1576 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1577 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1578 ? TRI->getAGPRClassForBitWidth(BitWidth) 1579 : TRI->getVGPRClassForBitWidth(BitWidth); 1580 } 1581 1582 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1583 CombineInfo &CI, CombineInfo &Paired, 1584 MachineBasicBlock::iterator InsertBefore) { 1585 MachineBasicBlock *MBB = CI.I->getParent(); 1586 DebugLoc DL = CI.I->getDebugLoc(); 1587 1588 const unsigned Opcode = getNewOpcode(CI, Paired); 1589 1590 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1591 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1592 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1593 1594 // Copy to the new source register. 1595 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1596 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1597 1598 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1599 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1600 1601 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1602 .add(*Src0) 1603 .addImm(SubRegIdx0) 1604 .add(*Src1) 1605 .addImm(SubRegIdx1); 1606 1607 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1608 .addReg(SrcReg, RegState::Kill); 1609 1610 AddressRegs Regs = getRegs(Opcode, *TII); 1611 1612 if (Regs.VAddr) 1613 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1614 1615 1616 // It shouldn't be possible to get this far if the two instructions 1617 // don't have a single memoperand, because MachineInstr::mayAlias() 1618 // will return true if this is the case. 1619 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1620 1621 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1622 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1623 1624 MachineInstr *New = 1625 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1626 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1627 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1628 .addImm(CI.CPol) // cpol 1629 .addImm(0) // tfe 1630 .addImm(0) // swz 1631 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1632 1633 CI.I->eraseFromParent(); 1634 Paired.I->eraseFromParent(); 1635 return New; 1636 } 1637 1638 MachineOperand 1639 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1640 APInt V(32, Val, true); 1641 if (TII->isInlineConstant(V)) 1642 return MachineOperand::CreateImm(Val); 1643 1644 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1645 MachineInstr *Mov = 1646 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1647 TII->get(AMDGPU::S_MOV_B32), Reg) 1648 .addImm(Val); 1649 (void)Mov; 1650 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1651 return MachineOperand::CreateReg(Reg, false); 1652 } 1653 1654 // Compute base address using Addr and return the final register. 1655 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1656 const MemAddress &Addr) const { 1657 MachineBasicBlock *MBB = MI.getParent(); 1658 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1659 DebugLoc DL = MI.getDebugLoc(); 1660 1661 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1662 Addr.Base.LoSubReg) && 1663 "Expected 32-bit Base-Register-Low!!"); 1664 1665 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1666 Addr.Base.HiSubReg) && 1667 "Expected 32-bit Base-Register-Hi!!"); 1668 1669 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1670 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1671 MachineOperand OffsetHi = 1672 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1673 1674 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1675 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1676 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1677 1678 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1679 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1680 MachineInstr *LoHalf = 1681 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1682 .addReg(CarryReg, RegState::Define) 1683 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1684 .add(OffsetLo) 1685 .addImm(0); // clamp bit 1686 (void)LoHalf; 1687 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1688 1689 MachineInstr *HiHalf = 1690 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1691 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1692 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1693 .add(OffsetHi) 1694 .addReg(CarryReg, RegState::Kill) 1695 .addImm(0); // clamp bit 1696 (void)HiHalf; 1697 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1698 1699 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1700 MachineInstr *FullBase = 1701 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1702 .addReg(DestSub0) 1703 .addImm(AMDGPU::sub0) 1704 .addReg(DestSub1) 1705 .addImm(AMDGPU::sub1); 1706 (void)FullBase; 1707 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1708 1709 return FullDestReg; 1710 } 1711 1712 // Update base and offset with the NewBase and NewOffset in MI. 1713 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1714 Register NewBase, 1715 int32_t NewOffset) const { 1716 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1717 Base->setReg(NewBase); 1718 Base->setIsKill(false); 1719 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1720 } 1721 1722 Optional<int32_t> 1723 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1724 if (Op.isImm()) 1725 return Op.getImm(); 1726 1727 if (!Op.isReg()) 1728 return None; 1729 1730 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1731 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1732 !Def->getOperand(1).isImm()) 1733 return None; 1734 1735 return Def->getOperand(1).getImm(); 1736 } 1737 1738 // Analyze Base and extracts: 1739 // - 32bit base registers, subregisters 1740 // - 64bit constant offset 1741 // Expecting base computation as: 1742 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1743 // %LO:vgpr_32, %c:sreg_64_xexec = 1744 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1745 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1746 // %Base:vreg_64 = 1747 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1748 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1749 MemAddress &Addr) const { 1750 if (!Base.isReg()) 1751 return; 1752 1753 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1754 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1755 || Def->getNumOperands() != 5) 1756 return; 1757 1758 MachineOperand BaseLo = Def->getOperand(1); 1759 MachineOperand BaseHi = Def->getOperand(3); 1760 if (!BaseLo.isReg() || !BaseHi.isReg()) 1761 return; 1762 1763 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1764 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1765 1766 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1767 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1768 return; 1769 1770 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1771 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1772 1773 auto Offset0P = extractConstOffset(*Src0); 1774 if (Offset0P) 1775 BaseLo = *Src1; 1776 else { 1777 if (!(Offset0P = extractConstOffset(*Src1))) 1778 return; 1779 BaseLo = *Src0; 1780 } 1781 1782 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1783 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1784 1785 if (Src0->isImm()) 1786 std::swap(Src0, Src1); 1787 1788 if (!Src1->isImm()) 1789 return; 1790 1791 uint64_t Offset1 = Src1->getImm(); 1792 BaseHi = *Src0; 1793 1794 Addr.Base.LoReg = BaseLo.getReg(); 1795 Addr.Base.HiReg = BaseHi.getReg(); 1796 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1797 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1798 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1799 } 1800 1801 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1802 MachineInstr &MI, 1803 MemInfoMap &Visited, 1804 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1805 1806 if (!(MI.mayLoad() ^ MI.mayStore())) 1807 return false; 1808 1809 // TODO: Support flat and scratch. 1810 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1811 return false; 1812 1813 if (MI.mayLoad() && 1814 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 1815 return false; 1816 1817 if (AnchorList.count(&MI)) 1818 return false; 1819 1820 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1821 1822 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1823 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1824 return false; 1825 } 1826 1827 // Step1: Find the base-registers and a 64bit constant offset. 1828 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1829 MemAddress MAddr; 1830 if (Visited.find(&MI) == Visited.end()) { 1831 processBaseWithConstOffset(Base, MAddr); 1832 Visited[&MI] = MAddr; 1833 } else 1834 MAddr = Visited[&MI]; 1835 1836 if (MAddr.Offset == 0) { 1837 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1838 " constant offsets that can be promoted.\n";); 1839 return false; 1840 } 1841 1842 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1843 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1844 1845 // Step2: Traverse through MI's basic block and find an anchor(that has the 1846 // same base-registers) with the highest 13bit distance from MI's offset. 1847 // E.g. (64bit loads) 1848 // bb: 1849 // addr1 = &a + 4096; load1 = load(addr1, 0) 1850 // addr2 = &a + 6144; load2 = load(addr2, 0) 1851 // addr3 = &a + 8192; load3 = load(addr3, 0) 1852 // addr4 = &a + 10240; load4 = load(addr4, 0) 1853 // addr5 = &a + 12288; load5 = load(addr5, 0) 1854 // 1855 // Starting from the first load, the optimization will try to find a new base 1856 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1857 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1858 // as the new-base(anchor) because of the maximum distance which can 1859 // accommodate more intermediate bases presumably. 1860 // 1861 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1862 // (&a + 8192) for load1, load2, load4. 1863 // addr = &a + 8192 1864 // load1 = load(addr, -4096) 1865 // load2 = load(addr, -2048) 1866 // load3 = load(addr, 0) 1867 // load4 = load(addr, 2048) 1868 // addr5 = &a + 12288; load5 = load(addr5, 0) 1869 // 1870 MachineInstr *AnchorInst = nullptr; 1871 MemAddress AnchorAddr; 1872 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1873 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1874 1875 MachineBasicBlock *MBB = MI.getParent(); 1876 MachineBasicBlock::iterator E = MBB->end(); 1877 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1878 ++MBBI; 1879 const SITargetLowering *TLI = 1880 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1881 1882 for ( ; MBBI != E; ++MBBI) { 1883 MachineInstr &MINext = *MBBI; 1884 // TODO: Support finding an anchor(with same base) from store addresses or 1885 // any other load addresses where the opcodes are different. 1886 if (MINext.getOpcode() != MI.getOpcode() || 1887 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1888 continue; 1889 1890 const MachineOperand &BaseNext = 1891 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1892 MemAddress MAddrNext; 1893 if (Visited.find(&MINext) == Visited.end()) { 1894 processBaseWithConstOffset(BaseNext, MAddrNext); 1895 Visited[&MINext] = MAddrNext; 1896 } else 1897 MAddrNext = Visited[&MINext]; 1898 1899 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1900 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1901 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1902 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1903 continue; 1904 1905 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1906 1907 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1908 TargetLoweringBase::AddrMode AM; 1909 AM.HasBaseReg = true; 1910 AM.BaseOffs = Dist; 1911 if (TLI->isLegalGlobalAddressingMode(AM) && 1912 (uint32_t)std::abs(Dist) > MaxDist) { 1913 MaxDist = std::abs(Dist); 1914 1915 AnchorAddr = MAddrNext; 1916 AnchorInst = &MINext; 1917 } 1918 } 1919 1920 if (AnchorInst) { 1921 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1922 AnchorInst->dump()); 1923 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1924 << AnchorAddr.Offset << "\n\n"); 1925 1926 // Instead of moving up, just re-compute anchor-instruction's base address. 1927 Register Base = computeBase(MI, AnchorAddr); 1928 1929 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1930 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1931 1932 for (auto P : InstsWCommonBase) { 1933 TargetLoweringBase::AddrMode AM; 1934 AM.HasBaseReg = true; 1935 AM.BaseOffs = P.second - AnchorAddr.Offset; 1936 1937 if (TLI->isLegalGlobalAddressingMode(AM)) { 1938 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1939 dbgs() << ")"; P.first->dump()); 1940 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1941 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1942 } 1943 } 1944 AnchorList.insert(AnchorInst); 1945 return true; 1946 } 1947 1948 return false; 1949 } 1950 1951 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1952 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1953 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1954 if (AddrList.front().InstClass == CI.InstClass && 1955 AddrList.front().IsAGPR == CI.IsAGPR && 1956 AddrList.front().hasSameBaseAddress(*CI.I)) { 1957 AddrList.emplace_back(CI); 1958 return; 1959 } 1960 } 1961 1962 // Base address not found, so add a new list. 1963 MergeableInsts.emplace_back(1, CI); 1964 } 1965 1966 std::pair<MachineBasicBlock::iterator, bool> 1967 SILoadStoreOptimizer::collectMergeableInsts( 1968 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1969 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1970 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1971 bool Modified = false; 1972 1973 // Sort potential mergeable instructions into lists. One list per base address. 1974 unsigned Order = 0; 1975 MachineBasicBlock::iterator BlockI = Begin; 1976 for (; BlockI != End; ++BlockI) { 1977 MachineInstr &MI = *BlockI; 1978 1979 // We run this before checking if an address is mergeable, because it can produce 1980 // better code even if the instructions aren't mergeable. 1981 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1982 Modified = true; 1983 1984 // Treat volatile accesses, ordered accesses and unmodeled side effects as 1985 // barriers. We can look after this barrier for separate merges. 1986 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 1987 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 1988 1989 // Search will resume after this instruction in a separate merge list. 1990 ++BlockI; 1991 break; 1992 } 1993 1994 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1995 if (InstClass == UNKNOWN) 1996 continue; 1997 1998 // Do not merge VMEM buffer instructions with "swizzled" bit set. 1999 int Swizzled = 2000 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2001 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2002 continue; 2003 2004 CombineInfo CI; 2005 CI.setMI(MI, *this); 2006 CI.Order = Order++; 2007 2008 if (!CI.hasMergeableAddress(*MRI)) 2009 continue; 2010 2011 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2012 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2013 // operands. However we are reporting that ds_write2 shall have 2014 // only VGPR data so that machine copy propagation does not 2015 // create an illegal instruction with a VGPR and AGPR sources. 2016 // Consequenctially if we create such instruction the verifier 2017 // will complain. 2018 continue; 2019 } 2020 2021 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2022 2023 addInstToMergeableList(CI, MergeableInsts); 2024 } 2025 2026 // At this point we have lists of Mergeable instructions. 2027 // 2028 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2029 // list try to find an instruction that can be merged with I. If an instruction 2030 // is found, it is stored in the Paired field. If no instructions are found, then 2031 // the CombineInfo object is deleted from the list. 2032 2033 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2034 E = MergeableInsts.end(); I != E;) { 2035 2036 std::list<CombineInfo> &MergeList = *I; 2037 if (MergeList.size() <= 1) { 2038 // This means we have found only one instruction with a given address 2039 // that can be merged, and we need at least 2 instructions to do a merge, 2040 // so this list can be discarded. 2041 I = MergeableInsts.erase(I); 2042 continue; 2043 } 2044 2045 // Sort the lists by offsets, this way mergeable instructions will be 2046 // adjacent to each other in the list, which will make it easier to find 2047 // matches. 2048 MergeList.sort( 2049 [] (const CombineInfo &A, const CombineInfo &B) { 2050 return A.Offset < B.Offset; 2051 }); 2052 ++I; 2053 } 2054 2055 return std::make_pair(BlockI, Modified); 2056 } 2057 2058 // Scan through looking for adjacent LDS operations with constant offsets from 2059 // the same base register. We rely on the scheduler to do the hard work of 2060 // clustering nearby loads, and assume these are all adjacent. 2061 bool SILoadStoreOptimizer::optimizeBlock( 2062 std::list<std::list<CombineInfo> > &MergeableInsts) { 2063 bool Modified = false; 2064 2065 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2066 E = MergeableInsts.end(); I != E;) { 2067 std::list<CombineInfo> &MergeList = *I; 2068 2069 bool OptimizeListAgain = false; 2070 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2071 // We weren't able to make any changes, so delete the list so we don't 2072 // process the same instructions the next time we try to optimize this 2073 // block. 2074 I = MergeableInsts.erase(I); 2075 continue; 2076 } 2077 2078 Modified = true; 2079 2080 // We made changes, but also determined that there were no more optimization 2081 // opportunities, so we don't need to reprocess the list 2082 if (!OptimizeListAgain) { 2083 I = MergeableInsts.erase(I); 2084 continue; 2085 } 2086 OptimizeAgain = true; 2087 } 2088 return Modified; 2089 } 2090 2091 bool 2092 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2093 std::list<CombineInfo> &MergeList, 2094 bool &OptimizeListAgain) { 2095 if (MergeList.empty()) 2096 return false; 2097 2098 bool Modified = false; 2099 2100 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2101 Next = std::next(I)) { 2102 2103 auto First = I; 2104 auto Second = Next; 2105 2106 if ((*First).Order > (*Second).Order) 2107 std::swap(First, Second); 2108 CombineInfo &CI = *First; 2109 CombineInfo &Paired = *Second; 2110 2111 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2112 if (!Where) { 2113 ++I; 2114 continue; 2115 } 2116 2117 Modified = true; 2118 2119 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2120 2121 MachineBasicBlock::iterator NewMI; 2122 switch (CI.InstClass) { 2123 default: 2124 llvm_unreachable("unknown InstClass"); 2125 break; 2126 case DS_READ: 2127 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2128 break; 2129 case DS_WRITE: 2130 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2131 break; 2132 case S_BUFFER_LOAD_IMM: 2133 NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); 2134 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2135 break; 2136 case BUFFER_LOAD: 2137 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2138 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2139 break; 2140 case BUFFER_STORE: 2141 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2142 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2143 break; 2144 case MIMG: 2145 NewMI = mergeImagePair(CI, Paired, Where->I); 2146 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2147 break; 2148 case TBUFFER_LOAD: 2149 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2150 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2151 break; 2152 case TBUFFER_STORE: 2153 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2154 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2155 break; 2156 case GLOBAL_LOAD: 2157 case GLOBAL_LOAD_SADDR: 2158 NewMI = mergeGlobalLoadPair(CI, Paired, Where->I); 2159 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2160 break; 2161 } 2162 CI.setMI(NewMI, *this); 2163 CI.Order = Where->Order; 2164 if (I == Second) 2165 I = Next; 2166 2167 MergeList.erase(Second); 2168 } 2169 2170 return Modified; 2171 } 2172 2173 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2174 if (skipFunction(MF.getFunction())) 2175 return false; 2176 2177 STM = &MF.getSubtarget<GCNSubtarget>(); 2178 if (!STM->loadStoreOptEnabled()) 2179 return false; 2180 2181 TII = STM->getInstrInfo(); 2182 TRI = &TII->getRegisterInfo(); 2183 2184 MRI = &MF.getRegInfo(); 2185 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2186 2187 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2188 2189 bool Modified = false; 2190 2191 // Contains the list of instructions for which constant offsets are being 2192 // promoted to the IMM. This is tracked for an entire block at time. 2193 SmallPtrSet<MachineInstr *, 4> AnchorList; 2194 MemInfoMap Visited; 2195 2196 for (MachineBasicBlock &MBB : MF) { 2197 MachineBasicBlock::iterator SectionEnd; 2198 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2199 I = SectionEnd) { 2200 bool CollectModified; 2201 std::list<std::list<CombineInfo>> MergeableInsts; 2202 2203 // First pass: Collect list of all instructions we know how to merge in a 2204 // subset of the block. 2205 std::tie(SectionEnd, CollectModified) = 2206 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2207 2208 Modified |= CollectModified; 2209 2210 do { 2211 OptimizeAgain = false; 2212 Modified |= optimizeBlock(MergeableInsts); 2213 } while (OptimizeAgain); 2214 } 2215 2216 Visited.clear(); 2217 AnchorList.clear(); 2218 } 2219 2220 return Modified; 2221 } 2222