1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 GLOBAL_LOAD, 83 GLOBAL_LOAD_SADDR, 84 GLOBAL_STORE, 85 GLOBAL_STORE_SADDR 86 }; 87 88 struct AddressRegs { 89 unsigned char NumVAddrs = 0; 90 bool SBase = false; 91 bool SRsrc = false; 92 bool SOffset = false; 93 bool SAddr = false; 94 bool VAddr = false; 95 bool Addr = false; 96 bool SSamp = false; 97 }; 98 99 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 100 const unsigned MaxAddressRegs = 12 + 1 + 1; 101 102 class SILoadStoreOptimizer : public MachineFunctionPass { 103 struct CombineInfo { 104 MachineBasicBlock::iterator I; 105 unsigned EltSize; 106 unsigned Offset; 107 unsigned Width; 108 unsigned Format; 109 unsigned BaseOff; 110 unsigned DMask; 111 InstClassEnum InstClass; 112 unsigned CPol = 0; 113 bool IsAGPR; 114 bool UseST64; 115 int AddrIdx[MaxAddressRegs]; 116 const MachineOperand *AddrReg[MaxAddressRegs]; 117 unsigned NumAddresses; 118 unsigned Order; 119 120 bool hasSameBaseAddress(const MachineInstr &MI) { 121 for (unsigned i = 0; i < NumAddresses; i++) { 122 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 123 124 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 125 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 126 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 127 return false; 128 } 129 continue; 130 } 131 132 // Check same base pointer. Be careful of subregisters, which can occur 133 // with vectors of pointers. 134 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 135 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 136 return false; 137 } 138 } 139 return true; 140 } 141 142 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 143 for (unsigned i = 0; i < NumAddresses; ++i) { 144 const MachineOperand *AddrOp = AddrReg[i]; 145 // Immediates are always OK. 146 if (AddrOp->isImm()) 147 continue; 148 149 // Don't try to merge addresses that aren't either immediates or registers. 150 // TODO: Should be possible to merge FrameIndexes and maybe some other 151 // non-register 152 if (!AddrOp->isReg()) 153 return false; 154 155 // TODO: We should be able to merge physical reg addresses. 156 if (AddrOp->getReg().isPhysical()) 157 return false; 158 159 // If an address has only one use then there will be on other 160 // instructions with the same address, so we can't merge this one. 161 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 162 return false; 163 } 164 return true; 165 } 166 167 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 168 169 // Compare by pointer order. 170 bool operator<(const CombineInfo& Other) const { 171 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 172 } 173 }; 174 175 struct BaseRegisters { 176 Register LoReg; 177 Register HiReg; 178 179 unsigned LoSubReg = 0; 180 unsigned HiSubReg = 0; 181 }; 182 183 struct MemAddress { 184 BaseRegisters Base; 185 int64_t Offset = 0; 186 }; 187 188 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 189 190 private: 191 const GCNSubtarget *STM = nullptr; 192 const SIInstrInfo *TII = nullptr; 193 const SIRegisterInfo *TRI = nullptr; 194 MachineRegisterInfo *MRI = nullptr; 195 AliasAnalysis *AA = nullptr; 196 bool OptimizeAgain; 197 198 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 199 const DenseSet<Register> &ARegUses, 200 const MachineInstr &A, const MachineInstr &B) const; 201 static bool dmasksCanBeCombined(const CombineInfo &CI, 202 const SIInstrInfo &TII, 203 const CombineInfo &Paired); 204 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 205 CombineInfo &Paired, bool Modify = false); 206 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 207 const CombineInfo &Paired); 208 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 209 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 210 const CombineInfo &Paired); 211 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 212 const CombineInfo &Paired); 213 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 214 215 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 216 217 unsigned read2Opcode(unsigned EltSize) const; 218 unsigned read2ST64Opcode(unsigned EltSize) const; 219 MachineBasicBlock::iterator 220 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 221 MachineBasicBlock::iterator InsertBefore); 222 223 unsigned write2Opcode(unsigned EltSize) const; 224 unsigned write2ST64Opcode(unsigned EltSize) const; 225 MachineBasicBlock::iterator 226 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 227 MachineBasicBlock::iterator InsertBefore); 228 MachineBasicBlock::iterator 229 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 230 MachineBasicBlock::iterator InsertBefore); 231 MachineBasicBlock::iterator 232 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 233 MachineBasicBlock::iterator InsertBefore); 234 MachineBasicBlock::iterator 235 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 236 MachineBasicBlock::iterator InsertBefore); 237 MachineBasicBlock::iterator 238 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 239 MachineBasicBlock::iterator InsertBefore); 240 MachineBasicBlock::iterator 241 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 242 MachineBasicBlock::iterator InsertBefore); 243 MachineBasicBlock::iterator 244 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 245 MachineBasicBlock::iterator InsertBefore); 246 MachineBasicBlock::iterator 247 mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired, 248 MachineBasicBlock::iterator InsertBefore); 249 MachineBasicBlock::iterator 250 mergeGlobalStorePair(CombineInfo &CI, CombineInfo &Paired, 251 MachineBasicBlock::iterator InsertBefore); 252 253 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 254 int32_t NewOffset) const; 255 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 256 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 257 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 258 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 259 /// Promotes constant offset to the immediate by adjusting the base. It 260 /// tries to use a base from the nearby instructions that allows it to have 261 /// a 13bit constant offset which gets promoted to the immediate. 262 bool promoteConstantOffsetToImm(MachineInstr &CI, 263 MemInfoMap &Visited, 264 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 265 void addInstToMergeableList(const CombineInfo &CI, 266 std::list<std::list<CombineInfo> > &MergeableInsts) const; 267 268 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 269 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 270 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 271 std::list<std::list<CombineInfo>> &MergeableInsts) const; 272 273 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 274 const CombineInfo &Paired); 275 276 public: 277 static char ID; 278 279 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 280 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 281 } 282 283 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 284 bool &OptimizeListAgain); 285 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 286 287 bool runOnMachineFunction(MachineFunction &MF) override; 288 289 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 290 291 void getAnalysisUsage(AnalysisUsage &AU) const override { 292 AU.setPreservesCFG(); 293 AU.addRequired<AAResultsWrapperPass>(); 294 295 MachineFunctionPass::getAnalysisUsage(AU); 296 } 297 298 MachineFunctionProperties getRequiredProperties() const override { 299 return MachineFunctionProperties() 300 .set(MachineFunctionProperties::Property::IsSSA); 301 } 302 }; 303 304 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 305 const unsigned Opc = MI.getOpcode(); 306 307 if (TII.isMUBUF(Opc)) { 308 // FIXME: Handle d16 correctly 309 return AMDGPU::getMUBUFElements(Opc); 310 } 311 if (TII.isMIMG(MI)) { 312 uint64_t DMaskImm = 313 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 314 return countPopulation(DMaskImm); 315 } 316 if (TII.isMTBUF(Opc)) { 317 return AMDGPU::getMTBUFElements(Opc); 318 } 319 320 switch (Opc) { 321 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 322 case AMDGPU::GLOBAL_LOAD_DWORD: 323 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 324 case AMDGPU::GLOBAL_STORE_DWORD: 325 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 326 return 1; 327 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 328 case AMDGPU::GLOBAL_LOAD_DWORDX2: 329 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 330 case AMDGPU::GLOBAL_STORE_DWORDX2: 331 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 332 return 2; 333 case AMDGPU::GLOBAL_LOAD_DWORDX3: 334 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 335 case AMDGPU::GLOBAL_STORE_DWORDX3: 336 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 337 return 3; 338 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 339 case AMDGPU::GLOBAL_LOAD_DWORDX4: 340 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 341 case AMDGPU::GLOBAL_STORE_DWORDX4: 342 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 343 return 4; 344 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 345 return 8; 346 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 347 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 348 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 349 case AMDGPU::DS_WRITE_B32_gfx9: 350 return 1; 351 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 352 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 353 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 354 case AMDGPU::DS_WRITE_B64_gfx9: 355 return 2; 356 default: 357 return 0; 358 } 359 } 360 361 /// Maps instruction opcode to enum InstClassEnum. 362 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 363 switch (Opc) { 364 default: 365 if (TII.isMUBUF(Opc)) { 366 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 367 default: 368 return UNKNOWN; 369 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 370 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 371 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 372 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 373 return BUFFER_LOAD; 374 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 375 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 376 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 377 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 378 return BUFFER_STORE; 379 } 380 } 381 if (TII.isMIMG(Opc)) { 382 // Ignore instructions encoded without vaddr. 383 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 384 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 385 return UNKNOWN; 386 // Ignore BVH instructions 387 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 388 return UNKNOWN; 389 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 390 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 391 TII.isGather4(Opc)) 392 return UNKNOWN; 393 return MIMG; 394 } 395 if (TII.isMTBUF(Opc)) { 396 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 397 default: 398 return UNKNOWN; 399 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 400 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 401 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 402 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 403 return TBUFFER_LOAD; 404 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 405 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 406 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 407 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 408 return TBUFFER_STORE; 409 } 410 } 411 return UNKNOWN; 412 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 413 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 414 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 415 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 416 return S_BUFFER_LOAD_IMM; 417 case AMDGPU::DS_READ_B32: 418 case AMDGPU::DS_READ_B32_gfx9: 419 case AMDGPU::DS_READ_B64: 420 case AMDGPU::DS_READ_B64_gfx9: 421 return DS_READ; 422 case AMDGPU::DS_WRITE_B32: 423 case AMDGPU::DS_WRITE_B32_gfx9: 424 case AMDGPU::DS_WRITE_B64: 425 case AMDGPU::DS_WRITE_B64_gfx9: 426 return DS_WRITE; 427 case AMDGPU::GLOBAL_LOAD_DWORD: 428 case AMDGPU::GLOBAL_LOAD_DWORDX2: 429 case AMDGPU::GLOBAL_LOAD_DWORDX3: 430 case AMDGPU::GLOBAL_LOAD_DWORDX4: 431 return GLOBAL_LOAD; 432 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 433 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 434 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 435 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 436 return GLOBAL_LOAD_SADDR; 437 case AMDGPU::GLOBAL_STORE_DWORD: 438 case AMDGPU::GLOBAL_STORE_DWORDX2: 439 case AMDGPU::GLOBAL_STORE_DWORDX3: 440 case AMDGPU::GLOBAL_STORE_DWORDX4: 441 return GLOBAL_STORE; 442 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 443 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 444 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 445 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 446 return GLOBAL_STORE_SADDR; 447 } 448 } 449 450 /// Determines instruction subclass from opcode. Only instructions 451 /// of the same subclass can be merged together. The merged instruction may have 452 /// a different subclass but must have the same class. 453 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 454 switch (Opc) { 455 default: 456 if (TII.isMUBUF(Opc)) 457 return AMDGPU::getMUBUFBaseOpcode(Opc); 458 if (TII.isMIMG(Opc)) { 459 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 460 assert(Info); 461 return Info->BaseOpcode; 462 } 463 if (TII.isMTBUF(Opc)) 464 return AMDGPU::getMTBUFBaseOpcode(Opc); 465 return -1; 466 case AMDGPU::DS_READ_B32: 467 case AMDGPU::DS_READ_B32_gfx9: 468 case AMDGPU::DS_READ_B64: 469 case AMDGPU::DS_READ_B64_gfx9: 470 case AMDGPU::DS_WRITE_B32: 471 case AMDGPU::DS_WRITE_B32_gfx9: 472 case AMDGPU::DS_WRITE_B64: 473 case AMDGPU::DS_WRITE_B64_gfx9: 474 return Opc; 475 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 476 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 477 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 478 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 479 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 480 case AMDGPU::GLOBAL_LOAD_DWORD: 481 case AMDGPU::GLOBAL_LOAD_DWORDX2: 482 case AMDGPU::GLOBAL_LOAD_DWORDX3: 483 case AMDGPU::GLOBAL_LOAD_DWORDX4: 484 return AMDGPU::GLOBAL_LOAD_DWORD; 485 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 486 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 487 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 488 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 489 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 490 case AMDGPU::GLOBAL_STORE_DWORD: 491 case AMDGPU::GLOBAL_STORE_DWORDX2: 492 case AMDGPU::GLOBAL_STORE_DWORDX3: 493 case AMDGPU::GLOBAL_STORE_DWORDX4: 494 return AMDGPU::GLOBAL_STORE_DWORD; 495 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 496 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 497 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 498 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 499 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 500 } 501 } 502 503 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 504 AddressRegs Result; 505 506 if (TII.isMUBUF(Opc)) { 507 if (AMDGPU::getMUBUFHasVAddr(Opc)) 508 Result.VAddr = true; 509 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 510 Result.SRsrc = true; 511 if (AMDGPU::getMUBUFHasSoffset(Opc)) 512 Result.SOffset = true; 513 514 return Result; 515 } 516 517 if (TII.isMIMG(Opc)) { 518 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 519 if (VAddr0Idx >= 0) { 520 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 521 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 522 } else { 523 Result.VAddr = true; 524 } 525 Result.SRsrc = true; 526 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 527 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 528 Result.SSamp = true; 529 530 return Result; 531 } 532 if (TII.isMTBUF(Opc)) { 533 if (AMDGPU::getMTBUFHasVAddr(Opc)) 534 Result.VAddr = true; 535 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 536 Result.SRsrc = true; 537 if (AMDGPU::getMTBUFHasSoffset(Opc)) 538 Result.SOffset = true; 539 540 return Result; 541 } 542 543 switch (Opc) { 544 default: 545 return Result; 546 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 547 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 548 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 549 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 550 Result.SBase = true; 551 return Result; 552 case AMDGPU::DS_READ_B32: 553 case AMDGPU::DS_READ_B64: 554 case AMDGPU::DS_READ_B32_gfx9: 555 case AMDGPU::DS_READ_B64_gfx9: 556 case AMDGPU::DS_WRITE_B32: 557 case AMDGPU::DS_WRITE_B64: 558 case AMDGPU::DS_WRITE_B32_gfx9: 559 case AMDGPU::DS_WRITE_B64_gfx9: 560 Result.Addr = true; 561 return Result; 562 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 563 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 564 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 565 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 566 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 567 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 568 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 569 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 570 Result.SAddr = true; 571 LLVM_FALLTHROUGH; 572 case AMDGPU::GLOBAL_LOAD_DWORD: 573 case AMDGPU::GLOBAL_LOAD_DWORDX2: 574 case AMDGPU::GLOBAL_LOAD_DWORDX3: 575 case AMDGPU::GLOBAL_LOAD_DWORDX4: 576 case AMDGPU::GLOBAL_STORE_DWORD: 577 case AMDGPU::GLOBAL_STORE_DWORDX2: 578 case AMDGPU::GLOBAL_STORE_DWORDX3: 579 case AMDGPU::GLOBAL_STORE_DWORDX4: 580 Result.VAddr = true; 581 return Result; 582 } 583 } 584 585 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 586 const SILoadStoreOptimizer &LSO) { 587 I = MI; 588 unsigned Opc = MI->getOpcode(); 589 InstClass = getInstClass(Opc, *LSO.TII); 590 591 if (InstClass == UNKNOWN) 592 return; 593 594 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 595 596 switch (InstClass) { 597 case DS_READ: 598 EltSize = 599 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 600 : 4; 601 break; 602 case DS_WRITE: 603 EltSize = 604 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 605 : 4; 606 break; 607 case S_BUFFER_LOAD_IMM: 608 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 609 break; 610 default: 611 EltSize = 4; 612 break; 613 } 614 615 if (InstClass == MIMG) { 616 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 617 // Offset is not considered for MIMG instructions. 618 Offset = 0; 619 } else { 620 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 621 Offset = I->getOperand(OffsetIdx).getImm(); 622 } 623 624 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 625 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 626 627 Width = getOpcodeWidth(*I, *LSO.TII); 628 629 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 630 Offset &= 0xffff; 631 } else if (InstClass != MIMG) { 632 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 633 } 634 635 AddressRegs Regs = getRegs(Opc, *LSO.TII); 636 637 NumAddresses = 0; 638 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 639 AddrIdx[NumAddresses++] = 640 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 641 if (Regs.Addr) 642 AddrIdx[NumAddresses++] = 643 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 644 if (Regs.SBase) 645 AddrIdx[NumAddresses++] = 646 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 647 if (Regs.SRsrc) 648 AddrIdx[NumAddresses++] = 649 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 650 if (Regs.SOffset) 651 AddrIdx[NumAddresses++] = 652 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 653 if (Regs.SAddr) 654 AddrIdx[NumAddresses++] = 655 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 656 if (Regs.VAddr) 657 AddrIdx[NumAddresses++] = 658 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 659 if (Regs.SSamp) 660 AddrIdx[NumAddresses++] = 661 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 662 assert(NumAddresses <= MaxAddressRegs); 663 664 for (unsigned J = 0; J < NumAddresses; J++) 665 AddrReg[J] = &I->getOperand(AddrIdx[J]); 666 } 667 668 } // end anonymous namespace. 669 670 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 671 "SI Load Store Optimizer", false, false) 672 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 673 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 674 false, false) 675 676 char SILoadStoreOptimizer::ID = 0; 677 678 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 679 680 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 681 return new SILoadStoreOptimizer(); 682 } 683 684 static void addDefsUsesToList(const MachineInstr &MI, 685 DenseSet<Register> &RegDefs, 686 DenseSet<Register> &RegUses) { 687 for (const auto &Op : MI.operands()) { 688 if (!Op.isReg()) 689 continue; 690 if (Op.isDef()) 691 RegDefs.insert(Op.getReg()); 692 if (Op.readsReg()) 693 RegUses.insert(Op.getReg()); 694 } 695 } 696 697 bool SILoadStoreOptimizer::canSwapInstructions( 698 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 699 const MachineInstr &A, const MachineInstr &B) const { 700 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 701 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 702 return false; 703 for (const auto &BOp : B.operands()) { 704 if (!BOp.isReg()) 705 continue; 706 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 707 return false; 708 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 709 return false; 710 } 711 return true; 712 } 713 714 // Given that \p CI and \p Paired are adjacent memory operations produce a new 715 // MMO for the combined operation with a new access size. 716 MachineMemOperand * 717 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 718 const CombineInfo &Paired) { 719 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 720 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 721 722 unsigned Size = MMOa->getSize() + MMOb->getSize(); 723 724 // A base pointer for the combined operation is the same as the leading 725 // operation's pointer. 726 if (Paired < CI) 727 MMOa = MMOb; 728 729 MachineFunction *MF = CI.I->getMF(); 730 return MF->getMachineMemOperand(MMOa, MMOa->getPointerInfo(), Size); 731 } 732 733 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 734 const SIInstrInfo &TII, 735 const CombineInfo &Paired) { 736 assert(CI.InstClass == MIMG); 737 738 // Ignore instructions with tfe/lwe set. 739 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 740 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 741 742 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 743 return false; 744 745 // Check other optional immediate operands for equality. 746 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 747 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 748 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 749 750 for (auto op : OperandsToMatch) { 751 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 752 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 753 return false; 754 if (Idx != -1 && 755 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 756 return false; 757 } 758 759 // Check DMask for overlaps. 760 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 761 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 762 763 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 764 if ((1u << AllowedBitsForMin) <= MinMask) 765 return false; 766 767 return true; 768 } 769 770 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 771 unsigned ComponentCount, 772 const GCNSubtarget &STI) { 773 if (ComponentCount > 4) 774 return 0; 775 776 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 777 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 778 if (!OldFormatInfo) 779 return 0; 780 781 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 782 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 783 ComponentCount, 784 OldFormatInfo->NumFormat, STI); 785 786 if (!NewFormatInfo) 787 return 0; 788 789 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 790 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 791 792 return NewFormatInfo->Format; 793 } 794 795 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 796 // highest power of two. Note that the result is well defined for all inputs 797 // including corner cases like: 798 // - if Lo == Hi, return that value 799 // - if Lo == 0, return 0 (even though the "- 1" below underflows 800 // - if Lo > Hi, return 0 (as if the range wrapped around) 801 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 802 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 803 } 804 805 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 806 const GCNSubtarget &STI, 807 CombineInfo &Paired, 808 bool Modify) { 809 assert(CI.InstClass != MIMG); 810 811 // XXX - Would the same offset be OK? Is there any reason this would happen or 812 // be useful? 813 if (CI.Offset == Paired.Offset) 814 return false; 815 816 // This won't be valid if the offset isn't aligned. 817 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 818 return false; 819 820 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 821 822 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 823 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 824 if (!Info0) 825 return false; 826 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 827 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 828 if (!Info1) 829 return false; 830 831 if (Info0->BitsPerComp != Info1->BitsPerComp || 832 Info0->NumFormat != Info1->NumFormat) 833 return false; 834 835 // TODO: Should be possible to support more formats, but if format loads 836 // are not dword-aligned, the merged load might not be valid. 837 if (Info0->BitsPerComp != 32) 838 return false; 839 840 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 841 return false; 842 } 843 844 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 845 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 846 CI.UseST64 = false; 847 CI.BaseOff = 0; 848 849 // Handle all non-DS instructions. 850 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 851 return (EltOffset0 + CI.Width == EltOffset1 || 852 EltOffset1 + Paired.Width == EltOffset0) && 853 CI.CPol == Paired.CPol; 854 } 855 856 // If the offset in elements doesn't fit in 8-bits, we might be able to use 857 // the stride 64 versions. 858 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 859 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 860 if (Modify) { 861 CI.Offset = EltOffset0 / 64; 862 Paired.Offset = EltOffset1 / 64; 863 CI.UseST64 = true; 864 } 865 return true; 866 } 867 868 // Check if the new offsets fit in the reduced 8-bit range. 869 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 870 if (Modify) { 871 CI.Offset = EltOffset0; 872 Paired.Offset = EltOffset1; 873 } 874 return true; 875 } 876 877 // Try to shift base address to decrease offsets. 878 uint32_t Min = std::min(EltOffset0, EltOffset1); 879 uint32_t Max = std::max(EltOffset0, EltOffset1); 880 881 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 882 if (((Max - Min) & ~Mask) == 0) { 883 if (Modify) { 884 // From the range of values we could use for BaseOff, choose the one that 885 // is aligned to the highest power of two, to maximise the chance that 886 // the same offset can be reused for other load/store pairs. 887 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 888 // Copy the low bits of the offsets, so that when we adjust them by 889 // subtracting BaseOff they will be multiples of 64. 890 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 891 CI.BaseOff = BaseOff * CI.EltSize; 892 CI.Offset = (EltOffset0 - BaseOff) / 64; 893 Paired.Offset = (EltOffset1 - BaseOff) / 64; 894 CI.UseST64 = true; 895 } 896 return true; 897 } 898 899 if (isUInt<8>(Max - Min)) { 900 if (Modify) { 901 // From the range of values we could use for BaseOff, choose the one that 902 // is aligned to the highest power of two, to maximise the chance that 903 // the same offset can be reused for other load/store pairs. 904 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 905 CI.BaseOff = BaseOff * CI.EltSize; 906 CI.Offset = EltOffset0 - BaseOff; 907 Paired.Offset = EltOffset1 - BaseOff; 908 } 909 return true; 910 } 911 912 return false; 913 } 914 915 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 916 const CombineInfo &CI, 917 const CombineInfo &Paired) { 918 const unsigned Width = (CI.Width + Paired.Width); 919 switch (CI.InstClass) { 920 default: 921 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 922 case S_BUFFER_LOAD_IMM: 923 switch (Width) { 924 default: 925 return false; 926 case 2: 927 case 4: 928 case 8: 929 return true; 930 } 931 } 932 } 933 934 const TargetRegisterClass * 935 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 936 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 937 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 938 } 939 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 940 return TRI->getRegClassForReg(*MRI, Src->getReg()); 941 } 942 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 943 return TRI->getRegClassForReg(*MRI, Src->getReg()); 944 } 945 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 946 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 947 } 948 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 949 return TRI->getRegClassForReg(*MRI, Src->getReg()); 950 } 951 return nullptr; 952 } 953 954 /// This function assumes that CI comes before Paired in a basic block. Return 955 /// an insertion point for the merged instruction or nullptr on failure. 956 SILoadStoreOptimizer::CombineInfo * 957 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 958 CombineInfo &Paired) { 959 // If another instruction has already been merged into CI, it may now be a 960 // type that we can't do any further merging into. 961 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 962 return nullptr; 963 assert(CI.InstClass == Paired.InstClass); 964 965 if (getInstSubclass(CI.I->getOpcode(), *TII) != 966 getInstSubclass(Paired.I->getOpcode(), *TII)) 967 return nullptr; 968 969 // Check both offsets (or masks for MIMG) can be combined and fit in the 970 // reduced range. 971 if (CI.InstClass == MIMG) { 972 if (!dmasksCanBeCombined(CI, *TII, Paired)) 973 return nullptr; 974 } else { 975 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 976 return nullptr; 977 } 978 979 DenseSet<Register> RegDefs; 980 DenseSet<Register> RegUses; 981 CombineInfo *Where; 982 if (CI.I->mayLoad()) { 983 // Try to hoist Paired up to CI. 984 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 985 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 986 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 987 return nullptr; 988 } 989 Where = &CI; 990 } else { 991 // Try to sink CI down to Paired. 992 addDefsUsesToList(*CI.I, RegDefs, RegUses); 993 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 994 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 995 return nullptr; 996 } 997 Where = &Paired; 998 } 999 1000 // Call offsetsCanBeCombined with modify = true so that the offsets are 1001 // correct for the new instruction. This should return true, because 1002 // this function should only be called on CombineInfo objects that 1003 // have already been confirmed to be mergeable. 1004 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1005 offsetsCanBeCombined(CI, *STM, Paired, true); 1006 return Where; 1007 } 1008 1009 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1010 if (STM->ldsRequiresM0Init()) 1011 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1012 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1013 } 1014 1015 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1016 if (STM->ldsRequiresM0Init()) 1017 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1018 1019 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1020 : AMDGPU::DS_READ2ST64_B64_gfx9; 1021 } 1022 1023 MachineBasicBlock::iterator 1024 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1025 MachineBasicBlock::iterator InsertBefore) { 1026 MachineBasicBlock *MBB = CI.I->getParent(); 1027 1028 // Be careful, since the addresses could be subregisters themselves in weird 1029 // cases, like vectors of pointers. 1030 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1031 1032 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1033 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1034 1035 unsigned NewOffset0 = CI.Offset; 1036 unsigned NewOffset1 = Paired.Offset; 1037 unsigned Opc = 1038 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1039 1040 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1041 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1042 1043 if (NewOffset0 > NewOffset1) { 1044 // Canonicalize the merged instruction so the smaller offset comes first. 1045 std::swap(NewOffset0, NewOffset1); 1046 std::swap(SubRegIdx0, SubRegIdx1); 1047 } 1048 1049 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1050 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1051 1052 const MCInstrDesc &Read2Desc = TII->get(Opc); 1053 1054 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1055 Register DestReg = MRI->createVirtualRegister(SuperRC); 1056 1057 DebugLoc DL = CI.I->getDebugLoc(); 1058 1059 Register BaseReg = AddrReg->getReg(); 1060 unsigned BaseSubReg = AddrReg->getSubReg(); 1061 unsigned BaseRegFlags = 0; 1062 if (CI.BaseOff) { 1063 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1064 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1065 .addImm(CI.BaseOff); 1066 1067 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1068 BaseRegFlags = RegState::Kill; 1069 1070 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1071 .addReg(ImmReg) 1072 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1073 .addImm(0); // clamp bit 1074 BaseSubReg = 0; 1075 } 1076 1077 MachineInstrBuilder Read2 = 1078 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1079 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1080 .addImm(NewOffset0) // offset0 1081 .addImm(NewOffset1) // offset1 1082 .addImm(0) // gds 1083 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1084 1085 (void)Read2; 1086 1087 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1088 1089 // Copy to the old destination registers. 1090 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1091 .add(*Dest0) // Copy to same destination including flags and sub reg. 1092 .addReg(DestReg, 0, SubRegIdx0); 1093 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1094 .add(*Dest1) 1095 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1096 1097 CI.I->eraseFromParent(); 1098 Paired.I->eraseFromParent(); 1099 1100 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1101 return Read2; 1102 } 1103 1104 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1105 if (STM->ldsRequiresM0Init()) 1106 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1107 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1108 : AMDGPU::DS_WRITE2_B64_gfx9; 1109 } 1110 1111 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1112 if (STM->ldsRequiresM0Init()) 1113 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1114 : AMDGPU::DS_WRITE2ST64_B64; 1115 1116 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1117 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1118 } 1119 1120 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1121 CombineInfo &CI, CombineInfo &Paired, 1122 MachineBasicBlock::iterator InsertBefore) { 1123 MachineBasicBlock *MBB = CI.I->getParent(); 1124 1125 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1126 // sure we preserve the subregister index and any register flags set on them. 1127 const MachineOperand *AddrReg = 1128 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1129 const MachineOperand *Data0 = 1130 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1131 const MachineOperand *Data1 = 1132 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1133 1134 unsigned NewOffset0 = CI.Offset; 1135 unsigned NewOffset1 = Paired.Offset; 1136 unsigned Opc = 1137 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1138 1139 if (NewOffset0 > NewOffset1) { 1140 // Canonicalize the merged instruction so the smaller offset comes first. 1141 std::swap(NewOffset0, NewOffset1); 1142 std::swap(Data0, Data1); 1143 } 1144 1145 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1146 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1147 1148 const MCInstrDesc &Write2Desc = TII->get(Opc); 1149 DebugLoc DL = CI.I->getDebugLoc(); 1150 1151 Register BaseReg = AddrReg->getReg(); 1152 unsigned BaseSubReg = AddrReg->getSubReg(); 1153 unsigned BaseRegFlags = 0; 1154 if (CI.BaseOff) { 1155 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1156 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1157 .addImm(CI.BaseOff); 1158 1159 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1160 BaseRegFlags = RegState::Kill; 1161 1162 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1163 .addReg(ImmReg) 1164 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1165 .addImm(0); // clamp bit 1166 BaseSubReg = 0; 1167 } 1168 1169 MachineInstrBuilder Write2 = 1170 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1171 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1172 .add(*Data0) // data0 1173 .add(*Data1) // data1 1174 .addImm(NewOffset0) // offset0 1175 .addImm(NewOffset1) // offset1 1176 .addImm(0) // gds 1177 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1178 1179 CI.I->eraseFromParent(); 1180 Paired.I->eraseFromParent(); 1181 1182 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1183 return Write2; 1184 } 1185 1186 MachineBasicBlock::iterator 1187 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1188 MachineBasicBlock::iterator InsertBefore) { 1189 MachineBasicBlock *MBB = CI.I->getParent(); 1190 DebugLoc DL = CI.I->getDebugLoc(); 1191 const unsigned Opcode = getNewOpcode(CI, Paired); 1192 1193 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1194 1195 Register DestReg = MRI->createVirtualRegister(SuperRC); 1196 unsigned MergedDMask = CI.DMask | Paired.DMask; 1197 unsigned DMaskIdx = 1198 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1199 1200 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1201 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1202 if (I == DMaskIdx) 1203 MIB.addImm(MergedDMask); 1204 else 1205 MIB.add((*CI.I).getOperand(I)); 1206 } 1207 1208 // It shouldn't be possible to get this far if the two instructions 1209 // don't have a single memoperand, because MachineInstr::mayAlias() 1210 // will return true if this is the case. 1211 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1212 1213 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1214 1215 unsigned SubRegIdx0, SubRegIdx1; 1216 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1217 1218 // Copy to the old destination registers. 1219 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1220 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1221 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1222 1223 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1224 .add(*Dest0) // Copy to same destination including flags and sub reg. 1225 .addReg(DestReg, 0, SubRegIdx0); 1226 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1227 .add(*Dest1) 1228 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1229 1230 CI.I->eraseFromParent(); 1231 Paired.I->eraseFromParent(); 1232 return New; 1233 } 1234 1235 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1236 CombineInfo &CI, CombineInfo &Paired, 1237 MachineBasicBlock::iterator InsertBefore) { 1238 MachineBasicBlock *MBB = CI.I->getParent(); 1239 DebugLoc DL = CI.I->getDebugLoc(); 1240 const unsigned Opcode = getNewOpcode(CI, Paired); 1241 1242 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1243 1244 Register DestReg = MRI->createVirtualRegister(SuperRC); 1245 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1246 1247 // It shouldn't be possible to get this far if the two instructions 1248 // don't have a single memoperand, because MachineInstr::mayAlias() 1249 // will return true if this is the case. 1250 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1251 1252 MachineInstr *New = 1253 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1254 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1255 .addImm(MergedOffset) // offset 1256 .addImm(CI.CPol) // cpol 1257 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1258 1259 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1260 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1261 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1262 1263 // Copy to the old destination registers. 1264 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1265 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1266 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1267 1268 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1269 .add(*Dest0) // Copy to same destination including flags and sub reg. 1270 .addReg(DestReg, 0, SubRegIdx0); 1271 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1272 .add(*Dest1) 1273 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1274 1275 CI.I->eraseFromParent(); 1276 Paired.I->eraseFromParent(); 1277 return New; 1278 } 1279 1280 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1281 CombineInfo &CI, CombineInfo &Paired, 1282 MachineBasicBlock::iterator InsertBefore) { 1283 MachineBasicBlock *MBB = CI.I->getParent(); 1284 DebugLoc DL = CI.I->getDebugLoc(); 1285 1286 const unsigned Opcode = getNewOpcode(CI, Paired); 1287 1288 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1289 1290 // Copy to the new source register. 1291 Register DestReg = MRI->createVirtualRegister(SuperRC); 1292 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1293 1294 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1295 1296 AddressRegs Regs = getRegs(Opcode, *TII); 1297 1298 if (Regs.VAddr) 1299 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1300 1301 // It shouldn't be possible to get this far if the two instructions 1302 // don't have a single memoperand, because MachineInstr::mayAlias() 1303 // will return true if this is the case. 1304 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1305 1306 MachineInstr *New = 1307 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1308 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1309 .addImm(MergedOffset) // offset 1310 .addImm(CI.CPol) // cpol 1311 .addImm(0) // tfe 1312 .addImm(0) // swz 1313 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1314 1315 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1316 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1317 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1318 1319 // Copy to the old destination registers. 1320 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1321 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1322 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1323 1324 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1325 .add(*Dest0) // Copy to same destination including flags and sub reg. 1326 .addReg(DestReg, 0, SubRegIdx0); 1327 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1328 .add(*Dest1) 1329 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1330 1331 CI.I->eraseFromParent(); 1332 Paired.I->eraseFromParent(); 1333 return New; 1334 } 1335 1336 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1337 CombineInfo &CI, CombineInfo &Paired, 1338 MachineBasicBlock::iterator InsertBefore) { 1339 MachineBasicBlock *MBB = CI.I->getParent(); 1340 DebugLoc DL = CI.I->getDebugLoc(); 1341 1342 const unsigned Opcode = getNewOpcode(CI, Paired); 1343 1344 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1345 1346 // Copy to the new source register. 1347 Register DestReg = MRI->createVirtualRegister(SuperRC); 1348 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1349 1350 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1351 1352 AddressRegs Regs = getRegs(Opcode, *TII); 1353 1354 if (Regs.VAddr) 1355 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1356 1357 unsigned JoinedFormat = 1358 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1359 1360 // It shouldn't be possible to get this far if the two instructions 1361 // don't have a single memoperand, because MachineInstr::mayAlias() 1362 // will return true if this is the case. 1363 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1364 1365 MachineInstr *New = 1366 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1367 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1368 .addImm(MergedOffset) // offset 1369 .addImm(JoinedFormat) // format 1370 .addImm(CI.CPol) // cpol 1371 .addImm(0) // tfe 1372 .addImm(0) // swz 1373 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1374 1375 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1376 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1377 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1378 1379 // Copy to the old destination registers. 1380 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1381 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1382 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1383 1384 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1385 .add(*Dest0) // Copy to same destination including flags and sub reg. 1386 .addReg(DestReg, 0, SubRegIdx0); 1387 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1388 .add(*Dest1) 1389 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1390 1391 CI.I->eraseFromParent(); 1392 Paired.I->eraseFromParent(); 1393 return New; 1394 } 1395 1396 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1397 CombineInfo &CI, CombineInfo &Paired, 1398 MachineBasicBlock::iterator InsertBefore) { 1399 MachineBasicBlock *MBB = CI.I->getParent(); 1400 DebugLoc DL = CI.I->getDebugLoc(); 1401 1402 const unsigned Opcode = getNewOpcode(CI, Paired); 1403 1404 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1405 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1406 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1407 1408 // Copy to the new source register. 1409 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1410 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1411 1412 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1413 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1414 1415 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1416 .add(*Src0) 1417 .addImm(SubRegIdx0) 1418 .add(*Src1) 1419 .addImm(SubRegIdx1); 1420 1421 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1422 .addReg(SrcReg, RegState::Kill); 1423 1424 AddressRegs Regs = getRegs(Opcode, *TII); 1425 1426 if (Regs.VAddr) 1427 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1428 1429 unsigned JoinedFormat = 1430 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1431 1432 // It shouldn't be possible to get this far if the two instructions 1433 // don't have a single memoperand, because MachineInstr::mayAlias() 1434 // will return true if this is the case. 1435 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1436 1437 MachineInstr *New = 1438 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1439 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1440 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1441 .addImm(JoinedFormat) // format 1442 .addImm(CI.CPol) // cpol 1443 .addImm(0) // tfe 1444 .addImm(0) // swz 1445 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1446 1447 CI.I->eraseFromParent(); 1448 Paired.I->eraseFromParent(); 1449 return New; 1450 } 1451 1452 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair( 1453 CombineInfo &CI, CombineInfo &Paired, 1454 MachineBasicBlock::iterator InsertBefore) { 1455 MachineBasicBlock *MBB = CI.I->getParent(); 1456 DebugLoc DL = CI.I->getDebugLoc(); 1457 1458 const unsigned Opcode = getNewOpcode(CI, Paired); 1459 1460 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1461 Register DestReg = MRI->createVirtualRegister(SuperRC); 1462 1463 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1464 1465 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1466 MIB.add(*SAddr); 1467 1468 MachineInstr *New = 1469 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1470 .addImm(std::min(CI.Offset, Paired.Offset)) 1471 .addImm(CI.CPol) 1472 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1473 1474 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1475 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1476 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1477 1478 // Copy to the old destination registers. 1479 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1480 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1481 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1482 1483 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1484 .add(*Dest0) // Copy to same destination including flags and sub reg. 1485 .addReg(DestReg, 0, SubRegIdx0); 1486 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1487 .add(*Dest1) 1488 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1489 1490 CI.I->eraseFromParent(); 1491 Paired.I->eraseFromParent(); 1492 return New; 1493 } 1494 1495 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalStorePair( 1496 CombineInfo &CI, CombineInfo &Paired, 1497 MachineBasicBlock::iterator InsertBefore) { 1498 MachineBasicBlock *MBB = CI.I->getParent(); 1499 DebugLoc DL = CI.I->getDebugLoc(); 1500 1501 const unsigned Opcode = getNewOpcode(CI, Paired); 1502 1503 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1504 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1505 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1506 1507 // Copy to the new source register. 1508 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1509 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1510 1511 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1512 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1513 1514 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1515 .add(*Src0) 1516 .addImm(SubRegIdx0) 1517 .add(*Src1) 1518 .addImm(SubRegIdx1); 1519 1520 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1521 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1522 .addReg(SrcReg, RegState::Kill); 1523 1524 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1525 MIB.add(*SAddr); 1526 1527 MachineInstr *New = 1528 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1529 .addImm(CI.CPol) 1530 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1531 1532 CI.I->eraseFromParent(); 1533 Paired.I->eraseFromParent(); 1534 return New; 1535 } 1536 1537 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1538 const CombineInfo &Paired) { 1539 const unsigned Width = CI.Width + Paired.Width; 1540 1541 switch (CI.InstClass) { 1542 default: 1543 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1544 // FIXME: Handle d16 correctly 1545 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1546 Width); 1547 case TBUFFER_LOAD: 1548 case TBUFFER_STORE: 1549 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1550 Width); 1551 1552 case UNKNOWN: 1553 llvm_unreachable("Unknown instruction class"); 1554 case S_BUFFER_LOAD_IMM: 1555 switch (Width) { 1556 default: 1557 return 0; 1558 case 2: 1559 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1560 case 4: 1561 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1562 case 8: 1563 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1564 } 1565 case GLOBAL_LOAD: 1566 switch (Width) { 1567 default: 1568 return 0; 1569 case 2: 1570 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1571 case 3: 1572 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1573 case 4: 1574 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1575 } 1576 case GLOBAL_LOAD_SADDR: 1577 switch (Width) { 1578 default: 1579 return 0; 1580 case 2: 1581 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1582 case 3: 1583 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1584 case 4: 1585 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1586 } 1587 case GLOBAL_STORE: 1588 switch (Width) { 1589 default: 1590 return 0; 1591 case 2: 1592 return AMDGPU::GLOBAL_STORE_DWORDX2; 1593 case 3: 1594 return AMDGPU::GLOBAL_STORE_DWORDX3; 1595 case 4: 1596 return AMDGPU::GLOBAL_STORE_DWORDX4; 1597 } 1598 case GLOBAL_STORE_SADDR: 1599 switch (Width) { 1600 default: 1601 return 0; 1602 case 2: 1603 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1604 case 3: 1605 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1606 case 4: 1607 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1608 } 1609 case MIMG: 1610 assert((countPopulation(CI.DMask | Paired.DMask) == Width) && 1611 "No overlaps"); 1612 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1613 } 1614 } 1615 1616 std::pair<unsigned, unsigned> 1617 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1618 const CombineInfo &Paired) { 1619 assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) == 1620 CI.Width + Paired.Width)) && 1621 "No overlaps"); 1622 1623 unsigned Idx0; 1624 unsigned Idx1; 1625 1626 static const unsigned Idxs[5][4] = { 1627 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1628 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1629 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1630 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1631 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1632 }; 1633 1634 assert(CI.Width >= 1 && CI.Width <= 4); 1635 assert(Paired.Width >= 1 && Paired.Width <= 4); 1636 1637 if (Paired < CI) { 1638 Idx1 = Idxs[0][Paired.Width - 1]; 1639 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1640 } else { 1641 Idx0 = Idxs[0][CI.Width - 1]; 1642 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1643 } 1644 1645 return std::make_pair(Idx0, Idx1); 1646 } 1647 1648 const TargetRegisterClass * 1649 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1650 const CombineInfo &Paired) { 1651 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1652 switch (CI.Width + Paired.Width) { 1653 default: 1654 return nullptr; 1655 case 2: 1656 return &AMDGPU::SReg_64_XEXECRegClass; 1657 case 4: 1658 return &AMDGPU::SGPR_128RegClass; 1659 case 8: 1660 return &AMDGPU::SGPR_256RegClass; 1661 case 16: 1662 return &AMDGPU::SGPR_512RegClass; 1663 } 1664 } 1665 1666 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1667 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1668 ? TRI->getAGPRClassForBitWidth(BitWidth) 1669 : TRI->getVGPRClassForBitWidth(BitWidth); 1670 } 1671 1672 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1673 CombineInfo &CI, CombineInfo &Paired, 1674 MachineBasicBlock::iterator InsertBefore) { 1675 MachineBasicBlock *MBB = CI.I->getParent(); 1676 DebugLoc DL = CI.I->getDebugLoc(); 1677 1678 const unsigned Opcode = getNewOpcode(CI, Paired); 1679 1680 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1681 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1682 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1683 1684 // Copy to the new source register. 1685 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1686 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1687 1688 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1689 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1690 1691 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1692 .add(*Src0) 1693 .addImm(SubRegIdx0) 1694 .add(*Src1) 1695 .addImm(SubRegIdx1); 1696 1697 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1698 .addReg(SrcReg, RegState::Kill); 1699 1700 AddressRegs Regs = getRegs(Opcode, *TII); 1701 1702 if (Regs.VAddr) 1703 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1704 1705 1706 // It shouldn't be possible to get this far if the two instructions 1707 // don't have a single memoperand, because MachineInstr::mayAlias() 1708 // will return true if this is the case. 1709 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1710 1711 MachineInstr *New = 1712 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1713 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1714 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1715 .addImm(CI.CPol) // cpol 1716 .addImm(0) // tfe 1717 .addImm(0) // swz 1718 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1719 1720 CI.I->eraseFromParent(); 1721 Paired.I->eraseFromParent(); 1722 return New; 1723 } 1724 1725 MachineOperand 1726 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1727 APInt V(32, Val, true); 1728 if (TII->isInlineConstant(V)) 1729 return MachineOperand::CreateImm(Val); 1730 1731 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1732 MachineInstr *Mov = 1733 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1734 TII->get(AMDGPU::S_MOV_B32), Reg) 1735 .addImm(Val); 1736 (void)Mov; 1737 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1738 return MachineOperand::CreateReg(Reg, false); 1739 } 1740 1741 // Compute base address using Addr and return the final register. 1742 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1743 const MemAddress &Addr) const { 1744 MachineBasicBlock *MBB = MI.getParent(); 1745 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1746 DebugLoc DL = MI.getDebugLoc(); 1747 1748 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1749 Addr.Base.LoSubReg) && 1750 "Expected 32-bit Base-Register-Low!!"); 1751 1752 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1753 Addr.Base.HiSubReg) && 1754 "Expected 32-bit Base-Register-Hi!!"); 1755 1756 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1757 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1758 MachineOperand OffsetHi = 1759 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1760 1761 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1762 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1763 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1764 1765 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1766 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1767 MachineInstr *LoHalf = 1768 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1769 .addReg(CarryReg, RegState::Define) 1770 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1771 .add(OffsetLo) 1772 .addImm(0); // clamp bit 1773 (void)LoHalf; 1774 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1775 1776 MachineInstr *HiHalf = 1777 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1778 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1779 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1780 .add(OffsetHi) 1781 .addReg(CarryReg, RegState::Kill) 1782 .addImm(0); // clamp bit 1783 (void)HiHalf; 1784 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1785 1786 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1787 MachineInstr *FullBase = 1788 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1789 .addReg(DestSub0) 1790 .addImm(AMDGPU::sub0) 1791 .addReg(DestSub1) 1792 .addImm(AMDGPU::sub1); 1793 (void)FullBase; 1794 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1795 1796 return FullDestReg; 1797 } 1798 1799 // Update base and offset with the NewBase and NewOffset in MI. 1800 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1801 Register NewBase, 1802 int32_t NewOffset) const { 1803 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1804 Base->setReg(NewBase); 1805 Base->setIsKill(false); 1806 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1807 } 1808 1809 Optional<int32_t> 1810 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1811 if (Op.isImm()) 1812 return Op.getImm(); 1813 1814 if (!Op.isReg()) 1815 return None; 1816 1817 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1818 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1819 !Def->getOperand(1).isImm()) 1820 return None; 1821 1822 return Def->getOperand(1).getImm(); 1823 } 1824 1825 // Analyze Base and extracts: 1826 // - 32bit base registers, subregisters 1827 // - 64bit constant offset 1828 // Expecting base computation as: 1829 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1830 // %LO:vgpr_32, %c:sreg_64_xexec = 1831 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1832 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1833 // %Base:vreg_64 = 1834 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1835 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1836 MemAddress &Addr) const { 1837 if (!Base.isReg()) 1838 return; 1839 1840 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1841 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1842 || Def->getNumOperands() != 5) 1843 return; 1844 1845 MachineOperand BaseLo = Def->getOperand(1); 1846 MachineOperand BaseHi = Def->getOperand(3); 1847 if (!BaseLo.isReg() || !BaseHi.isReg()) 1848 return; 1849 1850 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1851 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1852 1853 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1854 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1855 return; 1856 1857 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1858 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1859 1860 auto Offset0P = extractConstOffset(*Src0); 1861 if (Offset0P) 1862 BaseLo = *Src1; 1863 else { 1864 if (!(Offset0P = extractConstOffset(*Src1))) 1865 return; 1866 BaseLo = *Src0; 1867 } 1868 1869 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1870 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1871 1872 if (Src0->isImm()) 1873 std::swap(Src0, Src1); 1874 1875 if (!Src1->isImm()) 1876 return; 1877 1878 uint64_t Offset1 = Src1->getImm(); 1879 BaseHi = *Src0; 1880 1881 Addr.Base.LoReg = BaseLo.getReg(); 1882 Addr.Base.HiReg = BaseHi.getReg(); 1883 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1884 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1885 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1886 } 1887 1888 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1889 MachineInstr &MI, 1890 MemInfoMap &Visited, 1891 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1892 1893 if (!(MI.mayLoad() ^ MI.mayStore())) 1894 return false; 1895 1896 // TODO: Support flat and scratch. 1897 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1898 return false; 1899 1900 if (MI.mayLoad() && 1901 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 1902 return false; 1903 1904 if (AnchorList.count(&MI)) 1905 return false; 1906 1907 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1908 1909 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1910 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1911 return false; 1912 } 1913 1914 // Step1: Find the base-registers and a 64bit constant offset. 1915 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1916 MemAddress MAddr; 1917 if (Visited.find(&MI) == Visited.end()) { 1918 processBaseWithConstOffset(Base, MAddr); 1919 Visited[&MI] = MAddr; 1920 } else 1921 MAddr = Visited[&MI]; 1922 1923 if (MAddr.Offset == 0) { 1924 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1925 " constant offsets that can be promoted.\n";); 1926 return false; 1927 } 1928 1929 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1930 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1931 1932 // Step2: Traverse through MI's basic block and find an anchor(that has the 1933 // same base-registers) with the highest 13bit distance from MI's offset. 1934 // E.g. (64bit loads) 1935 // bb: 1936 // addr1 = &a + 4096; load1 = load(addr1, 0) 1937 // addr2 = &a + 6144; load2 = load(addr2, 0) 1938 // addr3 = &a + 8192; load3 = load(addr3, 0) 1939 // addr4 = &a + 10240; load4 = load(addr4, 0) 1940 // addr5 = &a + 12288; load5 = load(addr5, 0) 1941 // 1942 // Starting from the first load, the optimization will try to find a new base 1943 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1944 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1945 // as the new-base(anchor) because of the maximum distance which can 1946 // accommodate more intermediate bases presumably. 1947 // 1948 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1949 // (&a + 8192) for load1, load2, load4. 1950 // addr = &a + 8192 1951 // load1 = load(addr, -4096) 1952 // load2 = load(addr, -2048) 1953 // load3 = load(addr, 0) 1954 // load4 = load(addr, 2048) 1955 // addr5 = &a + 12288; load5 = load(addr5, 0) 1956 // 1957 MachineInstr *AnchorInst = nullptr; 1958 MemAddress AnchorAddr; 1959 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1960 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1961 1962 MachineBasicBlock *MBB = MI.getParent(); 1963 MachineBasicBlock::iterator E = MBB->end(); 1964 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1965 ++MBBI; 1966 const SITargetLowering *TLI = 1967 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1968 1969 for ( ; MBBI != E; ++MBBI) { 1970 MachineInstr &MINext = *MBBI; 1971 // TODO: Support finding an anchor(with same base) from store addresses or 1972 // any other load addresses where the opcodes are different. 1973 if (MINext.getOpcode() != MI.getOpcode() || 1974 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1975 continue; 1976 1977 const MachineOperand &BaseNext = 1978 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1979 MemAddress MAddrNext; 1980 if (Visited.find(&MINext) == Visited.end()) { 1981 processBaseWithConstOffset(BaseNext, MAddrNext); 1982 Visited[&MINext] = MAddrNext; 1983 } else 1984 MAddrNext = Visited[&MINext]; 1985 1986 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1987 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1988 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1989 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1990 continue; 1991 1992 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1993 1994 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1995 TargetLoweringBase::AddrMode AM; 1996 AM.HasBaseReg = true; 1997 AM.BaseOffs = Dist; 1998 if (TLI->isLegalGlobalAddressingMode(AM) && 1999 (uint32_t)std::abs(Dist) > MaxDist) { 2000 MaxDist = std::abs(Dist); 2001 2002 AnchorAddr = MAddrNext; 2003 AnchorInst = &MINext; 2004 } 2005 } 2006 2007 if (AnchorInst) { 2008 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2009 AnchorInst->dump()); 2010 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2011 << AnchorAddr.Offset << "\n\n"); 2012 2013 // Instead of moving up, just re-compute anchor-instruction's base address. 2014 Register Base = computeBase(MI, AnchorAddr); 2015 2016 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2017 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2018 2019 for (auto P : InstsWCommonBase) { 2020 TargetLoweringBase::AddrMode AM; 2021 AM.HasBaseReg = true; 2022 AM.BaseOffs = P.second - AnchorAddr.Offset; 2023 2024 if (TLI->isLegalGlobalAddressingMode(AM)) { 2025 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 2026 dbgs() << ")"; P.first->dump()); 2027 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 2028 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 2029 } 2030 } 2031 AnchorList.insert(AnchorInst); 2032 return true; 2033 } 2034 2035 return false; 2036 } 2037 2038 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2039 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2040 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2041 if (AddrList.front().InstClass == CI.InstClass && 2042 AddrList.front().IsAGPR == CI.IsAGPR && 2043 AddrList.front().hasSameBaseAddress(*CI.I)) { 2044 AddrList.emplace_back(CI); 2045 return; 2046 } 2047 } 2048 2049 // Base address not found, so add a new list. 2050 MergeableInsts.emplace_back(1, CI); 2051 } 2052 2053 std::pair<MachineBasicBlock::iterator, bool> 2054 SILoadStoreOptimizer::collectMergeableInsts( 2055 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2056 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2057 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2058 bool Modified = false; 2059 2060 // Sort potential mergeable instructions into lists. One list per base address. 2061 unsigned Order = 0; 2062 MachineBasicBlock::iterator BlockI = Begin; 2063 for (; BlockI != End; ++BlockI) { 2064 MachineInstr &MI = *BlockI; 2065 2066 // We run this before checking if an address is mergeable, because it can produce 2067 // better code even if the instructions aren't mergeable. 2068 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2069 Modified = true; 2070 2071 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2072 // barriers. We can look after this barrier for separate merges. 2073 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2074 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2075 2076 // Search will resume after this instruction in a separate merge list. 2077 ++BlockI; 2078 break; 2079 } 2080 2081 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2082 if (InstClass == UNKNOWN) 2083 continue; 2084 2085 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2086 int Swizzled = 2087 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2088 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2089 continue; 2090 2091 CombineInfo CI; 2092 CI.setMI(MI, *this); 2093 CI.Order = Order++; 2094 2095 if (!CI.hasMergeableAddress(*MRI)) 2096 continue; 2097 2098 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2099 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2100 // operands. However we are reporting that ds_write2 shall have 2101 // only VGPR data so that machine copy propagation does not 2102 // create an illegal instruction with a VGPR and AGPR sources. 2103 // Consequenctially if we create such instruction the verifier 2104 // will complain. 2105 continue; 2106 } 2107 2108 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2109 2110 addInstToMergeableList(CI, MergeableInsts); 2111 } 2112 2113 // At this point we have lists of Mergeable instructions. 2114 // 2115 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2116 // list try to find an instruction that can be merged with I. If an instruction 2117 // is found, it is stored in the Paired field. If no instructions are found, then 2118 // the CombineInfo object is deleted from the list. 2119 2120 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2121 E = MergeableInsts.end(); I != E;) { 2122 2123 std::list<CombineInfo> &MergeList = *I; 2124 if (MergeList.size() <= 1) { 2125 // This means we have found only one instruction with a given address 2126 // that can be merged, and we need at least 2 instructions to do a merge, 2127 // so this list can be discarded. 2128 I = MergeableInsts.erase(I); 2129 continue; 2130 } 2131 2132 // Sort the lists by offsets, this way mergeable instructions will be 2133 // adjacent to each other in the list, which will make it easier to find 2134 // matches. 2135 MergeList.sort( 2136 [] (const CombineInfo &A, const CombineInfo &B) { 2137 return A.Offset < B.Offset; 2138 }); 2139 ++I; 2140 } 2141 2142 return std::make_pair(BlockI, Modified); 2143 } 2144 2145 // Scan through looking for adjacent LDS operations with constant offsets from 2146 // the same base register. We rely on the scheduler to do the hard work of 2147 // clustering nearby loads, and assume these are all adjacent. 2148 bool SILoadStoreOptimizer::optimizeBlock( 2149 std::list<std::list<CombineInfo> > &MergeableInsts) { 2150 bool Modified = false; 2151 2152 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2153 E = MergeableInsts.end(); I != E;) { 2154 std::list<CombineInfo> &MergeList = *I; 2155 2156 bool OptimizeListAgain = false; 2157 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2158 // We weren't able to make any changes, so delete the list so we don't 2159 // process the same instructions the next time we try to optimize this 2160 // block. 2161 I = MergeableInsts.erase(I); 2162 continue; 2163 } 2164 2165 Modified = true; 2166 2167 // We made changes, but also determined that there were no more optimization 2168 // opportunities, so we don't need to reprocess the list 2169 if (!OptimizeListAgain) { 2170 I = MergeableInsts.erase(I); 2171 continue; 2172 } 2173 OptimizeAgain = true; 2174 } 2175 return Modified; 2176 } 2177 2178 bool 2179 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2180 std::list<CombineInfo> &MergeList, 2181 bool &OptimizeListAgain) { 2182 if (MergeList.empty()) 2183 return false; 2184 2185 bool Modified = false; 2186 2187 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2188 Next = std::next(I)) { 2189 2190 auto First = I; 2191 auto Second = Next; 2192 2193 if ((*First).Order > (*Second).Order) 2194 std::swap(First, Second); 2195 CombineInfo &CI = *First; 2196 CombineInfo &Paired = *Second; 2197 2198 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2199 if (!Where) { 2200 ++I; 2201 continue; 2202 } 2203 2204 Modified = true; 2205 2206 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2207 2208 MachineBasicBlock::iterator NewMI; 2209 switch (CI.InstClass) { 2210 default: 2211 llvm_unreachable("unknown InstClass"); 2212 break; 2213 case DS_READ: 2214 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2215 break; 2216 case DS_WRITE: 2217 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2218 break; 2219 case S_BUFFER_LOAD_IMM: 2220 NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); 2221 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2222 break; 2223 case BUFFER_LOAD: 2224 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2225 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2226 break; 2227 case BUFFER_STORE: 2228 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2229 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2230 break; 2231 case MIMG: 2232 NewMI = mergeImagePair(CI, Paired, Where->I); 2233 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2234 break; 2235 case TBUFFER_LOAD: 2236 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2237 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2238 break; 2239 case TBUFFER_STORE: 2240 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2241 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2242 break; 2243 case GLOBAL_LOAD: 2244 case GLOBAL_LOAD_SADDR: 2245 NewMI = mergeGlobalLoadPair(CI, Paired, Where->I); 2246 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2247 break; 2248 case GLOBAL_STORE: 2249 case GLOBAL_STORE_SADDR: 2250 NewMI = mergeGlobalStorePair(CI, Paired, Where->I); 2251 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2252 break; 2253 } 2254 CI.setMI(NewMI, *this); 2255 CI.Order = Where->Order; 2256 if (I == Second) 2257 I = Next; 2258 2259 MergeList.erase(Second); 2260 } 2261 2262 return Modified; 2263 } 2264 2265 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2266 if (skipFunction(MF.getFunction())) 2267 return false; 2268 2269 STM = &MF.getSubtarget<GCNSubtarget>(); 2270 if (!STM->loadStoreOptEnabled()) 2271 return false; 2272 2273 TII = STM->getInstrInfo(); 2274 TRI = &TII->getRegisterInfo(); 2275 2276 MRI = &MF.getRegInfo(); 2277 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2278 2279 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2280 2281 bool Modified = false; 2282 2283 // Contains the list of instructions for which constant offsets are being 2284 // promoted to the IMM. This is tracked for an entire block at time. 2285 SmallPtrSet<MachineInstr *, 4> AnchorList; 2286 MemInfoMap Visited; 2287 2288 for (MachineBasicBlock &MBB : MF) { 2289 MachineBasicBlock::iterator SectionEnd; 2290 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2291 I = SectionEnd) { 2292 bool CollectModified; 2293 std::list<std::list<CombineInfo>> MergeableInsts; 2294 2295 // First pass: Collect list of all instructions we know how to merge in a 2296 // subset of the block. 2297 std::tie(SectionEnd, CollectModified) = 2298 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2299 2300 Modified |= CollectModified; 2301 2302 do { 2303 OptimizeAgain = false; 2304 Modified |= optimizeBlock(MergeableInsts); 2305 } while (OptimizeAgain); 2306 } 2307 2308 Visited.clear(); 2309 AnchorList.clear(); 2310 } 2311 2312 return Modified; 2313 } 2314