1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 GLOBAL_LOAD, 83 GLOBAL_LOAD_SADDR, 84 GLOBAL_STORE, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE 88 }; 89 90 struct AddressRegs { 91 unsigned char NumVAddrs = 0; 92 bool SBase = false; 93 bool SRsrc = false; 94 bool SOffset = false; 95 bool SAddr = false; 96 bool VAddr = false; 97 bool Addr = false; 98 bool SSamp = false; 99 }; 100 101 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 102 const unsigned MaxAddressRegs = 12 + 1 + 1; 103 104 class SILoadStoreOptimizer : public MachineFunctionPass { 105 struct CombineInfo { 106 MachineBasicBlock::iterator I; 107 unsigned EltSize; 108 unsigned Offset; 109 unsigned Width; 110 unsigned Format; 111 unsigned BaseOff; 112 unsigned DMask; 113 InstClassEnum InstClass; 114 unsigned CPol = 0; 115 bool IsAGPR; 116 bool UseST64; 117 int AddrIdx[MaxAddressRegs]; 118 const MachineOperand *AddrReg[MaxAddressRegs]; 119 unsigned NumAddresses; 120 unsigned Order; 121 122 bool hasSameBaseAddress(const MachineInstr &MI) { 123 for (unsigned i = 0; i < NumAddresses; i++) { 124 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 125 126 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 127 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 128 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 129 return false; 130 } 131 continue; 132 } 133 134 // Check same base pointer. Be careful of subregisters, which can occur 135 // with vectors of pointers. 136 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 137 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 138 return false; 139 } 140 } 141 return true; 142 } 143 144 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 145 for (unsigned i = 0; i < NumAddresses; ++i) { 146 const MachineOperand *AddrOp = AddrReg[i]; 147 // Immediates are always OK. 148 if (AddrOp->isImm()) 149 continue; 150 151 // Don't try to merge addresses that aren't either immediates or registers. 152 // TODO: Should be possible to merge FrameIndexes and maybe some other 153 // non-register 154 if (!AddrOp->isReg()) 155 return false; 156 157 // TODO: We should be able to merge physical reg addresses. 158 if (AddrOp->getReg().isPhysical()) 159 return false; 160 161 // If an address has only one use then there will be on other 162 // instructions with the same address, so we can't merge this one. 163 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 164 return false; 165 } 166 return true; 167 } 168 169 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 170 171 // Compare by pointer order. 172 bool operator<(const CombineInfo& Other) const { 173 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 174 } 175 }; 176 177 struct BaseRegisters { 178 Register LoReg; 179 Register HiReg; 180 181 unsigned LoSubReg = 0; 182 unsigned HiSubReg = 0; 183 }; 184 185 struct MemAddress { 186 BaseRegisters Base; 187 int64_t Offset = 0; 188 }; 189 190 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 191 192 private: 193 const GCNSubtarget *STM = nullptr; 194 const SIInstrInfo *TII = nullptr; 195 const SIRegisterInfo *TRI = nullptr; 196 MachineRegisterInfo *MRI = nullptr; 197 AliasAnalysis *AA = nullptr; 198 bool OptimizeAgain; 199 200 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 201 const DenseSet<Register> &ARegUses, 202 const MachineInstr &A, const MachineInstr &B) const; 203 static bool dmasksCanBeCombined(const CombineInfo &CI, 204 const SIInstrInfo &TII, 205 const CombineInfo &Paired); 206 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 207 CombineInfo &Paired, bool Modify = false); 208 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 209 const CombineInfo &Paired); 210 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 211 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 212 const CombineInfo &Paired); 213 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 214 const CombineInfo &Paired); 215 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 216 217 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 218 219 unsigned read2Opcode(unsigned EltSize) const; 220 unsigned read2ST64Opcode(unsigned EltSize) const; 221 MachineBasicBlock::iterator 222 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 223 MachineBasicBlock::iterator InsertBefore); 224 225 unsigned write2Opcode(unsigned EltSize) const; 226 unsigned write2ST64Opcode(unsigned EltSize) const; 227 MachineBasicBlock::iterator 228 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 229 MachineBasicBlock::iterator InsertBefore); 230 MachineBasicBlock::iterator 231 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 232 MachineBasicBlock::iterator InsertBefore); 233 MachineBasicBlock::iterator 234 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 235 MachineBasicBlock::iterator InsertBefore); 236 MachineBasicBlock::iterator 237 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 238 MachineBasicBlock::iterator InsertBefore); 239 MachineBasicBlock::iterator 240 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 241 MachineBasicBlock::iterator InsertBefore); 242 MachineBasicBlock::iterator 243 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 244 MachineBasicBlock::iterator InsertBefore); 245 MachineBasicBlock::iterator 246 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 247 MachineBasicBlock::iterator InsertBefore); 248 MachineBasicBlock::iterator 249 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 250 MachineBasicBlock::iterator InsertBefore); 251 MachineBasicBlock::iterator 252 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 253 MachineBasicBlock::iterator InsertBefore); 254 255 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 256 int32_t NewOffset) const; 257 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 258 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 259 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 260 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 261 /// Promotes constant offset to the immediate by adjusting the base. It 262 /// tries to use a base from the nearby instructions that allows it to have 263 /// a 13bit constant offset which gets promoted to the immediate. 264 bool promoteConstantOffsetToImm(MachineInstr &CI, 265 MemInfoMap &Visited, 266 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 267 void addInstToMergeableList(const CombineInfo &CI, 268 std::list<std::list<CombineInfo> > &MergeableInsts) const; 269 270 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 271 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 272 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 273 std::list<std::list<CombineInfo>> &MergeableInsts) const; 274 275 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 276 const CombineInfo &Paired); 277 278 public: 279 static char ID; 280 281 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 282 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 283 } 284 285 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 286 bool &OptimizeListAgain); 287 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 288 289 bool runOnMachineFunction(MachineFunction &MF) override; 290 291 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 292 293 void getAnalysisUsage(AnalysisUsage &AU) const override { 294 AU.setPreservesCFG(); 295 AU.addRequired<AAResultsWrapperPass>(); 296 297 MachineFunctionPass::getAnalysisUsage(AU); 298 } 299 300 MachineFunctionProperties getRequiredProperties() const override { 301 return MachineFunctionProperties() 302 .set(MachineFunctionProperties::Property::IsSSA); 303 } 304 }; 305 306 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 307 const unsigned Opc = MI.getOpcode(); 308 309 if (TII.isMUBUF(Opc)) { 310 // FIXME: Handle d16 correctly 311 return AMDGPU::getMUBUFElements(Opc); 312 } 313 if (TII.isMIMG(MI)) { 314 uint64_t DMaskImm = 315 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 316 return countPopulation(DMaskImm); 317 } 318 if (TII.isMTBUF(Opc)) { 319 return AMDGPU::getMTBUFElements(Opc); 320 } 321 322 switch (Opc) { 323 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 324 case AMDGPU::GLOBAL_LOAD_DWORD: 325 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 326 case AMDGPU::GLOBAL_STORE_DWORD: 327 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 328 case AMDGPU::FLAT_LOAD_DWORD: 329 case AMDGPU::FLAT_STORE_DWORD: 330 return 1; 331 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 332 case AMDGPU::GLOBAL_LOAD_DWORDX2: 333 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 334 case AMDGPU::GLOBAL_STORE_DWORDX2: 335 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 336 case AMDGPU::FLAT_LOAD_DWORDX2: 337 case AMDGPU::FLAT_STORE_DWORDX2: 338 return 2; 339 case AMDGPU::GLOBAL_LOAD_DWORDX3: 340 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 341 case AMDGPU::GLOBAL_STORE_DWORDX3: 342 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 343 case AMDGPU::FLAT_LOAD_DWORDX3: 344 case AMDGPU::FLAT_STORE_DWORDX3: 345 return 3; 346 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 347 case AMDGPU::GLOBAL_LOAD_DWORDX4: 348 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 349 case AMDGPU::GLOBAL_STORE_DWORDX4: 350 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 351 case AMDGPU::FLAT_LOAD_DWORDX4: 352 case AMDGPU::FLAT_STORE_DWORDX4: 353 return 4; 354 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 355 return 8; 356 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 357 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 358 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 359 case AMDGPU::DS_WRITE_B32_gfx9: 360 return 1; 361 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 362 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 363 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 364 case AMDGPU::DS_WRITE_B64_gfx9: 365 return 2; 366 default: 367 return 0; 368 } 369 } 370 371 /// Maps instruction opcode to enum InstClassEnum. 372 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 373 switch (Opc) { 374 default: 375 if (TII.isMUBUF(Opc)) { 376 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 377 default: 378 return UNKNOWN; 379 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 380 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 381 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 382 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 383 return BUFFER_LOAD; 384 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 385 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 386 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 387 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 388 return BUFFER_STORE; 389 } 390 } 391 if (TII.isMIMG(Opc)) { 392 // Ignore instructions encoded without vaddr. 393 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 394 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 395 return UNKNOWN; 396 // Ignore BVH instructions 397 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 398 return UNKNOWN; 399 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 400 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 401 TII.isGather4(Opc)) 402 return UNKNOWN; 403 return MIMG; 404 } 405 if (TII.isMTBUF(Opc)) { 406 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 407 default: 408 return UNKNOWN; 409 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 410 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 411 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 412 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 413 return TBUFFER_LOAD; 414 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 415 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 416 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 417 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 418 return TBUFFER_STORE; 419 } 420 } 421 return UNKNOWN; 422 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 423 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 424 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 425 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 426 return S_BUFFER_LOAD_IMM; 427 case AMDGPU::DS_READ_B32: 428 case AMDGPU::DS_READ_B32_gfx9: 429 case AMDGPU::DS_READ_B64: 430 case AMDGPU::DS_READ_B64_gfx9: 431 return DS_READ; 432 case AMDGPU::DS_WRITE_B32: 433 case AMDGPU::DS_WRITE_B32_gfx9: 434 case AMDGPU::DS_WRITE_B64: 435 case AMDGPU::DS_WRITE_B64_gfx9: 436 return DS_WRITE; 437 case AMDGPU::GLOBAL_LOAD_DWORD: 438 case AMDGPU::GLOBAL_LOAD_DWORDX2: 439 case AMDGPU::GLOBAL_LOAD_DWORDX3: 440 case AMDGPU::GLOBAL_LOAD_DWORDX4: 441 return GLOBAL_LOAD; 442 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 443 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 444 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 445 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 446 return GLOBAL_LOAD_SADDR; 447 case AMDGPU::GLOBAL_STORE_DWORD: 448 case AMDGPU::GLOBAL_STORE_DWORDX2: 449 case AMDGPU::GLOBAL_STORE_DWORDX3: 450 case AMDGPU::GLOBAL_STORE_DWORDX4: 451 return GLOBAL_STORE; 452 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 453 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 454 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 455 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 456 return GLOBAL_STORE_SADDR; 457 case AMDGPU::FLAT_LOAD_DWORD: 458 case AMDGPU::FLAT_LOAD_DWORDX2: 459 case AMDGPU::FLAT_LOAD_DWORDX3: 460 case AMDGPU::FLAT_LOAD_DWORDX4: 461 return FLAT_LOAD; 462 case AMDGPU::FLAT_STORE_DWORD: 463 case AMDGPU::FLAT_STORE_DWORDX2: 464 case AMDGPU::FLAT_STORE_DWORDX3: 465 case AMDGPU::FLAT_STORE_DWORDX4: 466 return FLAT_STORE; 467 } 468 } 469 470 /// Determines instruction subclass from opcode. Only instructions 471 /// of the same subclass can be merged together. The merged instruction may have 472 /// a different subclass but must have the same class. 473 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 474 switch (Opc) { 475 default: 476 if (TII.isMUBUF(Opc)) 477 return AMDGPU::getMUBUFBaseOpcode(Opc); 478 if (TII.isMIMG(Opc)) { 479 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 480 assert(Info); 481 return Info->BaseOpcode; 482 } 483 if (TII.isMTBUF(Opc)) 484 return AMDGPU::getMTBUFBaseOpcode(Opc); 485 return -1; 486 case AMDGPU::DS_READ_B32: 487 case AMDGPU::DS_READ_B32_gfx9: 488 case AMDGPU::DS_READ_B64: 489 case AMDGPU::DS_READ_B64_gfx9: 490 case AMDGPU::DS_WRITE_B32: 491 case AMDGPU::DS_WRITE_B32_gfx9: 492 case AMDGPU::DS_WRITE_B64: 493 case AMDGPU::DS_WRITE_B64_gfx9: 494 return Opc; 495 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 496 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 497 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 498 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 499 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 500 case AMDGPU::GLOBAL_LOAD_DWORD: 501 case AMDGPU::GLOBAL_LOAD_DWORDX2: 502 case AMDGPU::GLOBAL_LOAD_DWORDX3: 503 case AMDGPU::GLOBAL_LOAD_DWORDX4: 504 return AMDGPU::GLOBAL_LOAD_DWORD; 505 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 506 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 507 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 508 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 509 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 510 case AMDGPU::GLOBAL_STORE_DWORD: 511 case AMDGPU::GLOBAL_STORE_DWORDX2: 512 case AMDGPU::GLOBAL_STORE_DWORDX3: 513 case AMDGPU::GLOBAL_STORE_DWORDX4: 514 return AMDGPU::GLOBAL_STORE_DWORD; 515 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 516 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 517 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 518 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 519 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 520 case AMDGPU::FLAT_LOAD_DWORD: 521 case AMDGPU::FLAT_LOAD_DWORDX2: 522 case AMDGPU::FLAT_LOAD_DWORDX3: 523 case AMDGPU::FLAT_LOAD_DWORDX4: 524 return AMDGPU::FLAT_LOAD_DWORD; 525 case AMDGPU::FLAT_STORE_DWORD: 526 case AMDGPU::FLAT_STORE_DWORDX2: 527 case AMDGPU::FLAT_STORE_DWORDX3: 528 case AMDGPU::FLAT_STORE_DWORDX4: 529 return AMDGPU::FLAT_STORE_DWORD; 530 } 531 } 532 533 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 534 AddressRegs Result; 535 536 if (TII.isMUBUF(Opc)) { 537 if (AMDGPU::getMUBUFHasVAddr(Opc)) 538 Result.VAddr = true; 539 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 540 Result.SRsrc = true; 541 if (AMDGPU::getMUBUFHasSoffset(Opc)) 542 Result.SOffset = true; 543 544 return Result; 545 } 546 547 if (TII.isMIMG(Opc)) { 548 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 549 if (VAddr0Idx >= 0) { 550 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 551 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 552 } else { 553 Result.VAddr = true; 554 } 555 Result.SRsrc = true; 556 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 557 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 558 Result.SSamp = true; 559 560 return Result; 561 } 562 if (TII.isMTBUF(Opc)) { 563 if (AMDGPU::getMTBUFHasVAddr(Opc)) 564 Result.VAddr = true; 565 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 566 Result.SRsrc = true; 567 if (AMDGPU::getMTBUFHasSoffset(Opc)) 568 Result.SOffset = true; 569 570 return Result; 571 } 572 573 switch (Opc) { 574 default: 575 return Result; 576 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 577 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 578 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 579 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 580 Result.SBase = true; 581 return Result; 582 case AMDGPU::DS_READ_B32: 583 case AMDGPU::DS_READ_B64: 584 case AMDGPU::DS_READ_B32_gfx9: 585 case AMDGPU::DS_READ_B64_gfx9: 586 case AMDGPU::DS_WRITE_B32: 587 case AMDGPU::DS_WRITE_B64: 588 case AMDGPU::DS_WRITE_B32_gfx9: 589 case AMDGPU::DS_WRITE_B64_gfx9: 590 Result.Addr = true; 591 return Result; 592 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 593 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 594 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 595 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 596 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 597 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 598 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 599 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 600 Result.SAddr = true; 601 LLVM_FALLTHROUGH; 602 case AMDGPU::GLOBAL_LOAD_DWORD: 603 case AMDGPU::GLOBAL_LOAD_DWORDX2: 604 case AMDGPU::GLOBAL_LOAD_DWORDX3: 605 case AMDGPU::GLOBAL_LOAD_DWORDX4: 606 case AMDGPU::GLOBAL_STORE_DWORD: 607 case AMDGPU::GLOBAL_STORE_DWORDX2: 608 case AMDGPU::GLOBAL_STORE_DWORDX3: 609 case AMDGPU::GLOBAL_STORE_DWORDX4: 610 case AMDGPU::FLAT_LOAD_DWORD: 611 case AMDGPU::FLAT_LOAD_DWORDX2: 612 case AMDGPU::FLAT_LOAD_DWORDX3: 613 case AMDGPU::FLAT_LOAD_DWORDX4: 614 case AMDGPU::FLAT_STORE_DWORD: 615 case AMDGPU::FLAT_STORE_DWORDX2: 616 case AMDGPU::FLAT_STORE_DWORDX3: 617 case AMDGPU::FLAT_STORE_DWORDX4: 618 Result.VAddr = true; 619 return Result; 620 } 621 } 622 623 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 624 const SILoadStoreOptimizer &LSO) { 625 I = MI; 626 unsigned Opc = MI->getOpcode(); 627 InstClass = getInstClass(Opc, *LSO.TII); 628 629 if (InstClass == UNKNOWN) 630 return; 631 632 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 633 634 switch (InstClass) { 635 case DS_READ: 636 EltSize = 637 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 638 : 4; 639 break; 640 case DS_WRITE: 641 EltSize = 642 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 643 : 4; 644 break; 645 case S_BUFFER_LOAD_IMM: 646 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 647 break; 648 default: 649 EltSize = 4; 650 break; 651 } 652 653 if (InstClass == MIMG) { 654 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 655 // Offset is not considered for MIMG instructions. 656 Offset = 0; 657 } else { 658 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 659 Offset = I->getOperand(OffsetIdx).getImm(); 660 } 661 662 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 663 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 664 665 Width = getOpcodeWidth(*I, *LSO.TII); 666 667 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 668 Offset &= 0xffff; 669 } else if (InstClass != MIMG) { 670 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 671 } 672 673 AddressRegs Regs = getRegs(Opc, *LSO.TII); 674 675 NumAddresses = 0; 676 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 677 AddrIdx[NumAddresses++] = 678 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 679 if (Regs.Addr) 680 AddrIdx[NumAddresses++] = 681 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 682 if (Regs.SBase) 683 AddrIdx[NumAddresses++] = 684 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 685 if (Regs.SRsrc) 686 AddrIdx[NumAddresses++] = 687 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 688 if (Regs.SOffset) 689 AddrIdx[NumAddresses++] = 690 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 691 if (Regs.SAddr) 692 AddrIdx[NumAddresses++] = 693 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 694 if (Regs.VAddr) 695 AddrIdx[NumAddresses++] = 696 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 697 if (Regs.SSamp) 698 AddrIdx[NumAddresses++] = 699 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 700 assert(NumAddresses <= MaxAddressRegs); 701 702 for (unsigned J = 0; J < NumAddresses; J++) 703 AddrReg[J] = &I->getOperand(AddrIdx[J]); 704 } 705 706 } // end anonymous namespace. 707 708 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 709 "SI Load Store Optimizer", false, false) 710 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 711 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 712 false, false) 713 714 char SILoadStoreOptimizer::ID = 0; 715 716 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 717 718 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 719 return new SILoadStoreOptimizer(); 720 } 721 722 static void addDefsUsesToList(const MachineInstr &MI, 723 DenseSet<Register> &RegDefs, 724 DenseSet<Register> &RegUses) { 725 for (const auto &Op : MI.operands()) { 726 if (!Op.isReg()) 727 continue; 728 if (Op.isDef()) 729 RegDefs.insert(Op.getReg()); 730 if (Op.readsReg()) 731 RegUses.insert(Op.getReg()); 732 } 733 } 734 735 bool SILoadStoreOptimizer::canSwapInstructions( 736 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 737 const MachineInstr &A, const MachineInstr &B) const { 738 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 739 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 740 return false; 741 for (const auto &BOp : B.operands()) { 742 if (!BOp.isReg()) 743 continue; 744 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 745 return false; 746 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 747 return false; 748 } 749 return true; 750 } 751 752 // Given that \p CI and \p Paired are adjacent memory operations produce a new 753 // MMO for the combined operation with a new access size. 754 MachineMemOperand * 755 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 756 const CombineInfo &Paired) { 757 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 758 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 759 760 unsigned Size = MMOa->getSize() + MMOb->getSize(); 761 762 // A base pointer for the combined operation is the same as the leading 763 // operation's pointer. 764 if (Paired < CI) 765 MMOa = MMOb; 766 767 MachineFunction *MF = CI.I->getMF(); 768 return MF->getMachineMemOperand(MMOa, MMOa->getPointerInfo(), Size); 769 } 770 771 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 772 const SIInstrInfo &TII, 773 const CombineInfo &Paired) { 774 assert(CI.InstClass == MIMG); 775 776 // Ignore instructions with tfe/lwe set. 777 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 778 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 779 780 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 781 return false; 782 783 // Check other optional immediate operands for equality. 784 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 785 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 786 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 787 788 for (auto op : OperandsToMatch) { 789 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 790 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 791 return false; 792 if (Idx != -1 && 793 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 794 return false; 795 } 796 797 // Check DMask for overlaps. 798 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 799 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 800 801 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 802 if ((1u << AllowedBitsForMin) <= MinMask) 803 return false; 804 805 return true; 806 } 807 808 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 809 unsigned ComponentCount, 810 const GCNSubtarget &STI) { 811 if (ComponentCount > 4) 812 return 0; 813 814 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 815 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 816 if (!OldFormatInfo) 817 return 0; 818 819 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 820 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 821 ComponentCount, 822 OldFormatInfo->NumFormat, STI); 823 824 if (!NewFormatInfo) 825 return 0; 826 827 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 828 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 829 830 return NewFormatInfo->Format; 831 } 832 833 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 834 // highest power of two. Note that the result is well defined for all inputs 835 // including corner cases like: 836 // - if Lo == Hi, return that value 837 // - if Lo == 0, return 0 (even though the "- 1" below underflows 838 // - if Lo > Hi, return 0 (as if the range wrapped around) 839 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 840 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 841 } 842 843 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 844 const GCNSubtarget &STI, 845 CombineInfo &Paired, 846 bool Modify) { 847 assert(CI.InstClass != MIMG); 848 849 // XXX - Would the same offset be OK? Is there any reason this would happen or 850 // be useful? 851 if (CI.Offset == Paired.Offset) 852 return false; 853 854 // This won't be valid if the offset isn't aligned. 855 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 856 return false; 857 858 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 859 860 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 861 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 862 if (!Info0) 863 return false; 864 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 865 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 866 if (!Info1) 867 return false; 868 869 if (Info0->BitsPerComp != Info1->BitsPerComp || 870 Info0->NumFormat != Info1->NumFormat) 871 return false; 872 873 // TODO: Should be possible to support more formats, but if format loads 874 // are not dword-aligned, the merged load might not be valid. 875 if (Info0->BitsPerComp != 32) 876 return false; 877 878 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 879 return false; 880 } 881 882 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 883 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 884 CI.UseST64 = false; 885 CI.BaseOff = 0; 886 887 // Handle all non-DS instructions. 888 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 889 return (EltOffset0 + CI.Width == EltOffset1 || 890 EltOffset1 + Paired.Width == EltOffset0) && 891 CI.CPol == Paired.CPol; 892 } 893 894 // If the offset in elements doesn't fit in 8-bits, we might be able to use 895 // the stride 64 versions. 896 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 897 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 898 if (Modify) { 899 CI.Offset = EltOffset0 / 64; 900 Paired.Offset = EltOffset1 / 64; 901 CI.UseST64 = true; 902 } 903 return true; 904 } 905 906 // Check if the new offsets fit in the reduced 8-bit range. 907 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 908 if (Modify) { 909 CI.Offset = EltOffset0; 910 Paired.Offset = EltOffset1; 911 } 912 return true; 913 } 914 915 // Try to shift base address to decrease offsets. 916 uint32_t Min = std::min(EltOffset0, EltOffset1); 917 uint32_t Max = std::max(EltOffset0, EltOffset1); 918 919 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 920 if (((Max - Min) & ~Mask) == 0) { 921 if (Modify) { 922 // From the range of values we could use for BaseOff, choose the one that 923 // is aligned to the highest power of two, to maximise the chance that 924 // the same offset can be reused for other load/store pairs. 925 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 926 // Copy the low bits of the offsets, so that when we adjust them by 927 // subtracting BaseOff they will be multiples of 64. 928 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 929 CI.BaseOff = BaseOff * CI.EltSize; 930 CI.Offset = (EltOffset0 - BaseOff) / 64; 931 Paired.Offset = (EltOffset1 - BaseOff) / 64; 932 CI.UseST64 = true; 933 } 934 return true; 935 } 936 937 if (isUInt<8>(Max - Min)) { 938 if (Modify) { 939 // From the range of values we could use for BaseOff, choose the one that 940 // is aligned to the highest power of two, to maximise the chance that 941 // the same offset can be reused for other load/store pairs. 942 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 943 CI.BaseOff = BaseOff * CI.EltSize; 944 CI.Offset = EltOffset0 - BaseOff; 945 Paired.Offset = EltOffset1 - BaseOff; 946 } 947 return true; 948 } 949 950 return false; 951 } 952 953 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 954 const CombineInfo &CI, 955 const CombineInfo &Paired) { 956 const unsigned Width = (CI.Width + Paired.Width); 957 switch (CI.InstClass) { 958 default: 959 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 960 case S_BUFFER_LOAD_IMM: 961 switch (Width) { 962 default: 963 return false; 964 case 2: 965 case 4: 966 case 8: 967 return true; 968 } 969 } 970 } 971 972 const TargetRegisterClass * 973 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 974 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 975 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 976 } 977 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 978 return TRI->getRegClassForReg(*MRI, Src->getReg()); 979 } 980 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 981 return TRI->getRegClassForReg(*MRI, Src->getReg()); 982 } 983 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 984 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 985 } 986 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 987 return TRI->getRegClassForReg(*MRI, Src->getReg()); 988 } 989 return nullptr; 990 } 991 992 /// This function assumes that CI comes before Paired in a basic block. Return 993 /// an insertion point for the merged instruction or nullptr on failure. 994 SILoadStoreOptimizer::CombineInfo * 995 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 996 CombineInfo &Paired) { 997 // If another instruction has already been merged into CI, it may now be a 998 // type that we can't do any further merging into. 999 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1000 return nullptr; 1001 assert(CI.InstClass == Paired.InstClass); 1002 1003 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1004 getInstSubclass(Paired.I->getOpcode(), *TII)) 1005 return nullptr; 1006 1007 // Check both offsets (or masks for MIMG) can be combined and fit in the 1008 // reduced range. 1009 if (CI.InstClass == MIMG) { 1010 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1011 return nullptr; 1012 } else { 1013 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1014 return nullptr; 1015 } 1016 1017 DenseSet<Register> RegDefs; 1018 DenseSet<Register> RegUses; 1019 CombineInfo *Where; 1020 if (CI.I->mayLoad()) { 1021 // Try to hoist Paired up to CI. 1022 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1023 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1024 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1025 return nullptr; 1026 } 1027 Where = &CI; 1028 } else { 1029 // Try to sink CI down to Paired. 1030 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1031 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1032 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1033 return nullptr; 1034 } 1035 Where = &Paired; 1036 } 1037 1038 // Call offsetsCanBeCombined with modify = true so that the offsets are 1039 // correct for the new instruction. This should return true, because 1040 // this function should only be called on CombineInfo objects that 1041 // have already been confirmed to be mergeable. 1042 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1043 offsetsCanBeCombined(CI, *STM, Paired, true); 1044 return Where; 1045 } 1046 1047 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1048 if (STM->ldsRequiresM0Init()) 1049 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1050 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1051 } 1052 1053 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1054 if (STM->ldsRequiresM0Init()) 1055 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1056 1057 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1058 : AMDGPU::DS_READ2ST64_B64_gfx9; 1059 } 1060 1061 MachineBasicBlock::iterator 1062 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1063 MachineBasicBlock::iterator InsertBefore) { 1064 MachineBasicBlock *MBB = CI.I->getParent(); 1065 1066 // Be careful, since the addresses could be subregisters themselves in weird 1067 // cases, like vectors of pointers. 1068 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1069 1070 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1071 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1072 1073 unsigned NewOffset0 = CI.Offset; 1074 unsigned NewOffset1 = Paired.Offset; 1075 unsigned Opc = 1076 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1077 1078 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1079 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1080 1081 if (NewOffset0 > NewOffset1) { 1082 // Canonicalize the merged instruction so the smaller offset comes first. 1083 std::swap(NewOffset0, NewOffset1); 1084 std::swap(SubRegIdx0, SubRegIdx1); 1085 } 1086 1087 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1088 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1089 1090 const MCInstrDesc &Read2Desc = TII->get(Opc); 1091 1092 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1093 Register DestReg = MRI->createVirtualRegister(SuperRC); 1094 1095 DebugLoc DL = CI.I->getDebugLoc(); 1096 1097 Register BaseReg = AddrReg->getReg(); 1098 unsigned BaseSubReg = AddrReg->getSubReg(); 1099 unsigned BaseRegFlags = 0; 1100 if (CI.BaseOff) { 1101 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1102 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1103 .addImm(CI.BaseOff); 1104 1105 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1106 BaseRegFlags = RegState::Kill; 1107 1108 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1109 .addReg(ImmReg) 1110 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1111 .addImm(0); // clamp bit 1112 BaseSubReg = 0; 1113 } 1114 1115 MachineInstrBuilder Read2 = 1116 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1117 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1118 .addImm(NewOffset0) // offset0 1119 .addImm(NewOffset1) // offset1 1120 .addImm(0) // gds 1121 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1122 1123 (void)Read2; 1124 1125 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1126 1127 // Copy to the old destination registers. 1128 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1129 .add(*Dest0) // Copy to same destination including flags and sub reg. 1130 .addReg(DestReg, 0, SubRegIdx0); 1131 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1132 .add(*Dest1) 1133 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1134 1135 CI.I->eraseFromParent(); 1136 Paired.I->eraseFromParent(); 1137 1138 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1139 return Read2; 1140 } 1141 1142 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1143 if (STM->ldsRequiresM0Init()) 1144 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1145 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1146 : AMDGPU::DS_WRITE2_B64_gfx9; 1147 } 1148 1149 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1150 if (STM->ldsRequiresM0Init()) 1151 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1152 : AMDGPU::DS_WRITE2ST64_B64; 1153 1154 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1155 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1156 } 1157 1158 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1159 CombineInfo &CI, CombineInfo &Paired, 1160 MachineBasicBlock::iterator InsertBefore) { 1161 MachineBasicBlock *MBB = CI.I->getParent(); 1162 1163 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1164 // sure we preserve the subregister index and any register flags set on them. 1165 const MachineOperand *AddrReg = 1166 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1167 const MachineOperand *Data0 = 1168 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1169 const MachineOperand *Data1 = 1170 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1171 1172 unsigned NewOffset0 = CI.Offset; 1173 unsigned NewOffset1 = Paired.Offset; 1174 unsigned Opc = 1175 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1176 1177 if (NewOffset0 > NewOffset1) { 1178 // Canonicalize the merged instruction so the smaller offset comes first. 1179 std::swap(NewOffset0, NewOffset1); 1180 std::swap(Data0, Data1); 1181 } 1182 1183 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1184 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1185 1186 const MCInstrDesc &Write2Desc = TII->get(Opc); 1187 DebugLoc DL = CI.I->getDebugLoc(); 1188 1189 Register BaseReg = AddrReg->getReg(); 1190 unsigned BaseSubReg = AddrReg->getSubReg(); 1191 unsigned BaseRegFlags = 0; 1192 if (CI.BaseOff) { 1193 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1194 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1195 .addImm(CI.BaseOff); 1196 1197 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1198 BaseRegFlags = RegState::Kill; 1199 1200 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1201 .addReg(ImmReg) 1202 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1203 .addImm(0); // clamp bit 1204 BaseSubReg = 0; 1205 } 1206 1207 MachineInstrBuilder Write2 = 1208 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1209 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1210 .add(*Data0) // data0 1211 .add(*Data1) // data1 1212 .addImm(NewOffset0) // offset0 1213 .addImm(NewOffset1) // offset1 1214 .addImm(0) // gds 1215 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1216 1217 CI.I->eraseFromParent(); 1218 Paired.I->eraseFromParent(); 1219 1220 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1221 return Write2; 1222 } 1223 1224 MachineBasicBlock::iterator 1225 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1226 MachineBasicBlock::iterator InsertBefore) { 1227 MachineBasicBlock *MBB = CI.I->getParent(); 1228 DebugLoc DL = CI.I->getDebugLoc(); 1229 const unsigned Opcode = getNewOpcode(CI, Paired); 1230 1231 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1232 1233 Register DestReg = MRI->createVirtualRegister(SuperRC); 1234 unsigned MergedDMask = CI.DMask | Paired.DMask; 1235 unsigned DMaskIdx = 1236 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1237 1238 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1239 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1240 if (I == DMaskIdx) 1241 MIB.addImm(MergedDMask); 1242 else 1243 MIB.add((*CI.I).getOperand(I)); 1244 } 1245 1246 // It shouldn't be possible to get this far if the two instructions 1247 // don't have a single memoperand, because MachineInstr::mayAlias() 1248 // will return true if this is the case. 1249 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1250 1251 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1252 1253 unsigned SubRegIdx0, SubRegIdx1; 1254 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1255 1256 // Copy to the old destination registers. 1257 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1258 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1259 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1260 1261 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1262 .add(*Dest0) // Copy to same destination including flags and sub reg. 1263 .addReg(DestReg, 0, SubRegIdx0); 1264 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1265 .add(*Dest1) 1266 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1267 1268 CI.I->eraseFromParent(); 1269 Paired.I->eraseFromParent(); 1270 return New; 1271 } 1272 1273 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1274 CombineInfo &CI, CombineInfo &Paired, 1275 MachineBasicBlock::iterator InsertBefore) { 1276 MachineBasicBlock *MBB = CI.I->getParent(); 1277 DebugLoc DL = CI.I->getDebugLoc(); 1278 const unsigned Opcode = getNewOpcode(CI, Paired); 1279 1280 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1281 1282 Register DestReg = MRI->createVirtualRegister(SuperRC); 1283 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1284 1285 // It shouldn't be possible to get this far if the two instructions 1286 // don't have a single memoperand, because MachineInstr::mayAlias() 1287 // will return true if this is the case. 1288 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1289 1290 MachineInstr *New = 1291 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1292 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1293 .addImm(MergedOffset) // offset 1294 .addImm(CI.CPol) // cpol 1295 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1296 1297 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1298 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1299 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1300 1301 // Copy to the old destination registers. 1302 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1303 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1304 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1305 1306 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1307 .add(*Dest0) // Copy to same destination including flags and sub reg. 1308 .addReg(DestReg, 0, SubRegIdx0); 1309 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1310 .add(*Dest1) 1311 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1312 1313 CI.I->eraseFromParent(); 1314 Paired.I->eraseFromParent(); 1315 return New; 1316 } 1317 1318 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1319 CombineInfo &CI, CombineInfo &Paired, 1320 MachineBasicBlock::iterator InsertBefore) { 1321 MachineBasicBlock *MBB = CI.I->getParent(); 1322 DebugLoc DL = CI.I->getDebugLoc(); 1323 1324 const unsigned Opcode = getNewOpcode(CI, Paired); 1325 1326 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1327 1328 // Copy to the new source register. 1329 Register DestReg = MRI->createVirtualRegister(SuperRC); 1330 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1331 1332 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1333 1334 AddressRegs Regs = getRegs(Opcode, *TII); 1335 1336 if (Regs.VAddr) 1337 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1338 1339 // It shouldn't be possible to get this far if the two instructions 1340 // don't have a single memoperand, because MachineInstr::mayAlias() 1341 // will return true if this is the case. 1342 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1343 1344 MachineInstr *New = 1345 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1346 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1347 .addImm(MergedOffset) // offset 1348 .addImm(CI.CPol) // cpol 1349 .addImm(0) // tfe 1350 .addImm(0) // swz 1351 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1352 1353 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1354 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1355 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1356 1357 // Copy to the old destination registers. 1358 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1359 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1360 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1361 1362 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1363 .add(*Dest0) // Copy to same destination including flags and sub reg. 1364 .addReg(DestReg, 0, SubRegIdx0); 1365 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1366 .add(*Dest1) 1367 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1368 1369 CI.I->eraseFromParent(); 1370 Paired.I->eraseFromParent(); 1371 return New; 1372 } 1373 1374 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1375 CombineInfo &CI, CombineInfo &Paired, 1376 MachineBasicBlock::iterator InsertBefore) { 1377 MachineBasicBlock *MBB = CI.I->getParent(); 1378 DebugLoc DL = CI.I->getDebugLoc(); 1379 1380 const unsigned Opcode = getNewOpcode(CI, Paired); 1381 1382 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1383 1384 // Copy to the new source register. 1385 Register DestReg = MRI->createVirtualRegister(SuperRC); 1386 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1387 1388 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1389 1390 AddressRegs Regs = getRegs(Opcode, *TII); 1391 1392 if (Regs.VAddr) 1393 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1394 1395 unsigned JoinedFormat = 1396 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1397 1398 // It shouldn't be possible to get this far if the two instructions 1399 // don't have a single memoperand, because MachineInstr::mayAlias() 1400 // will return true if this is the case. 1401 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1402 1403 MachineInstr *New = 1404 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1405 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1406 .addImm(MergedOffset) // offset 1407 .addImm(JoinedFormat) // format 1408 .addImm(CI.CPol) // cpol 1409 .addImm(0) // tfe 1410 .addImm(0) // swz 1411 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1412 1413 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1414 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1415 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1416 1417 // Copy to the old destination registers. 1418 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1419 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1420 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1421 1422 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1423 .add(*Dest0) // Copy to same destination including flags and sub reg. 1424 .addReg(DestReg, 0, SubRegIdx0); 1425 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1426 .add(*Dest1) 1427 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1428 1429 CI.I->eraseFromParent(); 1430 Paired.I->eraseFromParent(); 1431 return New; 1432 } 1433 1434 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1435 CombineInfo &CI, CombineInfo &Paired, 1436 MachineBasicBlock::iterator InsertBefore) { 1437 MachineBasicBlock *MBB = CI.I->getParent(); 1438 DebugLoc DL = CI.I->getDebugLoc(); 1439 1440 const unsigned Opcode = getNewOpcode(CI, Paired); 1441 1442 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1443 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1444 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1445 1446 // Copy to the new source register. 1447 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1448 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1449 1450 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1451 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1452 1453 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1454 .add(*Src0) 1455 .addImm(SubRegIdx0) 1456 .add(*Src1) 1457 .addImm(SubRegIdx1); 1458 1459 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1460 .addReg(SrcReg, RegState::Kill); 1461 1462 AddressRegs Regs = getRegs(Opcode, *TII); 1463 1464 if (Regs.VAddr) 1465 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1466 1467 unsigned JoinedFormat = 1468 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1469 1470 // It shouldn't be possible to get this far if the two instructions 1471 // don't have a single memoperand, because MachineInstr::mayAlias() 1472 // will return true if this is the case. 1473 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1474 1475 MachineInstr *New = 1476 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1477 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1478 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1479 .addImm(JoinedFormat) // format 1480 .addImm(CI.CPol) // cpol 1481 .addImm(0) // tfe 1482 .addImm(0) // swz 1483 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1484 1485 CI.I->eraseFromParent(); 1486 Paired.I->eraseFromParent(); 1487 return New; 1488 } 1489 1490 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1491 CombineInfo &CI, CombineInfo &Paired, 1492 MachineBasicBlock::iterator InsertBefore) { 1493 MachineBasicBlock *MBB = CI.I->getParent(); 1494 DebugLoc DL = CI.I->getDebugLoc(); 1495 1496 const unsigned Opcode = getNewOpcode(CI, Paired); 1497 1498 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1499 Register DestReg = MRI->createVirtualRegister(SuperRC); 1500 1501 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1502 1503 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1504 MIB.add(*SAddr); 1505 1506 MachineInstr *New = 1507 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1508 .addImm(std::min(CI.Offset, Paired.Offset)) 1509 .addImm(CI.CPol) 1510 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1511 1512 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1513 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1514 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1515 1516 // Copy to the old destination registers. 1517 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1518 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1519 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1520 1521 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1522 .add(*Dest0) // Copy to same destination including flags and sub reg. 1523 .addReg(DestReg, 0, SubRegIdx0); 1524 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1525 .add(*Dest1) 1526 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1527 1528 CI.I->eraseFromParent(); 1529 Paired.I->eraseFromParent(); 1530 return New; 1531 } 1532 1533 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1534 CombineInfo &CI, CombineInfo &Paired, 1535 MachineBasicBlock::iterator InsertBefore) { 1536 MachineBasicBlock *MBB = CI.I->getParent(); 1537 DebugLoc DL = CI.I->getDebugLoc(); 1538 1539 const unsigned Opcode = getNewOpcode(CI, Paired); 1540 1541 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1542 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1543 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1544 1545 // Copy to the new source register. 1546 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1547 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1548 1549 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1550 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1551 1552 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1553 .add(*Src0) 1554 .addImm(SubRegIdx0) 1555 .add(*Src1) 1556 .addImm(SubRegIdx1); 1557 1558 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1559 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1560 .addReg(SrcReg, RegState::Kill); 1561 1562 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1563 MIB.add(*SAddr); 1564 1565 MachineInstr *New = 1566 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1567 .addImm(CI.CPol) 1568 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1569 1570 CI.I->eraseFromParent(); 1571 Paired.I->eraseFromParent(); 1572 return New; 1573 } 1574 1575 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1576 const CombineInfo &Paired) { 1577 const unsigned Width = CI.Width + Paired.Width; 1578 1579 switch (CI.InstClass) { 1580 default: 1581 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1582 // FIXME: Handle d16 correctly 1583 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1584 Width); 1585 case TBUFFER_LOAD: 1586 case TBUFFER_STORE: 1587 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1588 Width); 1589 1590 case UNKNOWN: 1591 llvm_unreachable("Unknown instruction class"); 1592 case S_BUFFER_LOAD_IMM: 1593 switch (Width) { 1594 default: 1595 return 0; 1596 case 2: 1597 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1598 case 4: 1599 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1600 case 8: 1601 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1602 } 1603 case GLOBAL_LOAD: 1604 switch (Width) { 1605 default: 1606 return 0; 1607 case 2: 1608 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1609 case 3: 1610 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1611 case 4: 1612 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1613 } 1614 case GLOBAL_LOAD_SADDR: 1615 switch (Width) { 1616 default: 1617 return 0; 1618 case 2: 1619 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1620 case 3: 1621 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1622 case 4: 1623 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1624 } 1625 case GLOBAL_STORE: 1626 switch (Width) { 1627 default: 1628 return 0; 1629 case 2: 1630 return AMDGPU::GLOBAL_STORE_DWORDX2; 1631 case 3: 1632 return AMDGPU::GLOBAL_STORE_DWORDX3; 1633 case 4: 1634 return AMDGPU::GLOBAL_STORE_DWORDX4; 1635 } 1636 case GLOBAL_STORE_SADDR: 1637 switch (Width) { 1638 default: 1639 return 0; 1640 case 2: 1641 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1642 case 3: 1643 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1644 case 4: 1645 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1646 } 1647 case FLAT_LOAD: 1648 switch (Width) { 1649 default: 1650 return 0; 1651 case 2: 1652 return AMDGPU::FLAT_LOAD_DWORDX2; 1653 case 3: 1654 return AMDGPU::FLAT_LOAD_DWORDX3; 1655 case 4: 1656 return AMDGPU::FLAT_LOAD_DWORDX4; 1657 } 1658 case FLAT_STORE: 1659 switch (Width) { 1660 default: 1661 return 0; 1662 case 2: 1663 return AMDGPU::FLAT_STORE_DWORDX2; 1664 case 3: 1665 return AMDGPU::FLAT_STORE_DWORDX3; 1666 case 4: 1667 return AMDGPU::FLAT_STORE_DWORDX4; 1668 } 1669 case MIMG: 1670 assert((countPopulation(CI.DMask | Paired.DMask) == Width) && 1671 "No overlaps"); 1672 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1673 } 1674 } 1675 1676 std::pair<unsigned, unsigned> 1677 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1678 const CombineInfo &Paired) { 1679 assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) == 1680 CI.Width + Paired.Width)) && 1681 "No overlaps"); 1682 1683 unsigned Idx0; 1684 unsigned Idx1; 1685 1686 static const unsigned Idxs[5][4] = { 1687 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1688 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1689 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1690 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1691 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1692 }; 1693 1694 assert(CI.Width >= 1 && CI.Width <= 4); 1695 assert(Paired.Width >= 1 && Paired.Width <= 4); 1696 1697 if (Paired < CI) { 1698 Idx1 = Idxs[0][Paired.Width - 1]; 1699 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1700 } else { 1701 Idx0 = Idxs[0][CI.Width - 1]; 1702 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1703 } 1704 1705 return std::make_pair(Idx0, Idx1); 1706 } 1707 1708 const TargetRegisterClass * 1709 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1710 const CombineInfo &Paired) { 1711 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1712 switch (CI.Width + Paired.Width) { 1713 default: 1714 return nullptr; 1715 case 2: 1716 return &AMDGPU::SReg_64_XEXECRegClass; 1717 case 4: 1718 return &AMDGPU::SGPR_128RegClass; 1719 case 8: 1720 return &AMDGPU::SGPR_256RegClass; 1721 case 16: 1722 return &AMDGPU::SGPR_512RegClass; 1723 } 1724 } 1725 1726 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1727 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1728 ? TRI->getAGPRClassForBitWidth(BitWidth) 1729 : TRI->getVGPRClassForBitWidth(BitWidth); 1730 } 1731 1732 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1733 CombineInfo &CI, CombineInfo &Paired, 1734 MachineBasicBlock::iterator InsertBefore) { 1735 MachineBasicBlock *MBB = CI.I->getParent(); 1736 DebugLoc DL = CI.I->getDebugLoc(); 1737 1738 const unsigned Opcode = getNewOpcode(CI, Paired); 1739 1740 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1741 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1742 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1743 1744 // Copy to the new source register. 1745 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1746 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1747 1748 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1749 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1750 1751 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1752 .add(*Src0) 1753 .addImm(SubRegIdx0) 1754 .add(*Src1) 1755 .addImm(SubRegIdx1); 1756 1757 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1758 .addReg(SrcReg, RegState::Kill); 1759 1760 AddressRegs Regs = getRegs(Opcode, *TII); 1761 1762 if (Regs.VAddr) 1763 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1764 1765 1766 // It shouldn't be possible to get this far if the two instructions 1767 // don't have a single memoperand, because MachineInstr::mayAlias() 1768 // will return true if this is the case. 1769 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1770 1771 MachineInstr *New = 1772 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1773 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1774 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1775 .addImm(CI.CPol) // cpol 1776 .addImm(0) // tfe 1777 .addImm(0) // swz 1778 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1779 1780 CI.I->eraseFromParent(); 1781 Paired.I->eraseFromParent(); 1782 return New; 1783 } 1784 1785 MachineOperand 1786 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1787 APInt V(32, Val, true); 1788 if (TII->isInlineConstant(V)) 1789 return MachineOperand::CreateImm(Val); 1790 1791 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1792 MachineInstr *Mov = 1793 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1794 TII->get(AMDGPU::S_MOV_B32), Reg) 1795 .addImm(Val); 1796 (void)Mov; 1797 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1798 return MachineOperand::CreateReg(Reg, false); 1799 } 1800 1801 // Compute base address using Addr and return the final register. 1802 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1803 const MemAddress &Addr) const { 1804 MachineBasicBlock *MBB = MI.getParent(); 1805 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1806 DebugLoc DL = MI.getDebugLoc(); 1807 1808 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1809 Addr.Base.LoSubReg) && 1810 "Expected 32-bit Base-Register-Low!!"); 1811 1812 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1813 Addr.Base.HiSubReg) && 1814 "Expected 32-bit Base-Register-Hi!!"); 1815 1816 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1817 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1818 MachineOperand OffsetHi = 1819 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1820 1821 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1822 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1823 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1824 1825 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1826 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1827 MachineInstr *LoHalf = 1828 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1829 .addReg(CarryReg, RegState::Define) 1830 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1831 .add(OffsetLo) 1832 .addImm(0); // clamp bit 1833 (void)LoHalf; 1834 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1835 1836 MachineInstr *HiHalf = 1837 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1838 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1839 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1840 .add(OffsetHi) 1841 .addReg(CarryReg, RegState::Kill) 1842 .addImm(0); // clamp bit 1843 (void)HiHalf; 1844 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1845 1846 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1847 MachineInstr *FullBase = 1848 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1849 .addReg(DestSub0) 1850 .addImm(AMDGPU::sub0) 1851 .addReg(DestSub1) 1852 .addImm(AMDGPU::sub1); 1853 (void)FullBase; 1854 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1855 1856 return FullDestReg; 1857 } 1858 1859 // Update base and offset with the NewBase and NewOffset in MI. 1860 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1861 Register NewBase, 1862 int32_t NewOffset) const { 1863 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1864 Base->setReg(NewBase); 1865 Base->setIsKill(false); 1866 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1867 } 1868 1869 Optional<int32_t> 1870 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1871 if (Op.isImm()) 1872 return Op.getImm(); 1873 1874 if (!Op.isReg()) 1875 return None; 1876 1877 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1878 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1879 !Def->getOperand(1).isImm()) 1880 return None; 1881 1882 return Def->getOperand(1).getImm(); 1883 } 1884 1885 // Analyze Base and extracts: 1886 // - 32bit base registers, subregisters 1887 // - 64bit constant offset 1888 // Expecting base computation as: 1889 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1890 // %LO:vgpr_32, %c:sreg_64_xexec = 1891 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1892 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1893 // %Base:vreg_64 = 1894 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1895 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1896 MemAddress &Addr) const { 1897 if (!Base.isReg()) 1898 return; 1899 1900 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1901 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1902 || Def->getNumOperands() != 5) 1903 return; 1904 1905 MachineOperand BaseLo = Def->getOperand(1); 1906 MachineOperand BaseHi = Def->getOperand(3); 1907 if (!BaseLo.isReg() || !BaseHi.isReg()) 1908 return; 1909 1910 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1911 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1912 1913 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1914 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1915 return; 1916 1917 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1918 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1919 1920 auto Offset0P = extractConstOffset(*Src0); 1921 if (Offset0P) 1922 BaseLo = *Src1; 1923 else { 1924 if (!(Offset0P = extractConstOffset(*Src1))) 1925 return; 1926 BaseLo = *Src0; 1927 } 1928 1929 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1930 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1931 1932 if (Src0->isImm()) 1933 std::swap(Src0, Src1); 1934 1935 if (!Src1->isImm()) 1936 return; 1937 1938 uint64_t Offset1 = Src1->getImm(); 1939 BaseHi = *Src0; 1940 1941 Addr.Base.LoReg = BaseLo.getReg(); 1942 Addr.Base.HiReg = BaseHi.getReg(); 1943 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1944 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1945 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1946 } 1947 1948 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1949 MachineInstr &MI, 1950 MemInfoMap &Visited, 1951 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1952 1953 if (!(MI.mayLoad() ^ MI.mayStore())) 1954 return false; 1955 1956 // TODO: Support flat and scratch. 1957 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1958 return false; 1959 1960 if (MI.mayLoad() && 1961 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 1962 return false; 1963 1964 if (AnchorList.count(&MI)) 1965 return false; 1966 1967 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1968 1969 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1970 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1971 return false; 1972 } 1973 1974 // Step1: Find the base-registers and a 64bit constant offset. 1975 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1976 MemAddress MAddr; 1977 if (Visited.find(&MI) == Visited.end()) { 1978 processBaseWithConstOffset(Base, MAddr); 1979 Visited[&MI] = MAddr; 1980 } else 1981 MAddr = Visited[&MI]; 1982 1983 if (MAddr.Offset == 0) { 1984 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1985 " constant offsets that can be promoted.\n";); 1986 return false; 1987 } 1988 1989 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1990 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1991 1992 // Step2: Traverse through MI's basic block and find an anchor(that has the 1993 // same base-registers) with the highest 13bit distance from MI's offset. 1994 // E.g. (64bit loads) 1995 // bb: 1996 // addr1 = &a + 4096; load1 = load(addr1, 0) 1997 // addr2 = &a + 6144; load2 = load(addr2, 0) 1998 // addr3 = &a + 8192; load3 = load(addr3, 0) 1999 // addr4 = &a + 10240; load4 = load(addr4, 0) 2000 // addr5 = &a + 12288; load5 = load(addr5, 0) 2001 // 2002 // Starting from the first load, the optimization will try to find a new base 2003 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2004 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2005 // as the new-base(anchor) because of the maximum distance which can 2006 // accommodate more intermediate bases presumably. 2007 // 2008 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2009 // (&a + 8192) for load1, load2, load4. 2010 // addr = &a + 8192 2011 // load1 = load(addr, -4096) 2012 // load2 = load(addr, -2048) 2013 // load3 = load(addr, 0) 2014 // load4 = load(addr, 2048) 2015 // addr5 = &a + 12288; load5 = load(addr5, 0) 2016 // 2017 MachineInstr *AnchorInst = nullptr; 2018 MemAddress AnchorAddr; 2019 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2020 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2021 2022 MachineBasicBlock *MBB = MI.getParent(); 2023 MachineBasicBlock::iterator E = MBB->end(); 2024 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2025 ++MBBI; 2026 const SITargetLowering *TLI = 2027 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2028 2029 for ( ; MBBI != E; ++MBBI) { 2030 MachineInstr &MINext = *MBBI; 2031 // TODO: Support finding an anchor(with same base) from store addresses or 2032 // any other load addresses where the opcodes are different. 2033 if (MINext.getOpcode() != MI.getOpcode() || 2034 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2035 continue; 2036 2037 const MachineOperand &BaseNext = 2038 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2039 MemAddress MAddrNext; 2040 if (Visited.find(&MINext) == Visited.end()) { 2041 processBaseWithConstOffset(BaseNext, MAddrNext); 2042 Visited[&MINext] = MAddrNext; 2043 } else 2044 MAddrNext = Visited[&MINext]; 2045 2046 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2047 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2048 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2049 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2050 continue; 2051 2052 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 2053 2054 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2055 TargetLoweringBase::AddrMode AM; 2056 AM.HasBaseReg = true; 2057 AM.BaseOffs = Dist; 2058 if (TLI->isLegalGlobalAddressingMode(AM) && 2059 (uint32_t)std::abs(Dist) > MaxDist) { 2060 MaxDist = std::abs(Dist); 2061 2062 AnchorAddr = MAddrNext; 2063 AnchorInst = &MINext; 2064 } 2065 } 2066 2067 if (AnchorInst) { 2068 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2069 AnchorInst->dump()); 2070 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2071 << AnchorAddr.Offset << "\n\n"); 2072 2073 // Instead of moving up, just re-compute anchor-instruction's base address. 2074 Register Base = computeBase(MI, AnchorAddr); 2075 2076 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2077 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2078 2079 for (auto P : InstsWCommonBase) { 2080 TargetLoweringBase::AddrMode AM; 2081 AM.HasBaseReg = true; 2082 AM.BaseOffs = P.second - AnchorAddr.Offset; 2083 2084 if (TLI->isLegalGlobalAddressingMode(AM)) { 2085 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 2086 dbgs() << ")"; P.first->dump()); 2087 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 2088 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 2089 } 2090 } 2091 AnchorList.insert(AnchorInst); 2092 return true; 2093 } 2094 2095 return false; 2096 } 2097 2098 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2099 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2100 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2101 if (AddrList.front().InstClass == CI.InstClass && 2102 AddrList.front().IsAGPR == CI.IsAGPR && 2103 AddrList.front().hasSameBaseAddress(*CI.I)) { 2104 AddrList.emplace_back(CI); 2105 return; 2106 } 2107 } 2108 2109 // Base address not found, so add a new list. 2110 MergeableInsts.emplace_back(1, CI); 2111 } 2112 2113 std::pair<MachineBasicBlock::iterator, bool> 2114 SILoadStoreOptimizer::collectMergeableInsts( 2115 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2116 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2117 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2118 bool Modified = false; 2119 2120 // Sort potential mergeable instructions into lists. One list per base address. 2121 unsigned Order = 0; 2122 MachineBasicBlock::iterator BlockI = Begin; 2123 for (; BlockI != End; ++BlockI) { 2124 MachineInstr &MI = *BlockI; 2125 2126 // We run this before checking if an address is mergeable, because it can produce 2127 // better code even if the instructions aren't mergeable. 2128 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2129 Modified = true; 2130 2131 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2132 // barriers. We can look after this barrier for separate merges. 2133 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2134 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2135 2136 // Search will resume after this instruction in a separate merge list. 2137 ++BlockI; 2138 break; 2139 } 2140 2141 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2142 if (InstClass == UNKNOWN) 2143 continue; 2144 2145 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2146 int Swizzled = 2147 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2148 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2149 continue; 2150 2151 CombineInfo CI; 2152 CI.setMI(MI, *this); 2153 CI.Order = Order++; 2154 2155 if (!CI.hasMergeableAddress(*MRI)) 2156 continue; 2157 2158 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2159 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2160 // operands. However we are reporting that ds_write2 shall have 2161 // only VGPR data so that machine copy propagation does not 2162 // create an illegal instruction with a VGPR and AGPR sources. 2163 // Consequenctially if we create such instruction the verifier 2164 // will complain. 2165 continue; 2166 } 2167 2168 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2169 2170 addInstToMergeableList(CI, MergeableInsts); 2171 } 2172 2173 // At this point we have lists of Mergeable instructions. 2174 // 2175 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2176 // list try to find an instruction that can be merged with I. If an instruction 2177 // is found, it is stored in the Paired field. If no instructions are found, then 2178 // the CombineInfo object is deleted from the list. 2179 2180 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2181 E = MergeableInsts.end(); I != E;) { 2182 2183 std::list<CombineInfo> &MergeList = *I; 2184 if (MergeList.size() <= 1) { 2185 // This means we have found only one instruction with a given address 2186 // that can be merged, and we need at least 2 instructions to do a merge, 2187 // so this list can be discarded. 2188 I = MergeableInsts.erase(I); 2189 continue; 2190 } 2191 2192 // Sort the lists by offsets, this way mergeable instructions will be 2193 // adjacent to each other in the list, which will make it easier to find 2194 // matches. 2195 MergeList.sort( 2196 [] (const CombineInfo &A, const CombineInfo &B) { 2197 return A.Offset < B.Offset; 2198 }); 2199 ++I; 2200 } 2201 2202 return std::make_pair(BlockI, Modified); 2203 } 2204 2205 // Scan through looking for adjacent LDS operations with constant offsets from 2206 // the same base register. We rely on the scheduler to do the hard work of 2207 // clustering nearby loads, and assume these are all adjacent. 2208 bool SILoadStoreOptimizer::optimizeBlock( 2209 std::list<std::list<CombineInfo> > &MergeableInsts) { 2210 bool Modified = false; 2211 2212 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2213 E = MergeableInsts.end(); I != E;) { 2214 std::list<CombineInfo> &MergeList = *I; 2215 2216 bool OptimizeListAgain = false; 2217 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2218 // We weren't able to make any changes, so delete the list so we don't 2219 // process the same instructions the next time we try to optimize this 2220 // block. 2221 I = MergeableInsts.erase(I); 2222 continue; 2223 } 2224 2225 Modified = true; 2226 2227 // We made changes, but also determined that there were no more optimization 2228 // opportunities, so we don't need to reprocess the list 2229 if (!OptimizeListAgain) { 2230 I = MergeableInsts.erase(I); 2231 continue; 2232 } 2233 OptimizeAgain = true; 2234 } 2235 return Modified; 2236 } 2237 2238 bool 2239 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2240 std::list<CombineInfo> &MergeList, 2241 bool &OptimizeListAgain) { 2242 if (MergeList.empty()) 2243 return false; 2244 2245 bool Modified = false; 2246 2247 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2248 Next = std::next(I)) { 2249 2250 auto First = I; 2251 auto Second = Next; 2252 2253 if ((*First).Order > (*Second).Order) 2254 std::swap(First, Second); 2255 CombineInfo &CI = *First; 2256 CombineInfo &Paired = *Second; 2257 2258 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2259 if (!Where) { 2260 ++I; 2261 continue; 2262 } 2263 2264 Modified = true; 2265 2266 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2267 2268 MachineBasicBlock::iterator NewMI; 2269 switch (CI.InstClass) { 2270 default: 2271 llvm_unreachable("unknown InstClass"); 2272 break; 2273 case DS_READ: 2274 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2275 break; 2276 case DS_WRITE: 2277 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2278 break; 2279 case S_BUFFER_LOAD_IMM: 2280 NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); 2281 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2282 break; 2283 case BUFFER_LOAD: 2284 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2285 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2286 break; 2287 case BUFFER_STORE: 2288 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2289 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2290 break; 2291 case MIMG: 2292 NewMI = mergeImagePair(CI, Paired, Where->I); 2293 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2294 break; 2295 case TBUFFER_LOAD: 2296 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2297 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2298 break; 2299 case TBUFFER_STORE: 2300 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2301 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2302 break; 2303 case FLAT_LOAD: 2304 case GLOBAL_LOAD: 2305 case GLOBAL_LOAD_SADDR: 2306 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2307 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2308 break; 2309 case FLAT_STORE: 2310 case GLOBAL_STORE: 2311 case GLOBAL_STORE_SADDR: 2312 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2313 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2314 break; 2315 } 2316 CI.setMI(NewMI, *this); 2317 CI.Order = Where->Order; 2318 if (I == Second) 2319 I = Next; 2320 2321 MergeList.erase(Second); 2322 } 2323 2324 return Modified; 2325 } 2326 2327 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2328 if (skipFunction(MF.getFunction())) 2329 return false; 2330 2331 STM = &MF.getSubtarget<GCNSubtarget>(); 2332 if (!STM->loadStoreOptEnabled()) 2333 return false; 2334 2335 TII = STM->getInstrInfo(); 2336 TRI = &TII->getRegisterInfo(); 2337 2338 MRI = &MF.getRegInfo(); 2339 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2340 2341 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2342 2343 bool Modified = false; 2344 2345 // Contains the list of instructions for which constant offsets are being 2346 // promoted to the IMM. This is tracked for an entire block at time. 2347 SmallPtrSet<MachineInstr *, 4> AnchorList; 2348 MemInfoMap Visited; 2349 2350 for (MachineBasicBlock &MBB : MF) { 2351 MachineBasicBlock::iterator SectionEnd; 2352 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2353 I = SectionEnd) { 2354 bool CollectModified; 2355 std::list<std::list<CombineInfo>> MergeableInsts; 2356 2357 // First pass: Collect list of all instructions we know how to merge in a 2358 // subset of the block. 2359 std::tie(SectionEnd, CollectModified) = 2360 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2361 2362 Modified |= CollectModified; 2363 2364 do { 2365 OptimizeAgain = false; 2366 Modified |= optimizeBlock(MergeableInsts); 2367 } while (OptimizeAgain); 2368 } 2369 2370 Visited.clear(); 2371 AnchorList.clear(); 2372 } 2373 2374 return Modified; 2375 } 2376