1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 GLOBAL_LOAD 83 }; 84 85 struct AddressRegs { 86 unsigned char NumVAddrs = 0; 87 bool SBase = false; 88 bool SRsrc = false; 89 bool SOffset = false; 90 bool VAddr = false; 91 bool Addr = false; 92 bool SSamp = false; 93 }; 94 95 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 96 const unsigned MaxAddressRegs = 12 + 1 + 1; 97 98 class SILoadStoreOptimizer : public MachineFunctionPass { 99 struct CombineInfo { 100 MachineBasicBlock::iterator I; 101 unsigned EltSize; 102 unsigned Offset; 103 unsigned Width; 104 unsigned Format; 105 unsigned BaseOff; 106 unsigned DMask; 107 InstClassEnum InstClass; 108 unsigned CPol = 0; 109 bool IsAGPR; 110 bool UseST64; 111 int AddrIdx[MaxAddressRegs]; 112 const MachineOperand *AddrReg[MaxAddressRegs]; 113 unsigned NumAddresses; 114 unsigned Order; 115 116 bool hasSameBaseAddress(const MachineInstr &MI) { 117 for (unsigned i = 0; i < NumAddresses; i++) { 118 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 119 120 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 121 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 122 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 123 return false; 124 } 125 continue; 126 } 127 128 // Check same base pointer. Be careful of subregisters, which can occur 129 // with vectors of pointers. 130 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 131 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 132 return false; 133 } 134 } 135 return true; 136 } 137 138 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 139 for (unsigned i = 0; i < NumAddresses; ++i) { 140 const MachineOperand *AddrOp = AddrReg[i]; 141 // Immediates are always OK. 142 if (AddrOp->isImm()) 143 continue; 144 145 // Don't try to merge addresses that aren't either immediates or registers. 146 // TODO: Should be possible to merge FrameIndexes and maybe some other 147 // non-register 148 if (!AddrOp->isReg()) 149 return false; 150 151 // TODO: We should be able to merge physical reg addresses. 152 if (AddrOp->getReg().isPhysical()) 153 return false; 154 155 // If an address has only one use then there will be on other 156 // instructions with the same address, so we can't merge this one. 157 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 158 return false; 159 } 160 return true; 161 } 162 163 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 164 }; 165 166 struct BaseRegisters { 167 Register LoReg; 168 Register HiReg; 169 170 unsigned LoSubReg = 0; 171 unsigned HiSubReg = 0; 172 }; 173 174 struct MemAddress { 175 BaseRegisters Base; 176 int64_t Offset = 0; 177 }; 178 179 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 180 181 private: 182 const GCNSubtarget *STM = nullptr; 183 const SIInstrInfo *TII = nullptr; 184 const SIRegisterInfo *TRI = nullptr; 185 MachineRegisterInfo *MRI = nullptr; 186 AliasAnalysis *AA = nullptr; 187 bool OptimizeAgain; 188 189 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 190 const DenseSet<Register> &ARegUses, 191 const MachineInstr &A, const MachineInstr &B) const; 192 static bool dmasksCanBeCombined(const CombineInfo &CI, 193 const SIInstrInfo &TII, 194 const CombineInfo &Paired); 195 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 196 CombineInfo &Paired, bool Modify = false); 197 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 198 const CombineInfo &Paired); 199 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 200 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 201 const CombineInfo &Paired); 202 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 203 const CombineInfo &Paired); 204 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 205 206 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 207 208 unsigned read2Opcode(unsigned EltSize) const; 209 unsigned read2ST64Opcode(unsigned EltSize) const; 210 MachineBasicBlock::iterator 211 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 212 MachineBasicBlock::iterator InsertBefore); 213 214 unsigned write2Opcode(unsigned EltSize) const; 215 unsigned write2ST64Opcode(unsigned EltSize) const; 216 MachineBasicBlock::iterator 217 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 218 MachineBasicBlock::iterator InsertBefore); 219 MachineBasicBlock::iterator 220 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 221 MachineBasicBlock::iterator InsertBefore); 222 MachineBasicBlock::iterator 223 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 224 MachineBasicBlock::iterator InsertBefore); 225 MachineBasicBlock::iterator 226 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 227 MachineBasicBlock::iterator InsertBefore); 228 MachineBasicBlock::iterator 229 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 230 MachineBasicBlock::iterator InsertBefore); 231 MachineBasicBlock::iterator 232 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 233 MachineBasicBlock::iterator InsertBefore); 234 MachineBasicBlock::iterator 235 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 236 MachineBasicBlock::iterator InsertBefore); 237 MachineBasicBlock::iterator 238 mergeGlobalLoadPair(CombineInfo &CI, CombineInfo &Paired, 239 MachineBasicBlock::iterator InsertBefore); 240 241 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 242 int32_t NewOffset) const; 243 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 244 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 245 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 246 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 247 /// Promotes constant offset to the immediate by adjusting the base. It 248 /// tries to use a base from the nearby instructions that allows it to have 249 /// a 13bit constant offset which gets promoted to the immediate. 250 bool promoteConstantOffsetToImm(MachineInstr &CI, 251 MemInfoMap &Visited, 252 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 253 void addInstToMergeableList(const CombineInfo &CI, 254 std::list<std::list<CombineInfo> > &MergeableInsts) const; 255 256 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 257 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 258 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 259 std::list<std::list<CombineInfo>> &MergeableInsts) const; 260 261 public: 262 static char ID; 263 264 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 265 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 266 } 267 268 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 269 bool &OptimizeListAgain); 270 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 271 272 bool runOnMachineFunction(MachineFunction &MF) override; 273 274 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 275 276 void getAnalysisUsage(AnalysisUsage &AU) const override { 277 AU.setPreservesCFG(); 278 AU.addRequired<AAResultsWrapperPass>(); 279 280 MachineFunctionPass::getAnalysisUsage(AU); 281 } 282 283 MachineFunctionProperties getRequiredProperties() const override { 284 return MachineFunctionProperties() 285 .set(MachineFunctionProperties::Property::IsSSA); 286 } 287 }; 288 289 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 290 const unsigned Opc = MI.getOpcode(); 291 292 if (TII.isMUBUF(Opc)) { 293 // FIXME: Handle d16 correctly 294 return AMDGPU::getMUBUFElements(Opc); 295 } 296 if (TII.isMIMG(MI)) { 297 uint64_t DMaskImm = 298 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 299 return countPopulation(DMaskImm); 300 } 301 if (TII.isMTBUF(Opc)) { 302 return AMDGPU::getMTBUFElements(Opc); 303 } 304 305 switch (Opc) { 306 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 307 case AMDGPU::GLOBAL_LOAD_DWORD: 308 return 1; 309 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 310 case AMDGPU::GLOBAL_LOAD_DWORDX2: 311 return 2; 312 case AMDGPU::GLOBAL_LOAD_DWORDX3: 313 return 3; 314 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 315 case AMDGPU::GLOBAL_LOAD_DWORDX4: 316 return 4; 317 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 318 return 8; 319 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 320 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 321 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 322 case AMDGPU::DS_WRITE_B32_gfx9: 323 return 1; 324 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 325 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 326 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 327 case AMDGPU::DS_WRITE_B64_gfx9: 328 return 2; 329 default: 330 return 0; 331 } 332 } 333 334 /// Maps instruction opcode to enum InstClassEnum. 335 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 336 switch (Opc) { 337 default: 338 if (TII.isMUBUF(Opc)) { 339 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 340 default: 341 return UNKNOWN; 342 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 343 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 344 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 345 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 346 return BUFFER_LOAD; 347 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 348 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 349 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 350 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 351 return BUFFER_STORE; 352 } 353 } 354 if (TII.isMIMG(Opc)) { 355 // Ignore instructions encoded without vaddr. 356 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 357 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 358 return UNKNOWN; 359 // Ignore BVH instructions 360 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 361 return UNKNOWN; 362 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 363 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 364 TII.isGather4(Opc)) 365 return UNKNOWN; 366 return MIMG; 367 } 368 if (TII.isMTBUF(Opc)) { 369 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 370 default: 371 return UNKNOWN; 372 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 373 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 374 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 375 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 376 return TBUFFER_LOAD; 377 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 378 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 379 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 380 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 381 return TBUFFER_STORE; 382 } 383 } 384 return UNKNOWN; 385 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 386 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 387 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 388 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 389 return S_BUFFER_LOAD_IMM; 390 case AMDGPU::DS_READ_B32: 391 case AMDGPU::DS_READ_B32_gfx9: 392 case AMDGPU::DS_READ_B64: 393 case AMDGPU::DS_READ_B64_gfx9: 394 return DS_READ; 395 case AMDGPU::DS_WRITE_B32: 396 case AMDGPU::DS_WRITE_B32_gfx9: 397 case AMDGPU::DS_WRITE_B64: 398 case AMDGPU::DS_WRITE_B64_gfx9: 399 return DS_WRITE; 400 case AMDGPU::GLOBAL_LOAD_DWORD: 401 case AMDGPU::GLOBAL_LOAD_DWORDX2: 402 case AMDGPU::GLOBAL_LOAD_DWORDX3: 403 case AMDGPU::GLOBAL_LOAD_DWORDX4: 404 return GLOBAL_LOAD; 405 } 406 } 407 408 /// Determines instruction subclass from opcode. Only instructions 409 /// of the same subclass can be merged together. The merged instruction may have 410 /// a different subclass but must have the same class. 411 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 412 switch (Opc) { 413 default: 414 if (TII.isMUBUF(Opc)) 415 return AMDGPU::getMUBUFBaseOpcode(Opc); 416 if (TII.isMIMG(Opc)) { 417 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 418 assert(Info); 419 return Info->BaseOpcode; 420 } 421 if (TII.isMTBUF(Opc)) 422 return AMDGPU::getMTBUFBaseOpcode(Opc); 423 return -1; 424 case AMDGPU::DS_READ_B32: 425 case AMDGPU::DS_READ_B32_gfx9: 426 case AMDGPU::DS_READ_B64: 427 case AMDGPU::DS_READ_B64_gfx9: 428 case AMDGPU::DS_WRITE_B32: 429 case AMDGPU::DS_WRITE_B32_gfx9: 430 case AMDGPU::DS_WRITE_B64: 431 case AMDGPU::DS_WRITE_B64_gfx9: 432 return Opc; 433 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 434 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 435 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 436 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 437 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 438 case AMDGPU::GLOBAL_LOAD_DWORD: 439 case AMDGPU::GLOBAL_LOAD_DWORDX2: 440 case AMDGPU::GLOBAL_LOAD_DWORDX3: 441 case AMDGPU::GLOBAL_LOAD_DWORDX4: 442 return AMDGPU::GLOBAL_LOAD_DWORD; 443 } 444 } 445 446 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 447 AddressRegs Result; 448 449 if (TII.isMUBUF(Opc)) { 450 if (AMDGPU::getMUBUFHasVAddr(Opc)) 451 Result.VAddr = true; 452 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 453 Result.SRsrc = true; 454 if (AMDGPU::getMUBUFHasSoffset(Opc)) 455 Result.SOffset = true; 456 457 return Result; 458 } 459 460 if (TII.isMIMG(Opc)) { 461 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 462 if (VAddr0Idx >= 0) { 463 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 464 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 465 } else { 466 Result.VAddr = true; 467 } 468 Result.SRsrc = true; 469 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 470 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 471 Result.SSamp = true; 472 473 return Result; 474 } 475 if (TII.isMTBUF(Opc)) { 476 if (AMDGPU::getMTBUFHasVAddr(Opc)) 477 Result.VAddr = true; 478 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 479 Result.SRsrc = true; 480 if (AMDGPU::getMTBUFHasSoffset(Opc)) 481 Result.SOffset = true; 482 483 return Result; 484 } 485 486 switch (Opc) { 487 default: 488 return Result; 489 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 490 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 491 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 492 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 493 Result.SBase = true; 494 return Result; 495 case AMDGPU::DS_READ_B32: 496 case AMDGPU::DS_READ_B64: 497 case AMDGPU::DS_READ_B32_gfx9: 498 case AMDGPU::DS_READ_B64_gfx9: 499 case AMDGPU::DS_WRITE_B32: 500 case AMDGPU::DS_WRITE_B64: 501 case AMDGPU::DS_WRITE_B32_gfx9: 502 case AMDGPU::DS_WRITE_B64_gfx9: 503 Result.Addr = true; 504 return Result; 505 case AMDGPU::GLOBAL_LOAD_DWORD: 506 case AMDGPU::GLOBAL_LOAD_DWORDX2: 507 case AMDGPU::GLOBAL_LOAD_DWORDX3: 508 case AMDGPU::GLOBAL_LOAD_DWORDX4: 509 Result.VAddr = true; 510 return Result; 511 } 512 } 513 514 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 515 const SILoadStoreOptimizer &LSO) { 516 I = MI; 517 unsigned Opc = MI->getOpcode(); 518 InstClass = getInstClass(Opc, *LSO.TII); 519 520 if (InstClass == UNKNOWN) 521 return; 522 523 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 524 525 switch (InstClass) { 526 case DS_READ: 527 EltSize = 528 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 529 : 4; 530 break; 531 case DS_WRITE: 532 EltSize = 533 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 534 : 4; 535 break; 536 case S_BUFFER_LOAD_IMM: 537 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 538 break; 539 default: 540 EltSize = 4; 541 break; 542 } 543 544 if (InstClass == MIMG) { 545 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 546 // Offset is not considered for MIMG instructions. 547 Offset = 0; 548 } else { 549 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 550 Offset = I->getOperand(OffsetIdx).getImm(); 551 } 552 553 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 554 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 555 556 Width = getOpcodeWidth(*I, *LSO.TII); 557 558 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 559 Offset &= 0xffff; 560 } else if (InstClass != MIMG) { 561 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 562 } 563 564 AddressRegs Regs = getRegs(Opc, *LSO.TII); 565 566 NumAddresses = 0; 567 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 568 AddrIdx[NumAddresses++] = 569 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 570 if (Regs.Addr) 571 AddrIdx[NumAddresses++] = 572 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 573 if (Regs.SBase) 574 AddrIdx[NumAddresses++] = 575 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 576 if (Regs.SRsrc) 577 AddrIdx[NumAddresses++] = 578 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 579 if (Regs.SOffset) 580 AddrIdx[NumAddresses++] = 581 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 582 if (Regs.VAddr) 583 AddrIdx[NumAddresses++] = 584 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 585 if (Regs.SSamp) 586 AddrIdx[NumAddresses++] = 587 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 588 assert(NumAddresses <= MaxAddressRegs); 589 590 for (unsigned J = 0; J < NumAddresses; J++) 591 AddrReg[J] = &I->getOperand(AddrIdx[J]); 592 } 593 594 } // end anonymous namespace. 595 596 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 597 "SI Load Store Optimizer", false, false) 598 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 599 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 600 false, false) 601 602 char SILoadStoreOptimizer::ID = 0; 603 604 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 605 606 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 607 return new SILoadStoreOptimizer(); 608 } 609 610 static void addDefsUsesToList(const MachineInstr &MI, 611 DenseSet<Register> &RegDefs, 612 DenseSet<Register> &RegUses) { 613 for (const auto &Op : MI.operands()) { 614 if (!Op.isReg()) 615 continue; 616 if (Op.isDef()) 617 RegDefs.insert(Op.getReg()); 618 if (Op.readsReg()) 619 RegUses.insert(Op.getReg()); 620 } 621 } 622 623 bool SILoadStoreOptimizer::canSwapInstructions( 624 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 625 const MachineInstr &A, const MachineInstr &B) const { 626 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 627 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 628 return false; 629 for (const auto &BOp : B.operands()) { 630 if (!BOp.isReg()) 631 continue; 632 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 633 return false; 634 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 635 return false; 636 } 637 return true; 638 } 639 640 // This function assumes that \p A and \p B have are identical except for 641 // size and offset, and they reference adjacent memory. 642 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 643 const MachineMemOperand *A, 644 const MachineMemOperand *B) { 645 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 646 unsigned Size = A->getSize() + B->getSize(); 647 // This function adds the offset parameter to the existing offset for A, 648 // so we pass 0 here as the offset and then manually set it to the correct 649 // value after the call. 650 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 651 MMO->setOffset(MinOffset); 652 return MMO; 653 } 654 655 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 656 const SIInstrInfo &TII, 657 const CombineInfo &Paired) { 658 assert(CI.InstClass == MIMG); 659 660 // Ignore instructions with tfe/lwe set. 661 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 662 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 663 664 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 665 return false; 666 667 // Check other optional immediate operands for equality. 668 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 669 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 670 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 671 672 for (auto op : OperandsToMatch) { 673 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 674 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 675 return false; 676 if (Idx != -1 && 677 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 678 return false; 679 } 680 681 // Check DMask for overlaps. 682 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 683 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 684 685 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 686 if ((1u << AllowedBitsForMin) <= MinMask) 687 return false; 688 689 return true; 690 } 691 692 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 693 unsigned ComponentCount, 694 const GCNSubtarget &STI) { 695 if (ComponentCount > 4) 696 return 0; 697 698 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 699 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 700 if (!OldFormatInfo) 701 return 0; 702 703 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 704 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 705 ComponentCount, 706 OldFormatInfo->NumFormat, STI); 707 708 if (!NewFormatInfo) 709 return 0; 710 711 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 712 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 713 714 return NewFormatInfo->Format; 715 } 716 717 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 718 // highest power of two. Note that the result is well defined for all inputs 719 // including corner cases like: 720 // - if Lo == Hi, return that value 721 // - if Lo == 0, return 0 (even though the "- 1" below underflows 722 // - if Lo > Hi, return 0 (as if the range wrapped around) 723 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 724 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 725 } 726 727 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 728 const GCNSubtarget &STI, 729 CombineInfo &Paired, 730 bool Modify) { 731 assert(CI.InstClass != MIMG); 732 733 // XXX - Would the same offset be OK? Is there any reason this would happen or 734 // be useful? 735 if (CI.Offset == Paired.Offset) 736 return false; 737 738 // This won't be valid if the offset isn't aligned. 739 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 740 return false; 741 742 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 743 744 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 745 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 746 if (!Info0) 747 return false; 748 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 749 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 750 if (!Info1) 751 return false; 752 753 if (Info0->BitsPerComp != Info1->BitsPerComp || 754 Info0->NumFormat != Info1->NumFormat) 755 return false; 756 757 // TODO: Should be possible to support more formats, but if format loads 758 // are not dword-aligned, the merged load might not be valid. 759 if (Info0->BitsPerComp != 32) 760 return false; 761 762 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 763 return false; 764 } 765 766 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 767 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 768 CI.UseST64 = false; 769 CI.BaseOff = 0; 770 771 // Handle all non-DS instructions. 772 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 773 return (EltOffset0 + CI.Width == EltOffset1 || 774 EltOffset1 + Paired.Width == EltOffset0) && 775 CI.CPol == Paired.CPol; 776 } 777 778 // If the offset in elements doesn't fit in 8-bits, we might be able to use 779 // the stride 64 versions. 780 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 781 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 782 if (Modify) { 783 CI.Offset = EltOffset0 / 64; 784 Paired.Offset = EltOffset1 / 64; 785 CI.UseST64 = true; 786 } 787 return true; 788 } 789 790 // Check if the new offsets fit in the reduced 8-bit range. 791 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 792 if (Modify) { 793 CI.Offset = EltOffset0; 794 Paired.Offset = EltOffset1; 795 } 796 return true; 797 } 798 799 // Try to shift base address to decrease offsets. 800 uint32_t Min = std::min(EltOffset0, EltOffset1); 801 uint32_t Max = std::max(EltOffset0, EltOffset1); 802 803 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 804 if (((Max - Min) & ~Mask) == 0) { 805 if (Modify) { 806 // From the range of values we could use for BaseOff, choose the one that 807 // is aligned to the highest power of two, to maximise the chance that 808 // the same offset can be reused for other load/store pairs. 809 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 810 // Copy the low bits of the offsets, so that when we adjust them by 811 // subtracting BaseOff they will be multiples of 64. 812 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 813 CI.BaseOff = BaseOff * CI.EltSize; 814 CI.Offset = (EltOffset0 - BaseOff) / 64; 815 Paired.Offset = (EltOffset1 - BaseOff) / 64; 816 CI.UseST64 = true; 817 } 818 return true; 819 } 820 821 if (isUInt<8>(Max - Min)) { 822 if (Modify) { 823 // From the range of values we could use for BaseOff, choose the one that 824 // is aligned to the highest power of two, to maximise the chance that 825 // the same offset can be reused for other load/store pairs. 826 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 827 CI.BaseOff = BaseOff * CI.EltSize; 828 CI.Offset = EltOffset0 - BaseOff; 829 Paired.Offset = EltOffset1 - BaseOff; 830 } 831 return true; 832 } 833 834 return false; 835 } 836 837 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 838 const CombineInfo &CI, 839 const CombineInfo &Paired) { 840 const unsigned Width = (CI.Width + Paired.Width); 841 switch (CI.InstClass) { 842 default: 843 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 844 case S_BUFFER_LOAD_IMM: 845 switch (Width) { 846 default: 847 return false; 848 case 2: 849 case 4: 850 case 8: 851 return true; 852 } 853 } 854 } 855 856 const TargetRegisterClass * 857 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 858 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 859 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 860 } 861 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 862 return TRI->getRegClassForReg(*MRI, Src->getReg()); 863 } 864 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 865 return TRI->getRegClassForReg(*MRI, Src->getReg()); 866 } 867 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 868 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 869 } 870 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 871 return TRI->getRegClassForReg(*MRI, Src->getReg()); 872 } 873 return nullptr; 874 } 875 876 /// This function assumes that CI comes before Paired in a basic block. Return 877 /// an insertion point for the merged instruction or nullptr on failure. 878 SILoadStoreOptimizer::CombineInfo * 879 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 880 CombineInfo &Paired) { 881 // If another instruction has already been merged into CI, it may now be a 882 // type that we can't do any further merging into. 883 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 884 return nullptr; 885 assert(CI.InstClass == Paired.InstClass); 886 887 if (getInstSubclass(CI.I->getOpcode(), *TII) != 888 getInstSubclass(Paired.I->getOpcode(), *TII)) 889 return nullptr; 890 891 // Check both offsets (or masks for MIMG) can be combined and fit in the 892 // reduced range. 893 if (CI.InstClass == MIMG) { 894 if (!dmasksCanBeCombined(CI, *TII, Paired)) 895 return nullptr; 896 } else { 897 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 898 return nullptr; 899 } 900 901 DenseSet<Register> RegDefs; 902 DenseSet<Register> RegUses; 903 CombineInfo *Where; 904 if (CI.I->mayLoad()) { 905 // Try to hoist Paired up to CI. 906 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 907 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 908 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 909 return nullptr; 910 } 911 Where = &CI; 912 } else { 913 // Try to sink CI down to Paired. 914 addDefsUsesToList(*CI.I, RegDefs, RegUses); 915 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 916 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 917 return nullptr; 918 } 919 Where = &Paired; 920 } 921 922 // Call offsetsCanBeCombined with modify = true so that the offsets are 923 // correct for the new instruction. This should return true, because 924 // this function should only be called on CombineInfo objects that 925 // have already been confirmed to be mergeable. 926 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 927 offsetsCanBeCombined(CI, *STM, Paired, true); 928 return Where; 929 } 930 931 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 932 if (STM->ldsRequiresM0Init()) 933 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 934 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 935 } 936 937 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 938 if (STM->ldsRequiresM0Init()) 939 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 940 941 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 942 : AMDGPU::DS_READ2ST64_B64_gfx9; 943 } 944 945 MachineBasicBlock::iterator 946 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 947 MachineBasicBlock::iterator InsertBefore) { 948 MachineBasicBlock *MBB = CI.I->getParent(); 949 950 // Be careful, since the addresses could be subregisters themselves in weird 951 // cases, like vectors of pointers. 952 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 953 954 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 955 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 956 957 unsigned NewOffset0 = CI.Offset; 958 unsigned NewOffset1 = Paired.Offset; 959 unsigned Opc = 960 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 961 962 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 963 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 964 965 if (NewOffset0 > NewOffset1) { 966 // Canonicalize the merged instruction so the smaller offset comes first. 967 std::swap(NewOffset0, NewOffset1); 968 std::swap(SubRegIdx0, SubRegIdx1); 969 } 970 971 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 972 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 973 974 const MCInstrDesc &Read2Desc = TII->get(Opc); 975 976 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 977 Register DestReg = MRI->createVirtualRegister(SuperRC); 978 979 DebugLoc DL = CI.I->getDebugLoc(); 980 981 Register BaseReg = AddrReg->getReg(); 982 unsigned BaseSubReg = AddrReg->getSubReg(); 983 unsigned BaseRegFlags = 0; 984 if (CI.BaseOff) { 985 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 986 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 987 .addImm(CI.BaseOff); 988 989 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 990 BaseRegFlags = RegState::Kill; 991 992 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 993 .addReg(ImmReg) 994 .addReg(AddrReg->getReg(), 0, BaseSubReg) 995 .addImm(0); // clamp bit 996 BaseSubReg = 0; 997 } 998 999 MachineInstrBuilder Read2 = 1000 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1001 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1002 .addImm(NewOffset0) // offset0 1003 .addImm(NewOffset1) // offset1 1004 .addImm(0) // gds 1005 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1006 1007 (void)Read2; 1008 1009 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1010 1011 // Copy to the old destination registers. 1012 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1013 .add(*Dest0) // Copy to same destination including flags and sub reg. 1014 .addReg(DestReg, 0, SubRegIdx0); 1015 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1016 .add(*Dest1) 1017 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1018 1019 CI.I->eraseFromParent(); 1020 Paired.I->eraseFromParent(); 1021 1022 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1023 return Read2; 1024 } 1025 1026 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1027 if (STM->ldsRequiresM0Init()) 1028 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1029 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1030 : AMDGPU::DS_WRITE2_B64_gfx9; 1031 } 1032 1033 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1034 if (STM->ldsRequiresM0Init()) 1035 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1036 : AMDGPU::DS_WRITE2ST64_B64; 1037 1038 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1039 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1040 } 1041 1042 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1043 CombineInfo &CI, CombineInfo &Paired, 1044 MachineBasicBlock::iterator InsertBefore) { 1045 MachineBasicBlock *MBB = CI.I->getParent(); 1046 1047 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1048 // sure we preserve the subregister index and any register flags set on them. 1049 const MachineOperand *AddrReg = 1050 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1051 const MachineOperand *Data0 = 1052 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1053 const MachineOperand *Data1 = 1054 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1055 1056 unsigned NewOffset0 = CI.Offset; 1057 unsigned NewOffset1 = Paired.Offset; 1058 unsigned Opc = 1059 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1060 1061 if (NewOffset0 > NewOffset1) { 1062 // Canonicalize the merged instruction so the smaller offset comes first. 1063 std::swap(NewOffset0, NewOffset1); 1064 std::swap(Data0, Data1); 1065 } 1066 1067 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1068 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1069 1070 const MCInstrDesc &Write2Desc = TII->get(Opc); 1071 DebugLoc DL = CI.I->getDebugLoc(); 1072 1073 Register BaseReg = AddrReg->getReg(); 1074 unsigned BaseSubReg = AddrReg->getSubReg(); 1075 unsigned BaseRegFlags = 0; 1076 if (CI.BaseOff) { 1077 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1078 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1079 .addImm(CI.BaseOff); 1080 1081 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1082 BaseRegFlags = RegState::Kill; 1083 1084 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1085 .addReg(ImmReg) 1086 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1087 .addImm(0); // clamp bit 1088 BaseSubReg = 0; 1089 } 1090 1091 MachineInstrBuilder Write2 = 1092 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1093 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1094 .add(*Data0) // data0 1095 .add(*Data1) // data1 1096 .addImm(NewOffset0) // offset0 1097 .addImm(NewOffset1) // offset1 1098 .addImm(0) // gds 1099 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1100 1101 CI.I->eraseFromParent(); 1102 Paired.I->eraseFromParent(); 1103 1104 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1105 return Write2; 1106 } 1107 1108 MachineBasicBlock::iterator 1109 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1110 MachineBasicBlock::iterator InsertBefore) { 1111 MachineBasicBlock *MBB = CI.I->getParent(); 1112 DebugLoc DL = CI.I->getDebugLoc(); 1113 const unsigned Opcode = getNewOpcode(CI, Paired); 1114 1115 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1116 1117 Register DestReg = MRI->createVirtualRegister(SuperRC); 1118 unsigned MergedDMask = CI.DMask | Paired.DMask; 1119 unsigned DMaskIdx = 1120 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1121 1122 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1123 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1124 if (I == DMaskIdx) 1125 MIB.addImm(MergedDMask); 1126 else 1127 MIB.add((*CI.I).getOperand(I)); 1128 } 1129 1130 // It shouldn't be possible to get this far if the two instructions 1131 // don't have a single memoperand, because MachineInstr::mayAlias() 1132 // will return true if this is the case. 1133 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1134 1135 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1136 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1137 1138 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1139 1140 unsigned SubRegIdx0, SubRegIdx1; 1141 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1142 1143 // Copy to the old destination registers. 1144 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1145 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1146 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1147 1148 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1149 .add(*Dest0) // Copy to same destination including flags and sub reg. 1150 .addReg(DestReg, 0, SubRegIdx0); 1151 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1152 .add(*Dest1) 1153 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1154 1155 CI.I->eraseFromParent(); 1156 Paired.I->eraseFromParent(); 1157 return New; 1158 } 1159 1160 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1161 CombineInfo &CI, CombineInfo &Paired, 1162 MachineBasicBlock::iterator InsertBefore) { 1163 MachineBasicBlock *MBB = CI.I->getParent(); 1164 DebugLoc DL = CI.I->getDebugLoc(); 1165 const unsigned Opcode = getNewOpcode(CI, Paired); 1166 1167 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1168 1169 Register DestReg = MRI->createVirtualRegister(SuperRC); 1170 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1171 1172 // It shouldn't be possible to get this far if the two instructions 1173 // don't have a single memoperand, because MachineInstr::mayAlias() 1174 // will return true if this is the case. 1175 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1176 1177 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1178 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1179 1180 MachineInstr *New = 1181 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1182 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1183 .addImm(MergedOffset) // offset 1184 .addImm(CI.CPol) // cpol 1185 .addMemOperand( 1186 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1187 1188 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1189 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1190 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1191 1192 // Copy to the old destination registers. 1193 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1194 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1195 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1196 1197 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1198 .add(*Dest0) // Copy to same destination including flags and sub reg. 1199 .addReg(DestReg, 0, SubRegIdx0); 1200 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1201 .add(*Dest1) 1202 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1203 1204 CI.I->eraseFromParent(); 1205 Paired.I->eraseFromParent(); 1206 return New; 1207 } 1208 1209 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1210 CombineInfo &CI, CombineInfo &Paired, 1211 MachineBasicBlock::iterator InsertBefore) { 1212 MachineBasicBlock *MBB = CI.I->getParent(); 1213 DebugLoc DL = CI.I->getDebugLoc(); 1214 1215 const unsigned Opcode = getNewOpcode(CI, Paired); 1216 1217 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1218 1219 // Copy to the new source register. 1220 Register DestReg = MRI->createVirtualRegister(SuperRC); 1221 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1222 1223 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1224 1225 AddressRegs Regs = getRegs(Opcode, *TII); 1226 1227 if (Regs.VAddr) 1228 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1229 1230 // It shouldn't be possible to get this far if the two instructions 1231 // don't have a single memoperand, because MachineInstr::mayAlias() 1232 // will return true if this is the case. 1233 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1234 1235 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1236 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1237 1238 MachineInstr *New = 1239 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1240 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1241 .addImm(MergedOffset) // offset 1242 .addImm(CI.CPol) // cpol 1243 .addImm(0) // tfe 1244 .addImm(0) // swz 1245 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1246 1247 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1248 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1249 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1250 1251 // Copy to the old destination registers. 1252 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1253 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1254 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1255 1256 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1257 .add(*Dest0) // Copy to same destination including flags and sub reg. 1258 .addReg(DestReg, 0, SubRegIdx0); 1259 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1260 .add(*Dest1) 1261 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1262 1263 CI.I->eraseFromParent(); 1264 Paired.I->eraseFromParent(); 1265 return New; 1266 } 1267 1268 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1269 CombineInfo &CI, CombineInfo &Paired, 1270 MachineBasicBlock::iterator InsertBefore) { 1271 MachineBasicBlock *MBB = CI.I->getParent(); 1272 DebugLoc DL = CI.I->getDebugLoc(); 1273 1274 const unsigned Opcode = getNewOpcode(CI, Paired); 1275 1276 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1277 1278 // Copy to the new source register. 1279 Register DestReg = MRI->createVirtualRegister(SuperRC); 1280 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1281 1282 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1283 1284 AddressRegs Regs = getRegs(Opcode, *TII); 1285 1286 if (Regs.VAddr) 1287 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1288 1289 unsigned JoinedFormat = 1290 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1291 1292 // It shouldn't be possible to get this far if the two instructions 1293 // don't have a single memoperand, because MachineInstr::mayAlias() 1294 // will return true if this is the case. 1295 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1296 1297 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1298 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1299 1300 MachineInstr *New = 1301 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1302 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1303 .addImm(MergedOffset) // offset 1304 .addImm(JoinedFormat) // format 1305 .addImm(CI.CPol) // cpol 1306 .addImm(0) // tfe 1307 .addImm(0) // swz 1308 .addMemOperand( 1309 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1310 1311 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1312 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1313 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1314 1315 // Copy to the old destination registers. 1316 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1317 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1318 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1319 1320 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1321 .add(*Dest0) // Copy to same destination including flags and sub reg. 1322 .addReg(DestReg, 0, SubRegIdx0); 1323 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1324 .add(*Dest1) 1325 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1326 1327 CI.I->eraseFromParent(); 1328 Paired.I->eraseFromParent(); 1329 return New; 1330 } 1331 1332 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1333 CombineInfo &CI, CombineInfo &Paired, 1334 MachineBasicBlock::iterator InsertBefore) { 1335 MachineBasicBlock *MBB = CI.I->getParent(); 1336 DebugLoc DL = CI.I->getDebugLoc(); 1337 1338 const unsigned Opcode = getNewOpcode(CI, Paired); 1339 1340 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1341 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1342 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1343 1344 // Copy to the new source register. 1345 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1346 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1347 1348 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1349 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1350 1351 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1352 .add(*Src0) 1353 .addImm(SubRegIdx0) 1354 .add(*Src1) 1355 .addImm(SubRegIdx1); 1356 1357 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1358 .addReg(SrcReg, RegState::Kill); 1359 1360 AddressRegs Regs = getRegs(Opcode, *TII); 1361 1362 if (Regs.VAddr) 1363 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1364 1365 unsigned JoinedFormat = 1366 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1367 1368 // It shouldn't be possible to get this far if the two instructions 1369 // don't have a single memoperand, because MachineInstr::mayAlias() 1370 // will return true if this is the case. 1371 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1372 1373 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1374 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1375 1376 MachineInstr *New = 1377 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1378 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1379 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1380 .addImm(JoinedFormat) // format 1381 .addImm(CI.CPol) // cpol 1382 .addImm(0) // tfe 1383 .addImm(0) // swz 1384 .addMemOperand( 1385 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1386 1387 CI.I->eraseFromParent(); 1388 Paired.I->eraseFromParent(); 1389 return New; 1390 } 1391 1392 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeGlobalLoadPair( 1393 CombineInfo &CI, CombineInfo &Paired, 1394 MachineBasicBlock::iterator InsertBefore) { 1395 MachineBasicBlock *MBB = CI.I->getParent(); 1396 DebugLoc DL = CI.I->getDebugLoc(); 1397 1398 const unsigned Opcode = getNewOpcode(CI, Paired); 1399 1400 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1401 Register DestReg = MRI->createVirtualRegister(SuperRC); 1402 1403 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1404 1405 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1406 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1407 1408 MachineInstr *New = 1409 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1410 .addImm(std::min(CI.Offset, Paired.Offset)) 1411 .addImm(CI.CPol) 1412 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1413 1414 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1415 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1416 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1417 1418 // Copy to the old destination registers. 1419 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1420 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1421 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1422 1423 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1424 .add(*Dest0) // Copy to same destination including flags and sub reg. 1425 .addReg(DestReg, 0, SubRegIdx0); 1426 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1427 .add(*Dest1) 1428 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1429 1430 CI.I->eraseFromParent(); 1431 Paired.I->eraseFromParent(); 1432 return New; 1433 } 1434 1435 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1436 const CombineInfo &Paired) { 1437 const unsigned Width = CI.Width + Paired.Width; 1438 1439 switch (CI.InstClass) { 1440 default: 1441 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1442 // FIXME: Handle d16 correctly 1443 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1444 Width); 1445 case TBUFFER_LOAD: 1446 case TBUFFER_STORE: 1447 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1448 Width); 1449 1450 case UNKNOWN: 1451 llvm_unreachable("Unknown instruction class"); 1452 case S_BUFFER_LOAD_IMM: 1453 switch (Width) { 1454 default: 1455 return 0; 1456 case 2: 1457 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1458 case 4: 1459 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1460 case 8: 1461 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1462 } 1463 case GLOBAL_LOAD: 1464 switch (Width) { 1465 default: 1466 return 0; 1467 case 2: 1468 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1469 case 3: 1470 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1471 case 4: 1472 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1473 } 1474 case MIMG: 1475 assert((countPopulation(CI.DMask | Paired.DMask) == Width) && 1476 "No overlaps"); 1477 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1478 } 1479 } 1480 1481 std::pair<unsigned, unsigned> 1482 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1483 const CombineInfo &Paired) { 1484 bool ReverseOrder; 1485 if (CI.InstClass == MIMG) { 1486 assert( 1487 (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1488 "No overlaps"); 1489 ReverseOrder = CI.DMask > Paired.DMask; 1490 } else { 1491 ReverseOrder = CI.Offset > Paired.Offset; 1492 } 1493 1494 unsigned Idx0; 1495 unsigned Idx1; 1496 1497 static const unsigned Idxs[5][4] = { 1498 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1499 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1500 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1501 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1502 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1503 }; 1504 1505 assert(CI.Width >= 1 && CI.Width <= 4); 1506 assert(Paired.Width >= 1 && Paired.Width <= 4); 1507 1508 if (ReverseOrder) { 1509 Idx1 = Idxs[0][Paired.Width - 1]; 1510 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1511 } else { 1512 Idx0 = Idxs[0][CI.Width - 1]; 1513 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1514 } 1515 1516 return std::make_pair(Idx0, Idx1); 1517 } 1518 1519 const TargetRegisterClass * 1520 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1521 const CombineInfo &Paired) { 1522 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1523 switch (CI.Width + Paired.Width) { 1524 default: 1525 return nullptr; 1526 case 2: 1527 return &AMDGPU::SReg_64_XEXECRegClass; 1528 case 4: 1529 return &AMDGPU::SGPR_128RegClass; 1530 case 8: 1531 return &AMDGPU::SGPR_256RegClass; 1532 case 16: 1533 return &AMDGPU::SGPR_512RegClass; 1534 } 1535 } 1536 1537 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1538 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1539 ? TRI->getAGPRClassForBitWidth(BitWidth) 1540 : TRI->getVGPRClassForBitWidth(BitWidth); 1541 } 1542 1543 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1544 CombineInfo &CI, CombineInfo &Paired, 1545 MachineBasicBlock::iterator InsertBefore) { 1546 MachineBasicBlock *MBB = CI.I->getParent(); 1547 DebugLoc DL = CI.I->getDebugLoc(); 1548 1549 const unsigned Opcode = getNewOpcode(CI, Paired); 1550 1551 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1552 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1553 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1554 1555 // Copy to the new source register. 1556 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1557 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1558 1559 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1560 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1561 1562 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1563 .add(*Src0) 1564 .addImm(SubRegIdx0) 1565 .add(*Src1) 1566 .addImm(SubRegIdx1); 1567 1568 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1569 .addReg(SrcReg, RegState::Kill); 1570 1571 AddressRegs Regs = getRegs(Opcode, *TII); 1572 1573 if (Regs.VAddr) 1574 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1575 1576 1577 // It shouldn't be possible to get this far if the two instructions 1578 // don't have a single memoperand, because MachineInstr::mayAlias() 1579 // will return true if this is the case. 1580 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1581 1582 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1583 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1584 1585 MachineInstr *New = 1586 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1587 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1588 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1589 .addImm(CI.CPol) // cpol 1590 .addImm(0) // tfe 1591 .addImm(0) // swz 1592 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1593 1594 CI.I->eraseFromParent(); 1595 Paired.I->eraseFromParent(); 1596 return New; 1597 } 1598 1599 MachineOperand 1600 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1601 APInt V(32, Val, true); 1602 if (TII->isInlineConstant(V)) 1603 return MachineOperand::CreateImm(Val); 1604 1605 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1606 MachineInstr *Mov = 1607 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1608 TII->get(AMDGPU::S_MOV_B32), Reg) 1609 .addImm(Val); 1610 (void)Mov; 1611 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1612 return MachineOperand::CreateReg(Reg, false); 1613 } 1614 1615 // Compute base address using Addr and return the final register. 1616 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1617 const MemAddress &Addr) const { 1618 MachineBasicBlock *MBB = MI.getParent(); 1619 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1620 DebugLoc DL = MI.getDebugLoc(); 1621 1622 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1623 Addr.Base.LoSubReg) && 1624 "Expected 32-bit Base-Register-Low!!"); 1625 1626 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1627 Addr.Base.HiSubReg) && 1628 "Expected 32-bit Base-Register-Hi!!"); 1629 1630 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1631 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1632 MachineOperand OffsetHi = 1633 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1634 1635 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1636 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1637 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1638 1639 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1640 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1641 MachineInstr *LoHalf = 1642 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1643 .addReg(CarryReg, RegState::Define) 1644 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1645 .add(OffsetLo) 1646 .addImm(0); // clamp bit 1647 (void)LoHalf; 1648 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1649 1650 MachineInstr *HiHalf = 1651 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1652 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1653 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1654 .add(OffsetHi) 1655 .addReg(CarryReg, RegState::Kill) 1656 .addImm(0); // clamp bit 1657 (void)HiHalf; 1658 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1659 1660 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1661 MachineInstr *FullBase = 1662 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1663 .addReg(DestSub0) 1664 .addImm(AMDGPU::sub0) 1665 .addReg(DestSub1) 1666 .addImm(AMDGPU::sub1); 1667 (void)FullBase; 1668 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1669 1670 return FullDestReg; 1671 } 1672 1673 // Update base and offset with the NewBase and NewOffset in MI. 1674 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1675 Register NewBase, 1676 int32_t NewOffset) const { 1677 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1678 Base->setReg(NewBase); 1679 Base->setIsKill(false); 1680 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1681 } 1682 1683 Optional<int32_t> 1684 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1685 if (Op.isImm()) 1686 return Op.getImm(); 1687 1688 if (!Op.isReg()) 1689 return None; 1690 1691 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1692 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1693 !Def->getOperand(1).isImm()) 1694 return None; 1695 1696 return Def->getOperand(1).getImm(); 1697 } 1698 1699 // Analyze Base and extracts: 1700 // - 32bit base registers, subregisters 1701 // - 64bit constant offset 1702 // Expecting base computation as: 1703 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1704 // %LO:vgpr_32, %c:sreg_64_xexec = 1705 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1706 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1707 // %Base:vreg_64 = 1708 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1709 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1710 MemAddress &Addr) const { 1711 if (!Base.isReg()) 1712 return; 1713 1714 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1715 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1716 || Def->getNumOperands() != 5) 1717 return; 1718 1719 MachineOperand BaseLo = Def->getOperand(1); 1720 MachineOperand BaseHi = Def->getOperand(3); 1721 if (!BaseLo.isReg() || !BaseHi.isReg()) 1722 return; 1723 1724 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1725 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1726 1727 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1728 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1729 return; 1730 1731 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1732 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1733 1734 auto Offset0P = extractConstOffset(*Src0); 1735 if (Offset0P) 1736 BaseLo = *Src1; 1737 else { 1738 if (!(Offset0P = extractConstOffset(*Src1))) 1739 return; 1740 BaseLo = *Src0; 1741 } 1742 1743 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1744 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1745 1746 if (Src0->isImm()) 1747 std::swap(Src0, Src1); 1748 1749 if (!Src1->isImm()) 1750 return; 1751 1752 uint64_t Offset1 = Src1->getImm(); 1753 BaseHi = *Src0; 1754 1755 Addr.Base.LoReg = BaseLo.getReg(); 1756 Addr.Base.HiReg = BaseHi.getReg(); 1757 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1758 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1759 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1760 } 1761 1762 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1763 MachineInstr &MI, 1764 MemInfoMap &Visited, 1765 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1766 1767 if (!(MI.mayLoad() ^ MI.mayStore())) 1768 return false; 1769 1770 // TODO: Support flat and scratch. 1771 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1772 return false; 1773 1774 if (MI.mayLoad() && 1775 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 1776 return false; 1777 1778 if (AnchorList.count(&MI)) 1779 return false; 1780 1781 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1782 1783 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1784 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1785 return false; 1786 } 1787 1788 // Step1: Find the base-registers and a 64bit constant offset. 1789 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1790 MemAddress MAddr; 1791 if (Visited.find(&MI) == Visited.end()) { 1792 processBaseWithConstOffset(Base, MAddr); 1793 Visited[&MI] = MAddr; 1794 } else 1795 MAddr = Visited[&MI]; 1796 1797 if (MAddr.Offset == 0) { 1798 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1799 " constant offsets that can be promoted.\n";); 1800 return false; 1801 } 1802 1803 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1804 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1805 1806 // Step2: Traverse through MI's basic block and find an anchor(that has the 1807 // same base-registers) with the highest 13bit distance from MI's offset. 1808 // E.g. (64bit loads) 1809 // bb: 1810 // addr1 = &a + 4096; load1 = load(addr1, 0) 1811 // addr2 = &a + 6144; load2 = load(addr2, 0) 1812 // addr3 = &a + 8192; load3 = load(addr3, 0) 1813 // addr4 = &a + 10240; load4 = load(addr4, 0) 1814 // addr5 = &a + 12288; load5 = load(addr5, 0) 1815 // 1816 // Starting from the first load, the optimization will try to find a new base 1817 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1818 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1819 // as the new-base(anchor) because of the maximum distance which can 1820 // accommodate more intermediate bases presumably. 1821 // 1822 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1823 // (&a + 8192) for load1, load2, load4. 1824 // addr = &a + 8192 1825 // load1 = load(addr, -4096) 1826 // load2 = load(addr, -2048) 1827 // load3 = load(addr, 0) 1828 // load4 = load(addr, 2048) 1829 // addr5 = &a + 12288; load5 = load(addr5, 0) 1830 // 1831 MachineInstr *AnchorInst = nullptr; 1832 MemAddress AnchorAddr; 1833 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1834 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1835 1836 MachineBasicBlock *MBB = MI.getParent(); 1837 MachineBasicBlock::iterator E = MBB->end(); 1838 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1839 ++MBBI; 1840 const SITargetLowering *TLI = 1841 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1842 1843 for ( ; MBBI != E; ++MBBI) { 1844 MachineInstr &MINext = *MBBI; 1845 // TODO: Support finding an anchor(with same base) from store addresses or 1846 // any other load addresses where the opcodes are different. 1847 if (MINext.getOpcode() != MI.getOpcode() || 1848 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1849 continue; 1850 1851 const MachineOperand &BaseNext = 1852 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1853 MemAddress MAddrNext; 1854 if (Visited.find(&MINext) == Visited.end()) { 1855 processBaseWithConstOffset(BaseNext, MAddrNext); 1856 Visited[&MINext] = MAddrNext; 1857 } else 1858 MAddrNext = Visited[&MINext]; 1859 1860 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1861 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1862 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1863 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1864 continue; 1865 1866 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1867 1868 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1869 TargetLoweringBase::AddrMode AM; 1870 AM.HasBaseReg = true; 1871 AM.BaseOffs = Dist; 1872 if (TLI->isLegalGlobalAddressingMode(AM) && 1873 (uint32_t)std::abs(Dist) > MaxDist) { 1874 MaxDist = std::abs(Dist); 1875 1876 AnchorAddr = MAddrNext; 1877 AnchorInst = &MINext; 1878 } 1879 } 1880 1881 if (AnchorInst) { 1882 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1883 AnchorInst->dump()); 1884 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1885 << AnchorAddr.Offset << "\n\n"); 1886 1887 // Instead of moving up, just re-compute anchor-instruction's base address. 1888 Register Base = computeBase(MI, AnchorAddr); 1889 1890 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1891 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1892 1893 for (auto P : InstsWCommonBase) { 1894 TargetLoweringBase::AddrMode AM; 1895 AM.HasBaseReg = true; 1896 AM.BaseOffs = P.second - AnchorAddr.Offset; 1897 1898 if (TLI->isLegalGlobalAddressingMode(AM)) { 1899 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1900 dbgs() << ")"; P.first->dump()); 1901 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1902 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1903 } 1904 } 1905 AnchorList.insert(AnchorInst); 1906 return true; 1907 } 1908 1909 return false; 1910 } 1911 1912 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1913 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1914 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1915 if (AddrList.front().InstClass == CI.InstClass && 1916 AddrList.front().IsAGPR == CI.IsAGPR && 1917 AddrList.front().hasSameBaseAddress(*CI.I)) { 1918 AddrList.emplace_back(CI); 1919 return; 1920 } 1921 } 1922 1923 // Base address not found, so add a new list. 1924 MergeableInsts.emplace_back(1, CI); 1925 } 1926 1927 std::pair<MachineBasicBlock::iterator, bool> 1928 SILoadStoreOptimizer::collectMergeableInsts( 1929 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1930 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1931 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1932 bool Modified = false; 1933 1934 // Sort potential mergeable instructions into lists. One list per base address. 1935 unsigned Order = 0; 1936 MachineBasicBlock::iterator BlockI = Begin; 1937 for (; BlockI != End; ++BlockI) { 1938 MachineInstr &MI = *BlockI; 1939 1940 // We run this before checking if an address is mergeable, because it can produce 1941 // better code even if the instructions aren't mergeable. 1942 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1943 Modified = true; 1944 1945 // Treat volatile accesses, ordered accesses and unmodeled side effects as 1946 // barriers. We can look after this barrier for separate merges. 1947 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 1948 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 1949 1950 // Search will resume after this instruction in a separate merge list. 1951 ++BlockI; 1952 break; 1953 } 1954 1955 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1956 if (InstClass == UNKNOWN) 1957 continue; 1958 1959 // Do not merge VMEM buffer instructions with "swizzled" bit set. 1960 int Swizzled = 1961 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 1962 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 1963 continue; 1964 1965 CombineInfo CI; 1966 CI.setMI(MI, *this); 1967 CI.Order = Order++; 1968 1969 if (!CI.hasMergeableAddress(*MRI)) 1970 continue; 1971 1972 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 1973 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 1974 // operands. However we are reporting that ds_write2 shall have 1975 // only VGPR data so that machine copy propagation does not 1976 // create an illegal instruction with a VGPR and AGPR sources. 1977 // Consequenctially if we create such instruction the verifier 1978 // will complain. 1979 continue; 1980 } 1981 1982 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 1983 1984 addInstToMergeableList(CI, MergeableInsts); 1985 } 1986 1987 // At this point we have lists of Mergeable instructions. 1988 // 1989 // Part 2: Sort lists by offset and then for each CombineInfo object in the 1990 // list try to find an instruction that can be merged with I. If an instruction 1991 // is found, it is stored in the Paired field. If no instructions are found, then 1992 // the CombineInfo object is deleted from the list. 1993 1994 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 1995 E = MergeableInsts.end(); I != E;) { 1996 1997 std::list<CombineInfo> &MergeList = *I; 1998 if (MergeList.size() <= 1) { 1999 // This means we have found only one instruction with a given address 2000 // that can be merged, and we need at least 2 instructions to do a merge, 2001 // so this list can be discarded. 2002 I = MergeableInsts.erase(I); 2003 continue; 2004 } 2005 2006 // Sort the lists by offsets, this way mergeable instructions will be 2007 // adjacent to each other in the list, which will make it easier to find 2008 // matches. 2009 MergeList.sort( 2010 [] (const CombineInfo &A, const CombineInfo &B) { 2011 return A.Offset < B.Offset; 2012 }); 2013 ++I; 2014 } 2015 2016 return std::make_pair(BlockI, Modified); 2017 } 2018 2019 // Scan through looking for adjacent LDS operations with constant offsets from 2020 // the same base register. We rely on the scheduler to do the hard work of 2021 // clustering nearby loads, and assume these are all adjacent. 2022 bool SILoadStoreOptimizer::optimizeBlock( 2023 std::list<std::list<CombineInfo> > &MergeableInsts) { 2024 bool Modified = false; 2025 2026 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2027 E = MergeableInsts.end(); I != E;) { 2028 std::list<CombineInfo> &MergeList = *I; 2029 2030 bool OptimizeListAgain = false; 2031 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2032 // We weren't able to make any changes, so delete the list so we don't 2033 // process the same instructions the next time we try to optimize this 2034 // block. 2035 I = MergeableInsts.erase(I); 2036 continue; 2037 } 2038 2039 Modified = true; 2040 2041 // We made changes, but also determined that there were no more optimization 2042 // opportunities, so we don't need to reprocess the list 2043 if (!OptimizeListAgain) { 2044 I = MergeableInsts.erase(I); 2045 continue; 2046 } 2047 OptimizeAgain = true; 2048 } 2049 return Modified; 2050 } 2051 2052 bool 2053 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2054 std::list<CombineInfo> &MergeList, 2055 bool &OptimizeListAgain) { 2056 if (MergeList.empty()) 2057 return false; 2058 2059 bool Modified = false; 2060 2061 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2062 Next = std::next(I)) { 2063 2064 auto First = I; 2065 auto Second = Next; 2066 2067 if ((*First).Order > (*Second).Order) 2068 std::swap(First, Second); 2069 CombineInfo &CI = *First; 2070 CombineInfo &Paired = *Second; 2071 2072 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2073 if (!Where) { 2074 ++I; 2075 continue; 2076 } 2077 2078 Modified = true; 2079 2080 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2081 2082 MachineBasicBlock::iterator NewMI; 2083 switch (CI.InstClass) { 2084 default: 2085 llvm_unreachable("unknown InstClass"); 2086 break; 2087 case DS_READ: 2088 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2089 break; 2090 case DS_WRITE: 2091 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2092 break; 2093 case S_BUFFER_LOAD_IMM: 2094 NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); 2095 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2096 break; 2097 case BUFFER_LOAD: 2098 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2099 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2100 break; 2101 case BUFFER_STORE: 2102 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2103 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2104 break; 2105 case MIMG: 2106 NewMI = mergeImagePair(CI, Paired, Where->I); 2107 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2108 break; 2109 case TBUFFER_LOAD: 2110 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2111 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2112 break; 2113 case TBUFFER_STORE: 2114 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2115 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2116 break; 2117 case GLOBAL_LOAD: 2118 NewMI = mergeGlobalLoadPair(CI, Paired, Where->I); 2119 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2120 break; 2121 } 2122 CI.setMI(NewMI, *this); 2123 CI.Order = Where->Order; 2124 if (I == Second) 2125 I = Next; 2126 2127 MergeList.erase(Second); 2128 } 2129 2130 return Modified; 2131 } 2132 2133 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2134 if (skipFunction(MF.getFunction())) 2135 return false; 2136 2137 STM = &MF.getSubtarget<GCNSubtarget>(); 2138 if (!STM->loadStoreOptEnabled()) 2139 return false; 2140 2141 TII = STM->getInstrInfo(); 2142 TRI = &TII->getRegisterInfo(); 2143 2144 MRI = &MF.getRegInfo(); 2145 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2146 2147 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2148 2149 bool Modified = false; 2150 2151 // Contains the list of instructions for which constant offsets are being 2152 // promoted to the IMM. This is tracked for an entire block at time. 2153 SmallPtrSet<MachineInstr *, 4> AnchorList; 2154 MemInfoMap Visited; 2155 2156 for (MachineBasicBlock &MBB : MF) { 2157 MachineBasicBlock::iterator SectionEnd; 2158 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2159 I = SectionEnd) { 2160 bool CollectModified; 2161 std::list<std::list<CombineInfo>> MergeableInsts; 2162 2163 // First pass: Collect list of all instructions we know how to merge in a 2164 // subset of the block. 2165 std::tie(SectionEnd, CollectModified) = 2166 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2167 2168 Modified |= CollectModified; 2169 2170 do { 2171 OptimizeAgain = false; 2172 Modified |= optimizeBlock(MergeableInsts); 2173 } while (OptimizeAgain); 2174 } 2175 2176 Visited.clear(); 2177 AnchorList.clear(); 2178 } 2179 2180 return Modified; 2181 } 2182