1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 }; 83 84 struct AddressRegs { 85 unsigned char NumVAddrs = 0; 86 bool SBase = false; 87 bool SRsrc = false; 88 bool SOffset = false; 89 bool VAddr = false; 90 bool Addr = false; 91 bool SSamp = false; 92 }; 93 94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 95 const unsigned MaxAddressRegs = 12 + 1 + 1; 96 97 class SILoadStoreOptimizer : public MachineFunctionPass { 98 struct CombineInfo { 99 MachineBasicBlock::iterator I; 100 unsigned EltSize; 101 unsigned Offset; 102 unsigned Width; 103 unsigned Format; 104 unsigned BaseOff; 105 unsigned DMask; 106 InstClassEnum InstClass; 107 unsigned CPol = 0; 108 bool IsAGPR; 109 bool UseST64; 110 int AddrIdx[MaxAddressRegs]; 111 const MachineOperand *AddrReg[MaxAddressRegs]; 112 unsigned NumAddresses; 113 unsigned Order; 114 115 bool hasSameBaseAddress(const MachineInstr &MI) { 116 for (unsigned i = 0; i < NumAddresses; i++) { 117 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 118 119 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 120 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 121 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 122 return false; 123 } 124 continue; 125 } 126 127 // Check same base pointer. Be careful of subregisters, which can occur 128 // with vectors of pointers. 129 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 130 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 131 return false; 132 } 133 } 134 return true; 135 } 136 137 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 138 for (unsigned i = 0; i < NumAddresses; ++i) { 139 const MachineOperand *AddrOp = AddrReg[i]; 140 // Immediates are always OK. 141 if (AddrOp->isImm()) 142 continue; 143 144 // Don't try to merge addresses that aren't either immediates or registers. 145 // TODO: Should be possible to merge FrameIndexes and maybe some other 146 // non-register 147 if (!AddrOp->isReg()) 148 return false; 149 150 // TODO: We should be able to merge physical reg addresses. 151 if (AddrOp->getReg().isPhysical()) 152 return false; 153 154 // If an address has only one use then there will be on other 155 // instructions with the same address, so we can't merge this one. 156 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 157 return false; 158 } 159 return true; 160 } 161 162 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 163 }; 164 165 struct BaseRegisters { 166 Register LoReg; 167 Register HiReg; 168 169 unsigned LoSubReg = 0; 170 unsigned HiSubReg = 0; 171 }; 172 173 struct MemAddress { 174 BaseRegisters Base; 175 int64_t Offset = 0; 176 }; 177 178 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 179 180 private: 181 const GCNSubtarget *STM = nullptr; 182 const SIInstrInfo *TII = nullptr; 183 const SIRegisterInfo *TRI = nullptr; 184 MachineRegisterInfo *MRI = nullptr; 185 AliasAnalysis *AA = nullptr; 186 bool OptimizeAgain; 187 188 static bool dmasksCanBeCombined(const CombineInfo &CI, 189 const SIInstrInfo &TII, 190 const CombineInfo &Paired); 191 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 192 CombineInfo &Paired, bool Modify = false); 193 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 194 const CombineInfo &Paired); 195 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 196 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 197 const CombineInfo &Paired); 198 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 199 const CombineInfo &Paired); 200 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 201 202 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 203 SmallVectorImpl<MachineInstr *> &InstsToMove); 204 205 unsigned read2Opcode(unsigned EltSize) const; 206 unsigned read2ST64Opcode(unsigned EltSize) const; 207 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 208 CombineInfo &Paired, 209 const SmallVectorImpl<MachineInstr *> &InstsToMove); 210 211 unsigned write2Opcode(unsigned EltSize) const; 212 unsigned write2ST64Opcode(unsigned EltSize) const; 213 MachineBasicBlock::iterator 214 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 215 const SmallVectorImpl<MachineInstr *> &InstsToMove); 216 MachineBasicBlock::iterator 217 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 218 const SmallVectorImpl<MachineInstr *> &InstsToMove); 219 MachineBasicBlock::iterator 220 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 221 const SmallVectorImpl<MachineInstr *> &InstsToMove); 222 MachineBasicBlock::iterator 223 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 224 const SmallVectorImpl<MachineInstr *> &InstsToMove); 225 MachineBasicBlock::iterator 226 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 227 const SmallVectorImpl<MachineInstr *> &InstsToMove); 228 MachineBasicBlock::iterator 229 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 230 const SmallVectorImpl<MachineInstr *> &InstsToMove); 231 MachineBasicBlock::iterator 232 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 233 const SmallVectorImpl<MachineInstr *> &InstsToMove); 234 235 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 236 int32_t NewOffset) const; 237 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 238 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 239 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 240 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 241 /// Promotes constant offset to the immediate by adjusting the base. It 242 /// tries to use a base from the nearby instructions that allows it to have 243 /// a 13bit constant offset which gets promoted to the immediate. 244 bool promoteConstantOffsetToImm(MachineInstr &CI, 245 MemInfoMap &Visited, 246 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 247 void addInstToMergeableList(const CombineInfo &CI, 248 std::list<std::list<CombineInfo> > &MergeableInsts) const; 249 250 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 251 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 252 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 253 std::list<std::list<CombineInfo>> &MergeableInsts) const; 254 255 public: 256 static char ID; 257 258 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 259 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 260 } 261 262 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 263 bool &OptimizeListAgain); 264 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 265 266 bool runOnMachineFunction(MachineFunction &MF) override; 267 268 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 269 270 void getAnalysisUsage(AnalysisUsage &AU) const override { 271 AU.setPreservesCFG(); 272 AU.addRequired<AAResultsWrapperPass>(); 273 274 MachineFunctionPass::getAnalysisUsage(AU); 275 } 276 277 MachineFunctionProperties getRequiredProperties() const override { 278 return MachineFunctionProperties() 279 .set(MachineFunctionProperties::Property::IsSSA); 280 } 281 }; 282 283 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 284 const unsigned Opc = MI.getOpcode(); 285 286 if (TII.isMUBUF(Opc)) { 287 // FIXME: Handle d16 correctly 288 return AMDGPU::getMUBUFElements(Opc); 289 } 290 if (TII.isMIMG(MI)) { 291 uint64_t DMaskImm = 292 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 293 return countPopulation(DMaskImm); 294 } 295 if (TII.isMTBUF(Opc)) { 296 return AMDGPU::getMTBUFElements(Opc); 297 } 298 299 switch (Opc) { 300 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 301 return 1; 302 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 303 return 2; 304 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 305 return 4; 306 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 307 return 8; 308 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 309 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 310 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 311 case AMDGPU::DS_WRITE_B32_gfx9: 312 return 1; 313 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 314 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 315 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 316 case AMDGPU::DS_WRITE_B64_gfx9: 317 return 2; 318 default: 319 return 0; 320 } 321 } 322 323 /// Maps instruction opcode to enum InstClassEnum. 324 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 325 switch (Opc) { 326 default: 327 if (TII.isMUBUF(Opc)) { 328 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 329 default: 330 return UNKNOWN; 331 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 332 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 333 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 334 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 335 return BUFFER_LOAD; 336 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 337 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 338 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 339 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 340 return BUFFER_STORE; 341 } 342 } 343 if (TII.isMIMG(Opc)) { 344 // Ignore instructions encoded without vaddr. 345 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 346 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 347 return UNKNOWN; 348 // Ignore BVH instructions 349 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 350 return UNKNOWN; 351 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 352 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 353 TII.isGather4(Opc)) 354 return UNKNOWN; 355 return MIMG; 356 } 357 if (TII.isMTBUF(Opc)) { 358 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 359 default: 360 return UNKNOWN; 361 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 362 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 363 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 364 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 365 return TBUFFER_LOAD; 366 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 367 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 368 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 369 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 370 return TBUFFER_STORE; 371 } 372 } 373 return UNKNOWN; 374 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 375 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 376 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 377 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 378 return S_BUFFER_LOAD_IMM; 379 case AMDGPU::DS_READ_B32: 380 case AMDGPU::DS_READ_B32_gfx9: 381 case AMDGPU::DS_READ_B64: 382 case AMDGPU::DS_READ_B64_gfx9: 383 return DS_READ; 384 case AMDGPU::DS_WRITE_B32: 385 case AMDGPU::DS_WRITE_B32_gfx9: 386 case AMDGPU::DS_WRITE_B64: 387 case AMDGPU::DS_WRITE_B64_gfx9: 388 return DS_WRITE; 389 } 390 } 391 392 /// Determines instruction subclass from opcode. Only instructions 393 /// of the same subclass can be merged together. The merged instruction may have 394 /// a different subclass but must have the same class. 395 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 396 switch (Opc) { 397 default: 398 if (TII.isMUBUF(Opc)) 399 return AMDGPU::getMUBUFBaseOpcode(Opc); 400 if (TII.isMIMG(Opc)) { 401 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 402 assert(Info); 403 return Info->BaseOpcode; 404 } 405 if (TII.isMTBUF(Opc)) 406 return AMDGPU::getMTBUFBaseOpcode(Opc); 407 return -1; 408 case AMDGPU::DS_READ_B32: 409 case AMDGPU::DS_READ_B32_gfx9: 410 case AMDGPU::DS_READ_B64: 411 case AMDGPU::DS_READ_B64_gfx9: 412 case AMDGPU::DS_WRITE_B32: 413 case AMDGPU::DS_WRITE_B32_gfx9: 414 case AMDGPU::DS_WRITE_B64: 415 case AMDGPU::DS_WRITE_B64_gfx9: 416 return Opc; 417 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 418 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 419 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 420 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 421 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 422 } 423 } 424 425 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 426 AddressRegs Result; 427 428 if (TII.isMUBUF(Opc)) { 429 if (AMDGPU::getMUBUFHasVAddr(Opc)) 430 Result.VAddr = true; 431 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 432 Result.SRsrc = true; 433 if (AMDGPU::getMUBUFHasSoffset(Opc)) 434 Result.SOffset = true; 435 436 return Result; 437 } 438 439 if (TII.isMIMG(Opc)) { 440 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 441 if (VAddr0Idx >= 0) { 442 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 443 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 444 } else { 445 Result.VAddr = true; 446 } 447 Result.SRsrc = true; 448 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 449 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 450 Result.SSamp = true; 451 452 return Result; 453 } 454 if (TII.isMTBUF(Opc)) { 455 if (AMDGPU::getMTBUFHasVAddr(Opc)) 456 Result.VAddr = true; 457 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 458 Result.SRsrc = true; 459 if (AMDGPU::getMTBUFHasSoffset(Opc)) 460 Result.SOffset = true; 461 462 return Result; 463 } 464 465 switch (Opc) { 466 default: 467 return Result; 468 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 469 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 470 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 471 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 472 Result.SBase = true; 473 return Result; 474 case AMDGPU::DS_READ_B32: 475 case AMDGPU::DS_READ_B64: 476 case AMDGPU::DS_READ_B32_gfx9: 477 case AMDGPU::DS_READ_B64_gfx9: 478 case AMDGPU::DS_WRITE_B32: 479 case AMDGPU::DS_WRITE_B64: 480 case AMDGPU::DS_WRITE_B32_gfx9: 481 case AMDGPU::DS_WRITE_B64_gfx9: 482 Result.Addr = true; 483 return Result; 484 } 485 } 486 487 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 488 const SILoadStoreOptimizer &LSO) { 489 I = MI; 490 unsigned Opc = MI->getOpcode(); 491 InstClass = getInstClass(Opc, *LSO.TII); 492 493 if (InstClass == UNKNOWN) 494 return; 495 496 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 497 498 switch (InstClass) { 499 case DS_READ: 500 EltSize = 501 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 502 : 4; 503 break; 504 case DS_WRITE: 505 EltSize = 506 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 507 : 4; 508 break; 509 case S_BUFFER_LOAD_IMM: 510 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 511 break; 512 default: 513 EltSize = 4; 514 break; 515 } 516 517 if (InstClass == MIMG) { 518 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 519 // Offset is not considered for MIMG instructions. 520 Offset = 0; 521 } else { 522 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 523 Offset = I->getOperand(OffsetIdx).getImm(); 524 } 525 526 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 527 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 528 529 Width = getOpcodeWidth(*I, *LSO.TII); 530 531 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 532 Offset &= 0xffff; 533 } else if (InstClass != MIMG) { 534 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 535 } 536 537 AddressRegs Regs = getRegs(Opc, *LSO.TII); 538 539 NumAddresses = 0; 540 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 541 AddrIdx[NumAddresses++] = 542 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 543 if (Regs.Addr) 544 AddrIdx[NumAddresses++] = 545 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 546 if (Regs.SBase) 547 AddrIdx[NumAddresses++] = 548 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 549 if (Regs.SRsrc) 550 AddrIdx[NumAddresses++] = 551 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 552 if (Regs.SOffset) 553 AddrIdx[NumAddresses++] = 554 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 555 if (Regs.VAddr) 556 AddrIdx[NumAddresses++] = 557 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 558 if (Regs.SSamp) 559 AddrIdx[NumAddresses++] = 560 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 561 assert(NumAddresses <= MaxAddressRegs); 562 563 for (unsigned J = 0; J < NumAddresses; J++) 564 AddrReg[J] = &I->getOperand(AddrIdx[J]); 565 } 566 567 } // end anonymous namespace. 568 569 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 570 "SI Load Store Optimizer", false, false) 571 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 572 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 573 false, false) 574 575 char SILoadStoreOptimizer::ID = 0; 576 577 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 578 579 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 580 return new SILoadStoreOptimizer(); 581 } 582 583 static void moveInstsAfter(MachineBasicBlock::iterator I, 584 ArrayRef<MachineInstr *> InstsToMove) { 585 MachineBasicBlock *MBB = I->getParent(); 586 ++I; 587 for (MachineInstr *MI : InstsToMove) { 588 MI->removeFromParent(); 589 MBB->insert(I, MI); 590 } 591 } 592 593 static void addDefsUsesToList(const MachineInstr &MI, 594 DenseSet<Register> &RegDefs, 595 DenseSet<Register> &PhysRegUses) { 596 for (const MachineOperand &Op : MI.operands()) { 597 if (Op.isReg()) { 598 if (Op.isDef()) 599 RegDefs.insert(Op.getReg()); 600 else if (Op.readsReg() && Op.getReg().isPhysical()) 601 PhysRegUses.insert(Op.getReg()); 602 } 603 } 604 } 605 606 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 607 MachineBasicBlock::iterator B, 608 AliasAnalysis *AA) { 609 // RAW or WAR - cannot reorder 610 // WAW - cannot reorder 611 // RAR - safe to reorder 612 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 613 } 614 615 // Add MI and its defs to the lists if MI reads one of the defs that are 616 // already in the list. Returns true in that case. 617 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 618 DenseSet<Register> &PhysRegUses, 619 SmallVectorImpl<MachineInstr *> &Insts) { 620 for (MachineOperand &Use : MI.operands()) { 621 // If one of the defs is read, then there is a use of Def between I and the 622 // instruction that I will potentially be merged with. We will need to move 623 // this instruction after the merged instructions. 624 // 625 // Similarly, if there is a def which is read by an instruction that is to 626 // be moved for merging, then we need to move the def-instruction as well. 627 // This can only happen for physical registers such as M0; virtual 628 // registers are in SSA form. 629 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || 630 (Use.isDef() && RegDefs.count(Use.getReg())) || 631 (Use.isDef() && Use.getReg().isPhysical() && 632 PhysRegUses.count(Use.getReg())))) { 633 Insts.push_back(&MI); 634 addDefsUsesToList(MI, RegDefs, PhysRegUses); 635 return true; 636 } 637 } 638 639 return false; 640 } 641 642 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 643 ArrayRef<MachineInstr *> InstsToMove, 644 AliasAnalysis *AA) { 645 assert(MemOp.mayLoadOrStore()); 646 647 for (MachineInstr *InstToMove : InstsToMove) { 648 if (!InstToMove->mayLoadOrStore()) 649 continue; 650 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 651 return false; 652 } 653 return true; 654 } 655 656 // This function assumes that \p A and \p B have are identical except for 657 // size and offset, and they reference adjacent memory. 658 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 659 const MachineMemOperand *A, 660 const MachineMemOperand *B) { 661 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 662 unsigned Size = A->getSize() + B->getSize(); 663 // This function adds the offset parameter to the existing offset for A, 664 // so we pass 0 here as the offset and then manually set it to the correct 665 // value after the call. 666 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 667 MMO->setOffset(MinOffset); 668 return MMO; 669 } 670 671 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 672 const SIInstrInfo &TII, 673 const CombineInfo &Paired) { 674 assert(CI.InstClass == MIMG); 675 676 // Ignore instructions with tfe/lwe set. 677 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 678 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 679 680 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 681 return false; 682 683 // Check other optional immediate operands for equality. 684 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 685 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 686 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 687 688 for (auto op : OperandsToMatch) { 689 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 690 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 691 return false; 692 if (Idx != -1 && 693 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 694 return false; 695 } 696 697 // Check DMask for overlaps. 698 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 699 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 700 701 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 702 if ((1u << AllowedBitsForMin) <= MinMask) 703 return false; 704 705 return true; 706 } 707 708 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 709 unsigned ComponentCount, 710 const GCNSubtarget &STI) { 711 if (ComponentCount > 4) 712 return 0; 713 714 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 715 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 716 if (!OldFormatInfo) 717 return 0; 718 719 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 720 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 721 ComponentCount, 722 OldFormatInfo->NumFormat, STI); 723 724 if (!NewFormatInfo) 725 return 0; 726 727 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 728 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 729 730 return NewFormatInfo->Format; 731 } 732 733 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 734 // highest power of two. Note that the result is well defined for all inputs 735 // including corner cases like: 736 // - if Lo == Hi, return that value 737 // - if Lo == 0, return 0 (even though the "- 1" below underflows 738 // - if Lo > Hi, return 0 (as if the range wrapped around) 739 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 740 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 741 } 742 743 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 744 const GCNSubtarget &STI, 745 CombineInfo &Paired, 746 bool Modify) { 747 assert(CI.InstClass != MIMG); 748 749 // XXX - Would the same offset be OK? Is there any reason this would happen or 750 // be useful? 751 if (CI.Offset == Paired.Offset) 752 return false; 753 754 // This won't be valid if the offset isn't aligned. 755 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 756 return false; 757 758 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 759 760 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 761 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 762 if (!Info0) 763 return false; 764 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 765 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 766 if (!Info1) 767 return false; 768 769 if (Info0->BitsPerComp != Info1->BitsPerComp || 770 Info0->NumFormat != Info1->NumFormat) 771 return false; 772 773 // TODO: Should be possible to support more formats, but if format loads 774 // are not dword-aligned, the merged load might not be valid. 775 if (Info0->BitsPerComp != 32) 776 return false; 777 778 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 779 return false; 780 } 781 782 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 783 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 784 CI.UseST64 = false; 785 CI.BaseOff = 0; 786 787 // Handle all non-DS instructions. 788 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 789 return (EltOffset0 + CI.Width == EltOffset1 || 790 EltOffset1 + Paired.Width == EltOffset0) && 791 CI.CPol == Paired.CPol && 792 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol); 793 } 794 795 // If the offset in elements doesn't fit in 8-bits, we might be able to use 796 // the stride 64 versions. 797 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 798 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 799 if (Modify) { 800 CI.Offset = EltOffset0 / 64; 801 Paired.Offset = EltOffset1 / 64; 802 CI.UseST64 = true; 803 } 804 return true; 805 } 806 807 // Check if the new offsets fit in the reduced 8-bit range. 808 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 809 if (Modify) { 810 CI.Offset = EltOffset0; 811 Paired.Offset = EltOffset1; 812 } 813 return true; 814 } 815 816 // Try to shift base address to decrease offsets. 817 uint32_t Min = std::min(EltOffset0, EltOffset1); 818 uint32_t Max = std::max(EltOffset0, EltOffset1); 819 820 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 821 if (((Max - Min) & ~Mask) == 0) { 822 if (Modify) { 823 // From the range of values we could use for BaseOff, choose the one that 824 // is aligned to the highest power of two, to maximise the chance that 825 // the same offset can be reused for other load/store pairs. 826 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 827 // Copy the low bits of the offsets, so that when we adjust them by 828 // subtracting BaseOff they will be multiples of 64. 829 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 830 CI.BaseOff = BaseOff * CI.EltSize; 831 CI.Offset = (EltOffset0 - BaseOff) / 64; 832 Paired.Offset = (EltOffset1 - BaseOff) / 64; 833 CI.UseST64 = true; 834 } 835 return true; 836 } 837 838 if (isUInt<8>(Max - Min)) { 839 if (Modify) { 840 // From the range of values we could use for BaseOff, choose the one that 841 // is aligned to the highest power of two, to maximise the chance that 842 // the same offset can be reused for other load/store pairs. 843 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 844 CI.BaseOff = BaseOff * CI.EltSize; 845 CI.Offset = EltOffset0 - BaseOff; 846 Paired.Offset = EltOffset1 - BaseOff; 847 } 848 return true; 849 } 850 851 return false; 852 } 853 854 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 855 const CombineInfo &CI, 856 const CombineInfo &Paired) { 857 const unsigned Width = (CI.Width + Paired.Width); 858 switch (CI.InstClass) { 859 default: 860 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 861 case S_BUFFER_LOAD_IMM: 862 switch (Width) { 863 default: 864 return false; 865 case 2: 866 case 4: 867 case 8: 868 return true; 869 } 870 } 871 } 872 873 const TargetRegisterClass * 874 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 875 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 876 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 877 } 878 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 879 return TRI->getRegClassForReg(*MRI, Src->getReg()); 880 } 881 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 882 return TRI->getRegClassForReg(*MRI, Src->getReg()); 883 } 884 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 885 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 886 } 887 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 888 return TRI->getRegClassForReg(*MRI, Src->getReg()); 889 } 890 return nullptr; 891 } 892 893 /// This function assumes that CI comes before Paired in a basic block. 894 bool SILoadStoreOptimizer::checkAndPrepareMerge( 895 CombineInfo &CI, CombineInfo &Paired, 896 SmallVectorImpl<MachineInstr *> &InstsToMove) { 897 // If another instruction has already been merged into CI, it may now be a 898 // type that we can't do any further merging into. 899 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 900 return false; 901 assert(CI.InstClass == Paired.InstClass); 902 903 if (getInstSubclass(CI.I->getOpcode(), *TII) != 904 getInstSubclass(Paired.I->getOpcode(), *TII)) 905 return false; 906 907 // Check both offsets (or masks for MIMG) can be combined and fit in the 908 // reduced range. 909 if (CI.InstClass == MIMG) { 910 if (!dmasksCanBeCombined(CI, *TII, Paired)) 911 return false; 912 } else { 913 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 914 return false; 915 } 916 917 DenseSet<Register> RegDefsToMove; 918 DenseSet<Register> PhysRegUsesToMove; 919 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 920 921 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 922 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 923 if (MBBI == MBBE) { 924 // CombineInfo::Order is a hint on the instruction ordering within the 925 // basic block. This hint suggests that CI precedes Paired, which is 926 // true most of the time. However, moveInstsAfter() processing a 927 // previous list may have changed this order in a situation when it 928 // moves an instruction which exists in some other merge list. 929 // In this case it must be dependent. 930 return false; 931 } 932 933 // Keep going as long as one of these conditions are met: 934 // 1. It is safe to move I down past MBBI. 935 // 2. It is safe to move MBBI down past the instruction that I will 936 // be merged into. 937 938 if (MBBI->mayLoadOrStore() && 939 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 940 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 941 // We fail condition #1, but we may still be able to satisfy condition 942 // #2. Add this instruction to the move list and then we will check 943 // if condition #2 holds once we have selected the matching instruction. 944 InstsToMove.push_back(&*MBBI); 945 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 946 continue; 947 } 948 949 // When we match I with another load/store instruction we will be moving I 950 // down to the location of the matched instruction any uses of I will need 951 // to be moved down as well. 952 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, InstsToMove); 953 } 954 955 // If Paired depends on any of the instructions we plan to move, give up. 956 if (addToListsIfDependent(*Paired.I, RegDefsToMove, PhysRegUsesToMove, 957 InstsToMove)) 958 return false; 959 960 // We need to go through the list of instructions that we plan to 961 // move and make sure they are all safe to move down past the merged 962 // instruction. 963 if (!canMoveInstsAcrossMemOp(*Paired.I, InstsToMove, AA)) 964 return false; 965 966 // Call offsetsCanBeCombined with modify = true so that the offsets are 967 // correct for the new instruction. This should return true, because 968 // this function should only be called on CombineInfo objects that 969 // have already been confirmed to be mergeable. 970 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 971 offsetsCanBeCombined(CI, *STM, Paired, true); 972 return true; 973 } 974 975 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 976 if (STM->ldsRequiresM0Init()) 977 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 978 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 979 } 980 981 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 982 if (STM->ldsRequiresM0Init()) 983 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 984 985 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 986 : AMDGPU::DS_READ2ST64_B64_gfx9; 987 } 988 989 MachineBasicBlock::iterator 990 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 991 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 992 MachineBasicBlock *MBB = CI.I->getParent(); 993 994 // Be careful, since the addresses could be subregisters themselves in weird 995 // cases, like vectors of pointers. 996 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 997 998 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 999 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1000 1001 unsigned NewOffset0 = CI.Offset; 1002 unsigned NewOffset1 = Paired.Offset; 1003 unsigned Opc = 1004 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1005 1006 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1007 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1008 1009 if (NewOffset0 > NewOffset1) { 1010 // Canonicalize the merged instruction so the smaller offset comes first. 1011 std::swap(NewOffset0, NewOffset1); 1012 std::swap(SubRegIdx0, SubRegIdx1); 1013 } 1014 1015 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1016 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1017 1018 const MCInstrDesc &Read2Desc = TII->get(Opc); 1019 1020 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1021 Register DestReg = MRI->createVirtualRegister(SuperRC); 1022 1023 DebugLoc DL = CI.I->getDebugLoc(); 1024 1025 Register BaseReg = AddrReg->getReg(); 1026 unsigned BaseSubReg = AddrReg->getSubReg(); 1027 unsigned BaseRegFlags = 0; 1028 if (CI.BaseOff) { 1029 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1030 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1031 .addImm(CI.BaseOff); 1032 1033 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1034 BaseRegFlags = RegState::Kill; 1035 1036 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1037 .addReg(ImmReg) 1038 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1039 .addImm(0); // clamp bit 1040 BaseSubReg = 0; 1041 } 1042 1043 MachineInstrBuilder Read2 = 1044 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1045 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1046 .addImm(NewOffset0) // offset0 1047 .addImm(NewOffset1) // offset1 1048 .addImm(0) // gds 1049 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1050 1051 (void)Read2; 1052 1053 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1054 1055 // Copy to the old destination registers. 1056 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1057 .add(*Dest0) // Copy to same destination including flags and sub reg. 1058 .addReg(DestReg, 0, SubRegIdx0); 1059 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1060 .add(*Dest1) 1061 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1062 1063 moveInstsAfter(Copy1, InstsToMove); 1064 1065 CI.I->eraseFromParent(); 1066 Paired.I->eraseFromParent(); 1067 1068 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1069 return Read2; 1070 } 1071 1072 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1073 if (STM->ldsRequiresM0Init()) 1074 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1075 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1076 : AMDGPU::DS_WRITE2_B64_gfx9; 1077 } 1078 1079 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1080 if (STM->ldsRequiresM0Init()) 1081 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1082 : AMDGPU::DS_WRITE2ST64_B64; 1083 1084 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1085 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1086 } 1087 1088 MachineBasicBlock::iterator 1089 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1090 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1091 MachineBasicBlock *MBB = CI.I->getParent(); 1092 1093 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1094 // sure we preserve the subregister index and any register flags set on them. 1095 const MachineOperand *AddrReg = 1096 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1097 const MachineOperand *Data0 = 1098 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1099 const MachineOperand *Data1 = 1100 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1101 1102 unsigned NewOffset0 = CI.Offset; 1103 unsigned NewOffset1 = Paired.Offset; 1104 unsigned Opc = 1105 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1106 1107 if (NewOffset0 > NewOffset1) { 1108 // Canonicalize the merged instruction so the smaller offset comes first. 1109 std::swap(NewOffset0, NewOffset1); 1110 std::swap(Data0, Data1); 1111 } 1112 1113 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1114 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1115 1116 const MCInstrDesc &Write2Desc = TII->get(Opc); 1117 DebugLoc DL = CI.I->getDebugLoc(); 1118 1119 Register BaseReg = AddrReg->getReg(); 1120 unsigned BaseSubReg = AddrReg->getSubReg(); 1121 unsigned BaseRegFlags = 0; 1122 if (CI.BaseOff) { 1123 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1124 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1125 .addImm(CI.BaseOff); 1126 1127 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1128 BaseRegFlags = RegState::Kill; 1129 1130 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1131 .addReg(ImmReg) 1132 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1133 .addImm(0); // clamp bit 1134 BaseSubReg = 0; 1135 } 1136 1137 MachineInstrBuilder Write2 = 1138 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1139 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1140 .add(*Data0) // data0 1141 .add(*Data1) // data1 1142 .addImm(NewOffset0) // offset0 1143 .addImm(NewOffset1) // offset1 1144 .addImm(0) // gds 1145 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1146 1147 moveInstsAfter(Write2, InstsToMove); 1148 1149 CI.I->eraseFromParent(); 1150 Paired.I->eraseFromParent(); 1151 1152 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1153 return Write2; 1154 } 1155 1156 MachineBasicBlock::iterator 1157 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1158 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1159 MachineBasicBlock *MBB = CI.I->getParent(); 1160 DebugLoc DL = CI.I->getDebugLoc(); 1161 const unsigned Opcode = getNewOpcode(CI, Paired); 1162 1163 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1164 1165 Register DestReg = MRI->createVirtualRegister(SuperRC); 1166 unsigned MergedDMask = CI.DMask | Paired.DMask; 1167 unsigned DMaskIdx = 1168 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1169 1170 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1171 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1172 if (I == DMaskIdx) 1173 MIB.addImm(MergedDMask); 1174 else 1175 MIB.add((*CI.I).getOperand(I)); 1176 } 1177 1178 // It shouldn't be possible to get this far if the two instructions 1179 // don't have a single memoperand, because MachineInstr::mayAlias() 1180 // will return true if this is the case. 1181 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1182 1183 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1184 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1185 1186 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1187 1188 unsigned SubRegIdx0, SubRegIdx1; 1189 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1190 1191 // Copy to the old destination registers. 1192 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1193 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1194 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1195 1196 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1197 .add(*Dest0) // Copy to same destination including flags and sub reg. 1198 .addReg(DestReg, 0, SubRegIdx0); 1199 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1200 .add(*Dest1) 1201 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1202 1203 moveInstsAfter(Copy1, InstsToMove); 1204 1205 CI.I->eraseFromParent(); 1206 Paired.I->eraseFromParent(); 1207 return New; 1208 } 1209 1210 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1211 CombineInfo &CI, CombineInfo &Paired, 1212 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1213 MachineBasicBlock *MBB = CI.I->getParent(); 1214 DebugLoc DL = CI.I->getDebugLoc(); 1215 const unsigned Opcode = getNewOpcode(CI, Paired); 1216 1217 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1218 1219 Register DestReg = MRI->createVirtualRegister(SuperRC); 1220 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1221 1222 // It shouldn't be possible to get this far if the two instructions 1223 // don't have a single memoperand, because MachineInstr::mayAlias() 1224 // will return true if this is the case. 1225 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1226 1227 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1228 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1229 1230 MachineInstr *New = 1231 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1232 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1233 .addImm(MergedOffset) // offset 1234 .addImm(CI.CPol) // cpol 1235 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1236 1237 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1238 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1239 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1240 1241 // Copy to the old destination registers. 1242 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1243 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1244 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1245 1246 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1247 .add(*Dest0) // Copy to same destination including flags and sub reg. 1248 .addReg(DestReg, 0, SubRegIdx0); 1249 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1250 .add(*Dest1) 1251 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1252 1253 moveInstsAfter(Copy1, InstsToMove); 1254 1255 CI.I->eraseFromParent(); 1256 Paired.I->eraseFromParent(); 1257 return New; 1258 } 1259 1260 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1261 CombineInfo &CI, CombineInfo &Paired, 1262 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1263 MachineBasicBlock *MBB = CI.I->getParent(); 1264 DebugLoc DL = CI.I->getDebugLoc(); 1265 1266 const unsigned Opcode = getNewOpcode(CI, Paired); 1267 1268 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1269 1270 // Copy to the new source register. 1271 Register DestReg = MRI->createVirtualRegister(SuperRC); 1272 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1273 1274 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1275 1276 AddressRegs Regs = getRegs(Opcode, *TII); 1277 1278 if (Regs.VAddr) 1279 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1280 1281 // It shouldn't be possible to get this far if the two instructions 1282 // don't have a single memoperand, because MachineInstr::mayAlias() 1283 // will return true if this is the case. 1284 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1285 1286 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1287 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1288 1289 MachineInstr *New = 1290 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1291 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1292 .addImm(MergedOffset) // offset 1293 .addImm(CI.CPol) // cpol 1294 .addImm(0) // tfe 1295 .addImm(0) // swz 1296 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1297 1298 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1299 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1300 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1301 1302 // Copy to the old destination registers. 1303 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1304 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1305 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1306 1307 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1308 .add(*Dest0) // Copy to same destination including flags and sub reg. 1309 .addReg(DestReg, 0, SubRegIdx0); 1310 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1311 .add(*Dest1) 1312 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1313 1314 moveInstsAfter(Copy1, InstsToMove); 1315 1316 CI.I->eraseFromParent(); 1317 Paired.I->eraseFromParent(); 1318 return New; 1319 } 1320 1321 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1322 CombineInfo &CI, CombineInfo &Paired, 1323 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1324 MachineBasicBlock *MBB = CI.I->getParent(); 1325 DebugLoc DL = CI.I->getDebugLoc(); 1326 1327 const unsigned Opcode = getNewOpcode(CI, Paired); 1328 1329 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1330 1331 // Copy to the new source register. 1332 Register DestReg = MRI->createVirtualRegister(SuperRC); 1333 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1334 1335 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1336 1337 AddressRegs Regs = getRegs(Opcode, *TII); 1338 1339 if (Regs.VAddr) 1340 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1341 1342 unsigned JoinedFormat = 1343 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1344 1345 // It shouldn't be possible to get this far if the two instructions 1346 // don't have a single memoperand, because MachineInstr::mayAlias() 1347 // will return true if this is the case. 1348 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1349 1350 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1351 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1352 1353 MachineInstr *New = 1354 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1355 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1356 .addImm(MergedOffset) // offset 1357 .addImm(JoinedFormat) // format 1358 .addImm(CI.CPol) // cpol 1359 .addImm(0) // tfe 1360 .addImm(0) // swz 1361 .addMemOperand( 1362 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1363 1364 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1365 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1366 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1367 1368 // Copy to the old destination registers. 1369 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1370 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1371 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1372 1373 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1374 .add(*Dest0) // Copy to same destination including flags and sub reg. 1375 .addReg(DestReg, 0, SubRegIdx0); 1376 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1377 .add(*Dest1) 1378 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1379 1380 moveInstsAfter(Copy1, InstsToMove); 1381 1382 CI.I->eraseFromParent(); 1383 Paired.I->eraseFromParent(); 1384 return New; 1385 } 1386 1387 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1388 CombineInfo &CI, CombineInfo &Paired, 1389 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1390 MachineBasicBlock *MBB = CI.I->getParent(); 1391 DebugLoc DL = CI.I->getDebugLoc(); 1392 1393 const unsigned Opcode = getNewOpcode(CI, Paired); 1394 1395 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1396 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1397 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1398 1399 // Copy to the new source register. 1400 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1401 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1402 1403 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1404 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1405 1406 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1407 .add(*Src0) 1408 .addImm(SubRegIdx0) 1409 .add(*Src1) 1410 .addImm(SubRegIdx1); 1411 1412 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1413 .addReg(SrcReg, RegState::Kill); 1414 1415 AddressRegs Regs = getRegs(Opcode, *TII); 1416 1417 if (Regs.VAddr) 1418 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1419 1420 unsigned JoinedFormat = 1421 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1422 1423 // It shouldn't be possible to get this far if the two instructions 1424 // don't have a single memoperand, because MachineInstr::mayAlias() 1425 // will return true if this is the case. 1426 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1427 1428 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1429 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1430 1431 MachineInstr *New = 1432 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1433 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1434 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1435 .addImm(JoinedFormat) // format 1436 .addImm(CI.CPol) // cpol 1437 .addImm(0) // tfe 1438 .addImm(0) // swz 1439 .addMemOperand( 1440 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1441 1442 moveInstsAfter(MIB, InstsToMove); 1443 1444 CI.I->eraseFromParent(); 1445 Paired.I->eraseFromParent(); 1446 return New; 1447 } 1448 1449 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1450 const CombineInfo &Paired) { 1451 const unsigned Width = CI.Width + Paired.Width; 1452 1453 switch (CI.InstClass) { 1454 default: 1455 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1456 // FIXME: Handle d16 correctly 1457 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1458 Width); 1459 case TBUFFER_LOAD: 1460 case TBUFFER_STORE: 1461 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1462 Width); 1463 1464 case UNKNOWN: 1465 llvm_unreachable("Unknown instruction class"); 1466 case S_BUFFER_LOAD_IMM: 1467 switch (Width) { 1468 default: 1469 return 0; 1470 case 2: 1471 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1472 case 4: 1473 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1474 case 8: 1475 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1476 } 1477 case MIMG: 1478 assert((countPopulation(CI.DMask | Paired.DMask) == Width) && 1479 "No overlaps"); 1480 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1481 } 1482 } 1483 1484 std::pair<unsigned, unsigned> 1485 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1486 const CombineInfo &Paired) { 1487 bool ReverseOrder; 1488 if (CI.InstClass == MIMG) { 1489 assert( 1490 (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1491 "No overlaps"); 1492 ReverseOrder = CI.DMask > Paired.DMask; 1493 } else { 1494 ReverseOrder = CI.Offset > Paired.Offset; 1495 } 1496 1497 unsigned Idx0; 1498 unsigned Idx1; 1499 1500 static const unsigned Idxs[5][4] = { 1501 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1502 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1503 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1504 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1505 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1506 }; 1507 1508 assert(CI.Width >= 1 && CI.Width <= 4); 1509 assert(Paired.Width >= 1 && Paired.Width <= 4); 1510 1511 if (ReverseOrder) { 1512 Idx1 = Idxs[0][Paired.Width - 1]; 1513 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1514 } else { 1515 Idx0 = Idxs[0][CI.Width - 1]; 1516 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1517 } 1518 1519 return std::make_pair(Idx0, Idx1); 1520 } 1521 1522 const TargetRegisterClass * 1523 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1524 const CombineInfo &Paired) { 1525 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1526 switch (CI.Width + Paired.Width) { 1527 default: 1528 return nullptr; 1529 case 2: 1530 return &AMDGPU::SReg_64_XEXECRegClass; 1531 case 4: 1532 return &AMDGPU::SGPR_128RegClass; 1533 case 8: 1534 return &AMDGPU::SGPR_256RegClass; 1535 case 16: 1536 return &AMDGPU::SGPR_512RegClass; 1537 } 1538 } 1539 1540 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1541 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1542 ? TRI->getAGPRClassForBitWidth(BitWidth) 1543 : TRI->getVGPRClassForBitWidth(BitWidth); 1544 } 1545 1546 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1547 CombineInfo &CI, CombineInfo &Paired, 1548 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1549 MachineBasicBlock *MBB = CI.I->getParent(); 1550 DebugLoc DL = CI.I->getDebugLoc(); 1551 1552 const unsigned Opcode = getNewOpcode(CI, Paired); 1553 1554 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1555 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1556 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1557 1558 // Copy to the new source register. 1559 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1560 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1561 1562 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1563 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1564 1565 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1566 .add(*Src0) 1567 .addImm(SubRegIdx0) 1568 .add(*Src1) 1569 .addImm(SubRegIdx1); 1570 1571 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1572 .addReg(SrcReg, RegState::Kill); 1573 1574 AddressRegs Regs = getRegs(Opcode, *TII); 1575 1576 if (Regs.VAddr) 1577 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1578 1579 1580 // It shouldn't be possible to get this far if the two instructions 1581 // don't have a single memoperand, because MachineInstr::mayAlias() 1582 // will return true if this is the case. 1583 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1584 1585 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1586 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1587 1588 MachineInstr *New = 1589 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1590 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1591 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1592 .addImm(CI.CPol) // cpol 1593 .addImm(0) // tfe 1594 .addImm(0) // swz 1595 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1596 1597 moveInstsAfter(MIB, InstsToMove); 1598 1599 CI.I->eraseFromParent(); 1600 Paired.I->eraseFromParent(); 1601 return New; 1602 } 1603 1604 MachineOperand 1605 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1606 APInt V(32, Val, true); 1607 if (TII->isInlineConstant(V)) 1608 return MachineOperand::CreateImm(Val); 1609 1610 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1611 MachineInstr *Mov = 1612 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1613 TII->get(AMDGPU::S_MOV_B32), Reg) 1614 .addImm(Val); 1615 (void)Mov; 1616 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1617 return MachineOperand::CreateReg(Reg, false); 1618 } 1619 1620 // Compute base address using Addr and return the final register. 1621 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1622 const MemAddress &Addr) const { 1623 MachineBasicBlock *MBB = MI.getParent(); 1624 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1625 DebugLoc DL = MI.getDebugLoc(); 1626 1627 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1628 Addr.Base.LoSubReg) && 1629 "Expected 32-bit Base-Register-Low!!"); 1630 1631 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1632 Addr.Base.HiSubReg) && 1633 "Expected 32-bit Base-Register-Hi!!"); 1634 1635 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1636 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1637 MachineOperand OffsetHi = 1638 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1639 1640 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1641 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1642 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1643 1644 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1645 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1646 MachineInstr *LoHalf = 1647 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1648 .addReg(CarryReg, RegState::Define) 1649 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1650 .add(OffsetLo) 1651 .addImm(0); // clamp bit 1652 (void)LoHalf; 1653 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1654 1655 MachineInstr *HiHalf = 1656 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1657 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1658 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1659 .add(OffsetHi) 1660 .addReg(CarryReg, RegState::Kill) 1661 .addImm(0); // clamp bit 1662 (void)HiHalf; 1663 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1664 1665 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1666 MachineInstr *FullBase = 1667 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1668 .addReg(DestSub0) 1669 .addImm(AMDGPU::sub0) 1670 .addReg(DestSub1) 1671 .addImm(AMDGPU::sub1); 1672 (void)FullBase; 1673 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1674 1675 return FullDestReg; 1676 } 1677 1678 // Update base and offset with the NewBase and NewOffset in MI. 1679 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1680 Register NewBase, 1681 int32_t NewOffset) const { 1682 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1683 Base->setReg(NewBase); 1684 Base->setIsKill(false); 1685 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1686 } 1687 1688 Optional<int32_t> 1689 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1690 if (Op.isImm()) 1691 return Op.getImm(); 1692 1693 if (!Op.isReg()) 1694 return None; 1695 1696 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1697 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1698 !Def->getOperand(1).isImm()) 1699 return None; 1700 1701 return Def->getOperand(1).getImm(); 1702 } 1703 1704 // Analyze Base and extracts: 1705 // - 32bit base registers, subregisters 1706 // - 64bit constant offset 1707 // Expecting base computation as: 1708 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1709 // %LO:vgpr_32, %c:sreg_64_xexec = 1710 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1711 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1712 // %Base:vreg_64 = 1713 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1714 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1715 MemAddress &Addr) const { 1716 if (!Base.isReg()) 1717 return; 1718 1719 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1720 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1721 || Def->getNumOperands() != 5) 1722 return; 1723 1724 MachineOperand BaseLo = Def->getOperand(1); 1725 MachineOperand BaseHi = Def->getOperand(3); 1726 if (!BaseLo.isReg() || !BaseHi.isReg()) 1727 return; 1728 1729 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1730 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1731 1732 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1733 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1734 return; 1735 1736 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1737 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1738 1739 auto Offset0P = extractConstOffset(*Src0); 1740 if (Offset0P) 1741 BaseLo = *Src1; 1742 else { 1743 if (!(Offset0P = extractConstOffset(*Src1))) 1744 return; 1745 BaseLo = *Src0; 1746 } 1747 1748 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1749 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1750 1751 if (Src0->isImm()) 1752 std::swap(Src0, Src1); 1753 1754 if (!Src1->isImm()) 1755 return; 1756 1757 uint64_t Offset1 = Src1->getImm(); 1758 BaseHi = *Src0; 1759 1760 Addr.Base.LoReg = BaseLo.getReg(); 1761 Addr.Base.HiReg = BaseHi.getReg(); 1762 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1763 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1764 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1765 } 1766 1767 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1768 MachineInstr &MI, 1769 MemInfoMap &Visited, 1770 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1771 1772 if (!(MI.mayLoad() ^ MI.mayStore())) 1773 return false; 1774 1775 // TODO: Support flat and scratch. 1776 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1777 return false; 1778 1779 if (MI.mayLoad() && 1780 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 1781 return false; 1782 1783 if (AnchorList.count(&MI)) 1784 return false; 1785 1786 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1787 1788 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1789 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1790 return false; 1791 } 1792 1793 // Step1: Find the base-registers and a 64bit constant offset. 1794 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1795 MemAddress MAddr; 1796 if (Visited.find(&MI) == Visited.end()) { 1797 processBaseWithConstOffset(Base, MAddr); 1798 Visited[&MI] = MAddr; 1799 } else 1800 MAddr = Visited[&MI]; 1801 1802 if (MAddr.Offset == 0) { 1803 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1804 " constant offsets that can be promoted.\n";); 1805 return false; 1806 } 1807 1808 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1809 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1810 1811 // Step2: Traverse through MI's basic block and find an anchor(that has the 1812 // same base-registers) with the highest 13bit distance from MI's offset. 1813 // E.g. (64bit loads) 1814 // bb: 1815 // addr1 = &a + 4096; load1 = load(addr1, 0) 1816 // addr2 = &a + 6144; load2 = load(addr2, 0) 1817 // addr3 = &a + 8192; load3 = load(addr3, 0) 1818 // addr4 = &a + 10240; load4 = load(addr4, 0) 1819 // addr5 = &a + 12288; load5 = load(addr5, 0) 1820 // 1821 // Starting from the first load, the optimization will try to find a new base 1822 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1823 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1824 // as the new-base(anchor) because of the maximum distance which can 1825 // accomodate more intermediate bases presumeably. 1826 // 1827 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1828 // (&a + 8192) for load1, load2, load4. 1829 // addr = &a + 8192 1830 // load1 = load(addr, -4096) 1831 // load2 = load(addr, -2048) 1832 // load3 = load(addr, 0) 1833 // load4 = load(addr, 2048) 1834 // addr5 = &a + 12288; load5 = load(addr5, 0) 1835 // 1836 MachineInstr *AnchorInst = nullptr; 1837 MemAddress AnchorAddr; 1838 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1839 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1840 1841 MachineBasicBlock *MBB = MI.getParent(); 1842 MachineBasicBlock::iterator E = MBB->end(); 1843 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1844 ++MBBI; 1845 const SITargetLowering *TLI = 1846 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1847 1848 for ( ; MBBI != E; ++MBBI) { 1849 MachineInstr &MINext = *MBBI; 1850 // TODO: Support finding an anchor(with same base) from store addresses or 1851 // any other load addresses where the opcodes are different. 1852 if (MINext.getOpcode() != MI.getOpcode() || 1853 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1854 continue; 1855 1856 const MachineOperand &BaseNext = 1857 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1858 MemAddress MAddrNext; 1859 if (Visited.find(&MINext) == Visited.end()) { 1860 processBaseWithConstOffset(BaseNext, MAddrNext); 1861 Visited[&MINext] = MAddrNext; 1862 } else 1863 MAddrNext = Visited[&MINext]; 1864 1865 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1866 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1867 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1868 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1869 continue; 1870 1871 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1872 1873 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1874 TargetLoweringBase::AddrMode AM; 1875 AM.HasBaseReg = true; 1876 AM.BaseOffs = Dist; 1877 if (TLI->isLegalGlobalAddressingMode(AM) && 1878 (uint32_t)std::abs(Dist) > MaxDist) { 1879 MaxDist = std::abs(Dist); 1880 1881 AnchorAddr = MAddrNext; 1882 AnchorInst = &MINext; 1883 } 1884 } 1885 1886 if (AnchorInst) { 1887 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1888 AnchorInst->dump()); 1889 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1890 << AnchorAddr.Offset << "\n\n"); 1891 1892 // Instead of moving up, just re-compute anchor-instruction's base address. 1893 Register Base = computeBase(MI, AnchorAddr); 1894 1895 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1896 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1897 1898 for (auto P : InstsWCommonBase) { 1899 TargetLoweringBase::AddrMode AM; 1900 AM.HasBaseReg = true; 1901 AM.BaseOffs = P.second - AnchorAddr.Offset; 1902 1903 if (TLI->isLegalGlobalAddressingMode(AM)) { 1904 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1905 dbgs() << ")"; P.first->dump()); 1906 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1907 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1908 } 1909 } 1910 AnchorList.insert(AnchorInst); 1911 return true; 1912 } 1913 1914 return false; 1915 } 1916 1917 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1918 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1919 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1920 if (AddrList.front().InstClass == CI.InstClass && 1921 AddrList.front().IsAGPR == CI.IsAGPR && 1922 AddrList.front().hasSameBaseAddress(*CI.I)) { 1923 AddrList.emplace_back(CI); 1924 return; 1925 } 1926 } 1927 1928 // Base address not found, so add a new list. 1929 MergeableInsts.emplace_back(1, CI); 1930 } 1931 1932 std::pair<MachineBasicBlock::iterator, bool> 1933 SILoadStoreOptimizer::collectMergeableInsts( 1934 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1935 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1936 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1937 bool Modified = false; 1938 1939 // Sort potential mergeable instructions into lists. One list per base address. 1940 unsigned Order = 0; 1941 MachineBasicBlock::iterator BlockI = Begin; 1942 for (; BlockI != End; ++BlockI) { 1943 MachineInstr &MI = *BlockI; 1944 1945 // We run this before checking if an address is mergeable, because it can produce 1946 // better code even if the instructions aren't mergeable. 1947 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1948 Modified = true; 1949 1950 // Treat volatile accesses, ordered accesses and unmodeled side effects as 1951 // barriers. We can look after this barrier for separate merges. 1952 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 1953 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 1954 1955 // Search will resume after this instruction in a separate merge list. 1956 ++BlockI; 1957 break; 1958 } 1959 1960 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1961 if (InstClass == UNKNOWN) 1962 continue; 1963 1964 // Do not merge VMEM buffer instructions with "swizzled" bit set. 1965 int Swizzled = 1966 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 1967 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 1968 continue; 1969 1970 CombineInfo CI; 1971 CI.setMI(MI, *this); 1972 CI.Order = Order++; 1973 1974 if (!CI.hasMergeableAddress(*MRI)) 1975 continue; 1976 1977 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 1978 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 1979 // operands. However we are reporting that ds_write2 shall have 1980 // only VGPR data so that machine copy propagation does not 1981 // create an illegal instruction with a VGPR and AGPR sources. 1982 // Consequenctially if we create such instruction the verifier 1983 // will complain. 1984 continue; 1985 } 1986 1987 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 1988 1989 addInstToMergeableList(CI, MergeableInsts); 1990 } 1991 1992 // At this point we have lists of Mergeable instructions. 1993 // 1994 // Part 2: Sort lists by offset and then for each CombineInfo object in the 1995 // list try to find an instruction that can be merged with I. If an instruction 1996 // is found, it is stored in the Paired field. If no instructions are found, then 1997 // the CombineInfo object is deleted from the list. 1998 1999 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2000 E = MergeableInsts.end(); I != E;) { 2001 2002 std::list<CombineInfo> &MergeList = *I; 2003 if (MergeList.size() <= 1) { 2004 // This means we have found only one instruction with a given address 2005 // that can be merged, and we need at least 2 instructions to do a merge, 2006 // so this list can be discarded. 2007 I = MergeableInsts.erase(I); 2008 continue; 2009 } 2010 2011 // Sort the lists by offsets, this way mergeable instructions will be 2012 // adjacent to each other in the list, which will make it easier to find 2013 // matches. 2014 MergeList.sort( 2015 [] (const CombineInfo &A, const CombineInfo &B) { 2016 return A.Offset < B.Offset; 2017 }); 2018 ++I; 2019 } 2020 2021 return std::make_pair(BlockI, Modified); 2022 } 2023 2024 // Scan through looking for adjacent LDS operations with constant offsets from 2025 // the same base register. We rely on the scheduler to do the hard work of 2026 // clustering nearby loads, and assume these are all adjacent. 2027 bool SILoadStoreOptimizer::optimizeBlock( 2028 std::list<std::list<CombineInfo> > &MergeableInsts) { 2029 bool Modified = false; 2030 2031 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2032 E = MergeableInsts.end(); I != E;) { 2033 std::list<CombineInfo> &MergeList = *I; 2034 2035 bool OptimizeListAgain = false; 2036 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2037 // We weren't able to make any changes, so delete the list so we don't 2038 // process the same instructions the next time we try to optimize this 2039 // block. 2040 I = MergeableInsts.erase(I); 2041 continue; 2042 } 2043 2044 Modified = true; 2045 2046 // We made changes, but also determined that there were no more optimization 2047 // opportunities, so we don't need to reprocess the list 2048 if (!OptimizeListAgain) { 2049 I = MergeableInsts.erase(I); 2050 continue; 2051 } 2052 OptimizeAgain = true; 2053 } 2054 return Modified; 2055 } 2056 2057 bool 2058 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2059 std::list<CombineInfo> &MergeList, 2060 bool &OptimizeListAgain) { 2061 if (MergeList.empty()) 2062 return false; 2063 2064 bool Modified = false; 2065 2066 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2067 Next = std::next(I)) { 2068 2069 auto First = I; 2070 auto Second = Next; 2071 2072 if ((*First).Order > (*Second).Order) 2073 std::swap(First, Second); 2074 CombineInfo &CI = *First; 2075 CombineInfo &Paired = *Second; 2076 2077 SmallVector<MachineInstr *, 8> InstsToMove; 2078 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2079 ++I; 2080 continue; 2081 } 2082 2083 Modified = true; 2084 2085 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2086 2087 MachineBasicBlock::iterator NewMI; 2088 switch (CI.InstClass) { 2089 default: 2090 llvm_unreachable("unknown InstClass"); 2091 break; 2092 case DS_READ: 2093 NewMI = mergeRead2Pair(CI, Paired, InstsToMove); 2094 break; 2095 case DS_WRITE: 2096 NewMI = mergeWrite2Pair(CI, Paired, InstsToMove); 2097 break; 2098 case S_BUFFER_LOAD_IMM: 2099 NewMI = mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2100 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2101 break; 2102 case BUFFER_LOAD: 2103 NewMI = mergeBufferLoadPair(CI, Paired, InstsToMove); 2104 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2105 break; 2106 case BUFFER_STORE: 2107 NewMI = mergeBufferStorePair(CI, Paired, InstsToMove); 2108 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2109 break; 2110 case MIMG: 2111 NewMI = mergeImagePair(CI, Paired, InstsToMove); 2112 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2113 break; 2114 case TBUFFER_LOAD: 2115 NewMI = mergeTBufferLoadPair(CI, Paired, InstsToMove); 2116 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2117 break; 2118 case TBUFFER_STORE: 2119 NewMI = mergeTBufferStorePair(CI, Paired, InstsToMove); 2120 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2121 break; 2122 } 2123 CI.setMI(NewMI, *this); 2124 CI.Order = Paired.Order; 2125 if (I == Second) 2126 I = Next; 2127 2128 MergeList.erase(Second); 2129 } 2130 2131 return Modified; 2132 } 2133 2134 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2135 if (skipFunction(MF.getFunction())) 2136 return false; 2137 2138 STM = &MF.getSubtarget<GCNSubtarget>(); 2139 if (!STM->loadStoreOptEnabled()) 2140 return false; 2141 2142 TII = STM->getInstrInfo(); 2143 TRI = &TII->getRegisterInfo(); 2144 2145 MRI = &MF.getRegInfo(); 2146 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2147 2148 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2149 2150 bool Modified = false; 2151 2152 // Contains the list of instructions for which constant offsets are being 2153 // promoted to the IMM. This is tracked for an entire block at time. 2154 SmallPtrSet<MachineInstr *, 4> AnchorList; 2155 MemInfoMap Visited; 2156 2157 for (MachineBasicBlock &MBB : MF) { 2158 MachineBasicBlock::iterator SectionEnd; 2159 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2160 I = SectionEnd) { 2161 bool CollectModified; 2162 std::list<std::list<CombineInfo>> MergeableInsts; 2163 2164 // First pass: Collect list of all instructions we know how to merge in a 2165 // subset of the block. 2166 std::tie(SectionEnd, CollectModified) = 2167 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2168 2169 Modified |= CollectModified; 2170 2171 do { 2172 OptimizeAgain = false; 2173 Modified |= optimizeBlock(MergeableInsts); 2174 } while (OptimizeAgain); 2175 } 2176 2177 Visited.clear(); 2178 AnchorList.clear(); 2179 } 2180 2181 return Modified; 2182 } 2183